extract.go 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. "github.com/PuerkitoBio/goquery"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  31. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  32. )
  33. var Luacodes = sync.Map{}
  34. var SiteManages = sync.Map{}
  35. //启动测试抽取
  36. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  37. defer qu.Catch()
  38. ext := &ExtractTask{}
  39. ext.Id = taskId
  40. ext.IsRun = true
  41. ext.InitTestTaskInfo(resultcoll, trackcoll)
  42. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  43. ext.InitSite()
  44. ext.InitRulePres()
  45. ext.InitRuleBacks(false)
  46. ext.InitRuleBacks(true)
  47. ext.InitRuleCore(false)
  48. ext.InitRuleCore(true)
  49. ext.InitPkgCore()
  50. ext.InitBlockRule()
  51. ext.InfoTypeList()
  52. ext.InitTag(false)
  53. ext.InitTag(true)
  54. ext.InitClearFn(false)
  55. ext.InitClearFn(true)
  56. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  57. //初始化城市DFA信息
  58. ext.InitCityInfo()
  59. //ext.InitCityDFA()
  60. ext.InitAreaCode()
  61. ext.InitPostCode()
  62. }
  63. //质量审核
  64. ext.InitAuditFields()
  65. ext.InitAuditRule()
  66. ext.InitAuditClass()
  67. ext.InitAuditRecogField()
  68. //品牌抽取是否开启
  69. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  70. //附件抽取是否开启
  71. ext.InitFile()
  72. return RunExtractTestTask(ext, startId, num)
  73. }
  74. func IdTrans(startId string) bson.ObjectId {
  75. defer qu.Catch()
  76. return bson.ObjectIdHex(startId)
  77. }
  78. //开始测试任务抽取
  79. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  80. n, _ := strconv.Atoi(num)
  81. id := IdTrans(startId)
  82. if id.Valid() {
  83. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  84. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  85. for _, v := range *list {
  86. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  87. continue
  88. }
  89. var j, jf *ju.Job
  90. var isSite bool
  91. var codeSite string
  92. if ext.IsFileField && v["projectinfo"] != nil {
  93. v["isextFile"] = true
  94. j, jf, isSite,codeSite = ext.PreInfo(v)
  95. } else {
  96. j, _, isSite,codeSite = ext.PreInfo(v)
  97. }
  98. ext.TaskInfo.ProcessPool <- true
  99. go ext.ExtractProcess(j, jf, isSite,codeSite)
  100. }
  101. return true
  102. } else {
  103. return false
  104. }
  105. }
  106. //启动抽取
  107. func StartExtractTaskId(taskId string) bool {
  108. defer qu.Catch()
  109. isgo := false
  110. ext := TaskList[taskId]
  111. if ext == nil {
  112. ext = &ExtractTask{}
  113. ext.Id = taskId
  114. ext.InitTaskInfo()
  115. isgo = true
  116. } else {
  117. ext.Id = taskId
  118. ext.InitTaskInfo()
  119. }
  120. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  121. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  122. ext.InitSite()
  123. ext.InitRulePres()
  124. ext.InitRuleBacks(false)
  125. ext.InitRuleBacks(true)
  126. ext.InitRuleCore(false)
  127. ext.InitRuleCore(true)
  128. ext.InitPkgCore()
  129. ext.InitBlockRule()
  130. ext.InfoTypeList()
  131. ext.InitTag(false)
  132. ext.InitTag(true)
  133. ext.InitClearFn(false)
  134. ext.InitClearFn(true)
  135. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  136. //初始化城市DFA信息
  137. //ext.InitCityDFA()
  138. ext.InitCityInfo()
  139. ext.InitAreaCode()
  140. ext.InitPostCode()
  141. }
  142. //质量审核
  143. ext.InitAuditFields()
  144. ext.InitAuditRule()
  145. ext.InitAuditClass()
  146. ext.InitAuditRecogField()
  147. //品牌抽取是否开启
  148. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  149. //附件抽取是否开启
  150. ext.InitFile()
  151. ext.IsRun = true
  152. go ext.ResultSave(true)
  153. go ext.BidSave(true)
  154. if isgo {
  155. go RunExtractTask(taskId)
  156. }
  157. TaskList[taskId] = ext
  158. return true
  159. }
  160. //停止抽取
  161. func StopExtractTaskId(taskId string) bool {
  162. defer qu.Catch()
  163. ext := TaskList[taskId]
  164. if ext != nil {
  165. ext.IsRun = false
  166. TaskList[taskId] = ext
  167. }
  168. //更新task.s_extlastid
  169. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  170. return true
  171. }
  172. //开始抽取
  173. func RunExtractTask(taskId string) {
  174. defer qu.Catch()
  175. ext := TaskList[taskId]
  176. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  177. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  178. pageNum := (count + PageSize - 1) / PageSize
  179. limit := PageSize
  180. if count < PageSize {
  181. limit = count
  182. }
  183. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  184. for i := 0; i < pageNum; i++ {
  185. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  186. fmt.Printf("page=%d,query=%v", i+1, query)
  187. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  188. for _, v := range *list {
  189. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  190. continue
  191. }
  192. //根据标题判断是否抽取
  193. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  194. if !b {
  195. continue
  196. }
  197. _id := qu.BsonIdToSId(v["_id"])
  198. //log.Debug(_id)
  199. if !ext.IsRun {
  200. break
  201. }
  202. var j, jf *ju.Job
  203. var isSite bool
  204. var codeSite string
  205. if ext.IsFileField && v["projectinfo"] != nil {
  206. v["isextFile"] = true
  207. j, jf, isSite,codeSite = ext.PreInfo(v)
  208. } else {
  209. j, _, isSite,codeSite = ext.PreInfo(v)
  210. }
  211. ext.TaskInfo.ProcessPool <- true
  212. go ext.ExtractProcess(j, jf, isSite,codeSite)
  213. ext.TaskInfo.LastExtId = _id
  214. }
  215. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  216. if !ext.IsRun {
  217. break
  218. }
  219. }
  220. //更新task.s_extlastid
  221. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  222. }
  223. //信息预处理-不和版本关联,取最新版本的配置项
  224. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool,codeSite string) {
  225. return (&ExtractTask{}).PreInfo(doc)
  226. }
  227. //信息预处理-和版本关联
  228. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool,codeSite string) {
  229. defer qu.Catch()
  230. //判断是否有附件这个字段
  231. var isextFile bool
  232. if doc["isextFile"] != nil {
  233. isextFile = doc["isextFile"].(bool)
  234. }
  235. detail := ""
  236. d1, _ := doc["detail"].(string)
  237. d2, _ := doc["contenthtml"].(string)
  238. if len(d1) >= len(d2) || d2 == "" {
  239. detail = d1
  240. } else {
  241. detail = d2
  242. }
  243. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  244. d3, _ := doc["summary"].(string)
  245. detail = ju.CutLableStr(d3 + "\n" + detail)
  246. detail = cut.ClearHtml(d3 + "\n" + detail)
  247. doc["detail"] = detail
  248. if isextFile {
  249. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  250. }
  251. //正文小于50个字,有附件把附件内容加到正文
  252. tmpDeatil := detail
  253. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  254. if err == nil {
  255. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  256. if conlen < 50 {
  257. if isextFile {
  258. detail += qu.ObjToString(doc["detailfile"])
  259. doc["detail"] = detail
  260. }
  261. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  262. //防止文本过长,造成抽取阻塞
  263. log.Debug("文本太长", doc["_id"], conlen)
  264. doc["detail"] = d3
  265. }
  266. }
  267. toptype := qu.ObjToString(doc["toptype"])
  268. subtype := qu.ObjToString(doc["subtype"])
  269. if qu.ObjToString(doc["type"]) == "bid" {
  270. toptype = "结果"
  271. }
  272. if toptype == "" {
  273. toptype = "all"
  274. }
  275. if subtype == "" {
  276. subtype = "all"
  277. }
  278. j = &ju.Job{
  279. SourceMid: qu.BsonIdToSId(doc["_id"]),
  280. Category: toptype,
  281. CategorySecond: subtype,
  282. Content: qu.ObjToString(doc["detail"]),
  283. SpiderCode: qu.ObjToString(doc["spidercode"]),
  284. //Domain: qu.ObjToString(doc["domain"]),
  285. //Href: qu.ObjToString(doc["href"]),
  286. Title: qu.ObjToString(doc["title"]),
  287. Data: &doc,
  288. City: qu.ObjToString(doc["city"]),
  289. Province: qu.ObjToString(doc["area"]),
  290. Jsondata: qu.ObjToMap(doc["jsondata"]),
  291. Result: map[string][]*ju.ExtField{},
  292. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  293. RuleBlock: e.RuleBlock,
  294. }
  295. if isextFile {
  296. jf = &ju.Job{
  297. SourceMid: qu.BsonIdToSId(doc["_id"]),
  298. Category: toptype,
  299. Content: qu.ObjToString(doc["detailfile"]),
  300. SpiderCode: qu.ObjToString(doc["spidercode"]),
  301. Title: qu.ObjToString(doc["title"]),
  302. Data: &doc,
  303. City: qu.ObjToString(doc["city"]),
  304. Province: qu.ObjToString(doc["area"]),
  305. Jsondata: qu.ObjToMap(doc["jsondata"]),
  306. Result: map[string][]*ju.ExtField{},
  307. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  308. RuleBlock: e.RuleBlock,
  309. IsFile: isextFile,
  310. }
  311. }
  312. //是否配置站点
  313. codeSite = qu.ObjToString(doc["spidercode"])
  314. exp, isSite := Luacodes.Load(codeSite)
  315. if isSite{
  316. if exp.( map[string]interface{})["e.SiteClearFn"]!= nil{
  317. e.SiteClearFn = exp.( map[string]interface{})["e.SiteClearFn"].( map[string][]string)
  318. }
  319. if exp.( map[string]interface{})["e.SiteTag"]!= nil{
  320. e.SiteTag = exp.( map[string]interface{})["e.SiteTag"].( map[string][]*Tag)
  321. }
  322. if exp.( map[string]interface{})["e.SiteRuleCores"] != nil{
  323. e.SiteRuleCores = exp.( map[string]interface{})["e.SiteRuleCores"].( map[string]map[string][]*RuleCore)
  324. }
  325. if exp.( map[string]interface{})["e.SiteRuleBacks"]!= nil{
  326. e.SiteRuleBacks = exp.( map[string]interface{})["e.SiteRuleBacks"].( []*RegLuaInfo)
  327. }
  328. }
  329. qu.Try(func() {
  330. pretreated.AnalyStart(j, isSite,codeSite) //job.Block分块
  331. if isextFile {
  332. pretreated.AnalyStart(jf, isSite,codeSite)
  333. }
  334. }, func(err interface{}) {
  335. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  336. })
  337. return j, jf, isSite,codeSite
  338. }
  339. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  340. func file2text(doc *map[string]interface{}) {
  341. var strfileinfo bytes.Buffer
  342. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  343. if va, ok := v["attachments"].(map[string]interface{}); ok {
  344. for _, vaatt := range va {
  345. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  346. if qu.ObjToString(fileinfo["content"]) != "" {
  347. switch fileinfo["content"].(type) {
  348. case string:
  349. lock.Lock()
  350. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  351. lock.Unlock()
  352. case []map[string]interface{}:
  353. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  354. if fv["context"] != nil {
  355. lock.Lock()
  356. strfileinfo.WriteString(fv["context"].(string) + " \n")
  357. lock.Unlock()
  358. }
  359. }
  360. }
  361. }
  362. }
  363. }
  364. }
  365. }
  366. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  367. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  368. }
  369. }
  370. //抽取
  371. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool,codeSite string) {
  372. e.ExtractDetail(j, isSite,codeSite)
  373. if jf != nil && jf.IsFile {
  374. e.ExtractFile(jf, isSite,codeSite)
  375. }
  376. //分析抽取结果并保存 todo
  377. AnalysisSaveResult(j, jf, e)
  378. <-e.TaskInfo.ProcessPool
  379. }
  380. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
  381. qu.Try(func() {
  382. doc := *j.Data
  383. //全局前置规则,结果覆盖doc属性
  384. //for _, v := range e.RulePres {
  385. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  386. //}
  387. tmprules := map[string][]*RuleCore{}
  388. lockrule.Lock()
  389. if j.Category == "all" || j.CategorySecond == "all" {
  390. if isSite {
  391. for k, vc1 := range e.SiteRuleCores["all_all"] {
  392. tmprules[k] = vc1
  393. }
  394. } else {
  395. for k, vc1 := range e.RuleCores["all_all"] {
  396. tmprules[k] = vc1
  397. }
  398. }
  399. } else {
  400. if isSite {
  401. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  402. tmprules[k] = vc1
  403. }
  404. } else {
  405. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  406. tmprules[k] = vc1
  407. }
  408. }
  409. }
  410. if len(tmprules) < 1 { //分类未覆盖部分
  411. if isSite {
  412. for k, vc1 := range e.RuleCores["all_all"] {
  413. tmprules[k] = vc1
  414. }
  415. } else {
  416. for k, vc1 := range e.SiteRuleCores["all_all"] {
  417. tmprules[k] = vc1
  418. }
  419. }
  420. }
  421. lockrule.Unlock()
  422. //抽取规则
  423. for _, vc1 := range tmprules {
  424. for _, vc := range vc1 {
  425. tmp := ju.DeepCopy(doc).(map[string]interface{})
  426. //是否进入逻辑
  427. if !ju.Logic(vc.LuaLogic, tmp) {
  428. continue
  429. }
  430. ////抽取-前置规则
  431. //for _, v := range vc.RulePres {
  432. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  433. //}
  434. // log.Debug("抽取-前置规则", tmp)
  435. //抽取-规则
  436. ExtRuleCore(tmp, e, vc, j)
  437. // log.Debug("抽取-规则", tmp)
  438. //抽取-后置规则
  439. for _, v := range vc.RuleBacks {
  440. ExtRegBack(j, v, e.TaskInfo)
  441. }
  442. // log.Debug("抽取-后置规则", tmp)
  443. //项目名称未能抽取到,标题来凑
  444. if vc.Field == "projectname" {
  445. if vc.ExtFrom == "title" {
  446. isextitle := true
  447. for _, v := range j.Result[vc.Field] {
  448. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  449. isextitle = false
  450. break
  451. }
  452. }
  453. if isextitle { //标题加入选举
  454. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  455. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  456. }
  457. }
  458. for i := 0; i < 3; i++ {
  459. for _, v := range vc.RuleBacks {
  460. ExtRegBack(j, v, e.TaskInfo)
  461. }
  462. }
  463. }
  464. }
  465. }
  466. //全局后置规则
  467. if isSite {
  468. for _, v := range e.SiteRuleBacks {
  469. ExtRegBack(j, v, e.TaskInfo)
  470. }
  471. } else {
  472. for _, v := range e.RuleBacks {
  473. ExtRegBack(j, v, e.TaskInfo)
  474. }
  475. }
  476. //函数清理
  477. for key, val := range j.Result {
  478. for _, v := range val {
  479. lockclear.Lock()
  480. var cfn = []string{}
  481. if isSite {
  482. cfn = e.SiteClearFn[key]
  483. } else {
  484. cfn = e.ClearFn[key]
  485. }
  486. lockclear.Unlock()
  487. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  488. before, _ := v.Value.(string)
  489. v.Value = data[0]
  490. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  491. //添加行数清理的日志
  492. //清理特殊符号
  493. lockclear.Lock()
  494. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  495. text := qu.ObjToString(v.Value)
  496. before = text
  497. v.Value = clear.OtherClean(key, text)
  498. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  499. }
  500. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  501. lockclear.Unlock()
  502. }
  503. }
  504. PackageDetail(j, e, isSite,codeSite) //处理分包信息
  505. // bs, _ := json.Marshal(j.Result)
  506. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  507. }, func(err interface{}) {
  508. log.Debug("ExtractProcess err", err)
  509. })
  510. }
  511. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool,codeSite string) {
  512. qu.Try(func() {
  513. doc := *j.Data
  514. //全局前置规则,结果覆盖doc属性
  515. // for _, v := range e.RulePres {
  516. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  517. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  518. // }
  519. // }
  520. //抽取规则
  521. tmprules := map[string][]*RuleCore{}
  522. lockrule.Lock()
  523. if j.Category == "all" || j.CategorySecond == "all" {
  524. for k, vc1 := range e.RuleCores["all_all"] {
  525. tmprules[k] = vc1
  526. }
  527. } else {
  528. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  529. tmprules[k] = vc1
  530. }
  531. }
  532. lockrule.Unlock()
  533. for _, vc1 := range tmprules {
  534. for _, vc := range vc1 {
  535. tmp := ju.DeepCopy(doc).(map[string]interface{})
  536. //是否进入逻辑
  537. if !ju.Logic(vc.LuaLogic, tmp) {
  538. continue
  539. }
  540. //抽取-前置规则
  541. // for _, v := range vc.RulePres {
  542. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  543. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  544. // }
  545. // }
  546. // log.Debug("抽取-前置规则", tmp)
  547. //抽取-规则
  548. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  549. ExtRuleCore(tmp, e, vc, j)
  550. }
  551. // log.Debug("抽取-规则", tmp)
  552. //抽取-后置规则
  553. for _, v := range vc.RuleBacks {
  554. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  555. ExtRegBack(j, v, e.TaskInfo)
  556. }
  557. }
  558. // log.Debug("抽取-后置规则", tmp)
  559. }
  560. }
  561. //全局后置规则
  562. for _, v := range e.RuleBacks {
  563. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  564. ExtRegBack(j, v, e.TaskInfo)
  565. }
  566. }
  567. //函数清理
  568. for key, val := range j.Result {
  569. for _, v := range val {
  570. lockclear.Lock()
  571. cfn := e.ClearFn[key]
  572. lockclear.Unlock()
  573. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  574. v.Value = data[0]
  575. //清理特殊符号
  576. lockclear.Lock()
  577. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  578. clear.MesField[key] != nil {
  579. text := qu.ObjToString(v.Value)
  580. text = clear.OtherClean(key, text)
  581. v.Value = text
  582. }
  583. lockclear.Unlock()
  584. }
  585. }
  586. PackageDetail(j, e, isSite,codeSite) //处理分包信息
  587. // bs, _ := json.Marshal(j.Result)
  588. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  589. }, func(err interface{}) {
  590. log.Debug("ExtractProcess err", err)
  591. })
  592. }
  593. //前置过滤
  594. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  595. defer qu.Catch()
  596. before := ju.DeepCopy(doc).(map[string]interface{})
  597. extinfo := map[string]interface{}{}
  598. if in.IsLua {
  599. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  600. if j != nil {
  601. lua.Block = j.Block
  602. }
  603. extinfo = lua.RunScript("pre")
  604. for k, v := range extinfo { //结果覆盖原doc
  605. doc[k] = v
  606. }
  607. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  608. } else {
  609. var key string
  610. if !j.IsFile {
  611. key = qu.If(in.Field == "", "detail", in.Field).(string)
  612. } else {
  613. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  614. }
  615. text := qu.ObjToString(doc[key])
  616. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  617. doc[key] = extinfo[key] //结果覆盖原doc
  618. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  619. }
  620. return doc
  621. }
  622. //抽取-规则
  623. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job) {
  624. //候选人加入
  625. var kvMap map[string][]map[string]interface{}
  626. extByReg := true
  627. if vc.ExtFrom != "title" {
  628. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  629. }
  630. for _, v := range vc.RuleCores {
  631. if v.IsLua {
  632. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, kvMap,e)
  633. } else if extByReg {
  634. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
  635. }
  636. }
  637. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  638. if vc.Field == "budget" && len(kvMap) == 0 {
  639. if len(j.BlockPackage) == 1 {
  640. for _, bp := range j.BlockPackage {
  641. for fieldname, field := range vc.LFields {
  642. if field != vc.Field {
  643. continue
  644. }
  645. tp := ""
  646. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  647. if k == 0 {
  648. tp = "colon"
  649. } else if k == 1 {
  650. tp = "space"
  651. } else if k == 2 {
  652. tp = "table"
  653. }
  654. if v == nil || v.KvTags == nil {
  655. continue
  656. }
  657. for _, vv := range v.KvTags[fieldname] {
  658. text := ju.TrimLRSpace(vv.Value, "")
  659. if text != "" {
  660. j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{
  661. Field: vc.Field,
  662. Code: "CL_分包",
  663. Type: tp,
  664. MatchType: "package",
  665. RuleText: bp.Text,
  666. SourceValue: vv.Key,
  667. Value: text,
  668. })
  669. }
  670. }
  671. }
  672. }
  673. break
  674. }
  675. }
  676. } else {
  677. for k, v := range kvMap {
  678. if j.Result[k] == nil {
  679. j.Result[k] = [](*ju.ExtField){}
  680. }
  681. for _, tmp := range v {
  682. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
  683. if tmp["blocktag"] != nil {
  684. btag := make(map[string]string)
  685. for k := range tmp["blocktag"].(map[string]bool) {
  686. blocktag.Lock()
  687. if TagConfigDesc[k] != "" {
  688. btag[k] = TagConfigDesc[k]
  689. }
  690. blocktag.Unlock()
  691. }
  692. field.BlockTag = btag
  693. }
  694. j.Result[k] = append(j.Result[k], field)
  695. }
  696. }
  697. }
  698. }
  699. //抽取-规则-kv
  700. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap map[string][]map[string]interface{}, et *ExtractTask) {
  701. defer qu.Catch()
  702. if extfrom == "title" || !in.IsLua {
  703. return
  704. }
  705. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  706. lua.KvMap = kvMap
  707. lua.Block = j.Block
  708. extinfo := lua.RunScript("core")
  709. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  710. for _, v := range tmps {
  711. v["core"] = in.Code
  712. }
  713. kvMap[in.Field] = tmps
  714. }
  715. if len(extinfo) > 0 {
  716. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  717. }
  718. }
  719. //抽取-规则-正则
  720. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  721. defer qu.Catch()
  722. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  723. b := IsExtract(in.Field, j.Title, j.Content)
  724. if !b {
  725. return
  726. }
  727. //全文正则
  728. //text := qu.ObjToString(doc[extfrom])
  729. //if in.Field != "" {
  730. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  731. // if len(extinfo) > 0 {
  732. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  733. // }
  734. //}
  735. //块抽取
  736. if in.Field != "" {
  737. if extfrom == "title" {
  738. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
  739. if len(extinfo) > 0 {
  740. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  741. }
  742. } else {
  743. for _, v := range j.Block {
  744. btag := make(map[string]string)
  745. for k := range v.Classify {
  746. blocktag.Lock()
  747. btag[k] = TagConfigDesc[k]
  748. blocktag.Unlock()
  749. }
  750. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
  751. if len(extinfo) > 0 {
  752. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  753. }
  754. }
  755. }
  756. }
  757. }
  758. //lua脚本根据属性设置提取kv值
  759. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  760. kvmap := map[string][]map[string]interface{}{}
  761. if len(j.Winnerorder) > 1 {
  762. if vc.Field == "bidamount" {
  763. for _, v := range j.Winnerorder {
  764. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  765. "code": "winnerorder",
  766. "field": vc.Field,
  767. "ruletext": "中标候选人",
  768. "extfrom": vc.ExtFrom,
  769. "sourcevalue": "中标候选人",
  770. "value": v["price"],
  771. "type": "winnerorder",
  772. "matchtype": "winnerorder",
  773. })
  774. }
  775. //候选人中标金额
  776. if price := j.Winnerorder[0]["price"]; price != nil {
  777. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  778. "code": "CL_中标候选人",
  779. "field": vc.Field,
  780. "ruletext": "中标候选人",
  781. "extfrom": vc.ExtFrom,
  782. "sourcevalue": "中标候选人",
  783. "value": price,
  784. "type": "winnerorder",
  785. "matchtype": "winnerorder",
  786. })
  787. return kvmap, false
  788. }
  789. } else if vc.Field == "winner" {
  790. for _, v := range j.Winnerorder {
  791. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  792. "code": "winnerorder",
  793. "field": vc.Field,
  794. "ruletext": "中标候选人",
  795. "extfrom": vc.ExtFrom,
  796. "sourcevalue": "中标候选人",
  797. "value": v["entname"],
  798. "type": "winnerorder",
  799. "matchtype": "winnerorder",
  800. })
  801. }
  802. //候选人中标单位
  803. if entname := j.Winnerorder[0]["entname"]; entname != nil {
  804. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  805. "code": "CL_中标候选人",
  806. "field": vc.Field,
  807. "ruletext": "中标候选人",
  808. "extfrom": vc.ExtFrom,
  809. "sourcevalue": "中标候选人",
  810. "value": entname,
  811. "type": "winnerorder",
  812. "matchtype": "winnerorder",
  813. })
  814. return kvmap, false
  815. }
  816. }
  817. }
  818. for fieldname, field := range vc.LFields {
  819. if field != vc.Field {
  820. continue
  821. }
  822. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  823. }
  824. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  825. return kvmap, true
  826. }
  827. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  828. for _, bl := range blocks {
  829. tp := ""
  830. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  831. if k == 0 {
  832. tp = "colon"
  833. } else if k == 1 {
  834. tp = "space"
  835. } else if k == 2 {
  836. tp = "table"
  837. }
  838. if v == nil || v.KvTags == nil {
  839. continue
  840. }
  841. for _, vv := range v.KvTags[fieldname] {
  842. text := ju.TrimLRSpace(vv.Value, "")
  843. if text != "" {
  844. kvmap[field] = append(kvmap[field], map[string]interface{}{
  845. "code": "CL_" + vv.Key,
  846. "field": field,
  847. "ruletext": vv.Key,
  848. "extfrom": vc.ExtFrom,
  849. "sourcevalue": text,
  850. "value": text,
  851. "type": tp,
  852. "matchtype": "tag_string",
  853. "blocktag": bl.Classify,
  854. "weight": vv.Weight,
  855. })
  856. }
  857. }
  858. }
  859. if len(kvmap[field]) == 0 {
  860. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  861. }
  862. }
  863. }
  864. //正则提取结果
  865. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  866. defer qu.Catch()
  867. extinfo := map[string][]map[string]interface{}{}
  868. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  869. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  870. if len(apos) > 0 {
  871. pos := apos[0]
  872. for k, p := range v.RegCore.ExtractPos {
  873. if len(pos) > p {
  874. if pos[p] == -1 || pos[p+1] == -1 {
  875. continue
  876. }
  877. val := text[pos[p]:pos[p+1]]
  878. sourcevalue := val
  879. if val == "招标公告" {
  880. return extinfo
  881. }
  882. if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
  883. val = text
  884. }
  885. tmps := []map[string]interface{}{}
  886. tmp := map[string]interface{}{
  887. "field": v.Field,
  888. "code": v.Code,
  889. "ruletext": v.RuleText,
  890. "extfrom": text,
  891. "value": val,
  892. "type": "regexp",
  893. "matchtype": "regcontent",
  894. "blocktag": *tag,
  895. }
  896. tmps = append(tmps, tmp)
  897. extinfo[k] = tmps
  898. if strings.TrimSpace(val) != "" {
  899. if v.RegCore.NumSign == -1 { //正负值修正
  900. val = "-" + val
  901. }
  902. exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
  903. if tmp["blocktag"] != nil {
  904. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  905. }
  906. j.Result[k] = append(j.Result[k], &exfield)
  907. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  908. }
  909. }
  910. }
  911. if len(extinfo) == 0 {
  912. regArr := strings.Split(v.RuleText, "__")
  913. //fmt.Println(regArr[0])
  914. if len(regArr) > 0 {
  915. reg, err := regexp.Compile(regArr[0])
  916. if err == nil {
  917. datavals := reg.FindStringSubmatch(text)
  918. tmps := []map[string]interface{}{}
  919. for _, value := range datavals {
  920. if value == "" {
  921. continue
  922. }
  923. tmp := map[string]interface{}{
  924. "field": v.Field,
  925. "code": v.Code,
  926. "ruletext": regArr[0],
  927. "extfrom": text,
  928. "value": value,
  929. "type": "regexp",
  930. "matchtype": "regcontent",
  931. "blocktag": *tag,
  932. }
  933. tmps = append(tmps, tmp)
  934. extinfo[v.Field] = tmps
  935. exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
  936. if tmp["blocktag"] != nil {
  937. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  938. }
  939. j.Result[v.Field] = append(j.Result[v.Field], &exfield)
  940. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  941. }
  942. }
  943. }
  944. }
  945. }
  946. } else {
  947. pos := v.RegCore.Reg.FindStringIndex(text)
  948. val := ""
  949. if len(pos) == 2 {
  950. text = text[pos[1]:]
  951. rs := regexp.MustCompile("[^\r\n\t]+")
  952. tmp := rs.FindAllString(text, -1)
  953. if len(tmp) > 0 {
  954. val = tmp[0]
  955. }
  956. }
  957. if val != "" {
  958. tmps := []map[string]interface{}{}
  959. tmp := map[string]interface{}{
  960. "field": v.Field,
  961. "code": v.Code,
  962. "ruletext": v.RuleText,
  963. "extfrom": text,
  964. "value": val,
  965. "type": "regexp",
  966. "matchtype": "regcontent",
  967. "blocktag": *tag,
  968. }
  969. tmps = append(tmps, tmp)
  970. extinfo[v.Field] = tmps
  971. if j.Result[v.Field] == nil {
  972. j.Result[v.Field] = [](*ju.ExtField){}
  973. }
  974. field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
  975. if tmp["blocktag"] != nil {
  976. field.BlockTag = tmp["blocktag"].(map[string]string)
  977. }
  978. j.Result[v.Field] = append(j.Result[v.Field], field)
  979. }
  980. }
  981. return extinfo
  982. }
  983. //后置过滤
  984. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  985. defer qu.Catch()
  986. if in.IsLua {
  987. result := GetResultMapForLua(j)
  988. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  989. if j != nil {
  990. lua.Block = j.Block
  991. }
  992. extinfo := lua.RunScript("back")
  993. for k, v := range extinfo {
  994. if tmps, ok := v.([]map[string]interface{}); ok {
  995. j.Result[k] = [](*ju.ExtField){}
  996. for _, tmp := range tmps {
  997. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  998. if tmp["blocktag"] != nil {
  999. field.BlockTag = tmp["blocktag"].(map[string]string)
  1000. }
  1001. j.Result[k] = append(j.Result[k], field)
  1002. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1003. }
  1004. }
  1005. }
  1006. if len(extinfo) > 0 {
  1007. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1008. }
  1009. } else {
  1010. extinfo := map[string]interface{}{}
  1011. if in.Field != "" {
  1012. if j.Result[in.Field] != nil {
  1013. tmp := j.Result[in.Field]
  1014. exts := []interface{}{}
  1015. for k, v := range tmp {
  1016. //table抽取到的数据不清理
  1017. // if v.Type == "table" && v.Field != "projectname" {
  1018. // continue
  1019. // }
  1020. text := qu.ObjToString(v.Value)
  1021. if text != "" {
  1022. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1023. }
  1024. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1025. continue
  1026. }
  1027. j.Result[in.Field][k].Value = text
  1028. exts = append(exts, map[string]interface{}{
  1029. "field": v.Field,
  1030. "code": v.Code,
  1031. "ruletext": v.RuleText,
  1032. "type": v.Type,
  1033. "matchtype": v.MatchType,
  1034. "extfrom": v.ExtFrom,
  1035. "value": text,
  1036. })
  1037. }
  1038. extinfo[in.Field] = exts
  1039. if len(extinfo) > 0 {
  1040. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1041. }
  1042. }
  1043. } else {
  1044. for key, tmp := range j.Result {
  1045. exts := []interface{}{}
  1046. for k, v := range tmp {
  1047. if v.Type == "table" { //table抽取到的数据不清理
  1048. continue
  1049. }
  1050. text := qu.ObjToString(v.Value)
  1051. if text != "" {
  1052. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1053. }
  1054. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1055. continue
  1056. }
  1057. j.Result[key][k].Value = text
  1058. exts = append(exts, map[string]interface{}{
  1059. "field": v.Field,
  1060. "code": v.Code,
  1061. "ruletext": v.RuleText,
  1062. "type": v.Type,
  1063. "matchtype": v.MatchType,
  1064. "extfrom": v.ExtFrom,
  1065. "value": text,
  1066. })
  1067. }
  1068. extinfo[key] = exts
  1069. }
  1070. if len(extinfo) > 0 {
  1071. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1072. }
  1073. }
  1074. }
  1075. }
  1076. //获取抽取结果map[string][]interface{},lua脚本使用
  1077. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1078. defer qu.Catch()
  1079. result := map[string][]map[string]interface{}{}
  1080. for key, val := range j.Result {
  1081. if result[key] == nil {
  1082. result[key] = []map[string]interface{}{}
  1083. }
  1084. for _, v := range val {
  1085. tmp := map[string]interface{}{
  1086. "field": v.Field,
  1087. "code": v.Code,
  1088. "ruletext": v.RuleText,
  1089. "value": v.Value,
  1090. "type": v.Type,
  1091. "matchtype": v.MatchType,
  1092. "extfrom": v.ExtFrom,
  1093. }
  1094. result[key] = append(result[key], tmp)
  1095. }
  1096. }
  1097. return result
  1098. }
  1099. //抽取日志
  1100. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1101. defer qu.Catch()
  1102. if !t.IsEtxLog {
  1103. return
  1104. }
  1105. logdata := map[string]interface{}{
  1106. "code": qu.If(v.Code == "", "kv", v.Code),
  1107. "name": v.Name,
  1108. "type": ftype,
  1109. "ruletext": v.RuleText,
  1110. "islua": v.IsLua,
  1111. "field": v.Field,
  1112. "version": t.Version,
  1113. "taskname": t.Name,
  1114. "before": before,
  1115. "extinfo": extinfo,
  1116. "sid": sid,
  1117. "comeintime": time.Now().Unix(),
  1118. }
  1119. lock.Lock()
  1120. ExtLogs[t] = append(ExtLogs[t], logdata)
  1121. lock.Unlock()
  1122. }
  1123. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1124. exts := []map[string]interface{}{}
  1125. exts = append(exts, map[string]interface{}{
  1126. "field": ext.Field,
  1127. "code": ext.Code,
  1128. "type": ftype,
  1129. "matchtype": matchtype,
  1130. "extfrom": ext.ExtFrom,
  1131. "value": ext.Value,
  1132. })
  1133. extinfo := map[string]interface{}{
  1134. ext.Field: exts,
  1135. }
  1136. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1137. }
  1138. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1139. defer qu.Catch()
  1140. if !t.IsEtxLog {
  1141. return
  1142. }
  1143. logdata := map[string]interface{}{
  1144. "code": code,
  1145. "name": name,
  1146. "type": ftype,
  1147. "ruletext": "",
  1148. "islua": false,
  1149. "field": field,
  1150. "version": t.Version,
  1151. "taskname": t.Name,
  1152. "before": before,
  1153. "extinfo": extinfo,
  1154. "sid": sid,
  1155. "comeintime": time.Now().Unix(),
  1156. }
  1157. lock.Lock()
  1158. ExtLogs[t] = append(ExtLogs[t], logdata)
  1159. lock.Unlock()
  1160. }
  1161. //保存抽取日志
  1162. func SaveExtLog() {
  1163. defer qu.Catch()
  1164. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1165. lock.Lock()
  1166. tmpLogs = ExtLogs
  1167. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1168. lock.Unlock()
  1169. for k, v := range tmpLogs {
  1170. if len(v) < saveLimit {
  1171. db.Mgo.SaveBulk(k.TrackColl, v...)
  1172. } else {
  1173. for {
  1174. if len(v) > saveLimit {
  1175. tmp := v[:saveLimit]
  1176. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1177. v = v[saveLimit:]
  1178. } else {
  1179. db.Mgo.SaveBulk(k.TrackColl, v...)
  1180. break
  1181. }
  1182. }
  1183. }
  1184. }
  1185. time.AfterFunc(10*time.Second, SaveExtLog)
  1186. }
  1187. type FieldValue struct {
  1188. Value interface{}
  1189. Count int
  1190. }
  1191. //分析抽取结果并保存
  1192. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1193. qu.Try(func() {
  1194. //重新取出清理过后的中标候选人
  1195. resetWinnerorder(j)
  1196. doc, result, _id := funcAnalysis(j, e)
  1197. if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
  1198. go otherNeedSave(j, result, e)
  1199. }
  1200. auxinfo := auxInfo(j)
  1201. //从排序结果中取值
  1202. tmp := map[string]interface{}{} //抽取值
  1203. tmp["fieldall"] = auxinfo
  1204. for _, val := range result {
  1205. for _, v := range val { //取第一个非负数,项目名称除外
  1206. if v.Score > -1 {
  1207. tmp[v.Field] = v.Value
  1208. break
  1209. } else if v.Field == "projectname" {
  1210. tmp[v.Field] = v.Value
  1211. break
  1212. }
  1213. }
  1214. }
  1215. if len(j.PackageInfo) > 0 { //分包信息
  1216. tmp["package"] = j.PackageInfo
  1217. }
  1218. if len(j.Winnerorder) > 0 { //候选人信息
  1219. tmp["winnerorder"] = j.Winnerorder
  1220. }
  1221. //处理附件
  1222. var resultf map[string][]*ju.ExtField
  1223. if jf != nil {
  1224. _, resultf, _ = funcAnalysis(jf, e)
  1225. auxinfof := auxInfo(jf)
  1226. tmp["fieldallf"] = auxinfof
  1227. ffield := map[string]interface{}{}
  1228. for _, val := range resultf {
  1229. for _, v := range val { //取第一个非负数
  1230. if v.Score > -1 {
  1231. ffield[v.Field] = v.Value
  1232. break
  1233. }
  1234. }
  1235. }
  1236. if len(jf.PackageInfo) > 0 { //分包信息
  1237. ffield["package"] = jf.PackageInfo
  1238. }
  1239. if len(jf.Winnerorder) > 0 { //候选人信息
  1240. ffield["winnerorder"] = jf.Winnerorder
  1241. }
  1242. tmp["ffield"] = ffield
  1243. }
  1244. for k, v := range *doc {
  1245. //去重冗余字段
  1246. if delFiled(k) {
  1247. continue
  1248. }
  1249. if tmp[k] == nil {
  1250. tmp[k] = v
  1251. }
  1252. }
  1253. //质量审核
  1254. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1255. e.QualityAudit(tmp)
  1256. }
  1257. if e.IsExtractCity { //城市抽取
  1258. //e.ExtractCity(j, tmp, _id)
  1259. e.NewExtractCity(j, tmp, _id)
  1260. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1261. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1262. // tmp["district"] = d
  1263. // if b {
  1264. // tmp["city"] = c
  1265. // tmp["area"] = p
  1266. // }
  1267. }
  1268. //品牌抽取
  1269. if ju.IsBrandGoods {
  1270. tmp["checkhas"] = map[string]int{
  1271. "hastable": j.HasTable,
  1272. "hasgoods": j.HasGoods,
  1273. "hasbrand": j.HasBrand,
  1274. "haskey": j.HasKey,
  1275. }
  1276. if len(j.BrandData) > 0 {
  1277. tmp["tablebrand"] = j.BrandData
  1278. }
  1279. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1280. }
  1281. //所有kv组成的字符串
  1282. var kvtext bytes.Buffer
  1283. blocks := make([]ju.BlockAndTag, 0)
  1284. for _, v := range j.Block {
  1285. //分包和标签
  1286. if ju.Config["saveblock"].(bool) {
  1287. xx, _ := json.Marshal(v)
  1288. tmpblock := new(ju.TmpBlock)
  1289. err := json.Unmarshal(xx, &tmpblock)
  1290. if err != nil {
  1291. if v.BPackage != nil {
  1292. bpb, _ := json.Marshal(v.BPackage)
  1293. tmpblock.BPackage = string(bpb)
  1294. }
  1295. tmpblock = rangeBlockToJson(v, *tmpblock)
  1296. }
  1297. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1298. }
  1299. //把所有kv组装成一个字符串,存库
  1300. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1301. if jv == nil {
  1302. continue
  1303. }
  1304. for jv_k, jv_v := range jv.KvTags {
  1305. for _, jv_vv := range jv_v {
  1306. kvtext.WriteString(jv_k)
  1307. kvtext.WriteString(":")
  1308. kvtext.WriteString(jv_vv.Value)
  1309. kvtext.WriteString(" ")
  1310. }
  1311. }
  1312. }
  1313. }
  1314. if kvtext.Len() > 0 {
  1315. tmp["kvtext"] = kvtext.String()
  1316. }
  1317. if len(blocks) > 0 {
  1318. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1319. if utf8.RuneCount(blocksBytes) < 100000 {
  1320. tmp["blocks"] = string(blocksBytes)
  1321. }
  1322. }
  1323. }
  1324. //tmp["extract_content"] = j.Content
  1325. if e.TaskInfo.TestColl == "" {
  1326. if len(tmp) > 0 { //保存抽取结果
  1327. for field, _ := range e.Fields {
  1328. if tmp[field] == nil {
  1329. tmp[field] = "" //覆盖之前版本数据
  1330. }
  1331. }
  1332. tmp["repeat"] = 0
  1333. tmparr := []map[string]interface{}{
  1334. map[string]interface{}{
  1335. "_id": qu.StringTOBsonId(_id),
  1336. },
  1337. map[string]interface{}{"$set": tmp},
  1338. }
  1339. e.RWMutex.Lock()
  1340. e.BidArr = append(e.BidArr, tmparr)
  1341. e.BidTotal++
  1342. e.RWMutex.Unlock()
  1343. }
  1344. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1345. id := tmp["_id"]
  1346. tmp["result"] = result
  1347. tmp["resultf"] = resultf
  1348. delete(tmp, "_id")
  1349. tmparr := []map[string]interface{}{
  1350. map[string]interface{}{
  1351. "_id": id,
  1352. },
  1353. map[string]interface{}{"$set": tmp},
  1354. }
  1355. e.RWMutex.Lock()
  1356. e.ResultArr = append(e.ResultArr, tmparr)
  1357. e.RWMutex.Unlock()
  1358. }
  1359. } else { //测试结果
  1360. delete(tmp, "_id")
  1361. if len(j.BlockPackage) > 0 { //分包详情
  1362. bs, _ := json.Marshal(j.BlockPackage)
  1363. tmp["epackage"] = string(bs)
  1364. }
  1365. tmp["result"] = result
  1366. tmp["resultf"] = resultf
  1367. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1368. if !b {
  1369. log.Debug(e.TaskInfo.TestColl, _id)
  1370. }
  1371. }
  1372. }, func(err interface{}) {
  1373. log.Debug("AnalysisSaveResult err", err)
  1374. })
  1375. }
  1376. //保存其他
  1377. //kv、表格、块上的标签凡是新的标签都入库
  1378. //val type times firstid createtime 判定field
  1379. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1380. now := time.Now().Unix()
  1381. coll := e.TaskInfo.TestColl
  1382. if coll == "" {
  1383. coll = "extract_tag_result"
  1384. } else {
  1385. coll += "_tag"
  1386. }
  1387. datas := []map[string]interface{}{}
  1388. kv := map[string]int{}
  1389. for _, v := range j.Block {
  1390. //
  1391. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1392. if vv == nil || vv.KvTags == nil {
  1393. continue
  1394. }
  1395. for kkk, vvv := range vv.KvTags {
  1396. for _, vvvv := range vvv {
  1397. if vvvv.IsInvalid {
  1398. kv[kkk] = kv[kkk] + 1
  1399. break
  1400. }
  1401. }
  1402. }
  1403. }
  1404. for _, vv := range v.NotClassifyTitles {
  1405. datas = append(datas, map[string]interface{}{
  1406. "val": vv,
  1407. "times": 0,
  1408. "type": "block",
  1409. "firstid": j.SourceMid,
  1410. "createtime": now,
  1411. })
  1412. if len(datas) == saveLimit {
  1413. db.Mgo.SaveBulk(coll, datas...)
  1414. datas = []map[string]interface{}{}
  1415. }
  1416. }
  1417. }
  1418. for k, v := range kv {
  1419. datas = append(datas, map[string]interface{}{
  1420. "val": k,
  1421. "times": v,
  1422. "type": "kv",
  1423. "firstid": j.SourceMid,
  1424. "createtime": now,
  1425. })
  1426. if len(datas) == saveLimit {
  1427. db.Mgo.SaveBulk(coll, datas...)
  1428. datas = []map[string]interface{}{}
  1429. }
  1430. }
  1431. if len(datas) > 0 {
  1432. db.Mgo.SaveBulk(coll, datas...)
  1433. }
  1434. }
  1435. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1436. if j == nil {
  1437. return nil
  1438. }
  1439. if len(j.Block) > 0 {
  1440. for i, v := range j.Block {
  1441. rangetmp := new(ju.TmpBlock)
  1442. vb, _ := json.Marshal(v)
  1443. json.Unmarshal(vb, &rangetmp)
  1444. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1445. }
  1446. }
  1447. if j.ColonKV != nil {
  1448. cb, _ := json.Marshal(j.ColonKV)
  1449. tmpblock.ColonKV = string(cb)
  1450. }
  1451. if j.SpaceKV != nil {
  1452. sb, _ := json.Marshal(j.SpaceKV)
  1453. tmpblock.SpaceKV = string(sb)
  1454. }
  1455. if j.TableKV != nil {
  1456. tb, _ := json.Marshal(j.TableKV)
  1457. tmpblock.TableKV = string(tb)
  1458. }
  1459. return &tmpblock
  1460. }
  1461. //去重冗余字段
  1462. func delFiled(k string) bool {
  1463. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1464. }
  1465. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1466. defer qu.Catch()
  1467. doc := j.Data
  1468. result := j.Result
  1469. _id := qu.BsonIdToSId((*doc)["_id"])
  1470. result = ScoreFields(j, e.Tag)
  1471. //结果排序
  1472. for _, val := range result {
  1473. ju.Sort(val)
  1474. }
  1475. j.Result = JsonDataMergeProcessing(j, e)
  1476. return doc, result, _id
  1477. }
  1478. //辅助信息,如果没有排序先排序
  1479. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1480. fieldalls := map[string][]map[string]interface{}{}
  1481. for field, val := range j.Result {
  1482. //ju.Sort(val)
  1483. sfields := []map[string]interface{}{}
  1484. for _, v := range val {
  1485. standardized := false
  1486. if field == "buyer" || field == "winner" || field == "agency" {
  1487. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1488. if i > 0 {
  1489. standardized = true
  1490. }
  1491. }
  1492. sfield := map[string]interface{}{
  1493. "val": v.Value,
  1494. "type": v.Type,
  1495. "score": v.Score,
  1496. "blocktag": v.BlockTag,
  1497. "sourceval": v.SourceValue,
  1498. "standardized": standardized,
  1499. }
  1500. sfields = append(sfields, sfield)
  1501. }
  1502. fieldalls[field] = sfields
  1503. }
  1504. return fieldalls
  1505. }
  1506. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1507. defer qu.Catch()
  1508. //获取审核字段
  1509. for _, field := range e.AuditFields {
  1510. //1.分包
  1511. if resulttmp["package"] != nil {
  1512. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1513. for _, val := range packagedata {
  1514. if val[field] != nil {
  1515. fv := qu.ObjToString(val[field])
  1516. if fv != "" {
  1517. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1518. e.RedisMatch(field, fv, val) //redis匹配
  1519. } else { //除了buyer和winner,其他字段走规则匹配
  1520. e.RuleMatch(field, fv, val)
  1521. }
  1522. }
  1523. }
  1524. }
  1525. }
  1526. //2.外围
  1527. if resulttmp[field] != nil {
  1528. fv := qu.ObjToString(resulttmp[field])
  1529. if fv != "" {
  1530. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1531. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1532. } else { //除了buyer和winner,其他字段走规则匹配
  1533. e.RuleMatch(field, fv, resulttmp)
  1534. }
  1535. }
  1536. }
  1537. }
  1538. }
  1539. //Redis匹配
  1540. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1541. defer qu.Catch()
  1542. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1543. if i == 0 { //reids未找到,执行规则匹配
  1544. val[field+"_isredis"] = false
  1545. e.RuleMatch(field, fv, val) //规则匹配
  1546. } else { //redis找到,打标识存库
  1547. val[field+"_isredis"] = true
  1548. }
  1549. }
  1550. //规则匹配
  1551. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1552. defer qu.Catch()
  1553. if fieldval != "" {
  1554. SMap := e.StartMatch(field, fieldval)
  1555. //SMap.AddKey(field+"_isaudit", false)
  1556. for _, k := range SMap.Keys {
  1557. tmpMap[k] = SMap.Map[k]
  1558. }
  1559. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1560. }
  1561. }
  1562. //开始规则匹配
  1563. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1564. defer qu.Catch()
  1565. SMap := pretreated.NewSortMap()
  1566. lock.Lock()
  1567. f := e.RecogFieldMap[field]
  1568. lock.Unlock()
  1569. if len(f) > 0 {
  1570. fid := qu.BsonIdToSId(f["_id"])
  1571. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1572. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1573. if textAfterRecogFieldPrerule != "" {
  1574. lock.Lock()
  1575. classMap := e.FidClassMap[fid]
  1576. lock.Unlock()
  1577. L:
  1578. for _, c := range classMap { //class
  1579. classid := qu.BsonIdToSId(c["_id"])
  1580. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1581. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1582. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1583. if textAfterClassPrerule != "" {
  1584. lock.Lock()
  1585. ruleMap := e.CidRuleMap[classid]
  1586. lock.Unlock()
  1587. for _, r := range ruleMap { //rule
  1588. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1589. s_name := qu.ObjToString(r["s_name"])
  1590. rule := r["rule"].([]interface{})
  1591. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1592. if textAfterRulePrerule != "" {
  1593. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1594. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1595. if savefield != "" { //保存字段不为空,存储代码信息
  1596. SMap.AddKey(field+"_"+savefield, s_name)
  1597. }
  1598. break L
  1599. }
  1600. }
  1601. }
  1602. }
  1603. }
  1604. }
  1605. }
  1606. return SMap
  1607. }
  1608. //中标候选人经过清理之后,重新取出赋值
  1609. func resetWinnerorder(j *ju.Job) {
  1610. if len(j.Winnerorder) == 0 {
  1611. return
  1612. }
  1613. //中标单位
  1614. i := 0
  1615. winners := []*ju.ExtField{}
  1616. for _, v := range j.Result["winner"] {
  1617. if v.Code == "winnerorder" {
  1618. j.Winnerorder[i]["entname"] = v.Value
  1619. i++
  1620. } else {
  1621. winners = append(winners, v)
  1622. }
  1623. }
  1624. j.Result["winner"] = winners
  1625. //中标金额
  1626. i = 0
  1627. bidamounts := []*ju.ExtField{}
  1628. for _, v := range j.Result["bidamount"] {
  1629. if v.Code == "winnerorder" {
  1630. j.Winnerorder[i]["price"] = v.Value
  1631. i++
  1632. } else {
  1633. bidamounts = append(bidamounts, v)
  1634. }
  1635. }
  1636. j.Result["bidamount"] = bidamounts
  1637. }