extract.go 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. log "github.com/donnie4w/go-logger/logger"
  19. "gopkg.in/mgo.v2/bson"
  20. )
  21. var (
  22. lock sync.RWMutex
  23. cut = ju.NewCut() //获取正文并清理
  24. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  25. TaskList map[string]*ExtractTask //任务列表
  26. ClearTaskList map[string]*ClearTask //清理任务列表
  27. saveLimit = 200 //抽取日志批量保存
  28. PageSize = 5000 //查询分页
  29. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  30. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  31. )
  32. //启动测试抽取
  33. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  34. defer qu.Catch()
  35. ext := &ExtractTask{}
  36. ext.Id = taskId
  37. ext.IsRun = true
  38. ext.InitTestTaskInfo(resultcoll, trackcoll)
  39. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  40. ext.InitRulePres()
  41. ext.InitRuleBacks()
  42. ext.InitRuleCore()
  43. ext.InitPkgCore()
  44. ext.InitBlockRule()
  45. ext.InfoTypeList()
  46. ext.InitTag()
  47. ext.InitClearFn()
  48. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  49. //初始化城市DFA信息
  50. ext.InitCityDFA()
  51. ext.InitAreaCode()
  52. ext.InitPostCode()
  53. }
  54. //质量审核
  55. ext.InitAuditFields()
  56. ext.InitAuditRule()
  57. ext.InitAuditClass()
  58. ext.InitAuditRecogField()
  59. //品牌抽取是否开启
  60. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  61. //附件抽取是否开启
  62. ext.InitFile()
  63. return RunExtractTestTask(ext, startId, num)
  64. }
  65. func IdTrans(startId string) bson.ObjectId {
  66. defer qu.Catch()
  67. return bson.ObjectIdHex(startId)
  68. }
  69. //开始测试任务抽取
  70. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  71. n, _ := strconv.Atoi(num)
  72. id := IdTrans(startId)
  73. if id.Valid() {
  74. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  75. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  76. for _, v := range *list {
  77. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  78. continue
  79. }
  80. var j, jf *ju.Job
  81. if ext.IsFileField && v["projectinfo"] != nil {
  82. v["isextFile"] = true
  83. j, jf = ext.PreInfo(v)
  84. } else {
  85. j, _ = ext.PreInfo(v)
  86. }
  87. ext.TaskInfo.ProcessPool <- true
  88. go ext.ExtractProcess(j, jf)
  89. }
  90. return true
  91. } else {
  92. return false
  93. }
  94. }
  95. //启动抽取
  96. func StartExtractTaskId(taskId string) bool {
  97. defer qu.Catch()
  98. isgo := false
  99. ext := TaskList[taskId]
  100. if ext == nil {
  101. ext = &ExtractTask{}
  102. ext.Id = taskId
  103. ext.InitTaskInfo()
  104. isgo = true
  105. } else {
  106. ext.Id = taskId
  107. ext.InitTaskInfo()
  108. }
  109. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  110. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  111. ext.InitRulePres()
  112. ext.InitRuleBacks()
  113. ext.InitRuleCore()
  114. ext.InitPkgCore()
  115. ext.InitBlockRule()
  116. ext.InfoTypeList()
  117. ext.InitTag()
  118. ext.InitClearFn()
  119. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  120. //初始化城市DFA信息
  121. ext.InitCityDFA()
  122. ext.InitAreaCode()
  123. ext.InitPostCode()
  124. }
  125. //质量审核
  126. ext.InitAuditFields()
  127. ext.InitAuditRule()
  128. ext.InitAuditClass()
  129. ext.InitAuditRecogField()
  130. //品牌抽取是否开启
  131. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  132. //附件抽取是否开启
  133. ext.InitFile()
  134. ext.IsRun = true
  135. go ext.ResultSave(true)
  136. go ext.BidSave(true)
  137. if isgo {
  138. go RunExtractTask(taskId)
  139. }
  140. TaskList[taskId] = ext
  141. return true
  142. }
  143. //停止抽取
  144. func StopExtractTaskId(taskId string) bool {
  145. defer qu.Catch()
  146. ext := TaskList[taskId]
  147. if ext != nil {
  148. ext.IsRun = false
  149. TaskList[taskId] = ext
  150. }
  151. //更新task.s_extlastid
  152. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  153. return true
  154. }
  155. //开始抽取
  156. func RunExtractTask(taskId string) {
  157. defer qu.Catch()
  158. ext := TaskList[taskId]
  159. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  160. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  161. pageNum := (count + PageSize - 1) / PageSize
  162. limit := PageSize
  163. if count < PageSize {
  164. limit = count
  165. }
  166. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  167. for i := 0; i < pageNum; i++ {
  168. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  169. fmt.Printf("page=%d,query=%v", i+1, query)
  170. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  171. for _, v := range *list {
  172. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  173. continue
  174. }
  175. _id := qu.BsonIdToSId(v["_id"])
  176. //log.Debug(_id)
  177. if !ext.IsRun {
  178. break
  179. }
  180. var j, jf *ju.Job
  181. if ext.IsFileField && v["projectinfo"] != nil {
  182. v["isextFile"] = true
  183. j, jf = ext.PreInfo(v)
  184. } else {
  185. j, _ = ext.PreInfo(v)
  186. }
  187. ext.TaskInfo.ProcessPool <- true
  188. go ext.ExtractProcess(j, jf)
  189. ext.TaskInfo.LastExtId = _id
  190. }
  191. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  192. if !ext.IsRun {
  193. break
  194. }
  195. }
  196. //更新task.s_extlastid
  197. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  198. }
  199. //信息预处理-不和版本关联,取最新版本的配置项
  200. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  201. return (&ExtractTask{}).PreInfo(doc)
  202. }
  203. //信息预处理-和版本关联
  204. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  205. defer qu.Catch()
  206. //判断是否有附件这个字段
  207. var isextFile bool
  208. if doc["isextFile"] != nil {
  209. isextFile = doc["isextFile"].(bool)
  210. }
  211. detail := ""
  212. d1, _ := doc["detail"].(string)
  213. d2, _ := doc["contenthtml"].(string)
  214. if len(d1) >= len(d2) || d2 == "" {
  215. detail = d1
  216. } else {
  217. detail = d2
  218. }
  219. detail = ju.CutLableStr(detail)
  220. detail = cut.ClearHtml(detail)
  221. doc["detail"] = detail
  222. if isextFile {
  223. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  224. }
  225. toptype := qu.ObjToString(doc["toptype"])
  226. subtype := qu.ObjToString(doc["subtype"])
  227. if qu.ObjToString(doc["type"]) == "bid" {
  228. toptype = "结果"
  229. }
  230. if toptype == "" {
  231. toptype = "*"
  232. }
  233. j = &ju.Job{
  234. SourceMid: qu.BsonIdToSId(doc["_id"]),
  235. Category: toptype,
  236. CategorySecond: subtype,
  237. Content: qu.ObjToString(doc["detail"]),
  238. SpiderCode: qu.ObjToString(doc["spidercode"]),
  239. //Domain: qu.ObjToString(doc["domain"]),
  240. //Href: qu.ObjToString(doc["href"]),
  241. Title: qu.ObjToString(doc["title"]),
  242. Data: &doc,
  243. City: qu.ObjToString(doc["city"]),
  244. Province: qu.ObjToString(doc["area"]),
  245. Jsondata: qu.ObjToMap(doc["jsondata"]),
  246. Result: map[string][]*ju.ExtField{},
  247. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  248. RuleBlock: e.RuleBlock,
  249. }
  250. if isextFile {
  251. jf = &ju.Job{
  252. SourceMid: qu.BsonIdToSId(doc["_id"]),
  253. Category: toptype,
  254. Content: qu.ObjToString(doc["detailfile"]),
  255. SpiderCode: qu.ObjToString(doc["spidercode"]),
  256. Title: qu.ObjToString(doc["title"]),
  257. Data: &doc,
  258. City: qu.ObjToString(doc["city"]),
  259. Province: qu.ObjToString(doc["area"]),
  260. Jsondata: qu.ObjToMap(doc["jsondata"]),
  261. Result: map[string][]*ju.ExtField{},
  262. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  263. RuleBlock: e.RuleBlock,
  264. IsFile: isextFile,
  265. }
  266. }
  267. qu.Try(func() {
  268. pretreated.AnalyStart(j) //job.Block分块
  269. if isextFile {
  270. pretreated.AnalyStart(jf)
  271. }
  272. }, func(err interface{}) {
  273. log.Debug("pretreated.AnalyStart", err)
  274. })
  275. return j, jf
  276. }
  277. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  278. func file2text(doc *map[string]interface{}) {
  279. var strfileinfo bytes.Buffer
  280. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  281. if va, ok := v["attachments"].(map[string]interface{}); ok {
  282. for _, vaatt := range va {
  283. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  284. if qu.ObjToString(fileinfo["content"]) != "" {
  285. switch fileinfo["content"].(type) {
  286. case string:
  287. lock.Lock()
  288. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  289. lock.Unlock()
  290. case []map[string]interface{}:
  291. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  292. if fv["context"] != nil {
  293. lock.Lock()
  294. strfileinfo.WriteString(fv["context"].(string) + " \n")
  295. lock.Unlock()
  296. }
  297. }
  298. }
  299. }
  300. }
  301. }
  302. }
  303. }
  304. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  305. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  306. }
  307. }
  308. //抽取
  309. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  310. e.ExtractDetail(j)
  311. if jf != nil && jf.IsFile {
  312. e.ExtractFile(jf)
  313. }
  314. //分析抽取结果并保存 todo
  315. AnalysisSaveResult(j, jf, e)
  316. <-e.TaskInfo.ProcessPool
  317. }
  318. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  319. qu.Try(func() {
  320. doc := *j.Data
  321. //全局前置规则,结果覆盖doc属性
  322. //for _, v := range e.RulePres {
  323. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  324. //}
  325. if j.CategorySecond == "" {
  326. //抽取规则
  327. tmprules := map[string][]*RuleCore{}
  328. lock.Lock()
  329. if e.RuleCores[j.Category] == nil {
  330. j.Category = "*_其他"
  331. }
  332. for k, vc1 := range e.RuleCores[j.Category] {
  333. tmprules[k] = vc1
  334. }
  335. lock.Unlock()
  336. for _, vc1 := range tmprules {
  337. for _, vc := range vc1 {
  338. tmp := ju.DeepCopy(doc).(map[string]interface{})
  339. //是否进入逻辑
  340. if !ju.Logic(vc.LuaLogic, tmp) {
  341. continue
  342. }
  343. ////抽取-前置规则
  344. //for _, v := range vc.RulePres {
  345. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  346. //}
  347. // log.Debug("抽取-前置规则", tmp)
  348. //抽取-规则
  349. for _, v := range vc.RuleCores {
  350. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  351. }
  352. // log.Debug("抽取-规则", tmp)
  353. //项目名称未能抽取到,标题来凑
  354. if vc.Field == "projectname" {
  355. if len(j.Result[vc.Field]) < 1 {
  356. items := make([]*ju.ScoreItem, 1)
  357. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  358. field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
  359. if tmp["blocktag"] != nil {
  360. field.BlockTag = tmp["blocktag"].(map[string]bool)
  361. }
  362. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  363. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  364. }
  365. }
  366. //抽取-后置规则
  367. for _, v := range vc.RuleBacks {
  368. ExtRegBack(j, v, e.TaskInfo)
  369. }
  370. // log.Debug("抽取-后置规则", tmp)
  371. }
  372. }
  373. } else {
  374. var cores map[string][]*RuleCore
  375. if e.RuleCores[j.Category+"_"+j.CategorySecond] == nil {
  376. cores = e.RuleCores["*_其他"]
  377. } else {
  378. cores = e.RuleCores[j.Category+"_"+j.CategorySecond]
  379. }
  380. for _, vc1 := range cores {
  381. for _, vc := range vc1 {
  382. tmp := ju.DeepCopy(doc).(map[string]interface{})
  383. //是否进入逻辑
  384. if !ju.Logic(vc.LuaLogic, tmp) {
  385. continue
  386. }
  387. //抽取-前置规则
  388. for _, v := range vc.RulePres {
  389. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  390. }
  391. // log.Debug("抽取-前置规则", tmp)
  392. //抽取-规则
  393. for _, v := range vc.RuleCores {
  394. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  395. }
  396. // log.Debug("抽取-规则", tmp)
  397. //项目名称未能抽取到,标题来凑
  398. if vc.Field == "projectname" {
  399. items := make([]*ju.ScoreItem, 1)
  400. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  401. field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
  402. if len(j.Result[vc.Field]) < 1 {
  403. if tmp["blocktag"] != nil {
  404. field.BlockTag = tmp["blocktag"].(map[string]bool)
  405. }
  406. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  407. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  408. }
  409. }
  410. //抽取-后置规则
  411. for _, v := range vc.RuleBacks {
  412. ExtRegBack(j, v, e.TaskInfo)
  413. }
  414. // log.Debug("抽取-后置规则", tmp)
  415. }
  416. }
  417. }
  418. //全局后置规则
  419. for _, v := range e.RuleBacks {
  420. ExtRegBack(j, v, e.TaskInfo)
  421. }
  422. //候选人加入
  423. if len(j.Winnerorder) > 0 {
  424. winner := &ju.ExtField{
  425. Field: "winner",
  426. Code: "",
  427. RuleText: "",
  428. Type: "winnerorder",
  429. MatchType: "winnerorder",
  430. ExtFrom: "",
  431. Value: j.Winnerorder[0]["entname"],
  432. Score: 0,
  433. }
  434. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  435. winner.Score = -5
  436. }
  437. winners := j.Result["winner"]
  438. if winners != nil {
  439. winners = append(winners, winner)
  440. } else {
  441. winners = []*ju.ExtField{}
  442. winners = append(winners, winner)
  443. }
  444. j.Result["winner"] = winners
  445. }
  446. //函数清理
  447. for key, val := range j.Result {
  448. for _, v := range val {
  449. lock.Lock()
  450. cfn := e.ClearFn[key]
  451. lock.Unlock()
  452. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  453. v.Value = data[0]
  454. //清理特殊符号
  455. lock.Lock()
  456. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  457. clear.MesField[key] != nil {
  458. text := qu.ObjToString(v.Value)
  459. text = clear.OtherClean(key, text)
  460. if text != "" {
  461. v.Value = text
  462. }
  463. }
  464. lock.Unlock()
  465. }
  466. }
  467. PackageDetail(j, e) //处理分包信息
  468. // bs, _ := json.Marshal(j.Result)
  469. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  470. }, func(err interface{}) {
  471. log.Debug("ExtractProcess err", err)
  472. })
  473. }
  474. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  475. qu.Try(func() {
  476. doc := *j.Data
  477. //全局前置规则,结果覆盖doc属性
  478. for _, v := range e.RulePres {
  479. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  480. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  481. }
  482. }
  483. //抽取规则
  484. if j.CategorySecond == "" {
  485. for _, vc1 := range e.RuleCores[j.Category] {
  486. for _, vc := range vc1 {
  487. tmp := ju.DeepCopy(doc).(map[string]interface{})
  488. //是否进入逻辑
  489. if !ju.Logic(vc.LuaLogic, tmp) {
  490. continue
  491. }
  492. //抽取-前置规则
  493. for _, v := range vc.RulePres {
  494. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  495. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  496. }
  497. }
  498. // log.Debug("抽取-前置规则", tmp)
  499. //抽取-规则
  500. for _, v := range vc.RuleCores {
  501. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  502. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  503. }
  504. }
  505. // log.Debug("抽取-规则", tmp)
  506. //抽取-后置规则
  507. for _, v := range vc.RuleBacks {
  508. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  509. ExtRegBack(j, v, e.TaskInfo)
  510. }
  511. }
  512. // log.Debug("抽取-后置规则", tmp)
  513. }
  514. }
  515. } else {
  516. for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  517. for _, vc := range vc1 {
  518. tmp := ju.DeepCopy(doc).(map[string]interface{})
  519. //是否进入逻辑
  520. if !ju.Logic(vc.LuaLogic, tmp) {
  521. continue
  522. }
  523. //抽取-前置规则
  524. for _, v := range vc.RulePres {
  525. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  526. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  527. }
  528. }
  529. // log.Debug("抽取-前置规则", tmp)
  530. //抽取-规则
  531. for _, v := range vc.RuleCores {
  532. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  533. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  534. }
  535. }
  536. // log.Debug("抽取-规则", tmp)
  537. //抽取-后置规则
  538. for _, v := range vc.RuleBacks {
  539. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  540. ExtRegBack(j, v, e.TaskInfo)
  541. }
  542. }
  543. // log.Debug("抽取-后置规则", tmp)
  544. }
  545. }
  546. }
  547. //全局后置规则
  548. for _, v := range e.RuleBacks {
  549. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  550. ExtRegBack(j, v, e.TaskInfo)
  551. }
  552. }
  553. //候选人加入
  554. if len(j.Winnerorder) > 0 {
  555. winner := &ju.ExtField{
  556. Field: "winner",
  557. Code: "",
  558. RuleText: "",
  559. Type: "winnerorder",
  560. MatchType: "winnerorder",
  561. ExtFrom: "",
  562. Value: j.Winnerorder[0]["entname"],
  563. Score: 0,
  564. }
  565. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  566. winner.Score = -5
  567. }
  568. winners := j.Result["winner"]
  569. if winners != nil {
  570. winners = append(winners, winner)
  571. } else {
  572. winners = []*ju.ExtField{}
  573. winners = append(winners, winner)
  574. }
  575. j.Result["winner"] = winners
  576. }
  577. //函数清理
  578. for key, val := range j.Result {
  579. for _, v := range val {
  580. lock.Lock()
  581. cfn := e.ClearFn[key]
  582. lock.Unlock()
  583. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  584. v.Value = data[0]
  585. //清理特殊符号
  586. lock.Lock()
  587. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  588. clear.MesField[key] != nil {
  589. text := qu.ObjToString(v.Value)
  590. text = clear.OtherClean(key, text)
  591. v.Value = text
  592. }
  593. lock.Unlock()
  594. }
  595. }
  596. PackageDetail(j, e) //处理分包信息
  597. // bs, _ := json.Marshal(j.Result)
  598. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  599. }, func(err interface{}) {
  600. log.Debug("ExtractProcess err", err)
  601. })
  602. }
  603. //前置过滤
  604. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  605. defer qu.Catch()
  606. before := ju.DeepCopy(doc).(map[string]interface{})
  607. extinfo := map[string]interface{}{}
  608. if in.IsLua {
  609. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  610. if j != nil {
  611. lua.Block = j.Block
  612. }
  613. extinfo = lua.RunScript("pre")
  614. for k, v := range extinfo { //结果覆盖原doc
  615. doc[k] = v
  616. }
  617. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  618. } else {
  619. var key string
  620. if !j.IsFile {
  621. key = qu.If(in.Field == "", "detail", in.Field).(string)
  622. } else {
  623. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  624. }
  625. text := qu.ObjToString(doc[key])
  626. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  627. doc[key] = extinfo[key] //结果覆盖原doc
  628. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  629. }
  630. return doc
  631. }
  632. //抽取-规则
  633. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  634. defer qu.Catch()
  635. //废标、流标、ppp等跳过
  636. b := IsExtract(in.Field, j.Title, j.Content)
  637. if !b {
  638. return
  639. }
  640. if in.IsLua {
  641. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  642. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  643. lua.Block = j.Block
  644. extinfo := lua.RunScript("core")
  645. for k, v := range extinfo {
  646. if k == in.Field {
  647. if j.Result[k] == nil {
  648. j.Result[k] = [](*ju.ExtField){}
  649. }
  650. if tmps, ok := v.([]map[string]interface{}); ok {
  651. for _, tmp := range tmps {
  652. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
  653. if extfrom == "title" {
  654. field.Score = 4
  655. }
  656. if tmp["blocktag"] != nil {
  657. field.BlockTag = tmp["blocktag"].(map[string]bool)
  658. }
  659. item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
  660. if extfrom == "title" {
  661. item.Score = 4
  662. }
  663. if tmp["scoreitem"] == nil {
  664. scoreItems := make([]*ju.ScoreItem, 0)
  665. scoreItems = append(scoreItems, item)
  666. field.ScoreItem = scoreItems
  667. } else {
  668. field.ScoreItem = append(field.ScoreItem, item)
  669. }
  670. j.Result[k] = append(j.Result[k], field)
  671. }
  672. }
  673. }
  674. }
  675. if len(extinfo) > 0 {
  676. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  677. }
  678. } else {
  679. //全文正则
  680. //text := qu.ObjToString(doc[extfrom])
  681. //if in.Field != "" {
  682. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  683. // if len(extinfo) > 0 {
  684. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  685. // }
  686. //}
  687. //块抽取
  688. if in.Field != "" {
  689. if extfrom == "title" {
  690. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
  691. if len(extinfo) > 0 {
  692. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  693. }
  694. } else {
  695. for _, v := range j.Block {
  696. extinfo := extRegCoreToResult(extfrom, v.Text, &v.Classify, j, in)
  697. if len(extinfo) > 0 {
  698. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  699. }
  700. }
  701. }
  702. }
  703. }
  704. }
  705. //lua脚本根据属性设置提取kv值
  706. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  707. defer qu.Catch()
  708. kvmap := map[string][]map[string]interface{}{}
  709. for fieldname, field := range in.LFields {
  710. lock.Lock()
  711. tags := t[field] //获取对应标签库
  712. lock.Unlock()
  713. if tags == nil {
  714. continue
  715. }
  716. for _, bl := range j.Block {
  717. //冒号kv
  718. if bl.ColonKV != nil {
  719. kvs := bl.ColonKV.Kvs
  720. kvs2 := bl.ColonKV.Kvs_2
  721. // log.Debug("ColonKV1", kvs)
  722. // log.Debug("ColonKV2", kvs2)
  723. for _, tag := range tags {
  724. for _, kv := range kvs {
  725. if tag.Type == "string" {
  726. if kv.Key == tag.Key {
  727. text := ju.TrimLRSpace(kv.Value, "")
  728. if text != "" {
  729. kvmap[field] = append(kvmap[field], map[string]interface{}{
  730. "field": field,
  731. "code": in.Code,
  732. "ruletext": tag.Key,
  733. "extfrom": extfrom,
  734. "value": text,
  735. "type": "colon1",
  736. "matchtype": "tag_string",
  737. "blocktag": bl.Tag,
  738. })
  739. }
  740. break
  741. }
  742. } else if tag.Type == "regexp" {
  743. if tag.Reg.MatchString(kv.Key) {
  744. text := ju.TrimLRSpace(kv.Value, "")
  745. if text != "" {
  746. kvmap[field] = append(kvmap[field], map[string]interface{}{
  747. "field": field,
  748. "code": in.Code,
  749. "ruletext": tag.Key,
  750. "extfrom": extfrom,
  751. "value": text,
  752. "type": "colon1",
  753. "matchtype": "tag_regexp",
  754. "blocktag": bl.Tag,
  755. })
  756. }
  757. break
  758. }
  759. }
  760. }
  761. for _, kv := range kvs2 {
  762. if tag.Type == "string" {
  763. if kv.Key == tag.Key {
  764. text := ju.TrimLRSpace(kv.Value, "")
  765. if text != "" {
  766. kvmap[field] = append(kvmap[field], map[string]interface{}{
  767. "field": field,
  768. "code": in.Code,
  769. "ruletext": tag.Key,
  770. "extfrom": extfrom,
  771. "value": text,
  772. "type": "colon2",
  773. "matchtype": "tag_string",
  774. "blocktag": bl.Tag,
  775. })
  776. }
  777. break
  778. }
  779. } else if tag.Type == "regexp" {
  780. if tag.Reg.MatchString(kv.Key) {
  781. text := ju.TrimLRSpace(kv.Value, "")
  782. if text != "" {
  783. kvmap[field] = append(kvmap[field], map[string]interface{}{
  784. "field": field,
  785. "code": in.Code,
  786. "ruletext": tag.Key,
  787. "extfrom": extfrom,
  788. "value": text,
  789. "type": "colon2",
  790. "matchtype": "tag_regexp",
  791. "blocktag": bl.Tag,
  792. })
  793. }
  794. break
  795. }
  796. }
  797. }
  798. }
  799. }
  800. //空格kv
  801. if bl.SpaceKV != nil {
  802. kvs := bl.SpaceKV.Kvs
  803. // log.Debug("SpaceKV", kvs)
  804. for _, tag := range tags {
  805. for _, kv := range kvs {
  806. if tag.Type == "string" {
  807. if kv.Key == tag.Key {
  808. text := ju.TrimLRSpace(kv.Value, "")
  809. if text != "" {
  810. kvmap[field] = append(kvmap[field], map[string]interface{}{
  811. "field": field,
  812. "code": in.Code,
  813. "ruletext": tag.Key,
  814. "extfrom": extfrom,
  815. "value": text,
  816. "type": "space",
  817. "matchtype": "tag_string",
  818. "blocktag": bl.Tag,
  819. })
  820. }
  821. break
  822. }
  823. } else if tag.Type == "regexp" {
  824. if tag.Reg.MatchString(kv.Key) {
  825. text := ju.TrimLRSpace(kv.Value, "")
  826. if text != "" {
  827. kvmap[field] = append(kvmap[field], map[string]interface{}{
  828. "field": field,
  829. "code": in.Code,
  830. "ruletext": tag.Key,
  831. "extfrom": extfrom,
  832. "value": text,
  833. "type": "space",
  834. "matchtype": "tag_regexp",
  835. "blocktag": bl.Tag,
  836. })
  837. }
  838. break
  839. }
  840. }
  841. }
  842. }
  843. }
  844. //表格kv
  845. if bl.TableKV != nil {
  846. tkv := bl.TableKV
  847. // log.Debug("tkv", tkv)
  848. for k, v := range tkv.Kv {
  849. if k == fieldname {
  850. if len(tags) > -tkv.KvIndex[fieldname] {
  851. ruletext := ""
  852. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  853. ruletext = "项目名称"
  854. } else {
  855. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  856. }
  857. kvmap[field] = append(kvmap[field], map[string]interface{}{
  858. "field": field,
  859. "code": in.Code,
  860. "ruletext": ruletext,
  861. "extfrom": "table",
  862. "value": v,
  863. "type": "table",
  864. "matchtype": "tag_string",
  865. "blocktag": bl.Tag,
  866. })
  867. } else { //涉及其他待处理
  868. // log.Debug(tags)
  869. }
  870. }
  871. }
  872. }
  873. }
  874. }
  875. return kvmap
  876. }
  877. //正则提取结果
  878. func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  879. defer qu.Catch()
  880. extinfo := map[string][]map[string]interface{}{}
  881. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  882. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  883. if len(apos) > 0 {
  884. pos := apos[0]
  885. for k, p := range v.RegCore.ExtractPos {
  886. if len(pos) > p {
  887. if pos[p] == -1 || pos[p+1] == -1 {
  888. continue
  889. }
  890. val := text[pos[p]:pos[p+1]]
  891. if val == "招标公告" {
  892. return extinfo
  893. }
  894. if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
  895. val = text
  896. }
  897. tmps := []map[string]interface{}{}
  898. tmp := map[string]interface{}{
  899. "field": v.Field,
  900. "code": v.Code,
  901. "ruletext": v.RuleText,
  902. "extfrom": extfrom,
  903. "value": val,
  904. "type": "regexp",
  905. "matchtype": "regcontent",
  906. "blocktag": *tag,
  907. }
  908. tmps = append(tmps, tmp)
  909. extinfo[k] = tmps
  910. if strings.TrimSpace(val) != "" {
  911. if v.RegCore.NumSign == -1 { //正负值修正
  912. val = "-" + val
  913. }
  914. exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  915. if extfrom == "title" {
  916. exfield.Score = 4
  917. }
  918. if tmp["blocktag"] != nil {
  919. exfield.BlockTag = tmp["blocktag"].(map[string]bool)
  920. }
  921. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  922. if extfrom == "title" {
  923. item.Score = 4
  924. }
  925. if strings.Contains(val, "\n") {
  926. item.Score -= 1
  927. exfield.Score -= 1
  928. }
  929. if tmp["scoreitem"] == nil {
  930. sitems := make([]*ju.ScoreItem, 0)
  931. sitems = append(sitems, &item)
  932. exfield.ScoreItem = sitems
  933. } else {
  934. exfield.ScoreItem = append(exfield.ScoreItem, &item)
  935. }
  936. j.Result[k] = append(j.Result[k], &exfield)
  937. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  938. }
  939. }
  940. }
  941. }
  942. } else {
  943. pos := v.RegCore.Reg.FindStringIndex(text)
  944. val := ""
  945. if len(pos) == 2 {
  946. text = text[pos[1]:]
  947. rs := regexp.MustCompile("[^\r\n\t]+")
  948. tmp := rs.FindAllString(text, -1)
  949. if len(tmp) > 0 {
  950. val = tmp[0]
  951. }
  952. }
  953. if val != "" {
  954. tmps := []map[string]interface{}{}
  955. tmp := map[string]interface{}{
  956. "field": v.Field,
  957. "code": v.Code,
  958. "ruletext": v.RuleText,
  959. "extfrom": extfrom,
  960. "value": val,
  961. "type": "regexp",
  962. "matchtype": "regcontent",
  963. "blocktag": *tag,
  964. }
  965. tmps = append(tmps, tmp)
  966. extinfo[v.Field] = tmps
  967. if j.Result[v.Field] == nil {
  968. j.Result[v.Field] = [](*ju.ExtField){}
  969. }
  970. field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  971. if extfrom == "title" {
  972. field.Score = 4
  973. }
  974. if tmp["blocktag"] != nil {
  975. field.BlockTag = tmp["blocktag"].(map[string]bool)
  976. }
  977. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  978. if extfrom == "title" {
  979. item.Score = 4
  980. }
  981. if tmp["scoreitem"] == nil {
  982. sitems := make([]*ju.ScoreItem, 0)
  983. sitems = append(sitems, &item)
  984. field.ScoreItem = sitems
  985. } else {
  986. field.ScoreItem = append(field.ScoreItem, &item)
  987. }
  988. j.Result[v.Field] = append(j.Result[v.Field], field)
  989. }
  990. }
  991. return extinfo
  992. }
  993. //后置过滤
  994. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  995. defer qu.Catch()
  996. if in.IsLua {
  997. result := GetResultMapForLua(j)
  998. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  999. if j != nil {
  1000. lua.Block = j.Block
  1001. }
  1002. extinfo := lua.RunScript("back")
  1003. for k, v := range extinfo {
  1004. if tmps, ok := v.([]map[string]interface{}); ok {
  1005. j.Result[k] = [](*ju.ExtField){}
  1006. for _, tmp := range tmps {
  1007. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  1008. if tmp["blocktag"] != nil {
  1009. field.BlockTag = tmp["blocktag"].(map[string]bool)
  1010. }
  1011. item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  1012. if tmp["scoreitem"] == nil {
  1013. scoreItems := make([]*ju.ScoreItem, 0)
  1014. scoreItems = append(scoreItems, &item)
  1015. field.ScoreItem = scoreItems
  1016. } else {
  1017. field.ScoreItem = append(field.ScoreItem, &item)
  1018. }
  1019. j.Result[k] = append(j.Result[k], field)
  1020. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1021. }
  1022. }
  1023. }
  1024. if len(extinfo) > 0 {
  1025. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1026. }
  1027. } else {
  1028. extinfo := map[string]interface{}{}
  1029. if in.Field != "" {
  1030. if j.Result[in.Field] != nil {
  1031. tmp := j.Result[in.Field]
  1032. exts := []interface{}{}
  1033. for k, v := range tmp {
  1034. //table抽取到的数据不清理
  1035. // if v.Type == "table" && v.Field != "projectname" {
  1036. // continue
  1037. // }
  1038. text := qu.ObjToString(v.Value)
  1039. if text != "" && v.ExtFrom != "title" {
  1040. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1041. }
  1042. j.Result[in.Field][k].Value = text
  1043. exts = append(exts, map[string]interface{}{
  1044. "field": v.Field,
  1045. "code": v.Code,
  1046. "ruletext": v.RuleText,
  1047. "type": v.Type,
  1048. "matchtype": v.MatchType,
  1049. "extfrom": v.ExtFrom,
  1050. "value": text,
  1051. })
  1052. }
  1053. extinfo[in.Field] = exts
  1054. if len(extinfo) > 0 {
  1055. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1056. }
  1057. }
  1058. } else {
  1059. for key, tmp := range j.Result {
  1060. exts := []interface{}{}
  1061. for k, v := range tmp {
  1062. if v.Type == "table" { //table抽取到的数据不清理
  1063. continue
  1064. }
  1065. text := qu.ObjToString(v.Value)
  1066. if text != "" {
  1067. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1068. }
  1069. j.Result[key][k].Value = text
  1070. exts = append(exts, map[string]interface{}{
  1071. "field": v.Field,
  1072. "code": v.Code,
  1073. "ruletext": v.RuleText,
  1074. "type": v.Type,
  1075. "matchtype": v.MatchType,
  1076. "extfrom": v.ExtFrom,
  1077. "value": text,
  1078. })
  1079. }
  1080. extinfo[key] = exts
  1081. }
  1082. if len(extinfo) > 0 {
  1083. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1084. }
  1085. }
  1086. }
  1087. }
  1088. //获取抽取结果map[string][]interface{},lua脚本使用
  1089. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1090. defer qu.Catch()
  1091. result := map[string][]map[string]interface{}{}
  1092. for key, val := range j.Result {
  1093. if result[key] == nil {
  1094. result[key] = []map[string]interface{}{}
  1095. }
  1096. for _, v := range val {
  1097. tmp := map[string]interface{}{
  1098. "field": v.Field,
  1099. "code": v.Code,
  1100. "ruletext": v.RuleText,
  1101. "value": v.Value,
  1102. "type": v.Type,
  1103. "matchtype": v.MatchType,
  1104. "extfrom": v.ExtFrom,
  1105. }
  1106. result[key] = append(result[key], tmp)
  1107. }
  1108. }
  1109. return result
  1110. }
  1111. //抽取日志
  1112. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1113. defer qu.Catch()
  1114. if !t.IsEtxLog {
  1115. return
  1116. }
  1117. logdata := map[string]interface{}{
  1118. "code": v.Code,
  1119. "name": v.Name,
  1120. "type": ftype,
  1121. "ruletext": v.RuleText,
  1122. "islua": v.IsLua,
  1123. "field": v.Field,
  1124. "version": t.Version,
  1125. "taskname": t.Name,
  1126. "before": before,
  1127. "extinfo": extinfo,
  1128. "sid": sid,
  1129. "comeintime": time.Now().Unix(),
  1130. }
  1131. lock.Lock()
  1132. ExtLogs[t] = append(ExtLogs[t], logdata)
  1133. lock.Unlock()
  1134. }
  1135. //保存抽取日志
  1136. func SaveExtLog() {
  1137. defer qu.Catch()
  1138. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1139. lock.Lock()
  1140. tmpLogs = ExtLogs
  1141. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1142. lock.Unlock()
  1143. for k, v := range tmpLogs {
  1144. if len(v) < saveLimit {
  1145. db.Mgo.SaveBulk(k.TrackColl, v...)
  1146. } else {
  1147. for {
  1148. if len(v) > saveLimit {
  1149. tmp := v[:saveLimit]
  1150. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1151. v = v[saveLimit:]
  1152. } else {
  1153. db.Mgo.SaveBulk(k.TrackColl, v...)
  1154. break
  1155. }
  1156. }
  1157. }
  1158. }
  1159. time.AfterFunc(10*time.Second, SaveExtLog)
  1160. }
  1161. type FieldValue struct {
  1162. Value interface{}
  1163. Count int
  1164. }
  1165. //分析抽取结果并保存
  1166. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1167. qu.Try(func() {
  1168. doc, result, _id := funcAnalysis(j)
  1169. go otherNeedSave(j, result, e)
  1170. auxinfo := auxInfo(j)
  1171. //从排序结果中取值
  1172. tmp := map[string]interface{}{} //抽取值
  1173. tmp["fieldall"] = auxinfo
  1174. for _, val := range result {
  1175. for _, v := range val { //取第一个非负数
  1176. if v.Score > -1 {
  1177. tmp[v.Field] = v.Value
  1178. break
  1179. }
  1180. }
  1181. }
  1182. if len(j.PackageInfo) > 0 { //分包信息
  1183. tmp["package"] = j.PackageInfo
  1184. }
  1185. if len(j.Winnerorder) > 0 { //候选人信息
  1186. tmp["winnerorder"] = j.Winnerorder
  1187. }
  1188. //处理附件
  1189. var resultf map[string][]*ju.ExtField
  1190. if jf != nil {
  1191. _, resultf, _ = funcAnalysis(jf)
  1192. auxinfof := auxInfo(jf)
  1193. tmp["fieldallf"] = auxinfof
  1194. ffield := map[string]interface{}{}
  1195. for _, val := range resultf {
  1196. for _, v := range val { //取第一个非负数
  1197. if v.Score > -1 {
  1198. ffield[v.Field] = v.Value
  1199. break
  1200. }
  1201. }
  1202. }
  1203. if len(jf.PackageInfo) > 0 { //分包信息
  1204. ffield["package"] = jf.PackageInfo
  1205. }
  1206. if len(jf.Winnerorder) > 0 { //候选人信息
  1207. ffield["winnerorder"] = jf.Winnerorder
  1208. }
  1209. tmp["ffield"] = ffield
  1210. }
  1211. for k, v := range *doc {
  1212. //去重冗余字段
  1213. if delFiled(k) {
  1214. continue
  1215. }
  1216. if tmp[k] == nil {
  1217. tmp[k] = v
  1218. }
  1219. }
  1220. //质量审核
  1221. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1222. e.QualityAudit(tmp)
  1223. }
  1224. if e.IsExtractCity { //城市抽取
  1225. e.ExtractCity(j, tmp, _id)
  1226. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1227. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1228. // tmp["district"] = d
  1229. // if b {
  1230. // tmp["city"] = c
  1231. // tmp["area"] = p
  1232. // }
  1233. }
  1234. //品牌抽取
  1235. if ju.IsBrandGoods {
  1236. tmp["checkhas"] = map[string]int{
  1237. "hastable": j.HasTable,
  1238. "hasgoods": j.HasGoods,
  1239. "hasbrand": j.HasBrand,
  1240. "haskey": j.HasKey,
  1241. }
  1242. if len(j.BrandData) > 0 {
  1243. tmp["tablebrand"] = j.BrandData
  1244. }
  1245. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1246. }
  1247. //所有kv组成的字符串
  1248. var kvtext bytes.Buffer
  1249. blocks := make([]ju.BlockAndTag, 0)
  1250. for _, v := range j.Block {
  1251. //分包和标签
  1252. if ju.Config["saveblock"].(bool) {
  1253. xx, _ := json.Marshal(v)
  1254. tmpblock := new(ju.TmpBlock)
  1255. err := json.Unmarshal(xx, &tmpblock)
  1256. if err != nil {
  1257. if v.BPackage != nil {
  1258. bpb, _ := json.Marshal(v.BPackage)
  1259. tmpblock.BPackage = string(bpb)
  1260. }
  1261. tmpblock = rangeBlockToJson(v, *tmpblock)
  1262. }
  1263. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1264. }
  1265. //把所有kv组装成一个字符串,存库
  1266. for ck, cv := range v.ColonKV.Kv {
  1267. kvtext.WriteString(ck)
  1268. kvtext.WriteString(":")
  1269. kvtext.WriteString(cv)
  1270. kvtext.WriteString(" ")
  1271. }
  1272. for sk, sv := range v.SpaceKV.Kv {
  1273. kvtext.WriteString(sk)
  1274. kvtext.WriteString(":")
  1275. kvtext.WriteString(sv)
  1276. kvtext.WriteString(" ")
  1277. }
  1278. for tk, tv := range v.TableKV.Kv {
  1279. kvtext.WriteString(tk)
  1280. kvtext.WriteString(":")
  1281. kvtext.WriteString(tv)
  1282. kvtext.WriteString(" ")
  1283. }
  1284. }
  1285. if kvtext.Len() > 0 {
  1286. tmp["kvtext"] = kvtext.String()
  1287. }
  1288. if len(blocks) > 0 {
  1289. tmp["blocks"] = blocks
  1290. }
  1291. tmp["extract_content"] = j.Content
  1292. if e.TaskInfo.TestColl == "" {
  1293. if len(tmp) > 0 { //保存抽取结果
  1294. for field, _ := range e.Fields {
  1295. if tmp[field] == nil {
  1296. tmp[field] = "" //覆盖之前版本数据
  1297. }
  1298. }
  1299. tmp["repeat"] = 0
  1300. tmparr := []map[string]interface{}{
  1301. map[string]interface{}{
  1302. "_id": qu.StringTOBsonId(_id),
  1303. },
  1304. map[string]interface{}{"$set": tmp},
  1305. }
  1306. e.BidArr = append(e.BidArr, tmparr)
  1307. e.BidTotal++
  1308. }
  1309. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1310. id := tmp["_id"]
  1311. tmp["result"] = result
  1312. tmp["resultf"] = resultf
  1313. delete(tmp, "_id")
  1314. tmparr := []map[string]interface{}{
  1315. map[string]interface{}{
  1316. "_id": id,
  1317. },
  1318. map[string]interface{}{"$set": tmp},
  1319. }
  1320. e.ResultArr = append(e.ResultArr, tmparr)
  1321. }
  1322. } else { //测试结果
  1323. delete(tmp, "_id")
  1324. if len(j.BlockPackage) > 0 { //分包详情
  1325. bs, _ := json.Marshal(j.BlockPackage)
  1326. tmp["epackage"] = string(bs)
  1327. }
  1328. tmp["result"] = result
  1329. tmp["resultf"] = resultf
  1330. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1331. if !b {
  1332. log.Debug(e.TaskInfo.TestColl, _id)
  1333. }
  1334. }
  1335. }, func(err interface{}) {
  1336. log.Debug("AnalysisSaveResult err", err)
  1337. })
  1338. }
  1339. //保存其他
  1340. //kv、表格、块上的标签凡是新的标签都入库
  1341. //val type times firstid createtime 判定field
  1342. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1343. now := time.Now().Unix()
  1344. coll := e.TaskInfo.TestColl
  1345. if coll == "" {
  1346. coll = "extract_tag_result"
  1347. } else {
  1348. coll += "_tag"
  1349. }
  1350. datas := []map[string]interface{}{}
  1351. kv := map[string]int{}
  1352. for _, v := range j.Block {
  1353. //
  1354. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1355. if vv == nil || vv.KvTag == nil {
  1356. continue
  1357. }
  1358. for kkk, vvv := range vv.KvTag {
  1359. if vvv.Weight == ju.RetainKvWeight {
  1360. kv[kkk] = kv[kkk] + 1
  1361. }
  1362. }
  1363. }
  1364. for _, vv := range v.NotClassifyTitles {
  1365. datas = append(datas, map[string]interface{}{
  1366. "val": vv,
  1367. "times": 0,
  1368. "type": "block",
  1369. "firstid": j.SourceMid,
  1370. "createtime": now,
  1371. })
  1372. if len(datas) == 200 {
  1373. db.Mgo.SaveBulk(coll, datas...)
  1374. datas = []map[string]interface{}{}
  1375. }
  1376. }
  1377. }
  1378. for k, v := range kv {
  1379. datas = append(datas, map[string]interface{}{
  1380. "val": k,
  1381. "times": v,
  1382. "type": "kv",
  1383. "firstid": j.SourceMid,
  1384. "createtime": now,
  1385. })
  1386. if len(datas) == 200 {
  1387. db.Mgo.SaveBulk(coll, datas...)
  1388. datas = []map[string]interface{}{}
  1389. }
  1390. }
  1391. if len(datas) > 0 {
  1392. db.Mgo.SaveBulk(coll, datas...)
  1393. }
  1394. }
  1395. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1396. if j == nil {
  1397. return nil
  1398. }
  1399. if len(j.Block) > 0 {
  1400. for i, v := range j.Block {
  1401. rangetmp := new(ju.TmpBlock)
  1402. vb, _ := json.Marshal(v)
  1403. json.Unmarshal(vb, &rangetmp)
  1404. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1405. }
  1406. }
  1407. if j.ColonKV != nil {
  1408. cb, _ := json.Marshal(j.ColonKV)
  1409. tmpblock.ColonKV = string(cb)
  1410. }
  1411. if j.SpaceKV != nil {
  1412. sb, _ := json.Marshal(j.SpaceKV)
  1413. tmpblock.SpaceKV = string(sb)
  1414. }
  1415. if j.TableKV != nil {
  1416. tb, _ := json.Marshal(j.TableKV)
  1417. tmpblock.TableKV = string(tb)
  1418. }
  1419. return &tmpblock
  1420. }
  1421. //去重冗余字段
  1422. func delFiled(k string) bool {
  1423. return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1424. }
  1425. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1426. defer qu.Catch()
  1427. doc := j.Data
  1428. result := j.Result
  1429. _id := qu.BsonIdToSId((*doc)["_id"])
  1430. result = ScoreFields(j)
  1431. //结果排序
  1432. for _, val := range result {
  1433. ju.Sort(val)
  1434. }
  1435. return doc, result, _id
  1436. }
  1437. //辅助信息,如果没有排序先排序
  1438. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1439. fieldalls := map[string][]map[string]interface{}{}
  1440. for field, val := range j.Result {
  1441. //ju.Sort(val)
  1442. sfields := []map[string]interface{}{}
  1443. for _, v := range val {
  1444. standardized := false
  1445. if field == "buyer" || field == "winner" || field == "agency" {
  1446. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1447. if i > 0 {
  1448. standardized = true
  1449. }
  1450. }
  1451. sfield := map[string]interface{}{
  1452. "val": v.Value,
  1453. "type": v.Type,
  1454. "score": v.Score,
  1455. "blocktag": v.BlockTag,
  1456. "sourceval": v.SourceValue,
  1457. "standardized": standardized,
  1458. }
  1459. sfields = append(sfields, sfield)
  1460. }
  1461. fieldalls[field] = sfields
  1462. }
  1463. return fieldalls
  1464. }
  1465. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1466. defer qu.Catch()
  1467. //获取审核字段
  1468. for _, field := range e.AuditFields {
  1469. //1.分包
  1470. if resulttmp["package"] != nil {
  1471. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1472. for _, val := range packagedata {
  1473. if val[field] != nil {
  1474. fv := qu.ObjToString(val[field])
  1475. if fv != "" {
  1476. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1477. e.RedisMatch(field, fv, val) //redis匹配
  1478. } else { //除了buyer和winner,其他字段走规则匹配
  1479. e.RuleMatch(field, fv, val)
  1480. }
  1481. }
  1482. }
  1483. }
  1484. }
  1485. //2.外围
  1486. if resulttmp[field] != nil {
  1487. fv := qu.ObjToString(resulttmp[field])
  1488. if fv != "" {
  1489. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1490. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1491. } else { //除了buyer和winner,其他字段走规则匹配
  1492. e.RuleMatch(field, fv, resulttmp)
  1493. }
  1494. }
  1495. }
  1496. }
  1497. }
  1498. //Redis匹配
  1499. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1500. defer qu.Catch()
  1501. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1502. if i == 0 { //reids未找到,执行规则匹配
  1503. val[field+"_isredis"] = false
  1504. e.RuleMatch(field, fv, val) //规则匹配
  1505. } else { //redis找到,打标识存库
  1506. val[field+"_isredis"] = true
  1507. }
  1508. }
  1509. //规则匹配
  1510. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1511. defer qu.Catch()
  1512. if fieldval != "" {
  1513. SMap := e.StartMatch(field, fieldval)
  1514. //SMap.AddKey(field+"_isaudit", false)
  1515. for _, k := range SMap.Keys {
  1516. tmpMap[k] = SMap.Map[k]
  1517. }
  1518. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1519. }
  1520. }
  1521. //开始规则匹配
  1522. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1523. defer qu.Catch()
  1524. SMap := pretreated.NewSortMap()
  1525. lock.Lock()
  1526. f := e.RecogFieldMap[field]
  1527. lock.Unlock()
  1528. if len(f) > 0 {
  1529. fid := qu.BsonIdToSId(f["_id"])
  1530. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1531. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1532. if textAfterRecogFieldPrerule != "" {
  1533. lock.Lock()
  1534. classMap := e.FidClassMap[fid]
  1535. lock.Unlock()
  1536. L:
  1537. for _, c := range classMap { //class
  1538. classid := qu.BsonIdToSId(c["_id"])
  1539. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1540. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1541. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1542. if textAfterClassPrerule != "" {
  1543. lock.Lock()
  1544. ruleMap := e.CidRuleMap[classid]
  1545. lock.Unlock()
  1546. for _, r := range ruleMap { //rule
  1547. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1548. s_name := qu.ObjToString(r["s_name"])
  1549. rule := r["rule"].([]interface{})
  1550. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1551. if textAfterRulePrerule != "" {
  1552. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1553. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1554. if savefield != "" { //保存字段不为空,存储代码信息
  1555. SMap.AddKey(field+"_"+savefield, s_name)
  1556. }
  1557. break L
  1558. }
  1559. }
  1560. }
  1561. }
  1562. }
  1563. }
  1564. }
  1565. return SMap
  1566. }