extract.go 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. log "github.com/donnie4w/go-logger/logger"
  19. "gopkg.in/mgo.v2/bson"
  20. )
  21. var (
  22. lock, lockrule, lockclear sync.RWMutex
  23. cut = ju.NewCut() //获取正文并清理
  24. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  25. TaskList map[string]*ExtractTask //任务列表
  26. ClearTaskList map[string]*ClearTask //清理任务列表
  27. saveLimit = 200 //抽取日志批量保存
  28. PageSize = 5000 //查询分页
  29. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  30. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  31. )
  32. //启动测试抽取
  33. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  34. defer qu.Catch()
  35. ext := &ExtractTask{}
  36. ext.Id = taskId
  37. ext.IsRun = true
  38. ext.InitTestTaskInfo(resultcoll, trackcoll)
  39. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  40. ext.InitRulePres()
  41. ext.InitRuleBacks()
  42. ext.InitRuleCore()
  43. ext.InitPkgCore()
  44. ext.InitBlockRule()
  45. ext.InfoTypeList()
  46. ext.InitTag()
  47. ext.InitClearFn()
  48. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  49. //初始化城市DFA信息
  50. ext.InitCityDFA()
  51. ext.InitAreaCode()
  52. ext.InitPostCode()
  53. }
  54. //质量审核
  55. ext.InitAuditFields()
  56. ext.InitAuditRule()
  57. ext.InitAuditClass()
  58. ext.InitAuditRecogField()
  59. //品牌抽取是否开启
  60. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  61. //附件抽取是否开启
  62. ext.InitFile()
  63. return RunExtractTestTask(ext, startId, num)
  64. }
  65. func IdTrans(startId string) bson.ObjectId {
  66. defer qu.Catch()
  67. return bson.ObjectIdHex(startId)
  68. }
  69. //开始测试任务抽取
  70. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  71. n, _ := strconv.Atoi(num)
  72. id := IdTrans(startId)
  73. if id.Valid() {
  74. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  75. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  76. for _, v := range *list {
  77. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  78. continue
  79. }
  80. var j, jf *ju.Job
  81. if ext.IsFileField && v["projectinfo"] != nil {
  82. v["isextFile"] = true
  83. j, jf = ext.PreInfo(v)
  84. } else {
  85. j, _ = ext.PreInfo(v)
  86. }
  87. ext.TaskInfo.ProcessPool <- true
  88. go ext.ExtractProcess(j, jf)
  89. }
  90. return true
  91. } else {
  92. return false
  93. }
  94. }
  95. //启动抽取
  96. func StartExtractTaskId(taskId string) bool {
  97. defer qu.Catch()
  98. isgo := false
  99. ext := TaskList[taskId]
  100. if ext == nil {
  101. ext = &ExtractTask{}
  102. ext.Id = taskId
  103. ext.InitTaskInfo()
  104. isgo = true
  105. } else {
  106. ext.Id = taskId
  107. ext.InitTaskInfo()
  108. }
  109. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  110. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  111. ext.InitRulePres()
  112. ext.InitRuleBacks()
  113. ext.InitRuleCore()
  114. ext.InitPkgCore()
  115. ext.InitBlockRule()
  116. ext.InfoTypeList()
  117. ext.InitTag()
  118. ext.InitClearFn()
  119. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  120. //初始化城市DFA信息
  121. ext.InitCityDFA()
  122. ext.InitAreaCode()
  123. ext.InitPostCode()
  124. }
  125. //质量审核
  126. ext.InitAuditFields()
  127. ext.InitAuditRule()
  128. ext.InitAuditClass()
  129. ext.InitAuditRecogField()
  130. //品牌抽取是否开启
  131. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  132. //附件抽取是否开启
  133. ext.InitFile()
  134. ext.IsRun = true
  135. go ext.ResultSave(true)
  136. go ext.BidSave(true)
  137. if isgo {
  138. go RunExtractTask(taskId)
  139. }
  140. TaskList[taskId] = ext
  141. return true
  142. }
  143. //停止抽取
  144. func StopExtractTaskId(taskId string) bool {
  145. defer qu.Catch()
  146. ext := TaskList[taskId]
  147. if ext != nil {
  148. ext.IsRun = false
  149. TaskList[taskId] = ext
  150. }
  151. //更新task.s_extlastid
  152. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  153. return true
  154. }
  155. //开始抽取
  156. func RunExtractTask(taskId string) {
  157. defer qu.Catch()
  158. ext := TaskList[taskId]
  159. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  160. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  161. pageNum := (count + PageSize - 1) / PageSize
  162. limit := PageSize
  163. if count < PageSize {
  164. limit = count
  165. }
  166. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  167. for i := 0; i < pageNum; i++ {
  168. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  169. fmt.Printf("page=%d,query=%v", i+1, query)
  170. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  171. for _, v := range *list {
  172. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  173. continue
  174. }
  175. _id := qu.BsonIdToSId(v["_id"])
  176. //log.Debug(_id)
  177. if !ext.IsRun {
  178. break
  179. }
  180. var j, jf *ju.Job
  181. if ext.IsFileField && v["projectinfo"] != nil {
  182. v["isextFile"] = true
  183. j, jf = ext.PreInfo(v)
  184. } else {
  185. j, _ = ext.PreInfo(v)
  186. }
  187. ext.TaskInfo.ProcessPool <- true
  188. go ext.ExtractProcess(j, jf)
  189. ext.TaskInfo.LastExtId = _id
  190. }
  191. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  192. if !ext.IsRun {
  193. break
  194. }
  195. }
  196. //更新task.s_extlastid
  197. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  198. }
  199. //信息预处理-不和版本关联,取最新版本的配置项
  200. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  201. return (&ExtractTask{}).PreInfo(doc)
  202. }
  203. //信息预处理-和版本关联
  204. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  205. defer qu.Catch()
  206. //判断是否有附件这个字段
  207. var isextFile bool
  208. if doc["isextFile"] != nil {
  209. isextFile = doc["isextFile"].(bool)
  210. }
  211. detail := ""
  212. d1, _ := doc["detail"].(string)
  213. d2, _ := doc["contenthtml"].(string)
  214. if len(d1) >= len(d2) || d2 == "" {
  215. detail = d1
  216. } else {
  217. detail = d2
  218. }
  219. detail = ju.CutLableStr(detail)
  220. detail = cut.ClearHtml(detail)
  221. doc["detail"] = detail
  222. if isextFile {
  223. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  224. }
  225. toptype := qu.ObjToString(doc["toptype"])
  226. subtype := qu.ObjToString(doc["subtype"])
  227. if qu.ObjToString(doc["type"]) == "bid" {
  228. toptype = "结果"
  229. }
  230. if toptype == "" {
  231. toptype = "all"
  232. }
  233. if subtype == "" {
  234. subtype = "all"
  235. }
  236. j = &ju.Job{
  237. SourceMid: qu.BsonIdToSId(doc["_id"]),
  238. Category: toptype,
  239. CategorySecond: subtype,
  240. Content: qu.ObjToString(doc["detail"]),
  241. SpiderCode: qu.ObjToString(doc["spidercode"]),
  242. //Domain: qu.ObjToString(doc["domain"]),
  243. //Href: qu.ObjToString(doc["href"]),
  244. Title: qu.ObjToString(doc["title"]),
  245. Data: &doc,
  246. City: qu.ObjToString(doc["city"]),
  247. Province: qu.ObjToString(doc["area"]),
  248. Jsondata: qu.ObjToMap(doc["jsondata"]),
  249. Result: map[string][]*ju.ExtField{},
  250. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  251. RuleBlock: e.RuleBlock,
  252. }
  253. if isextFile {
  254. jf = &ju.Job{
  255. SourceMid: qu.BsonIdToSId(doc["_id"]),
  256. Category: toptype,
  257. Content: qu.ObjToString(doc["detailfile"]),
  258. SpiderCode: qu.ObjToString(doc["spidercode"]),
  259. Title: qu.ObjToString(doc["title"]),
  260. Data: &doc,
  261. City: qu.ObjToString(doc["city"]),
  262. Province: qu.ObjToString(doc["area"]),
  263. Jsondata: qu.ObjToMap(doc["jsondata"]),
  264. Result: map[string][]*ju.ExtField{},
  265. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  266. RuleBlock: e.RuleBlock,
  267. IsFile: isextFile,
  268. }
  269. }
  270. qu.Try(func() {
  271. pretreated.AnalyStart(j) //job.Block分块
  272. if isextFile {
  273. pretreated.AnalyStart(jf)
  274. }
  275. }, func(err interface{}) {
  276. log.Debug("pretreated.AnalyStart", err)
  277. })
  278. return j, jf
  279. }
  280. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  281. func file2text(doc *map[string]interface{}) {
  282. var strfileinfo bytes.Buffer
  283. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  284. if va, ok := v["attachments"].(map[string]interface{}); ok {
  285. for _, vaatt := range va {
  286. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  287. if qu.ObjToString(fileinfo["content"]) != "" {
  288. switch fileinfo["content"].(type) {
  289. case string:
  290. lock.Lock()
  291. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  292. lock.Unlock()
  293. case []map[string]interface{}:
  294. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  295. if fv["context"] != nil {
  296. lock.Lock()
  297. strfileinfo.WriteString(fv["context"].(string) + " \n")
  298. lock.Unlock()
  299. }
  300. }
  301. }
  302. }
  303. }
  304. }
  305. }
  306. }
  307. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  308. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  309. }
  310. }
  311. //抽取
  312. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  313. e.ExtractDetail(j)
  314. if jf != nil && jf.IsFile {
  315. e.ExtractFile(jf)
  316. }
  317. //分析抽取结果并保存 todo
  318. AnalysisSaveResult(j, jf, e)
  319. <-e.TaskInfo.ProcessPool
  320. }
  321. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  322. qu.Try(func() {
  323. doc := *j.Data
  324. //全局前置规则,结果覆盖doc属性
  325. //for _, v := range e.RulePres {
  326. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  327. //}
  328. tmprules := map[string][]*RuleCore{}
  329. lockrule.Lock()
  330. if j.Category == "all" || j.CategorySecond == "all" {
  331. for k, vc1 := range e.RuleCores["all_all"] {
  332. tmprules[k] = vc1
  333. }
  334. } else {
  335. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  336. tmprules[k] = vc1
  337. }
  338. }
  339. if len(tmprules) < 1 { //分类未覆盖部分
  340. for k, vc1 := range e.RuleCores["all_all"] {
  341. tmprules[k] = vc1
  342. }
  343. }
  344. lockrule.Unlock()
  345. //抽取规则
  346. for _, vc1 := range tmprules {
  347. for _, vc := range vc1 {
  348. tmp := ju.DeepCopy(doc).(map[string]interface{})
  349. //是否进入逻辑
  350. if !ju.Logic(vc.LuaLogic, tmp) {
  351. continue
  352. }
  353. ////抽取-前置规则
  354. //for _, v := range vc.RulePres {
  355. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  356. //}
  357. // log.Debug("抽取-前置规则", tmp)
  358. //抽取-规则
  359. for _, v := range vc.RuleCores {
  360. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  361. }
  362. // log.Debug("抽取-规则", tmp)
  363. //项目名称未能抽取到,标题来凑
  364. if vc.Field == "projectname" {
  365. //if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
  366. field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  367. if tmp["blocktag"] != nil {
  368. btag:= make(map[string]string)
  369. for k := range tmp["blocktag"].(map[string]bool){
  370. btag[k] = TagConfigDesc[k]
  371. }
  372. field.BlockTag = btag
  373. }
  374. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  375. //}
  376. }
  377. //抽取-后置规则
  378. for _, v := range vc.RuleBacks {
  379. ExtRegBack(j, v, e.TaskInfo)
  380. }
  381. // log.Debug("抽取-后置规则", tmp)
  382. }
  383. }
  384. //全局后置规则
  385. for _, v := range e.RuleBacks {
  386. ExtRegBack(j, v, e.TaskInfo)
  387. }
  388. //候选人加入
  389. if len(j.Winnerorder) > 0 {
  390. winner := &ju.ExtField{
  391. Field: "winner",
  392. Code: "",
  393. RuleText: "",
  394. Type: "winnerorder",
  395. MatchType: "winnerorder",
  396. ExtFrom: "",
  397. Value: j.Winnerorder[0]["entname"],
  398. Score: 0,
  399. }
  400. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  401. winner.Score = -5
  402. }
  403. winners := j.Result["winner"]
  404. if winners != nil {
  405. winners = append(winners, winner)
  406. } else {
  407. winners = []*ju.ExtField{}
  408. winners = append(winners, winner)
  409. }
  410. j.Result["winner"] = winners
  411. }
  412. //函数清理
  413. for key, val := range j.Result {
  414. for _, v := range val {
  415. lockclear.Lock()
  416. cfn := e.ClearFn[key]
  417. lockclear.Unlock()
  418. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  419. v.Value = data[0]
  420. //清理特殊符号
  421. lockclear.Lock()
  422. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  423. clear.MesField[key] != nil {
  424. text := qu.ObjToString(v.Value)
  425. text = clear.OtherClean(key, text)
  426. if text != "" {
  427. v.Value = text
  428. }
  429. }
  430. lockclear.Unlock()
  431. }
  432. }
  433. PackageDetail(j, e) //处理分包信息
  434. // bs, _ := json.Marshal(j.Result)
  435. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  436. }, func(err interface{}) {
  437. log.Debug("ExtractProcess err", err)
  438. })
  439. }
  440. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  441. qu.Try(func() {
  442. doc := *j.Data
  443. //全局前置规则,结果覆盖doc属性
  444. // for _, v := range e.RulePres {
  445. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  446. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  447. // }
  448. // }
  449. //抽取规则
  450. tmprules := map[string][]*RuleCore{}
  451. lockrule.Lock()
  452. if j.Category == "all" || j.CategorySecond == "all" {
  453. for k, vc1 := range e.RuleCores["all_all"] {
  454. tmprules[k] = vc1
  455. }
  456. } else {
  457. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  458. tmprules[k] = vc1
  459. }
  460. }
  461. lockrule.Unlock()
  462. for _, vc1 := range tmprules {
  463. for _, vc := range vc1 {
  464. tmp := ju.DeepCopy(doc).(map[string]interface{})
  465. //是否进入逻辑
  466. if !ju.Logic(vc.LuaLogic, tmp) {
  467. continue
  468. }
  469. //抽取-前置规则
  470. // for _, v := range vc.RulePres {
  471. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  472. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  473. // }
  474. // }
  475. // log.Debug("抽取-前置规则", tmp)
  476. //抽取-规则
  477. for _, v := range vc.RuleCores {
  478. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  479. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  480. }
  481. }
  482. // log.Debug("抽取-规则", tmp)
  483. //抽取-后置规则
  484. for _, v := range vc.RuleBacks {
  485. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  486. ExtRegBack(j, v, e.TaskInfo)
  487. }
  488. }
  489. // log.Debug("抽取-后置规则", tmp)
  490. }
  491. }
  492. //全局后置规则
  493. for _, v := range e.RuleBacks {
  494. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  495. ExtRegBack(j, v, e.TaskInfo)
  496. }
  497. }
  498. //候选人加入
  499. if len(j.Winnerorder) > 0 {
  500. winner := &ju.ExtField{
  501. Field: "winner",
  502. Code: "",
  503. RuleText: "",
  504. Type: "winnerorder",
  505. MatchType: "winnerorder",
  506. ExtFrom: "",
  507. Value: j.Winnerorder[0]["entname"],
  508. Score: 0,
  509. }
  510. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  511. winner.Score = -5
  512. }
  513. winners := j.Result["winner"]
  514. if winners != nil {
  515. winners = append(winners, winner)
  516. } else {
  517. winners = []*ju.ExtField{}
  518. winners = append(winners, winner)
  519. }
  520. j.Result["winner"] = winners
  521. }
  522. //函数清理
  523. for key, val := range j.Result {
  524. for _, v := range val {
  525. lockclear.Lock()
  526. cfn := e.ClearFn[key]
  527. lockclear.Unlock()
  528. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  529. v.Value = data[0]
  530. //清理特殊符号
  531. lockclear.Lock()
  532. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  533. clear.MesField[key] != nil {
  534. text := qu.ObjToString(v.Value)
  535. text = clear.OtherClean(key, text)
  536. v.Value = text
  537. }
  538. lockclear.Unlock()
  539. }
  540. }
  541. PackageDetail(j, e) //处理分包信息
  542. // bs, _ := json.Marshal(j.Result)
  543. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  544. }, func(err interface{}) {
  545. log.Debug("ExtractProcess err", err)
  546. })
  547. }
  548. //前置过滤
  549. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  550. defer qu.Catch()
  551. before := ju.DeepCopy(doc).(map[string]interface{})
  552. extinfo := map[string]interface{}{}
  553. if in.IsLua {
  554. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  555. if j != nil {
  556. lua.Block = j.Block
  557. }
  558. extinfo = lua.RunScript("pre")
  559. for k, v := range extinfo { //结果覆盖原doc
  560. doc[k] = v
  561. }
  562. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  563. } else {
  564. var key string
  565. if !j.IsFile {
  566. key = qu.If(in.Field == "", "detail", in.Field).(string)
  567. } else {
  568. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  569. }
  570. text := qu.ObjToString(doc[key])
  571. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  572. doc[key] = extinfo[key] //结果覆盖原doc
  573. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  574. }
  575. return doc
  576. }
  577. //抽取-规则
  578. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  579. defer qu.Catch()
  580. //废标、流标、ppp等跳过
  581. b := IsExtract(in.Field, j.Title, j.Content)
  582. if !b {
  583. return
  584. }
  585. if in.IsLua {
  586. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  587. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  588. lua.Block = j.Block
  589. extinfo := lua.RunScript("core")
  590. for k, v := range extinfo {
  591. if k == in.Field {
  592. if j.Result[k] == nil {
  593. j.Result[k] = [](*ju.ExtField){}
  594. }
  595. if tmps, ok := v.([]map[string]interface{}); ok {
  596. for _, tmp := range tmps {
  597. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
  598. if tmp["blocktag"] != nil {
  599. btag := make(map[string]string)
  600. for k := range tmp["blocktag"].(map[string]bool){
  601. btag[k] = TagConfigDesc[k]
  602. }
  603. field.BlockTag = btag
  604. }
  605. j.Result[k] = append(j.Result[k], field)
  606. }
  607. }
  608. }
  609. }
  610. if len(extinfo) > 0 {
  611. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  612. }
  613. } else {
  614. //全文正则
  615. //text := qu.ObjToString(doc[extfrom])
  616. //if in.Field != "" {
  617. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  618. // if len(extinfo) > 0 {
  619. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  620. // }
  621. //}
  622. //块抽取
  623. if in.Field != "" {
  624. if extfrom == "title" {
  625. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
  626. if len(extinfo) > 0 {
  627. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  628. }
  629. } else {
  630. for _, v := range j.Block {
  631. btag := make(map[string]string)
  632. for k:=range v.Classify{
  633. btag[k] = TagConfigDesc[k]
  634. }
  635. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
  636. if len(extinfo) > 0 {
  637. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  638. }
  639. }
  640. }
  641. }
  642. }
  643. }
  644. //lua脚本根据属性设置提取kv值
  645. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  646. defer qu.Catch()
  647. kvmap := map[string][]map[string]interface{}{}
  648. for fieldname, field := range in.LFields {
  649. lock.Lock()
  650. tags := t[field] //获取对应标签库
  651. lock.Unlock()
  652. if tags == nil {
  653. continue
  654. }
  655. for _, bl := range j.Block {
  656. //冒号kv
  657. if bl.ColonKV != nil {
  658. kvs := bl.ColonKV.Kvs
  659. kvs2 := bl.ColonKV.Kvs_2
  660. // log.Debug("ColonKV1", kvs)
  661. // log.Debug("ColonKV2", kvs2)
  662. for _, tag := range tags {
  663. for _, kv := range kvs {
  664. if tag.Type == "string" {
  665. if kv.Key == tag.Key {
  666. text := ju.TrimLRSpace(kv.Value, "")
  667. if text != "" {
  668. kvmap[field] = append(kvmap[field], map[string]interface{}{
  669. "field": field,
  670. "code": in.Code,
  671. "ruletext": tag.Key,
  672. "extfrom": extfrom,
  673. "sourcevalue": text,
  674. "value": text,
  675. "type": "colon1",
  676. "matchtype": "tag_string",
  677. "blocktag": bl.Classify,
  678. })
  679. }
  680. break
  681. }
  682. } else if tag.Type == "regexp" {
  683. if tag.Reg.MatchString(kv.Key) {
  684. text := ju.TrimLRSpace(kv.Value, "")
  685. if text != "" {
  686. kvmap[field] = append(kvmap[field], map[string]interface{}{
  687. "field": field,
  688. "code": in.Code,
  689. "ruletext": tag.Key,
  690. "extfrom": extfrom,
  691. "sourcevalue": text,
  692. "value": text,
  693. "type": "colon1",
  694. "matchtype": "tag_regexp",
  695. "blocktag": bl.Classify,
  696. })
  697. }
  698. break
  699. }
  700. }
  701. }
  702. for _, kv := range kvs2 {
  703. if tag.Type == "string" {
  704. if kv.Key == tag.Key {
  705. text := ju.TrimLRSpace(kv.Value, "")
  706. if text != "" {
  707. kvmap[field] = append(kvmap[field], map[string]interface{}{
  708. "field": field,
  709. "code": in.Code,
  710. "ruletext": tag.Key,
  711. "extfrom": extfrom,
  712. "sourcevalue": text,
  713. "value": text,
  714. "type": "colon2",
  715. "matchtype": "tag_string",
  716. "blocktag": bl.Classify,
  717. })
  718. }
  719. break
  720. }
  721. } else if tag.Type == "regexp" {
  722. if tag.Reg.MatchString(kv.Key) {
  723. text := ju.TrimLRSpace(kv.Value, "")
  724. if text != "" {
  725. kvmap[field] = append(kvmap[field], map[string]interface{}{
  726. "field": field,
  727. "code": in.Code,
  728. "ruletext": tag.Key,
  729. "extfrom": extfrom,
  730. "sourcevalue": text,
  731. "value": text,
  732. "type": "colon2",
  733. "matchtype": "tag_regexp",
  734. "blocktag": bl.Classify,
  735. })
  736. }
  737. break
  738. }
  739. }
  740. }
  741. }
  742. }
  743. //空格kv
  744. if bl.SpaceKV != nil {
  745. kvs := bl.SpaceKV.Kvs
  746. // log.Debug("SpaceKV", kvs)
  747. for _, tag := range tags {
  748. for _, kv := range kvs {
  749. if tag.Type == "string" {
  750. if kv.Key == tag.Key {
  751. text := ju.TrimLRSpace(kv.Value, "")
  752. if text != "" {
  753. kvmap[field] = append(kvmap[field], map[string]interface{}{
  754. "field": field,
  755. "code": in.Code,
  756. "ruletext": tag.Key,
  757. "extfrom": extfrom,
  758. "sourcevalue": text,
  759. "value": text,
  760. "type": "space",
  761. "matchtype": "tag_string",
  762. "blocktag": bl.Classify,
  763. })
  764. }
  765. break
  766. }
  767. } else if tag.Type == "regexp" {
  768. if tag.Reg.MatchString(kv.Key) {
  769. text := ju.TrimLRSpace(kv.Value, "")
  770. if text != "" {
  771. kvmap[field] = append(kvmap[field], map[string]interface{}{
  772. "field": field,
  773. "code": in.Code,
  774. "ruletext": tag.Key,
  775. "extfrom": extfrom,
  776. "sourcevalue": text,
  777. "value": text,
  778. "type": "space",
  779. "matchtype": "tag_regexp",
  780. "blocktag": bl.Classify,
  781. })
  782. }
  783. break
  784. }
  785. }
  786. }
  787. }
  788. }
  789. //表格kv
  790. if bl.TableKV != nil {
  791. tkv := bl.TableKV
  792. // log.Debug("tkv", tkv)
  793. for k, v := range tkv.Kv {
  794. if k == fieldname {
  795. if len(tags) > -tkv.KvIndex[fieldname] {
  796. ruletext := ""
  797. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  798. ruletext = "项目名称"
  799. } else {
  800. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  801. }
  802. kvmap[field] = append(kvmap[field], map[string]interface{}{
  803. "field": field,
  804. "code": in.Code,
  805. "ruletext": ruletext,
  806. "extfrom": "table",
  807. "sourcevalue": v,
  808. "value": v,
  809. "type": "table",
  810. "matchtype": "tag_string",
  811. "blocktag": bl.Classify,
  812. })
  813. } else { //涉及其他待处理
  814. // log.Debug(tags)
  815. }
  816. }
  817. }
  818. }
  819. }
  820. }
  821. return kvmap
  822. }
  823. //正则提取结果
  824. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  825. defer qu.Catch()
  826. extinfo := map[string][]map[string]interface{}{}
  827. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  828. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  829. if len(apos) > 0 {
  830. pos := apos[0]
  831. for k, p := range v.RegCore.ExtractPos {
  832. if len(pos) > p {
  833. if pos[p] == -1 || pos[p+1] == -1 {
  834. continue
  835. }
  836. val := text[pos[p]:pos[p+1]]
  837. sourcevalue := val
  838. if val == "招标公告" {
  839. return extinfo
  840. }
  841. if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
  842. val = text
  843. }
  844. tmps := []map[string]interface{}{}
  845. tmp := map[string]interface{}{
  846. "field": v.Field,
  847. "code": v.Code,
  848. "ruletext": v.RuleText,
  849. "extfrom": extfrom,
  850. "value": val,
  851. "type": "regexp",
  852. "matchtype": "regcontent",
  853. "blocktag": *tag,
  854. }
  855. tmps = append(tmps, tmp)
  856. extinfo[k] = tmps
  857. if strings.TrimSpace(val) != "" {
  858. if v.RegCore.NumSign == -1 { //正负值修正
  859. val = "-" + val
  860. }
  861. exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
  862. if tmp["blocktag"] != nil {
  863. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  864. }
  865. j.Result[k] = append(j.Result[k], &exfield)
  866. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  867. }
  868. }
  869. }
  870. if len(extinfo) == 0 {
  871. regArr := strings.Split(v.RuleText, "__")
  872. //fmt.Println(regArr[0])
  873. if len(regArr) > 0 {
  874. reg, err := regexp.Compile(regArr[0])
  875. if err == nil {
  876. datavals := reg.FindStringSubmatch(text)
  877. tmps := []map[string]interface{}{}
  878. for _, value := range datavals {
  879. if value == "" {
  880. continue
  881. }
  882. tmp := map[string]interface{}{
  883. "field": v.Field,
  884. "code": v.Code,
  885. "ruletext": regArr[0],
  886. "extfrom": extfrom,
  887. "value": value,
  888. "type": "regexp",
  889. "matchtype": "regcontent",
  890. "blocktag": *tag,
  891. }
  892. tmps = append(tmps, tmp)
  893. extinfo[v.Field] = tmps
  894. exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
  895. if tmp["blocktag"] != nil {
  896. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  897. }
  898. j.Result[v.Field] = append(j.Result[v.Field], &exfield)
  899. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  900. }
  901. }
  902. }
  903. }
  904. }
  905. } else {
  906. pos := v.RegCore.Reg.FindStringIndex(text)
  907. val := ""
  908. if len(pos) == 2 {
  909. text = text[pos[1]:]
  910. rs := regexp.MustCompile("[^\r\n\t]+")
  911. tmp := rs.FindAllString(text, -1)
  912. if len(tmp) > 0 {
  913. val = tmp[0]
  914. }
  915. }
  916. if val != "" {
  917. tmps := []map[string]interface{}{}
  918. tmp := map[string]interface{}{
  919. "field": v.Field,
  920. "code": v.Code,
  921. "ruletext": v.RuleText,
  922. "extfrom": extfrom,
  923. "value": val,
  924. "type": "regexp",
  925. "matchtype": "regcontent",
  926. "blocktag": *tag,
  927. }
  928. tmps = append(tmps, tmp)
  929. extinfo[v.Field] = tmps
  930. if j.Result[v.Field] == nil {
  931. j.Result[v.Field] = [](*ju.ExtField){}
  932. }
  933. field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
  934. if tmp["blocktag"] != nil {
  935. field.BlockTag = tmp["blocktag"].(map[string]string)
  936. }
  937. j.Result[v.Field] = append(j.Result[v.Field], field)
  938. }
  939. }
  940. return extinfo
  941. }
  942. //后置过滤
  943. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  944. defer qu.Catch()
  945. if in.IsLua {
  946. result := GetResultMapForLua(j)
  947. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  948. if j != nil {
  949. lua.Block = j.Block
  950. }
  951. extinfo := lua.RunScript("back")
  952. for k, v := range extinfo {
  953. if tmps, ok := v.([]map[string]interface{}); ok {
  954. j.Result[k] = [](*ju.ExtField){}
  955. for _, tmp := range tmps {
  956. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  957. if tmp["blocktag"] != nil {
  958. field.BlockTag = tmp["blocktag"].(map[string]string)
  959. }
  960. j.Result[k] = append(j.Result[k], field)
  961. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  962. }
  963. }
  964. }
  965. if len(extinfo) > 0 {
  966. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  967. }
  968. } else {
  969. extinfo := map[string]interface{}{}
  970. if in.Field != "" {
  971. if j.Result[in.Field] != nil {
  972. tmp := j.Result[in.Field]
  973. exts := []interface{}{}
  974. for k, v := range tmp {
  975. //table抽取到的数据不清理
  976. // if v.Type == "table" && v.Field != "projectname" {
  977. // continue
  978. // }
  979. text := qu.ObjToString(v.Value)
  980. if text != "" {
  981. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  982. }
  983. j.Result[in.Field][k].Value = text
  984. exts = append(exts, map[string]interface{}{
  985. "field": v.Field,
  986. "code": v.Code,
  987. "ruletext": v.RuleText,
  988. "type": v.Type,
  989. "matchtype": v.MatchType,
  990. "extfrom": v.ExtFrom,
  991. "value": text,
  992. })
  993. }
  994. extinfo[in.Field] = exts
  995. if len(extinfo) > 0 {
  996. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  997. }
  998. }
  999. } else {
  1000. for key, tmp := range j.Result {
  1001. exts := []interface{}{}
  1002. for k, v := range tmp {
  1003. if v.Type == "table" { //table抽取到的数据不清理
  1004. continue
  1005. }
  1006. text := qu.ObjToString(v.Value)
  1007. if text != "" {
  1008. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1009. }
  1010. j.Result[key][k].Value = text
  1011. exts = append(exts, map[string]interface{}{
  1012. "field": v.Field,
  1013. "code": v.Code,
  1014. "ruletext": v.RuleText,
  1015. "type": v.Type,
  1016. "matchtype": v.MatchType,
  1017. "extfrom": v.ExtFrom,
  1018. "value": text,
  1019. })
  1020. }
  1021. extinfo[key] = exts
  1022. }
  1023. if len(extinfo) > 0 {
  1024. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1025. }
  1026. }
  1027. }
  1028. }
  1029. //获取抽取结果map[string][]interface{},lua脚本使用
  1030. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1031. defer qu.Catch()
  1032. result := map[string][]map[string]interface{}{}
  1033. for key, val := range j.Result {
  1034. if result[key] == nil {
  1035. result[key] = []map[string]interface{}{}
  1036. }
  1037. for _, v := range val {
  1038. tmp := map[string]interface{}{
  1039. "field": v.Field,
  1040. "code": v.Code,
  1041. "ruletext": v.RuleText,
  1042. "value": v.Value,
  1043. "type": v.Type,
  1044. "matchtype": v.MatchType,
  1045. "extfrom": v.ExtFrom,
  1046. }
  1047. result[key] = append(result[key], tmp)
  1048. }
  1049. }
  1050. return result
  1051. }
  1052. //抽取日志
  1053. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1054. defer qu.Catch()
  1055. if !t.IsEtxLog {
  1056. return
  1057. }
  1058. logdata := map[string]interface{}{
  1059. "code": v.Code,
  1060. "name": v.Name,
  1061. "type": ftype,
  1062. "ruletext": v.RuleText,
  1063. "islua": v.IsLua,
  1064. "field": v.Field,
  1065. "version": t.Version,
  1066. "taskname": t.Name,
  1067. "before": before,
  1068. "extinfo": extinfo,
  1069. "sid": sid,
  1070. "comeintime": time.Now().Unix(),
  1071. }
  1072. lock.Lock()
  1073. ExtLogs[t] = append(ExtLogs[t], logdata)
  1074. lock.Unlock()
  1075. }
  1076. //保存抽取日志
  1077. func SaveExtLog() {
  1078. defer qu.Catch()
  1079. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1080. lock.Lock()
  1081. tmpLogs = ExtLogs
  1082. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1083. lock.Unlock()
  1084. for k, v := range tmpLogs {
  1085. if len(v) < saveLimit {
  1086. db.Mgo.SaveBulk(k.TrackColl, v...)
  1087. } else {
  1088. for {
  1089. if len(v) > saveLimit {
  1090. tmp := v[:saveLimit]
  1091. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1092. v = v[saveLimit:]
  1093. } else {
  1094. db.Mgo.SaveBulk(k.TrackColl, v...)
  1095. break
  1096. }
  1097. }
  1098. }
  1099. }
  1100. time.AfterFunc(10*time.Second, SaveExtLog)
  1101. }
  1102. type FieldValue struct {
  1103. Value interface{}
  1104. Count int
  1105. }
  1106. //分析抽取结果并保存
  1107. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1108. qu.Try(func() {
  1109. doc, result, _id := funcAnalysis(j)
  1110. if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
  1111. go otherNeedSave(j, result, e)
  1112. }
  1113. auxinfo := auxInfo(j)
  1114. //从排序结果中取值
  1115. tmp := map[string]interface{}{} //抽取值
  1116. tmp["fieldall"] = auxinfo
  1117. for _, val := range result {
  1118. for _, v := range val { //取第一个非负数
  1119. if v.Score > -1 {
  1120. tmp[v.Field] = v.Value
  1121. break
  1122. }
  1123. }
  1124. }
  1125. if len(j.PackageInfo) > 0 { //分包信息
  1126. tmp["package"] = j.PackageInfo
  1127. }
  1128. if len(j.Winnerorder) > 0 { //候选人信息
  1129. tmp["winnerorder"] = j.Winnerorder
  1130. }
  1131. //处理附件
  1132. var resultf map[string][]*ju.ExtField
  1133. if jf != nil {
  1134. _, resultf, _ = funcAnalysis(jf)
  1135. auxinfof := auxInfo(jf)
  1136. tmp["fieldallf"] = auxinfof
  1137. ffield := map[string]interface{}{}
  1138. for _, val := range resultf {
  1139. for _, v := range val { //取第一个非负数
  1140. if v.Score > -1 {
  1141. ffield[v.Field] = v.Value
  1142. break
  1143. }
  1144. }
  1145. }
  1146. if len(jf.PackageInfo) > 0 { //分包信息
  1147. ffield["package"] = jf.PackageInfo
  1148. }
  1149. if len(jf.Winnerorder) > 0 { //候选人信息
  1150. ffield["winnerorder"] = jf.Winnerorder
  1151. }
  1152. tmp["ffield"] = ffield
  1153. }
  1154. for k, v := range *doc {
  1155. //去重冗余字段
  1156. if delFiled(k) {
  1157. continue
  1158. }
  1159. if tmp[k] == nil {
  1160. tmp[k] = v
  1161. }
  1162. }
  1163. //质量审核
  1164. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1165. e.QualityAudit(tmp)
  1166. }
  1167. if e.IsExtractCity { //城市抽取
  1168. e.ExtractCity(j, tmp, _id)
  1169. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1170. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1171. // tmp["district"] = d
  1172. // if b {
  1173. // tmp["city"] = c
  1174. // tmp["area"] = p
  1175. // }
  1176. }
  1177. //品牌抽取
  1178. if ju.IsBrandGoods {
  1179. tmp["checkhas"] = map[string]int{
  1180. "hastable": j.HasTable,
  1181. "hasgoods": j.HasGoods,
  1182. "hasbrand": j.HasBrand,
  1183. "haskey": j.HasKey,
  1184. }
  1185. if len(j.BrandData) > 0 {
  1186. tmp["tablebrand"] = j.BrandData
  1187. }
  1188. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1189. }
  1190. //所有kv组成的字符串
  1191. var kvtext bytes.Buffer
  1192. blocks := make([]ju.BlockAndTag, 0)
  1193. for _, v := range j.Block {
  1194. //分包和标签
  1195. if ju.Config["saveblock"].(bool) {
  1196. xx, _ := json.Marshal(v)
  1197. tmpblock := new(ju.TmpBlock)
  1198. err := json.Unmarshal(xx, &tmpblock)
  1199. if err != nil {
  1200. if v.BPackage != nil {
  1201. bpb, _ := json.Marshal(v.BPackage)
  1202. tmpblock.BPackage = string(bpb)
  1203. }
  1204. tmpblock = rangeBlockToJson(v, *tmpblock)
  1205. }
  1206. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1207. }
  1208. //把所有kv组装成一个字符串,存库
  1209. if v.ColonKV != nil {
  1210. for ck, cv := range v.ColonKV.Kv {
  1211. kvtext.WriteString(ck)
  1212. kvtext.WriteString(":")
  1213. kvtext.WriteString(cv)
  1214. kvtext.WriteString(" ")
  1215. }
  1216. }
  1217. if v.SpaceKV != nil {
  1218. for sk, sv := range v.SpaceKV.Kv {
  1219. kvtext.WriteString(sk)
  1220. kvtext.WriteString(":")
  1221. kvtext.WriteString(sv)
  1222. kvtext.WriteString(" ")
  1223. }
  1224. }
  1225. if v.TableKV != nil {
  1226. for tk, tv := range v.TableKV.Kv {
  1227. kvtext.WriteString(tk)
  1228. kvtext.WriteString(":")
  1229. kvtext.WriteString(tv)
  1230. kvtext.WriteString(" ")
  1231. }
  1232. }
  1233. }
  1234. if kvtext.Len() > 0 {
  1235. tmp["kvtext"] = kvtext.String()
  1236. }
  1237. if len(blocks) > 0 {
  1238. tmp["blocks"] = blocks
  1239. }
  1240. //tmp["extract_content"] = j.Content
  1241. if e.TaskInfo.TestColl == "" {
  1242. if len(tmp) > 0 { //保存抽取结果
  1243. for field, _ := range e.Fields {
  1244. if tmp[field] == nil {
  1245. tmp[field] = "" //覆盖之前版本数据
  1246. }
  1247. }
  1248. tmp["repeat"] = 0
  1249. tmparr := []map[string]interface{}{
  1250. map[string]interface{}{
  1251. "_id": qu.StringTOBsonId(_id),
  1252. },
  1253. map[string]interface{}{"$set": tmp},
  1254. }
  1255. e.BidArr = append(e.BidArr, tmparr)
  1256. e.BidTotal++
  1257. }
  1258. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1259. id := tmp["_id"]
  1260. tmp["result"] = result
  1261. tmp["resultf"] = resultf
  1262. delete(tmp, "_id")
  1263. tmparr := []map[string]interface{}{
  1264. map[string]interface{}{
  1265. "_id": id,
  1266. },
  1267. map[string]interface{}{"$set": tmp},
  1268. }
  1269. e.ResultArr = append(e.ResultArr, tmparr)
  1270. }
  1271. } else { //测试结果
  1272. delete(tmp, "_id")
  1273. if len(j.BlockPackage) > 0 { //分包详情
  1274. bs, _ := json.Marshal(j.BlockPackage)
  1275. tmp["epackage"] = string(bs)
  1276. }
  1277. tmp["result"] = result
  1278. tmp["resultf"] = resultf
  1279. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1280. if !b {
  1281. log.Debug(e.TaskInfo.TestColl, _id)
  1282. }
  1283. }
  1284. }, func(err interface{}) {
  1285. log.Debug("AnalysisSaveResult err", err)
  1286. })
  1287. }
  1288. //保存其他
  1289. //kv、表格、块上的标签凡是新的标签都入库
  1290. //val type times firstid createtime 判定field
  1291. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1292. now := time.Now().Unix()
  1293. coll := e.TaskInfo.TestColl
  1294. if coll == "" {
  1295. coll = "extract_tag_result"
  1296. } else {
  1297. coll += "_tag"
  1298. }
  1299. datas := []map[string]interface{}{}
  1300. kv := map[string]int{}
  1301. for _, v := range j.Block {
  1302. //
  1303. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1304. if vv == nil || vv.KvTag == nil {
  1305. continue
  1306. }
  1307. for kkk, vvv := range vv.KvTag {
  1308. if vvv.Weight == ju.RetainKvWeight {
  1309. kv[kkk] = kv[kkk] + 1
  1310. }
  1311. }
  1312. }
  1313. for _, vv := range v.NotClassifyTitles {
  1314. datas = append(datas, map[string]interface{}{
  1315. "val": vv,
  1316. "times": 0,
  1317. "type": "block",
  1318. "firstid": j.SourceMid,
  1319. "createtime": now,
  1320. })
  1321. if len(datas) == 200 {
  1322. db.Mgo.SaveBulk(coll, datas...)
  1323. datas = []map[string]interface{}{}
  1324. }
  1325. }
  1326. }
  1327. for k, v := range kv {
  1328. datas = append(datas, map[string]interface{}{
  1329. "val": k,
  1330. "times": v,
  1331. "type": "kv",
  1332. "firstid": j.SourceMid,
  1333. "createtime": now,
  1334. })
  1335. if len(datas) == 200 {
  1336. db.Mgo.SaveBulk(coll, datas...)
  1337. datas = []map[string]interface{}{}
  1338. }
  1339. }
  1340. if len(datas) > 0 {
  1341. db.Mgo.SaveBulk(coll, datas...)
  1342. }
  1343. }
  1344. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1345. if j == nil {
  1346. return nil
  1347. }
  1348. if len(j.Block) > 0 {
  1349. for i, v := range j.Block {
  1350. rangetmp := new(ju.TmpBlock)
  1351. vb, _ := json.Marshal(v)
  1352. json.Unmarshal(vb, &rangetmp)
  1353. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1354. }
  1355. }
  1356. if j.ColonKV != nil {
  1357. cb, _ := json.Marshal(j.ColonKV)
  1358. tmpblock.ColonKV = string(cb)
  1359. }
  1360. if j.SpaceKV != nil {
  1361. sb, _ := json.Marshal(j.SpaceKV)
  1362. tmpblock.SpaceKV = string(sb)
  1363. }
  1364. if j.TableKV != nil {
  1365. tb, _ := json.Marshal(j.TableKV)
  1366. tmpblock.TableKV = string(tb)
  1367. }
  1368. return &tmpblock
  1369. }
  1370. //去重冗余字段
  1371. func delFiled(k string) bool {
  1372. return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1373. }
  1374. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1375. defer qu.Catch()
  1376. doc := j.Data
  1377. result := j.Result
  1378. _id := qu.BsonIdToSId((*doc)["_id"])
  1379. result = ScoreFields(j)
  1380. //结果排序
  1381. for _, val := range result {
  1382. ju.Sort(val)
  1383. }
  1384. return doc, result, _id
  1385. }
  1386. //辅助信息,如果没有排序先排序
  1387. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1388. fieldalls := map[string][]map[string]interface{}{}
  1389. for field, val := range j.Result {
  1390. //ju.Sort(val)
  1391. sfields := []map[string]interface{}{}
  1392. for _, v := range val {
  1393. standardized := false
  1394. if field == "buyer" || field == "winner" || field == "agency" {
  1395. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1396. if i > 0 {
  1397. standardized = true
  1398. }
  1399. }
  1400. sfield := map[string]interface{}{
  1401. "val": v.Value,
  1402. "type": v.Type,
  1403. "score": v.Score,
  1404. "blocktag": v.BlockTag,
  1405. "sourceval": v.SourceValue,
  1406. "standardized": standardized,
  1407. }
  1408. sfields = append(sfields, sfield)
  1409. }
  1410. fieldalls[field] = sfields
  1411. }
  1412. return fieldalls
  1413. }
  1414. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1415. defer qu.Catch()
  1416. //获取审核字段
  1417. for _, field := range e.AuditFields {
  1418. //1.分包
  1419. if resulttmp["package"] != nil {
  1420. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1421. for _, val := range packagedata {
  1422. if val[field] != nil {
  1423. fv := qu.ObjToString(val[field])
  1424. if fv != "" {
  1425. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1426. e.RedisMatch(field, fv, val) //redis匹配
  1427. } else { //除了buyer和winner,其他字段走规则匹配
  1428. e.RuleMatch(field, fv, val)
  1429. }
  1430. }
  1431. }
  1432. }
  1433. }
  1434. //2.外围
  1435. if resulttmp[field] != nil {
  1436. fv := qu.ObjToString(resulttmp[field])
  1437. if fv != "" {
  1438. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1439. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1440. } else { //除了buyer和winner,其他字段走规则匹配
  1441. e.RuleMatch(field, fv, resulttmp)
  1442. }
  1443. }
  1444. }
  1445. }
  1446. }
  1447. //Redis匹配
  1448. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1449. defer qu.Catch()
  1450. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1451. if i == 0 { //reids未找到,执行规则匹配
  1452. val[field+"_isredis"] = false
  1453. e.RuleMatch(field, fv, val) //规则匹配
  1454. } else { //redis找到,打标识存库
  1455. val[field+"_isredis"] = true
  1456. }
  1457. }
  1458. //规则匹配
  1459. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1460. defer qu.Catch()
  1461. if fieldval != "" {
  1462. SMap := e.StartMatch(field, fieldval)
  1463. //SMap.AddKey(field+"_isaudit", false)
  1464. for _, k := range SMap.Keys {
  1465. tmpMap[k] = SMap.Map[k]
  1466. }
  1467. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1468. }
  1469. }
  1470. //开始规则匹配
  1471. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1472. defer qu.Catch()
  1473. SMap := pretreated.NewSortMap()
  1474. lock.Lock()
  1475. f := e.RecogFieldMap[field]
  1476. lock.Unlock()
  1477. if len(f) > 0 {
  1478. fid := qu.BsonIdToSId(f["_id"])
  1479. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1480. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1481. if textAfterRecogFieldPrerule != "" {
  1482. lock.Lock()
  1483. classMap := e.FidClassMap[fid]
  1484. lock.Unlock()
  1485. L:
  1486. for _, c := range classMap { //class
  1487. classid := qu.BsonIdToSId(c["_id"])
  1488. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1489. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1490. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1491. if textAfterClassPrerule != "" {
  1492. lock.Lock()
  1493. ruleMap := e.CidRuleMap[classid]
  1494. lock.Unlock()
  1495. for _, r := range ruleMap { //rule
  1496. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1497. s_name := qu.ObjToString(r["s_name"])
  1498. rule := r["rule"].([]interface{})
  1499. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1500. if textAfterRulePrerule != "" {
  1501. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1502. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1503. if savefield != "" { //保存字段不为空,存储代码信息
  1504. SMap.AddKey(field+"_"+savefield, s_name)
  1505. }
  1506. break L
  1507. }
  1508. }
  1509. }
  1510. }
  1511. }
  1512. }
  1513. }
  1514. return SMap
  1515. }