extract.go 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. "log"
  11. qu "qfw/util"
  12. "qfw/util/redis"
  13. "reflect"
  14. "regexp"
  15. "strconv"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. "gopkg.in/mgo.v2/bson"
  20. )
  21. var (
  22. lock sync.RWMutex
  23. cut = ju.NewCut() //获取正文并清理
  24. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  25. TaskList map[string]*ExtractTask //任务列表
  26. ClearTaskList map[string]*ClearTask //清理任务列表
  27. saveLimit = 200 //抽取日志批量保存
  28. PageSize = 5000 //查询分页
  29. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
  30. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  31. )
  32. //启动测试抽取
  33. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  34. defer qu.Catch()
  35. ext := &ExtractTask{}
  36. ext.Id = taskId
  37. ext.IsRun = true
  38. ext.InitTestTaskInfo(resultcoll, trackcoll)
  39. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  40. ext.InitRulePres()
  41. ext.InitRuleBacks()
  42. ext.InitRuleCore()
  43. ext.InitPkgCore()
  44. ext.InitTag()
  45. ext.InitClearFn()
  46. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  47. //初始化城市DFA信息
  48. ext.InitDFA()
  49. }
  50. //质量审核
  51. ext.InitAuditFields()
  52. ext.InitAuditRule()
  53. ext.InitAuditClass()
  54. ext.InitAuditRecogField()
  55. //品牌抽取是否开启
  56. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  57. //附件抽取是否开启
  58. ext.InitFile()
  59. return RunExtractTestTask(ext, startId, num)
  60. }
  61. func IdTrans(startId string) bson.ObjectId {
  62. defer qu.Catch()
  63. return bson.ObjectIdHex(startId)
  64. }
  65. //开始测试任务抽取
  66. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  67. n, _ := strconv.Atoi(num)
  68. id := IdTrans(startId)
  69. if id.Valid() {
  70. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  71. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  72. for _, v := range *list {
  73. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  74. continue
  75. }
  76. var j, jf *ju.Job
  77. if ext.IsFileField && v["projectinfo"] != nil {
  78. v["isextFile"] = true
  79. j, jf = PreInfo(v)
  80. } else {
  81. j, _ = PreInfo(v)
  82. }
  83. ext.TaskInfo.ProcessPool <- true
  84. go ext.ExtractProcess(j, jf)
  85. }
  86. return true
  87. } else {
  88. return false
  89. }
  90. }
  91. //启动抽取
  92. func StartExtractTaskId(taskId string) bool {
  93. defer qu.Catch()
  94. isgo := false
  95. ext := TaskList[taskId]
  96. if ext == nil {
  97. ext = &ExtractTask{}
  98. ext.Id = taskId
  99. ext.InitTaskInfo()
  100. isgo = true
  101. } else {
  102. ext.Id = taskId
  103. ext.InitTaskInfo()
  104. }
  105. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  106. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  107. ext.InitRulePres()
  108. ext.InitRuleBacks()
  109. ext.InitRuleCore()
  110. ext.InitPkgCore()
  111. ext.InitTag()
  112. ext.InitClearFn()
  113. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  114. //初始化城市DFA信息
  115. ext.InitDFA()
  116. }
  117. //质量审核
  118. ext.InitAuditFields()
  119. ext.InitAuditRule()
  120. ext.InitAuditClass()
  121. ext.InitAuditRecogField()
  122. //品牌抽取是否开启
  123. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  124. //附件抽取是否开启
  125. ext.InitFile()
  126. ext.IsRun = true
  127. go ext.ResultSave()
  128. go ext.BidSave()
  129. if isgo {
  130. go RunExtractTask(taskId)
  131. }
  132. TaskList[taskId] = ext
  133. return true
  134. }
  135. //停止抽取
  136. func StopExtractTaskId(taskId string) bool {
  137. defer qu.Catch()
  138. ext := TaskList[taskId]
  139. if ext != nil {
  140. ext.IsRun = false
  141. TaskList[taskId] = ext
  142. }
  143. //更新task.s_extlastid
  144. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  145. return true
  146. }
  147. //开始抽取
  148. func RunExtractTask(taskId string) {
  149. defer qu.Catch()
  150. ext := TaskList[taskId]
  151. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  152. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  153. pageNum := (count + PageSize - 1) / PageSize
  154. limit := PageSize
  155. if count < PageSize {
  156. limit = count
  157. }
  158. log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  159. for i := 0; i < pageNum; i++ {
  160. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  161. log.Printf("page=%d,query=%v", i+1, query)
  162. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  163. for _, v := range *list {
  164. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  165. continue
  166. }
  167. _id := qu.BsonIdToSId(v["_id"])
  168. log.Println(_id)
  169. if !ext.IsRun {
  170. break
  171. }
  172. var j, jf *ju.Job
  173. if ext.IsFileField && v["projectinfo"] != nil {
  174. v["isextFile"] = true
  175. j, jf = PreInfo(v)
  176. } else {
  177. j, _ = PreInfo(v)
  178. }
  179. ext.TaskInfo.ProcessPool <- true
  180. go ext.ExtractProcess(j, jf)
  181. ext.TaskInfo.LastExtId = _id
  182. }
  183. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  184. if !ext.IsRun {
  185. break
  186. }
  187. }
  188. //更新task.s_extlastid
  189. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  190. }
  191. //信息预处理
  192. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  193. defer qu.Catch()
  194. //判断是否有附件这个字段
  195. var isextFile bool
  196. if doc["isextFile"] != nil {
  197. isextFile = doc["isextFile"].(bool)
  198. }
  199. detail := ""
  200. d1, _ := doc["detail"].(string)
  201. d2, _ := doc["contenthtml"].(string)
  202. if len(d1) >= len(d2) || d2 == "" {
  203. detail = d1
  204. } else {
  205. detail = d2
  206. }
  207. detail = ju.CutLableStr(detail)
  208. detail = cut.ClearHtml(detail)
  209. doc["detail"] = detail
  210. if isextFile {
  211. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  212. }
  213. toptype := qu.ObjToString(doc["toptype"])
  214. if qu.ObjToString(doc["type"]) == "bid" {
  215. toptype = "结果"
  216. }
  217. if toptype == "" {
  218. toptype = "*"
  219. }
  220. j = &ju.Job{
  221. SourceMid: qu.BsonIdToSId(doc["_id"]),
  222. Category: toptype,
  223. Content: qu.ObjToString(doc["detail"]),
  224. SpiderCode: qu.ObjToString(doc["spidercode"]),
  225. //Domain: qu.ObjToString(doc["domain"]),
  226. //Href: qu.ObjToString(doc["href"]),
  227. Title: qu.ObjToString(doc["title"]),
  228. Data: &doc,
  229. City: qu.ObjToString(doc["city"]),
  230. Province: qu.ObjToString(doc["area"]),
  231. Result: map[string][]*ju.ExtField{},
  232. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  233. }
  234. if isextFile {
  235. jf = &ju.Job{
  236. SourceMid: qu.BsonIdToSId(doc["_id"]),
  237. Category: toptype,
  238. Content: qu.ObjToString(doc["detailfile"]),
  239. SpiderCode: qu.ObjToString(doc["spidercode"]),
  240. Title: qu.ObjToString(doc["title"]),
  241. Data: &doc,
  242. City: qu.ObjToString(doc["city"]),
  243. Province: qu.ObjToString(doc["area"]),
  244. Result: map[string][]*ju.ExtField{},
  245. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  246. IsFile: isextFile,
  247. }
  248. }
  249. qu.Try(func() {
  250. pretreated.AnalyStart(j)
  251. if isextFile {
  252. pretreated.AnalyStart(jf)
  253. }
  254. }, func(err interface{}) {
  255. log.Println("pretreated.AnalyStart", err)
  256. })
  257. return j, jf
  258. }
  259. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  260. func file2text(doc *map[string]interface{}) {
  261. var strfileinfo bytes.Buffer
  262. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  263. if va, ok := v["attachments"].(map[string]interface{}); ok {
  264. for _, vaatt := range va {
  265. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  266. if qu.ObjToString(fileinfo["content"]) != "" {
  267. switch fileinfo["content"].(type) {
  268. case string:
  269. lock.Lock()
  270. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  271. lock.Unlock()
  272. case []map[string]interface{}:
  273. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  274. if fv["context"] != nil {
  275. lock.Lock()
  276. strfileinfo.WriteString(fv["context"].(string) + " \n")
  277. lock.Unlock()
  278. }
  279. }
  280. }
  281. }
  282. }
  283. }
  284. }
  285. }
  286. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  287. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  288. }
  289. }
  290. //抽取
  291. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  292. e.ExtractDetail(j)
  293. if jf != nil && jf.IsFile {
  294. e.ExtractFile(jf)
  295. }
  296. //分析抽取结果并保存 todo
  297. AnalysisSaveResult(j, jf, e)
  298. <-e.TaskInfo.ProcessPool
  299. }
  300. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  301. qu.Try(func() {
  302. doc := *j.Data
  303. //全局前置规则,结果覆盖doc属性
  304. for _, v := range e.RulePres {
  305. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  306. }
  307. //抽取规则
  308. for _, vc := range e.RuleCores {
  309. tmp := ju.DeepCopy(doc).(map[string]interface{})
  310. //是否进入逻辑
  311. if !ju.Logic(vc.LuaLogic, tmp) {
  312. continue
  313. }
  314. //抽取-前置规则
  315. for _, v := range vc.RulePres {
  316. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  317. }
  318. //log.Println("抽取-前置规则", tmp)
  319. //抽取-规则
  320. for _, v := range vc.RuleCores {
  321. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  322. }
  323. //log.Println("抽取-规则", tmp)
  324. //项目名称未能抽取到,标题来凑
  325. if vc.Field == "projectname" {
  326. if len(j.Result[vc.Field]) < 1 {
  327. j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  328. }
  329. }
  330. //抽取-后置规则
  331. for _, v := range vc.RuleBacks {
  332. ExtRegBack(j, v, e.TaskInfo)
  333. }
  334. //log.Println("抽取-后置规则", tmp)
  335. }
  336. //全局后置规则
  337. for _, v := range e.RuleBacks {
  338. ExtRegBack(j, v, e.TaskInfo)
  339. }
  340. //候选人加入
  341. if len(j.Winnerorder) > 0 {
  342. winner := &ju.ExtField{
  343. Field: "winner",
  344. Code: "",
  345. RuleText: "",
  346. Type: "winnerorder",
  347. MatchType: "winnerorder",
  348. ExtFrom: "",
  349. Value: j.Winnerorder[0]["entname"],
  350. Score: 0,
  351. }
  352. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  353. winner.Score = -5
  354. }
  355. winners := j.Result["winner"]
  356. if winners != nil {
  357. winners = append(winners, winner)
  358. } else {
  359. winners = []*ju.ExtField{}
  360. winners = append(winners, winner)
  361. }
  362. j.Result["winner"] = winners
  363. }
  364. //函数清理
  365. for key, val := range j.Result {
  366. for _, v := range val {
  367. lock.Lock()
  368. cfn := e.ClearFn[key]
  369. lock.Unlock()
  370. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  371. v.Value = data[0]
  372. //清理特殊符号
  373. lock.Lock()
  374. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  375. clear.MesField[key] != nil {
  376. text := qu.ObjToString(v.Value)
  377. text = clear.OtherClean(key, text)
  378. v.Value = text
  379. }
  380. lock.Unlock()
  381. }
  382. }
  383. PackageDetail(j, e) //处理分包信息
  384. // bs, _ := json.Marshal(j.Result)
  385. // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  386. }, func(err interface{}) {
  387. log.Println("ExtractProcess err", err)
  388. })
  389. }
  390. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  391. qu.Try(func() {
  392. doc := *j.Data
  393. //全局前置规则,结果覆盖doc属性
  394. for _, v := range e.RulePres {
  395. if e.FileFields[v.Field] > 0 {
  396. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  397. }
  398. }
  399. //抽取规则
  400. for _, vc := range e.RuleCores {
  401. tmp := ju.DeepCopy(doc).(map[string]interface{})
  402. //是否进入逻辑
  403. if !ju.Logic(vc.LuaLogic, tmp) {
  404. continue
  405. }
  406. //抽取-前置规则
  407. for _, v := range vc.RulePres {
  408. if e.FileFields[vc.Field] > 0 {
  409. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  410. }
  411. }
  412. //log.Println("抽取-前置规则", tmp)
  413. //抽取-规则
  414. for _, v := range vc.RuleCores {
  415. if e.FileFields[vc.Field] > 0 {
  416. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  417. }
  418. }
  419. //log.Println("抽取-规则", tmp)
  420. //抽取-后置规则
  421. for _, v := range vc.RuleBacks {
  422. if e.FileFields[vc.Field] > 0 {
  423. ExtRegBack(j, v, e.TaskInfo)
  424. }
  425. }
  426. //log.Println("抽取-后置规则", tmp)
  427. }
  428. //全局后置规则
  429. for _, v := range e.RuleBacks {
  430. if e.FileFields[v.Field] > 0 {
  431. ExtRegBack(j, v, e.TaskInfo)
  432. }
  433. }
  434. //候选人加入
  435. if len(j.Winnerorder) > 0 {
  436. winner := &ju.ExtField{
  437. Field: "winner",
  438. Code: "",
  439. RuleText: "",
  440. Type: "winnerorder",
  441. MatchType: "winnerorder",
  442. ExtFrom: "",
  443. Value: j.Winnerorder[0]["entname"],
  444. Score: 0,
  445. }
  446. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  447. winner.Score = -5
  448. }
  449. winners := j.Result["winner"]
  450. if winners != nil {
  451. winners = append(winners, winner)
  452. } else {
  453. winners = []*ju.ExtField{}
  454. winners = append(winners, winner)
  455. }
  456. j.Result["winner"] = winners
  457. }
  458. //函数清理
  459. for key, val := range j.Result {
  460. for _, v := range val {
  461. lock.Lock()
  462. cfn := e.ClearFn[key]
  463. lock.Unlock()
  464. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  465. v.Value = data[0]
  466. //清理特殊符号
  467. lock.Lock()
  468. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  469. clear.MesField[key] != nil {
  470. text := qu.ObjToString(v.Value)
  471. text = clear.OtherClean(key, text)
  472. v.Value = text
  473. }
  474. lock.Unlock()
  475. }
  476. }
  477. PackageDetail(j, e) //处理分包信息
  478. // bs, _ := json.Marshal(j.Result)
  479. // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  480. }, func(err interface{}) {
  481. log.Println("ExtractProcess err", err)
  482. })
  483. }
  484. //前置过滤
  485. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  486. defer qu.Catch()
  487. before := ju.DeepCopy(doc).(map[string]interface{})
  488. extinfo := map[string]interface{}{}
  489. if in.IsLua {
  490. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  491. if j != nil {
  492. lua.Block = j.Block
  493. }
  494. extinfo = lua.RunScript("pre")
  495. for k, v := range extinfo { //结果覆盖原doc
  496. doc[k] = v
  497. }
  498. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  499. } else {
  500. var key string
  501. if !j.IsFile {
  502. key = qu.If(in.Field == "", "detail", in.Field).(string)
  503. } else {
  504. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  505. }
  506. text := qu.ObjToString(doc[key])
  507. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  508. doc[key] = extinfo[key] //结果覆盖原doc
  509. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  510. }
  511. return doc
  512. }
  513. //抽取-规则
  514. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  515. defer qu.Catch()
  516. //废标、流标、ppp等跳过
  517. b := IsExtract(in.Field, j.Title, j.Content)
  518. if !b {
  519. return
  520. }
  521. if in.IsLua {
  522. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  523. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  524. lua.Block = j.Block
  525. extinfo := lua.RunScript("core")
  526. for k, v := range extinfo {
  527. if k == in.Field {
  528. if j.Result[k] == nil {
  529. j.Result[k] = [](*ju.ExtField){}
  530. }
  531. if tmps, ok := v.([]map[string]interface{}); ok {
  532. for _, tmp := range tmps {
  533. j.Result[k] = append(j.Result[k],
  534. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
  535. }
  536. }
  537. }
  538. }
  539. if len(extinfo) > 0 {
  540. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  541. }
  542. } else {
  543. //全文正则
  544. text := qu.ObjToString(doc[extfrom])
  545. if in.Field != "" {
  546. extinfo := extRegCoreToResult(extfrom, text, j, in)
  547. if len(extinfo) > 0 {
  548. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  549. }
  550. }
  551. }
  552. }
  553. //lua脚本根据属性设置提取kv值
  554. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  555. defer qu.Catch()
  556. kvmap := map[string][]map[string]interface{}{}
  557. for fieldname, field := range in.LFields {
  558. lock.Lock()
  559. tags := t[field] //获取对应标签库
  560. lock.Unlock()
  561. for _, bl := range j.Block {
  562. //冒号kv
  563. if bl.ColonKV != nil {
  564. kvs := bl.ColonKV.Kvs
  565. kvs2 := bl.ColonKV.Kvs_2
  566. //log.Println("ColonKV1", kvs)
  567. //log.Println("ColonKV2", kvs2)
  568. for _, tag := range tags {
  569. for _, kv := range kvs {
  570. if tag.Type == "string" {
  571. if kv.Key == tag.Key {
  572. text := ju.TrimLRSpace(kv.Value, "")
  573. if text != "" {
  574. kvmap[field] = append(kvmap[field], map[string]interface{}{
  575. "field": field,
  576. "code": in.Code,
  577. "ruletext": tag.Key,
  578. "extfrom": extfrom,
  579. "value": text,
  580. "type": "colon1",
  581. "matchtype": "tag_string",
  582. })
  583. }
  584. break
  585. }
  586. } else if tag.Type == "regexp" {
  587. if tag.Reg.MatchString(kv.Key) {
  588. text := ju.TrimLRSpace(kv.Value, "")
  589. if text != "" {
  590. kvmap[field] = append(kvmap[field], map[string]interface{}{
  591. "field": field,
  592. "code": in.Code,
  593. "ruletext": tag.Key,
  594. "extfrom": extfrom,
  595. "value": text,
  596. "type": "colon1",
  597. "matchtype": "tag_regexp",
  598. })
  599. }
  600. break
  601. }
  602. }
  603. }
  604. for _, kv := range kvs2 {
  605. if tag.Type == "string" {
  606. if kv.Key == tag.Key {
  607. text := ju.TrimLRSpace(kv.Value, "")
  608. if text != "" {
  609. kvmap[field] = append(kvmap[field], map[string]interface{}{
  610. "field": field,
  611. "code": in.Code,
  612. "ruletext": tag.Key,
  613. "extfrom": extfrom,
  614. "value": text,
  615. "type": "colon2",
  616. "matchtype": "tag_string",
  617. })
  618. }
  619. break
  620. }
  621. } else if tag.Type == "regexp" {
  622. if tag.Reg.MatchString(kv.Key) {
  623. text := ju.TrimLRSpace(kv.Value, "")
  624. if text != "" {
  625. kvmap[field] = append(kvmap[field], map[string]interface{}{
  626. "field": field,
  627. "code": in.Code,
  628. "ruletext": tag.Key,
  629. "extfrom": extfrom,
  630. "value": text,
  631. "type": "colon2",
  632. "matchtype": "tag_regexp",
  633. })
  634. }
  635. break
  636. }
  637. }
  638. }
  639. }
  640. }
  641. //空格kv
  642. if bl.SpaceKV != nil {
  643. kvs := bl.SpaceKV.Kvs
  644. //log.Println("SpaceKV", kvs)
  645. for _, tag := range tags {
  646. for _, kv := range kvs {
  647. if tag.Type == "string" {
  648. if kv.Key == tag.Key {
  649. text := ju.TrimLRSpace(kv.Value, "")
  650. if text != "" {
  651. kvmap[field] = append(kvmap[field], map[string]interface{}{
  652. "field": field,
  653. "code": in.Code,
  654. "ruletext": tag.Key,
  655. "extfrom": extfrom,
  656. "value": text,
  657. "type": "space",
  658. "matchtype": "tag_string",
  659. })
  660. }
  661. break
  662. }
  663. } else if tag.Type == "regexp" {
  664. if tag.Reg.MatchString(kv.Key) {
  665. text := ju.TrimLRSpace(kv.Value, "")
  666. if text != "" {
  667. kvmap[field] = append(kvmap[field], map[string]interface{}{
  668. "field": field,
  669. "code": in.Code,
  670. "ruletext": tag.Key,
  671. "extfrom": extfrom,
  672. "value": text,
  673. "type": "space",
  674. "matchtype": "tag_regexp",
  675. })
  676. }
  677. break
  678. }
  679. }
  680. }
  681. }
  682. }
  683. //表格kv
  684. if bl.TableKV != nil {
  685. tkv := bl.TableKV
  686. //log.Println("tkv", tkv)
  687. for k, v := range tkv.Kv {
  688. if k == fieldname {
  689. if len(tags) > -tkv.KvIndex[fieldname] {
  690. ruletext := ""
  691. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  692. ruletext = "项目名称"
  693. } else {
  694. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  695. }
  696. kvmap[field] = append(kvmap[field], map[string]interface{}{
  697. "field": field,
  698. "code": in.Code,
  699. "ruletext": ruletext,
  700. "extfrom": "table",
  701. "value": v,
  702. "type": "table",
  703. "matchtype": "tag_string",
  704. })
  705. } else { //涉及其他待处理
  706. //log.Println(tags)
  707. }
  708. }
  709. }
  710. }
  711. }
  712. }
  713. return kvmap
  714. }
  715. //正则提取结果
  716. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  717. defer qu.Catch()
  718. extinfo := map[string][]map[string]interface{}{}
  719. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  720. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  721. if len(apos) > 0 {
  722. pos := apos[0]
  723. for k, p := range v.RegCore.ExtractPos {
  724. if len(pos) > p {
  725. if pos[p] == -1 || pos[p+1] == -1 {
  726. continue
  727. }
  728. val := text[pos[p]:pos[p+1]]
  729. tmps := []map[string]interface{}{}
  730. tmp := map[string]interface{}{
  731. "field": v.Field,
  732. "code": v.Code,
  733. "ruletext": v.RuleText,
  734. "extfrom": extfrom,
  735. "value": val,
  736. "type": "regexp",
  737. "matchtype": "regcontent",
  738. }
  739. tmps = append(tmps, tmp)
  740. extinfo[k] = tmps
  741. if val != "" {
  742. if j.Result[v.Field] == nil {
  743. j.Result[k] = [](*ju.ExtField){}
  744. }
  745. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  746. }
  747. }
  748. }
  749. }
  750. } else {
  751. pos := v.RegCore.Reg.FindStringIndex(text)
  752. val := ""
  753. if len(pos) == 2 {
  754. text = text[pos[1]:]
  755. rs := regexp.MustCompile("[^\r\n\t]+")
  756. tmp := rs.FindAllString(text, -1)
  757. if len(tmp) > 0 {
  758. val = tmp[0]
  759. }
  760. }
  761. if val != "" {
  762. tmps := []map[string]interface{}{}
  763. tmp := map[string]interface{}{
  764. "field": v.Field,
  765. "code": v.Code,
  766. "ruletext": v.RuleText,
  767. "extfrom": extfrom,
  768. "value": val,
  769. "type": "regexp",
  770. "matchtype": "regcontent",
  771. }
  772. tmps = append(tmps, tmp)
  773. extinfo[v.Field] = tmps
  774. if j.Result[v.Field] == nil {
  775. j.Result[v.Field] = [](*ju.ExtField){}
  776. }
  777. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  778. }
  779. }
  780. return extinfo
  781. }
  782. //后置过滤
  783. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  784. defer qu.Catch()
  785. if in.IsLua {
  786. result := GetResultMapForLua(j)
  787. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  788. if j != nil {
  789. lua.Block = j.Block
  790. }
  791. extinfo := lua.RunScript("back")
  792. for k, v := range extinfo {
  793. if tmps, ok := v.([]map[string]interface{}); ok {
  794. j.Result[k] = [](*ju.ExtField){}
  795. for _, tmp := range tmps {
  796. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  797. }
  798. }
  799. }
  800. if len(extinfo) > 0 {
  801. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  802. }
  803. } else {
  804. extinfo := map[string]interface{}{}
  805. if in.Field != "" {
  806. if j.Result[in.Field] != nil {
  807. tmp := j.Result[in.Field]
  808. exts := []interface{}{}
  809. for k, v := range tmp {
  810. //table抽取到的数据不清理
  811. // if v.Type == "table" && v.Field != "projectname" {
  812. // continue
  813. // }
  814. text := qu.ObjToString(v.Value)
  815. if text != "" {
  816. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  817. }
  818. j.Result[in.Field][k].Value = text
  819. exts = append(exts, map[string]interface{}{
  820. "field": v.Field,
  821. "code": v.Code,
  822. "ruletext": v.RuleText,
  823. "type": v.Type,
  824. "matchtype": v.MatchType,
  825. "extfrom": v.ExtFrom,
  826. "value": text,
  827. })
  828. }
  829. extinfo[in.Field] = exts
  830. if len(extinfo) > 0 {
  831. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  832. }
  833. }
  834. } else {
  835. for key, tmp := range j.Result {
  836. exts := []interface{}{}
  837. for k, v := range tmp {
  838. if v.Type == "table" { //table抽取到的数据不清理
  839. continue
  840. }
  841. text := qu.ObjToString(v.Value)
  842. if text != "" {
  843. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  844. }
  845. j.Result[key][k].Value = text
  846. exts = append(exts, map[string]interface{}{
  847. "field": v.Field,
  848. "code": v.Code,
  849. "ruletext": v.RuleText,
  850. "type": v.Type,
  851. "matchtype": v.MatchType,
  852. "extfrom": v.ExtFrom,
  853. "value": text,
  854. })
  855. }
  856. extinfo[key] = exts
  857. }
  858. if len(extinfo) > 0 {
  859. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  860. }
  861. }
  862. }
  863. }
  864. //获取抽取结果map[string][]interface{},lua脚本使用
  865. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  866. defer qu.Catch()
  867. result := map[string][]map[string]interface{}{}
  868. for key, val := range j.Result {
  869. if result[key] == nil {
  870. result[key] = []map[string]interface{}{}
  871. }
  872. for _, v := range val {
  873. tmp := map[string]interface{}{
  874. "field": v.Field,
  875. "code": v.Code,
  876. "ruletext": v.RuleText,
  877. "value": v.Value,
  878. "type": v.Type,
  879. "matchtype": v.MatchType,
  880. "extfrom": v.ExtFrom,
  881. }
  882. result[key] = append(result[key], tmp)
  883. }
  884. }
  885. return result
  886. }
  887. //抽取日志
  888. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  889. defer qu.Catch()
  890. if !t.IsEtxLog {
  891. return
  892. }
  893. logdata := map[string]interface{}{
  894. "code": v.Code,
  895. "name": v.Name,
  896. "type": ftype,
  897. "ruletext": v.RuleText,
  898. "islua": v.IsLua,
  899. "field": v.Field,
  900. "version": t.Version,
  901. "taskname": t.Name,
  902. "before": before,
  903. "extinfo": extinfo,
  904. "sid": sid,
  905. "comeintime": time.Now().Unix(),
  906. }
  907. lock.Lock()
  908. ExtLogs[t] = append(ExtLogs[t], logdata)
  909. lock.Unlock()
  910. }
  911. //保存抽取日志
  912. func SaveExtLog() {
  913. defer qu.Catch()
  914. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  915. lock.Lock()
  916. tmpLogs = ExtLogs
  917. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  918. lock.Unlock()
  919. for k, v := range tmpLogs {
  920. if len(v) < saveLimit {
  921. db.Mgo.SaveBulk(k.TrackColl, v...)
  922. } else {
  923. for {
  924. if len(v) > saveLimit {
  925. tmp := v[:saveLimit]
  926. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  927. v = v[saveLimit:]
  928. } else {
  929. db.Mgo.SaveBulk(k.TrackColl, v...)
  930. break
  931. }
  932. }
  933. }
  934. }
  935. time.AfterFunc(10*time.Second, SaveExtLog)
  936. }
  937. type FieldValue struct {
  938. Value interface{}
  939. Count int
  940. }
  941. //分析抽取结果并保存
  942. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  943. qu.Try(func() {
  944. doc, result, _id, values := funcAnalysis(j)
  945. //从排序结果中取值
  946. tmp := map[string]interface{}{} //抽取值
  947. for key, val := range values {
  948. for _, v := range val { //取第一个非负数
  949. if v.Key != "" && v.Value > -1 {
  950. tmp[key] = v.Object
  951. break
  952. }
  953. }
  954. }
  955. if len(j.PackageInfo) > 0 { //分包信息
  956. tmp["package"] = j.PackageInfo
  957. }
  958. if len(j.Winnerorder) > 0 { //候选人信息
  959. tmp["winnerorder"] = j.Winnerorder
  960. }
  961. //处理附件
  962. var resultf map[string][]*ju.ExtField
  963. var filevalues map[string][]*ju.SortObject
  964. if jf != nil {
  965. _, resultf, _, filevalues = funcAnalysis(jf)
  966. ffield := map[string]interface{}{}
  967. for key, val := range filevalues {
  968. for _, v := range val { //取第一个非负数
  969. if v.Key != "" && v.Value > -1 {
  970. ffield[key] = v.Object
  971. break
  972. }
  973. }
  974. }
  975. if len(jf.PackageInfo) > 0 { //分包信息
  976. ffield["package"] = jf.PackageInfo
  977. }
  978. if len(jf.Winnerorder) > 0 { //候选人信息
  979. ffield["winnerorder"] = jf.Winnerorder
  980. }
  981. tmp["ffield"] = ffield
  982. }
  983. for k, v := range *doc {
  984. //去重冗余字段
  985. if delFiled(k) {
  986. continue
  987. }
  988. if tmp[k] == nil {
  989. tmp[k] = v
  990. }
  991. }
  992. //质量审核
  993. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  994. e.QualityAudit(tmp)
  995. }
  996. if e.IsExtractCity { //城市抽取
  997. b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  998. //log.Println("省份---", p, "城市---", c, "区---", d)
  999. tmp["district"] = d
  1000. if b {
  1001. tmp["city"] = c
  1002. tmp["area"] = p
  1003. }
  1004. }
  1005. //品牌抽取
  1006. if ju.IsBrandGoods {
  1007. tmp["checkhas"] = map[string]int{
  1008. "hastable": j.HasTable,
  1009. "hasgoods": j.HasGoods,
  1010. "hasbrand": j.HasBrand,
  1011. "haskey": j.HasKey,
  1012. }
  1013. if len(j.BrandData) > 0 {
  1014. tmp["tablebrand"] = j.BrandData
  1015. }
  1016. //log.Println("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1017. }
  1018. if e.TaskInfo.TestColl == "" {
  1019. if len(tmp) > 0 { //保存抽取结果
  1020. for field, _ := range e.Fields {
  1021. if tmp[field] == nil {
  1022. tmp[field] = "" //覆盖之前版本数据
  1023. }
  1024. }
  1025. tmp["repeat"] = 0
  1026. tmparr := []map[string]interface{}{
  1027. map[string]interface{}{
  1028. "_id": qu.StringTOBsonId(_id),
  1029. },
  1030. map[string]interface{}{"$set": tmp},
  1031. }
  1032. e.BidArr = append(e.BidArr, tmparr)
  1033. }
  1034. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1035. id := tmp["_id"]
  1036. tmp["result"] = result
  1037. tmp["resultf"] = resultf
  1038. delete(tmp, "_id")
  1039. tmparr := []map[string]interface{}{
  1040. map[string]interface{}{
  1041. "_id": id,
  1042. },
  1043. map[string]interface{}{"$set": tmp},
  1044. }
  1045. e.ResultArr = append(e.ResultArr, tmparr)
  1046. }
  1047. } else { //测试结果
  1048. delete(tmp, "_id")
  1049. if len(j.BlockPackage) > 0 { //分包详情
  1050. bs, _ := json.Marshal(j.BlockPackage)
  1051. tmp["epackage"] = string(bs)
  1052. }
  1053. tmp["result"] = result
  1054. tmp["resultf"] = resultf
  1055. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1056. if !b {
  1057. log.Println(e.TaskInfo.TestColl, _id)
  1058. }
  1059. }
  1060. }, func(err interface{}) {
  1061. log.Println("AnalysisSaveResult err", err)
  1062. })
  1063. }
  1064. //去重冗余字段
  1065. func delFiled(k string) bool {
  1066. return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo"
  1067. }
  1068. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {
  1069. defer qu.Catch()
  1070. doc := j.Data
  1071. result := j.Result
  1072. _id := qu.BsonIdToSId((*doc)["_id"])
  1073. iscore, _ := ju.Config["fieldscore"].(bool)
  1074. if iscore { //打分
  1075. result = ScoreFields(j)
  1076. }
  1077. //结果排序
  1078. values := map[string][]*ju.SortObject{}
  1079. for key, val := range result {
  1080. fieldValue := map[string][]interface{}{}
  1081. if iscore { //走打分
  1082. for _, v := range val {
  1083. if len(fmt.Sprint(v.Value)) < 1 {
  1084. continue //去除空串
  1085. }
  1086. fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
  1087. }
  1088. } else { //不走打分,按出现频次
  1089. for _, v := range val {
  1090. if len(fmt.Sprint(v.Value)) < 1 {
  1091. continue //去除空串
  1092. }
  1093. if fieldValue[fmt.Sprint(v.Value)] == nil {
  1094. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  1095. } else {
  1096. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  1097. }
  1098. }
  1099. }
  1100. objects := []*ju.SortObject{}
  1101. for k, v := range fieldValue {
  1102. ValueStr := "" //第二排序
  1103. if reflect.TypeOf(v[1]).String() == "string" {
  1104. ValueStr = qu.ObjToString(v[1])
  1105. }
  1106. tmp := &ju.SortObject{
  1107. Key: k,
  1108. Value: qu.IntAll(v[0]),
  1109. Object: v[1],
  1110. ValueStr: ValueStr,
  1111. }
  1112. objects = append(objects, tmp)
  1113. }
  1114. values[key] = ju.ExtSort(objects)
  1115. }
  1116. return doc, result, _id, values
  1117. }
  1118. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1119. defer qu.Catch()
  1120. //获取审核字段
  1121. for _, field := range e.AuditFields {
  1122. //1.分包
  1123. if resulttmp["package"] != nil {
  1124. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1125. for _, val := range packagedata {
  1126. if val[field] != nil {
  1127. fv := qu.ObjToString(val[field])
  1128. if fv != "" {
  1129. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1130. e.RedisMatch(field, fv, val) //redis匹配
  1131. } else { //除了buyer和winner,其他字段走规则匹配
  1132. e.RuleMatch(field, fv, val)
  1133. }
  1134. }
  1135. }
  1136. }
  1137. }
  1138. //2.外围
  1139. if resulttmp[field] != nil {
  1140. fv := qu.ObjToString(resulttmp[field])
  1141. if fv != "" {
  1142. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1143. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1144. } else { //除了buyer和winner,其他字段走规则匹配
  1145. e.RuleMatch(field, fv, resulttmp)
  1146. }
  1147. }
  1148. }
  1149. }
  1150. }
  1151. //Redis匹配
  1152. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1153. defer qu.Catch()
  1154. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1155. if i == 0 { //reids未找到,执行规则匹配
  1156. val[field+"_isredis"] = false
  1157. e.RuleMatch(field, fv, val) //规则匹配
  1158. } else { //redis找到,打标识存库
  1159. val[field+"_isredis"] = true
  1160. }
  1161. }
  1162. //规则匹配
  1163. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1164. defer qu.Catch()
  1165. if fieldval != "" {
  1166. SMap := e.StartMatch(field, fieldval)
  1167. //SMap.AddKey(field+"_isaudit", false)
  1168. for _, k := range SMap.Keys {
  1169. tmpMap[k] = SMap.Map[k]
  1170. }
  1171. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1172. }
  1173. }
  1174. //开始规则匹配
  1175. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1176. defer qu.Catch()
  1177. SMap := pretreated.NewSortMap()
  1178. lock.Lock()
  1179. f := e.RecogFieldMap[field]
  1180. lock.Unlock()
  1181. if len(f) > 0 {
  1182. fid := qu.BsonIdToSId(f["_id"])
  1183. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1184. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1185. if textAfterRecogFieldPrerule != "" {
  1186. lock.Lock()
  1187. classMap := e.FidClassMap[fid]
  1188. lock.Unlock()
  1189. L:
  1190. for _, c := range classMap { //class
  1191. classid := qu.BsonIdToSId(c["_id"])
  1192. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1193. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1194. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1195. if textAfterClassPrerule != "" {
  1196. lock.Lock()
  1197. ruleMap := e.CidRuleMap[classid]
  1198. lock.Unlock()
  1199. for _, r := range ruleMap { //rule
  1200. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1201. s_name := qu.ObjToString(r["s_name"])
  1202. rule := r["rule"].([]interface{})
  1203. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1204. if textAfterRulePrerule != "" {
  1205. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1206. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1207. if savefield != "" { //保存字段不为空,存储代码信息
  1208. SMap.AddKey(field+"_"+savefield, s_name)
  1209. }
  1210. break L
  1211. }
  1212. }
  1213. }
  1214. }
  1215. }
  1216. }
  1217. }
  1218. return SMap
  1219. }