extract.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815
  1. package extract
  2. import (
  3. "fmt"
  4. log "github.com/donnie4w/go-logger/logger"
  5. "gopkg.in/mgo.v2/bson"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "unicode/utf8"
  16. )
  17. // 结果追踪调试
  18. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  19. defer qu.Catch()
  20. ext := TaskList[taskId]
  21. if ext == nil {
  22. ext = &ExtractTask{}
  23. ext.Id = taskId
  24. ext.InitTestTaskInfo(resultcoll, trackcoll)
  25. ext.IsRun = true
  26. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  27. }
  28. ext.InitSite()
  29. ext.InitRulePres()
  30. ext.InitRuleBacks(false)
  31. ext.InitRuleBacks(true)
  32. ext.InitRuleCore(false)
  33. ext.InitRuleCore(true)
  34. ext.InitPkgCore()
  35. ext.InitBlockRule()
  36. ext.InfoTypeList()
  37. ext.InitTag(false)
  38. ext.InitTag(true)
  39. ext.InitClearFn(false)
  40. ext.InitClearFn(true)
  41. ext.Lock()
  42. if ext.IsExtractCity && ext.ProvinceMap == nil { //版本上控制是否开始城市抽取
  43. ext.InitCityInfo()
  44. ext.InitAreaCode()
  45. ext.InitPostCode()
  46. }
  47. ext.Unlock()
  48. //质量审核
  49. ext.InitAuditFields()
  50. ext.InitAuditRule()
  51. ext.InitAuditClass()
  52. ext.InitAuditRecogField()
  53. //品牌抽取是否开启
  54. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  55. //价格个数抽取是否开启
  56. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  57. //附件抽取是否开启
  58. ext.InitFile()
  59. ext.TaskInfo.TestColl = resultcoll
  60. TaskList[taskId] = ext
  61. return RunExtractTestTask(ext, startId, num)
  62. }
  63. func IdTrans(startId string) bson.ObjectId {
  64. defer qu.Catch()
  65. return bson.ObjectIdHex(startId)
  66. }
  67. // 开始测试任务抽取~结果追踪
  68. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  69. n, _ := strconv.Atoi(num)
  70. id := IdTrans(startId)
  71. if id.Valid() {
  72. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  73. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  74. for _, v := range *list {
  75. if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
  76. continue
  77. }
  78. var j, jf *ju.Job
  79. var isSite bool
  80. j, _, isSite = ext.PreInfo(v)
  81. go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
  82. ext.TaskInfo.ProcessPool <- true
  83. }
  84. return true
  85. } else {
  86. return false
  87. }
  88. }
  89. // 启动抽取
  90. func StartExtractTaskId(taskId string) bool {
  91. defer qu.Catch()
  92. isgo := false
  93. ext := TaskList[taskId]
  94. if ext == nil {
  95. ext = &ExtractTask{}
  96. ext.Id = taskId
  97. ext.InitTaskInfo()
  98. isgo = true
  99. } else {
  100. ext.Id = taskId
  101. ext.InitTaskInfo()
  102. }
  103. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  104. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  105. ext.InitSite()
  106. ext.InitRulePres()
  107. ext.InitRuleBacks(false)
  108. ext.InitRuleBacks(true)
  109. ext.InitRuleCore(false)
  110. ext.InitRuleCore(true)
  111. ext.InitPkgCore()
  112. ext.InitBlockRule()
  113. ext.InfoTypeList()
  114. ext.InitTag(false)
  115. ext.InitTag(true)
  116. ext.InitClearFn(false)
  117. ext.InitClearFn(true)
  118. ext.Lock()
  119. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  120. ext.InitCityInfo()
  121. ext.InitAreaCode()
  122. ext.InitPostCode()
  123. }
  124. ext.Unlock()
  125. //质量审核
  126. ext.InitAuditFields()
  127. ext.InitAuditRule()
  128. ext.InitAuditClass()
  129. ext.InitAuditRecogField()
  130. //品牌抽取是否开启
  131. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  132. //价格个数抽取是否开启
  133. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  134. //附件抽取是否开启
  135. ext.InitFile()
  136. ext.IsRun = true
  137. go ext.ResultSave(true)
  138. go ext.BidSave(true)
  139. if isgo {
  140. go RunExtractTask(taskId)
  141. }
  142. TaskList[taskId] = ext
  143. return true
  144. }
  145. // 停止抽取
  146. func StopExtractTaskId(taskId string) bool {
  147. defer qu.Catch()
  148. ext := TaskList[taskId]
  149. if ext != nil {
  150. ext.IsRun = false
  151. TaskList[taskId] = ext
  152. }
  153. //更新task.s_extlastid
  154. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  155. return true
  156. }
  157. // 开始抽取
  158. func RunExtractTask(taskId string) {
  159. defer qu.Catch()
  160. ext := TaskList[taskId]
  161. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  162. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  163. pageNum := (count + PageSize - 1) / PageSize
  164. limit := PageSize
  165. if count < PageSize {
  166. limit = count
  167. }
  168. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  169. for i := 0; i < pageNum; i++ {
  170. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  171. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  172. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  173. for _, v := range *list {
  174. //根据标题判断是否抽取
  175. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  176. if !b {
  177. continue
  178. }
  179. _id := qu.BsonIdToSId(v["_id"])
  180. //log.Debug(_id)
  181. if !ext.IsRun {
  182. break
  183. }
  184. var j, jf *ju.Job
  185. var isSite bool
  186. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  187. v["isextFile"] = true
  188. j, jf, isSite = ext.PreInfo(v)
  189. } else {
  190. j, _, isSite = ext.PreInfo(v)
  191. }
  192. go ext.ExtractProcess(j, jf, isSite)
  193. ext.TaskInfo.LastExtId = _id
  194. ext.TaskInfo.ProcessPool <- true
  195. }
  196. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  197. if !ext.IsRun {
  198. break
  199. }
  200. }
  201. //更新task.s_extlastid
  202. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  203. }
  204. // 信息预处理-不和版本关联,取最新版本的配置项
  205. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  206. return (&ExtractTask{}).PreInfo(doc)
  207. }
  208. // 信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
  209. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  210. defer qu.Catch()
  211. //判断是否有附件这个字段
  212. var isextFile bool
  213. if doc["isextFile"] != nil {
  214. isextFile = doc["isextFile"].(bool)
  215. }
  216. isextFile = false
  217. detail := ""
  218. summary := qu.ObjToString(doc["summary"])
  219. detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
  220. //调整采用detail抽取
  221. if utf8.RuneCountInString(detail) > 10000 {
  222. detail = string(([]rune(detail))[:10000])
  223. }
  224. doc["detail"] = detail
  225. isClearnMoney := !clearMoneyReg.MatchString(detail)
  226. if isClearnMoney {
  227. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  228. }
  229. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  230. if isextFile {
  231. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  232. }
  233. toptype := qu.ObjToString(doc["toptype"])
  234. subtype := qu.ObjToString(doc["subtype"])
  235. if qu.ObjToString(doc["type"]) == "bid" {
  236. toptype = "结果"
  237. }
  238. if subtype == "其他" {
  239. subtype = "其它"
  240. }
  241. if toptype == "" || subtype == "" {
  242. toptype, subtype = "all", "all"
  243. }
  244. if toptype == "采购意向" || subtype == "采购意向" {
  245. toptype, subtype = "招标", "招标" //暂时按照"招标"
  246. }
  247. toMap := qu.ObjToMap(doc["jsondata"])
  248. if (*toMap) != nil {
  249. if (*toMap)["extweight"] == nil {
  250. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  251. }
  252. if (*toMap)["jsoncontent"] != nil {
  253. delete(*toMap, "jsoncontent")
  254. }
  255. for k, v := range *toMap {
  256. if _, ok := v.(float64); ok {
  257. continue
  258. } else if _, ok := v.(int64); ok {
  259. continue
  260. } else if _, ok2 := v.(string); ok2 {
  261. continue
  262. } else {
  263. delete(*toMap, k)
  264. }
  265. }
  266. }
  267. j = &ju.Job{
  268. SourceMid: qu.BsonIdToSId(doc["_id"]),
  269. Category: toptype,
  270. CategorySecond: subtype,
  271. Content: qu.ObjToString(doc["detail"]),
  272. SpiderCode: qu.ObjToString(doc["spidercode"]),
  273. Site: qu.ObjToString(doc["site"]),
  274. Title: qu.ObjToString(doc["title"]),
  275. Data: &doc,
  276. City: qu.ObjToString(doc["city"]),
  277. Province: qu.ObjToString(doc["area"]),
  278. Jsondata: toMap,
  279. Result: map[string][]*ju.ExtField{},
  280. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  281. RuleBlock: e.RuleBlock,
  282. Dataging: qu.IntAll(doc["dataging"]),
  283. IsClearnMoney: isClearnMoneystr,
  284. IsUnRulesTab: false,
  285. }
  286. if isextFile {
  287. jf = &ju.Job{
  288. SourceMid: qu.BsonIdToSId(doc["_id"]),
  289. Category: toptype,
  290. CategorySecond: subtype,
  291. Content: qu.ObjToString(doc["detailfile"]),
  292. SpiderCode: qu.ObjToString(doc["spidercode"]),
  293. Site: qu.ObjToString(doc["site"]),
  294. Title: qu.ObjToString(doc["title"]),
  295. Data: &doc,
  296. City: qu.ObjToString(doc["city"]),
  297. Province: qu.ObjToString(doc["area"]),
  298. Jsondata: toMap,
  299. Result: map[string][]*ju.ExtField{},
  300. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  301. RuleBlock: e.RuleBlock,
  302. IsFile: isextFile,
  303. Dataging: qu.IntAll(doc["dataging"]),
  304. IsClearnMoney: isClearnMoneystr,
  305. IsUnRulesTab: false,
  306. }
  307. }
  308. codeSite := j.SpiderCode
  309. //是否启用站点
  310. if value, ok := e.SiteMerge.Load(codeSite); ok {
  311. isSite = value.(bool)
  312. }
  313. if isSite {
  314. //是否配置站点
  315. exp, isSite := e.Luacodes.Load(codeSite)
  316. if isSite {
  317. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  318. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  319. }
  320. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  321. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  322. }
  323. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  324. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  325. }
  326. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  327. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  328. }
  329. }
  330. }
  331. qu.Try(func() { //不解析表格
  332. pretreated.AnalyStartNoTable(j, isSite, codeSite) //job.Block分块
  333. if isextFile && strings.TrimSpace(jf.Content) != "" {
  334. pretreated.AnalyStartNoTable(jf, isSite, codeSite)
  335. }
  336. }, func(err interface{}) {
  337. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  338. })
  339. return j, jf, isSite
  340. }
  341. // 抽取-正文
  342. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  343. e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性
  344. if jf != nil && jf.IsFile { //附件jf → j 合并
  345. e.ExtractDetail(jf, isSite, j.SpiderCode)
  346. for tmpk, xs := range jf.Result {
  347. if len(j.Result[tmpk]) == 0 {
  348. if tmpk == "budget" || tmpk == "bidamount" {
  349. for _, v := range xs {
  350. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  351. j.Result[tmpk] = append(j.Result[tmpk], v)
  352. }
  353. }
  354. } else {
  355. if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
  356. continue
  357. }
  358. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  359. }
  360. } else {
  361. if tmpk == "winner" { //均没有有效值~采用附件的
  362. isUsed := false
  363. for _, v := range j.Result[tmpk] {
  364. if v.Value != "" {
  365. isUsed = true
  366. break
  367. }
  368. }
  369. if !isUsed {
  370. if j.Category == "招标" && j.CategorySecond != "单一" {
  371. continue
  372. }
  373. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  374. }
  375. }
  376. }
  377. }
  378. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  379. if j.Category == "招标" && j.CategorySecond != "单一" {
  380. } else {
  381. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  382. }
  383. }
  384. if len(j.PackageInfo) == 0 && isUsedPackageJF(jf.PackageInfo) {
  385. j.PackageInfo = jf.PackageInfo
  386. }
  387. }
  388. if isSite {
  389. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  390. if ok && ismerge.(bool) {
  391. tmpj := &ju.Job{
  392. SourceMid: j.SourceMid,
  393. Category: j.Category,
  394. CategorySecond: j.CategorySecond,
  395. Content: j.Content,
  396. SpiderCode: j.SpiderCode,
  397. //Domain: qu.ObjToString(doc["domain"]),
  398. //Href: qu.ObjToString(doc["href"]),
  399. Title: j.Title,
  400. Data: j.Data,
  401. City: j.City,
  402. Province: j.Province,
  403. Jsondata: j.Jsondata,
  404. Result: map[string][]*ju.ExtField{},
  405. BuyerAddr: j.BuyerAddr,
  406. RuleBlock: e.RuleBlock,
  407. }
  408. qu.Try(func() {
  409. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  410. }, func(err interface{}) {
  411. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  412. })
  413. e.ExtractDetail(tmpj, false, "")
  414. //合并数据
  415. j.Block = append(j.Block, tmpj.Block...)
  416. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  417. for tmpk, _ := range j.Result {
  418. if len(tmpj.Result[tmpk]) > 0 {
  419. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  420. }
  421. }
  422. for tmpk, _ := range tmpj.Result {
  423. if len(j.Result[tmpk]) == 0 {
  424. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  425. }
  426. }
  427. }
  428. }
  429. //分析抽取结果并保存
  430. AnalysisSaveResult(j, jf, e)
  431. <-e.TaskInfo.ProcessPool
  432. }
  433. // 抽取-正文-规则等 detail
  434. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  435. qu.Try(func() {
  436. doc := *j.Data
  437. tmprules := map[string][]*RuleCore{}
  438. lockrule.Lock()
  439. //加载分类抽取配置
  440. if j.Category == "all" || j.CategorySecond == "all" {
  441. if isSite {
  442. for k, vc1 := range e.SiteRuleCores["all_all"] {
  443. tmprules[k] = vc1
  444. }
  445. } else {
  446. for k, vc1 := range e.RuleCores["all_all"] {
  447. tmprules[k] = vc1
  448. }
  449. }
  450. } else {
  451. if isSite {
  452. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  453. tmprules[k] = vc1
  454. }
  455. //找不到配置类别全抽
  456. if tmprules == nil || len(tmprules) == 0 {
  457. for k, vc1 := range e.SiteRuleCores["all_all"] {
  458. tmprules[k] = vc1
  459. }
  460. }
  461. } else {
  462. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  463. tmprules[k] = vc1
  464. }
  465. //找不到配置类别全抽
  466. if tmprules == nil || len(tmprules) == 0 {
  467. for k, vc1 := range e.RuleCores["all_all"] {
  468. tmprules[k] = vc1
  469. }
  470. }
  471. }
  472. }
  473. if len(tmprules) < 1 { //分类未覆盖部分
  474. if isSite {
  475. for k, vc1 := range e.RuleCores["all_all"] {
  476. tmprules[k] = vc1
  477. }
  478. } else {
  479. for k, vc1 := range e.SiteRuleCores["all_all"] {
  480. tmprules[k] = vc1
  481. }
  482. }
  483. }
  484. lockrule.Unlock()
  485. //抽取规则
  486. for _, vc1 := range tmprules {
  487. for _, vc := range vc1 {
  488. tmp := ju.DeepCopy(doc).(map[string]interface{})
  489. //是否进入逻辑
  490. if !ju.Logic(vc.LuaLogic, tmp) {
  491. continue
  492. }
  493. if vc.Field == "bidamount" {
  494. //log.Debug("调试抽取字段")
  495. }
  496. //抽取-前置规则
  497. //for _, v := range vc.RulePres {
  498. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  499. //}
  500. // log.Debug("抽取-前置规则", tmp)
  501. //抽取-规则
  502. ExtRuleCore(tmp, e, vc, j, isSite)
  503. // log.Debug("抽取-规则", tmp)
  504. //抽取-后置规则
  505. for _, v := range vc.RuleBacks {
  506. ExtRegBack(j, v, e.TaskInfo, vc)
  507. }
  508. //kv规则
  509. for _, v := range vc.KVRuleCores {
  510. ExtRuleKV(j, v, e.TaskInfo)
  511. }
  512. //项目名称未能抽取到,标题来凑
  513. if vc.Field == "projectname" {
  514. if vc.ExtFrom == "title" {
  515. isextitle := true
  516. for _, v := range j.Result[vc.Field] {
  517. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  518. isextitle = false
  519. break
  520. }
  521. }
  522. if isextitle { //标题加入选举
  523. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  524. if isSite {
  525. field.Score = 1
  526. }
  527. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  528. }
  529. }
  530. for i := 0; i < 3; i++ {
  531. for _, v := range vc.RuleBacks {
  532. ExtRegBack(j, v, e.TaskInfo, vc)
  533. }
  534. }
  535. }
  536. }
  537. }
  538. //全局后置规则
  539. if isSite {
  540. for _, v := range e.SiteRuleBacks {
  541. ExtRegBack(j, v, e.TaskInfo, nil)
  542. }
  543. } else {
  544. for _, v := range e.RuleBacks {
  545. ExtRegBack(j, v, e.TaskInfo, nil)
  546. }
  547. }
  548. //函数清理
  549. for key, val := range j.Result {
  550. for i, v := range val {
  551. if v.Field == "projectname" && v.Type == "table" {
  552. break
  553. }
  554. if key == "budget" || key == "bidamount" {
  555. if _, ok := v.Value.(float64); ok && !v.IsTrue {
  556. continue
  557. }
  558. }
  559. lockclear.Lock()
  560. var cfn = []string{}
  561. if isSite {
  562. cfn = e.SiteClearFn[key]
  563. if len(cfn) == 0 {
  564. cfn = e.ClearFn[key]
  565. }
  566. } else {
  567. cfn = e.ClearFn[key]
  568. }
  569. lockclear.Unlock()
  570. if len(cfn) == 0 {
  571. continue
  572. }
  573. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  574. if key == "budget" || key == "bidamount" {
  575. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  576. j.Result[key][i].IsTrue = true
  577. } else {
  578. j.Result[key][i].Value = data[0]
  579. continue
  580. }
  581. }
  582. before, _ := v.Value.(string)
  583. v.Value = data[0]
  584. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  585. //添加行数清理的日志 , 清理特殊符号
  586. lockclear.Lock()
  587. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  588. text := qu.ObjToString(v.Value)
  589. before = text
  590. //指定清理--新增-函数清理-其他清理
  591. if key == "winner" || key == "agency" || key == "buyer" {
  592. text = strings.ReplaceAll(text, "【", "")
  593. text = strings.ReplaceAll(text, "】", "")
  594. }
  595. v.Value = clear.OtherClean(key, text)
  596. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  597. }
  598. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  599. lockclear.Unlock()
  600. }
  601. }
  602. PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
  603. //bs, _ := json.Marshal(j.Result)
  604. //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  605. }, func(err interface{}) {
  606. log.Debug("ExtractProcess err", err, j.SourceMid)
  607. })
  608. }
  609. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  610. qu.Try(func() {
  611. doc := *j.Data
  612. //抽取规则
  613. tmprules := map[string][]*RuleCore{}
  614. lockrule.Lock()
  615. if j.Category == "all" || j.CategorySecond == "all" {
  616. for k, vc1 := range e.RuleCores["all_all"] {
  617. tmprules[k] = vc1
  618. }
  619. } else {
  620. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  621. tmprules[k] = vc1
  622. }
  623. }
  624. lockrule.Unlock()
  625. for _, vc1 := range tmprules {
  626. for _, vc := range vc1 {
  627. tmp := ju.DeepCopy(doc).(map[string]interface{})
  628. //是否进入逻辑
  629. if !ju.Logic(vc.LuaLogic, tmp) {
  630. continue
  631. }
  632. //抽取-前置规则
  633. //for _, v := range vc.RulePres {
  634. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  635. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  636. // }
  637. //}
  638. //抽取-规则
  639. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  640. ExtRuleCore(tmp, e, vc, j, isSite)
  641. }
  642. //抽取-后置规则
  643. for _, v := range vc.RuleBacks {
  644. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  645. ExtRegBack(j, v, e.TaskInfo, vc)
  646. }
  647. }
  648. }
  649. }
  650. //全局后置规则
  651. for _, v := range e.RuleBacks {
  652. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  653. ExtRegBack(j, v, e.TaskInfo, nil)
  654. }
  655. }
  656. //函数清理
  657. for key, val := range j.Result {
  658. for _, v := range val {
  659. lockclear.Lock()
  660. var cfn = []string{}
  661. if isSite {
  662. cfn = e.SiteClearFn[key]
  663. if len(cfn) == 0 {
  664. cfn = e.ClearFn[key]
  665. }
  666. } else {
  667. cfn = e.ClearFn[key]
  668. }
  669. lockclear.Unlock()
  670. if len(cfn) == 0 {
  671. continue
  672. }
  673. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  674. v.Value = data[0]
  675. //清理特殊符号
  676. lockclear.Lock()
  677. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  678. clear.MesField[key] != nil {
  679. text := qu.ObjToString(v.Value)
  680. text = clear.OtherClean(key, text)
  681. v.Value = text
  682. }
  683. lockclear.Unlock()
  684. }
  685. }
  686. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  687. //bs, _ := json.Marshal(j.Result)
  688. //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  689. }, func(err interface{}) {
  690. log.Debug("ExtractProcess err", err)
  691. })
  692. }
  693. // 审查
  694. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  695. defer qu.Catch()
  696. //获取审核字段
  697. for _, field := range e.AuditFields {
  698. //1.分包
  699. if resulttmp["package"] != nil {
  700. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  701. for _, val := range packagedata {
  702. if val[field] != nil {
  703. fv := qu.ObjToString(val[field])
  704. if fv != "" {
  705. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  706. e.RedisMatch(field, fv, val) //redis匹配
  707. } else { //除了buyer和winner,其他字段走规则匹配
  708. e.RuleMatch(field, fv, val)
  709. }
  710. }
  711. }
  712. }
  713. }
  714. //2.外围
  715. if resulttmp[field] != nil {
  716. fv := qu.ObjToString(resulttmp[field])
  717. if fv != "" {
  718. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  719. e.RedisMatch(field, fv, resulttmp) //redis匹配
  720. } else { //除了buyer和winner,其他字段走规则匹配
  721. e.RuleMatch(field, fv, resulttmp)
  722. }
  723. }
  724. }
  725. }
  726. }
  727. // Redis匹配
  728. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  729. defer qu.Catch()
  730. i := redis.GetInt(field, field+"_"+fv) //查找redis
  731. if i == 0 { //reids未找到,执行规则匹配
  732. val[field+"_isredis"] = false
  733. e.RuleMatch(field, fv, val) //规则匹配
  734. } else { //redis找到,打标识存库
  735. val[field+"_isredis"] = true
  736. }
  737. }
  738. // 规则匹配
  739. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  740. defer qu.Catch()
  741. if fieldval != "" {
  742. SMap := e.StartMatch(field, fieldval)
  743. //SMap.AddKey(field+"_isaudit", false)
  744. for _, k := range SMap.Keys {
  745. tmpMap[k] = SMap.Map[k]
  746. }
  747. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  748. }
  749. }
  750. // 开始规则匹配
  751. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  752. defer qu.Catch()
  753. SMap := pretreated.NewSortMap()
  754. lock.Lock()
  755. f := e.RecogFieldMap[field]
  756. lock.Unlock()
  757. if len(f) > 0 {
  758. fid := qu.BsonIdToSId(f["_id"])
  759. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  760. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  761. if textAfterRecogFieldPrerule != "" {
  762. lock.Lock()
  763. classMap := e.FidClassMap[fid]
  764. lock.Unlock()
  765. L:
  766. for _, c := range classMap { //class
  767. classid := qu.BsonIdToSId(c["_id"])
  768. classPrerule := qu.ObjToString(c["s_class_prerule"])
  769. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  770. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  771. if textAfterClassPrerule != "" {
  772. lock.Lock()
  773. ruleMap := e.CidRuleMap[classid]
  774. lock.Unlock()
  775. for _, r := range ruleMap { //rule
  776. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  777. s_name := qu.ObjToString(r["s_name"])
  778. rule := r["rule"].([]interface{})
  779. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  780. if textAfterRulePrerule != "" {
  781. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  782. if b { //匹配到一个分类下某个规则时,不再继续匹配
  783. if savefield != "" { //保存字段不为空,存储代码信息
  784. SMap.AddKey(field+"_"+savefield, s_name)
  785. }
  786. break L
  787. }
  788. }
  789. }
  790. }
  791. }
  792. }
  793. }
  794. return SMap
  795. }