extract.go 67 KB


  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. "github.com/PuerkitoBio/goquery"
  20. log "github.com/donnie4w/go-logger/logger"
  21. "gopkg.in/mgo.v2/bson"
  22. )
  23. var (
  24. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  25. cut = ju.NewCut() //获取正文并清理
  26. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  27. TaskList map[string]*ExtractTask //任务列表
  28. ClearTaskList map[string]*ClearTask //清理任务列表
  29. saveLimit = 100 //抽取日志批量保存
  30. PageSize = 5000 //查询分页
  31. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. )
  34. //启动测试抽取
  35. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  36. defer qu.Catch()
  37. ext := &ExtractTask{}
  38. ext.Id = taskId
  39. ext.IsRun = true
  40. ext.InitTestTaskInfo(resultcoll, trackcoll)
  41. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  42. ext.InitSite()
  43. ext.InitRulePres()
  44. ext.InitRuleBacks(false)
  45. ext.InitRuleBacks(true)
  46. ext.InitRuleCore(false)
  47. ext.InitRuleCore(true)
  48. ext.InitPkgCore()
  49. ext.InitBlockRule()
  50. ext.InfoTypeList()
  51. ext.InitTag(false)
  52. ext.InitTag(true)
  53. ext.InitClearFn(false)
  54. ext.InitClearFn(true)
  55. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  56. //初始化城市DFA信息
  57. ext.InitCityInfo()
  58. //ext.InitCityDFA()
  59. ext.InitAreaCode()
  60. ext.InitPostCode()
  61. }
  62. //质量审核
  63. ext.InitAuditFields()
  64. ext.InitAuditRule()
  65. ext.InitAuditClass()
  66. ext.InitAuditRecogField()
  67. //品牌抽取是否开启
  68. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  69. //价格个数抽取是否开启
  70. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  71. //附件抽取是否开启
  72. ext.InitFile()
  73. return RunExtractTestTask(ext, startId, num)
  74. }
  75. func IdTrans(startId string) bson.ObjectId {
  76. defer qu.Catch()
  77. return bson.ObjectIdHex(startId)
  78. }
  79. //开始测试任务抽取
  80. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  81. n, _ := strconv.Atoi(num)
  82. id := IdTrans(startId)
  83. if id.Valid() {
  84. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  85. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  86. for _, v := range *list {
  87. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  88. continue
  89. }
  90. var j, jf *ju.Job
  91. var isSite bool
  92. if ext.IsFileField && v["projectinfo"] != nil {
  93. v["isextFile"] = true
  94. j, jf, isSite = ext.PreInfo(v)
  95. } else {
  96. j, _, isSite = ext.PreInfo(v)
  97. }
  98. go ext.ExtractProcess(j, jf, isSite)
  99. ext.TaskInfo.ProcessPool <- true
  100. }
  101. return true
  102. } else {
  103. return false
  104. }
  105. }
  106. //启动抽取
  107. func StartExtractTaskId(taskId string) bool {
  108. defer qu.Catch()
  109. isgo := false
  110. ext := TaskList[taskId]
  111. if ext == nil {
  112. ext = &ExtractTask{}
  113. ext.Id = taskId
  114. ext.InitTaskInfo()
  115. isgo = true
  116. } else {
  117. ext.Id = taskId
  118. ext.InitTaskInfo()
  119. }
  120. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  121. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  122. ext.InitSite()
  123. ext.InitRulePres()
  124. ext.InitRuleBacks(false)
  125. ext.InitRuleBacks(true)
  126. ext.InitRuleCore(false)
  127. ext.InitRuleCore(true)
  128. ext.InitPkgCore()
  129. ext.InitBlockRule()
  130. ext.InfoTypeList()
  131. ext.InitTag(false)
  132. ext.InitTag(true)
  133. ext.InitClearFn(false)
  134. ext.InitClearFn(true)
  135. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  136. //初始化城市DFA信息
  137. //ext.InitCityDFA()
  138. ext.InitCityInfo()
  139. ext.InitAreaCode()
  140. ext.InitPostCode()
  141. }
  142. //质量审核
  143. ext.InitAuditFields()
  144. ext.InitAuditRule()
  145. ext.InitAuditClass()
  146. ext.InitAuditRecogField()
  147. //品牌抽取是否开启
  148. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  149. //价格个数抽取是否开启
  150. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  151. //附件抽取是否开启
  152. ext.InitFile()
  153. ext.IsRun = true
  154. go ext.ResultSave(true)
  155. go ext.BidSave(true)
  156. if isgo {
  157. go RunExtractTask(taskId)
  158. }
  159. TaskList[taskId] = ext
  160. return true
  161. }
  162. //停止抽取
  163. func StopExtractTaskId(taskId string) bool {
  164. defer qu.Catch()
  165. ext := TaskList[taskId]
  166. if ext != nil {
  167. ext.IsRun = false
  168. TaskList[taskId] = ext
  169. }
  170. //更新task.s_extlastid
  171. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  172. return true
  173. }
  174. //开始抽取
  175. func RunExtractTask(taskId string) {
  176. defer qu.Catch()
  177. ext := TaskList[taskId]
  178. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  179. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  180. pageNum := (count + PageSize - 1) / PageSize
  181. limit := PageSize
  182. if count < PageSize {
  183. limit = count
  184. }
  185. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  186. for i := 0; i < pageNum; i++ {
  187. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  188. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  189. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  190. for _, v := range *list {
  191. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  192. continue
  193. }
  194. //根据标题判断是否抽取
  195. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  196. if !b {
  197. continue
  198. }
  199. _id := qu.BsonIdToSId(v["_id"])
  200. //log.Debug(_id)
  201. if !ext.IsRun {
  202. break
  203. }
  204. var j, jf *ju.Job
  205. var isSite bool
  206. if ext.IsFileField && v["projectinfo"] != nil {
  207. v["isextFile"] = true
  208. j, jf, isSite = ext.PreInfo(v)
  209. } else {
  210. j, _, isSite = ext.PreInfo(v)
  211. }
  212. go ext.ExtractProcess(j, jf, isSite)
  213. ext.TaskInfo.LastExtId = _id
  214. ext.TaskInfo.ProcessPool <- true
  215. }
  216. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  217. if !ext.IsRun {
  218. break
  219. }
  220. }
  221. //更新task.s_extlastid
  222. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  223. }
  224. //信息预处理-不和版本关联,取最新版本的配置项
  225. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  226. return (&ExtractTask{}).PreInfo(doc)
  227. }
  228. //信息预处理-和版本关联
  229. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  230. defer qu.Catch()
  231. //判断是否有附件这个字段
  232. var isextFile bool
  233. if doc["isextFile"] != nil {
  234. isextFile = doc["isextFile"].(bool)
  235. }
  236. detail := ""
  237. d1, _ := doc["detail"].(string)
  238. d2, _ := doc["contenthtml"].(string)
  239. if len(d1) >= len(d2) || d2 == "" {
  240. detail = d1
  241. } else {
  242. detail = d2
  243. }
  244. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  245. d3, _ := doc["summary"].(string)
  246. //全文的需要修复表格
  247. detail = pretreated.RepairCon(detail)
  248. detail = ju.CutLableStr(d3 + "\n" + detail)
  249. detail = cut.ClearHtml(d3 + "\n" + detail)
  250. doc["detail"] = detail
  251. if isextFile {
  252. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  253. }
  254. if utf8.RuneCountInString(detail) < 2000 {
  255. detail += qu.ObjToString(doc["detailfile"])
  256. doc["detail"] = detail
  257. } else {
  258. //正文小于200个字,有附件把附件内容加到正文
  259. tmpDeatil := detail
  260. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  261. if err == nil {
  262. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  263. if conlen < 2000 {
  264. if isextFile {
  265. detail += qu.ObjToString(doc["detailfile"])
  266. doc["detail"] = detail
  267. }
  268. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  269. //防止文本过长,造成抽取阻塞
  270. log.Debug("文本太长", doc["_id"], conlen)
  271. doc["detail"] = d3
  272. }
  273. }
  274. }
  275. toptype := qu.ObjToString(doc["toptype"])
  276. subtype := qu.ObjToString(doc["subtype"])
  277. if qu.ObjToString(doc["type"]) == "bid" {
  278. toptype = "结果"
  279. }
  280. if toptype == "" {
  281. toptype = "all"
  282. }
  283. if subtype == "" {
  284. subtype = "all"
  285. }
  286. if toptype == "其它" || subtype == "其它" || subtype == "其他" || subtype == "结果变更" {
  287. toptype = "all"
  288. subtype = "all"
  289. }
  290. toMap := qu.ObjToMap(doc["jsondata"])
  291. //log.Debug("toMap", toMap)
  292. if (*toMap) != nil {
  293. if (*toMap)["extweight"] == nil {
  294. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  295. }
  296. }
  297. j = &ju.Job{
  298. SourceMid: qu.BsonIdToSId(doc["_id"]),
  299. Category: toptype,
  300. CategorySecond: subtype,
  301. Content: qu.ObjToString(doc["detail"]),
  302. SpiderCode: qu.ObjToString(doc["spidercode"]),
  303. Site: qu.ObjToString(doc["site"]),
  304. //Domain: qu.ObjToString(doc["domain"]),
  305. //Href: qu.ObjToString(doc["href"]),
  306. Title: qu.ObjToString(doc["title"]),
  307. Data: &doc,
  308. City: qu.ObjToString(doc["city"]),
  309. Province: qu.ObjToString(doc["area"]),
  310. Jsondata: toMap,
  311. Result: map[string][]*ju.ExtField{},
  312. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  313. RuleBlock: e.RuleBlock,
  314. }
  315. if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
  316. delete((*j.Jsondata), "jsoncontent")
  317. }
  318. if isextFile {
  319. jf = &ju.Job{
  320. SourceMid: qu.BsonIdToSId(doc["_id"]),
  321. Category: toptype,
  322. Content: qu.ObjToString(doc["detailfile"]),
  323. SpiderCode: qu.ObjToString(doc["spidercode"]),
  324. Site: qu.ObjToString(doc["site"]),
  325. Title: qu.ObjToString(doc["title"]),
  326. Data: &doc,
  327. City: qu.ObjToString(doc["city"]),
  328. Province: qu.ObjToString(doc["area"]),
  329. Jsondata: toMap,
  330. Result: map[string][]*ju.ExtField{},
  331. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  332. RuleBlock: e.RuleBlock,
  333. IsFile: isextFile,
  334. }
  335. if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
  336. delete((*jf.Jsondata), "jsoncontent")
  337. }
  338. }
  339. codeSite := j.SpiderCode
  340. //是否启用站点
  341. if value, ok := e.SiteMerge.Load(codeSite); ok {
  342. isSite = value.(bool)
  343. }
  344. if isSite {
  345. //是否配置站点
  346. exp, isSite := e.Luacodes.Load(codeSite)
  347. if isSite {
  348. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  349. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  350. }
  351. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  352. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  353. }
  354. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  355. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  356. }
  357. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  358. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  359. }
  360. }
  361. }
  362. qu.Try(func() {
  363. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  364. if isextFile {
  365. pretreated.AnalyStart(jf, isSite, codeSite)
  366. }
  367. }, func(err interface{}) {
  368. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  369. })
  370. return j, jf, isSite
  371. }
  372. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  373. func file2text(doc *map[string]interface{}) {
  374. tmpstr := ""
  375. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  376. for _, attachs := range attach_text {
  377. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  378. for _, fileinfo := range fileinfos {
  379. if ff, ok := fileinfo.(map[string]interface{}); ok {
  380. attach_url := qu.ObjToString(ff["attach_url"])
  381. bs := ju.OssGetObject(attach_url)
  382. if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  383. tmpstr += bs + "\n"
  384. } else {
  385. break
  386. }
  387. }
  388. }
  389. }
  390. }
  391. }
  392. (*doc)["detailfile"] = tmpstr
  393. }
  394. //抽取
  395. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  396. e.ExtractDetail(j, isSite, j.SpiderCode)
  397. if jf != nil && jf.IsFile {
  398. e.ExtractFile(jf, isSite, j.SpiderCode)
  399. }
  400. if isSite {
  401. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  402. if ok && ismerge.(bool) {
  403. tmpj := &ju.Job{
  404. SourceMid: j.SourceMid,
  405. Category: j.Category,
  406. CategorySecond: j.CategorySecond,
  407. Content: j.Content,
  408. SpiderCode: j.SpiderCode,
  409. //Domain: qu.ObjToString(doc["domain"]),
  410. //Href: qu.ObjToString(doc["href"]),
  411. Title: j.Title,
  412. Data: j.Data,
  413. City: j.City,
  414. Province: j.Province,
  415. Jsondata: j.Jsondata,
  416. Result: map[string][]*ju.ExtField{},
  417. BuyerAddr: j.BuyerAddr,
  418. RuleBlock: e.RuleBlock,
  419. }
  420. qu.Try(func() {
  421. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  422. }, func(err interface{}) {
  423. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  424. })
  425. e.ExtractDetail(tmpj, false, "")
  426. //if jf != nil && jf.IsFile {
  427. // e.ExtractFile(jf, false, "")
  428. //}
  429. //合并数据
  430. j.Block = append(j.Block, tmpj.Block...)
  431. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  432. for tmpk, _ := range j.Result {
  433. if len(tmpj.Result[tmpk]) > 0 {
  434. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  435. }
  436. }
  437. for tmpk, _ := range tmpj.Result {
  438. if len(j.Result[tmpk]) == 0 {
  439. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  440. }
  441. }
  442. }
  443. }
  444. //分析抽取结果并保存
  445. AnalysisSaveResult(j, jf, e)
  446. <-e.TaskInfo.ProcessPool
  447. }
  448. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  449. qu.Try(func() {
  450. doc := *j.Data
  451. //全局前置规则,结果覆盖doc属性
  452. //for _, v := range e.RulePres {
  453. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  454. //}
  455. tmprules := map[string][]*RuleCore{}
  456. lockrule.Lock()
  457. if j.Category == "all" || j.CategorySecond == "all" {
  458. if isSite {
  459. for k, vc1 := range e.SiteRuleCores["all_all"] {
  460. tmprules[k] = vc1
  461. }
  462. } else {
  463. for k, vc1 := range e.RuleCores["all_all"] {
  464. tmprules[k] = vc1
  465. }
  466. }
  467. } else {
  468. if isSite {
  469. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  470. tmprules[k] = vc1
  471. }
  472. } else {
  473. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  474. tmprules[k] = vc1
  475. }
  476. }
  477. }
  478. if len(tmprules) < 1 { //分类未覆盖部分
  479. if isSite {
  480. for k, vc1 := range e.RuleCores["all_all"] {
  481. tmprules[k] = vc1
  482. }
  483. } else {
  484. for k, vc1 := range e.SiteRuleCores["all_all"] {
  485. tmprules[k] = vc1
  486. }
  487. }
  488. }
  489. lockrule.Unlock()
  490. //抽取规则
  491. for _, vc1 := range tmprules {
  492. for _, vc := range vc1 {
  493. tmp := ju.DeepCopy(doc).(map[string]interface{})
  494. //是否进入逻辑
  495. if !ju.Logic(vc.LuaLogic, tmp) {
  496. continue
  497. }
  498. ////抽取-前置规则
  499. //for _, v := range vc.RulePres {
  500. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  501. //}
  502. // log.Debug("抽取-前置规则", tmp)
  503. //抽取-规则
  504. ExtRuleCore(tmp, e, vc, j, isSite)
  505. // log.Debug("抽取-规则", tmp)
  506. //抽取-后置规则
  507. for _, v := range vc.RuleBacks {
  508. ExtRegBack(j, v, e.TaskInfo, vc)
  509. }
  510. //kv规则
  511. for _, v := range vc.KVRuleCores {
  512. ExtRuleKV(j, v, e.TaskInfo)
  513. }
  514. // log.Debug("抽取-后置规则", tmp)
  515. //项目名称未能抽取到,标题来凑
  516. if vc.Field == "projectname" {
  517. if vc.ExtFrom == "title" {
  518. isextitle := true
  519. for _, v := range j.Result[vc.Field] {
  520. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  521. isextitle = false
  522. break
  523. }
  524. }
  525. if isextitle { //标题加入选举
  526. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  527. if isSite {
  528. field.Score = 1
  529. }
  530. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  531. }
  532. }
  533. for i := 0; i < 3; i++ {
  534. for _, v := range vc.RuleBacks {
  535. ExtRegBack(j, v, e.TaskInfo, vc)
  536. }
  537. }
  538. }
  539. }
  540. }
  541. //全局后置规则
  542. if isSite {
  543. for _, v := range e.SiteRuleBacks {
  544. ExtRegBack(j, v, e.TaskInfo, nil)
  545. }
  546. } else {
  547. for _, v := range e.RuleBacks {
  548. ExtRegBack(j, v, e.TaskInfo, nil)
  549. }
  550. }
  551. //函数清理
  552. for key, val := range j.Result {
  553. for i, v := range val {
  554. // if v.ExtFrom == "title"&& v.Field == "buyer"{
  555. // qu.Debug("title---",v.Value)
  556. // }else if v.Field == "buyer"{
  557. // qu.Debug("text---",v.Value)
  558. // }
  559. lockclear.Lock()
  560. var cfn = []string{}
  561. if isSite {
  562. cfn = e.SiteClearFn[key]
  563. } else {
  564. cfn = e.ClearFn[key]
  565. }
  566. lockclear.Unlock()
  567. if len(cfn) == 0 {
  568. continue
  569. }
  570. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  571. if key == "budget" || key == "bidamount" {
  572. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  573. j.Result[key][i].IsTrue = true
  574. } else {
  575. j.Result[key][i].Value = data[0]
  576. continue
  577. }
  578. }
  579. before, _ := v.Value.(string)
  580. v.Value = data[0]
  581. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  582. //添加行数清理的日志
  583. //清理特殊符号
  584. lockclear.Lock()
  585. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  586. text := qu.ObjToString(v.Value)
  587. before = text
  588. v.Value = clear.OtherClean(key, text)
  589. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  590. }
  591. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  592. lockclear.Unlock()
  593. }
  594. }
  595. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  596. // bs, _ := json.Marshal(j.Result)
  597. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  598. }, func(err interface{}) {
  599. log.Debug("ExtractProcess err", err)
  600. })
  601. }
  602. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  603. qu.Try(func() {
  604. doc := *j.Data
  605. //全局前置规则,结果覆盖doc属性
  606. // for _, v := range e.RulePres {
  607. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  608. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  609. // }
  610. // }
  611. //抽取规则
  612. tmprules := map[string][]*RuleCore{}
  613. lockrule.Lock()
  614. if j.Category == "all" || j.CategorySecond == "all" {
  615. for k, vc1 := range e.RuleCores["all_all"] {
  616. tmprules[k] = vc1
  617. }
  618. } else {
  619. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  620. tmprules[k] = vc1
  621. }
  622. }
  623. lockrule.Unlock()
  624. for _, vc1 := range tmprules {
  625. for _, vc := range vc1 {
  626. tmp := ju.DeepCopy(doc).(map[string]interface{})
  627. //是否进入逻辑
  628. if !ju.Logic(vc.LuaLogic, tmp) {
  629. continue
  630. }
  631. //抽取-前置规则
  632. // for _, v := range vc.RulePres {
  633. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  634. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  635. // }
  636. // }
  637. // log.Debug("抽取-前置规则", tmp)
  638. //抽取-规则
  639. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  640. ExtRuleCore(tmp, e, vc, j, isSite)
  641. }
  642. // log.Debug("抽取-规则", tmp)
  643. //抽取-后置规则
  644. for _, v := range vc.RuleBacks {
  645. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  646. ExtRegBack(j, v, e.TaskInfo, vc)
  647. }
  648. }
  649. // log.Debug("抽取-后置规则", tmp)
  650. }
  651. }
  652. //全局后置规则
  653. for _, v := range e.RuleBacks {
  654. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  655. ExtRegBack(j, v, e.TaskInfo, nil)
  656. }
  657. }
  658. //函数清理
  659. for key, val := range j.Result {
  660. for _, v := range val {
  661. lockclear.Lock()
  662. cfn := e.ClearFn[key]
  663. lockclear.Unlock()
  664. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  665. v.Value = data[0]
  666. //清理特殊符号
  667. lockclear.Lock()
  668. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  669. clear.MesField[key] != nil {
  670. text := qu.ObjToString(v.Value)
  671. text = clear.OtherClean(key, text)
  672. v.Value = text
  673. }
  674. lockclear.Unlock()
  675. }
  676. }
  677. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  678. // bs, _ := json.Marshal(j.Result)
  679. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  680. }, func(err interface{}) {
  681. log.Debug("ExtractProcess err", err)
  682. })
  683. }
  684. //前置过滤
  685. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  686. defer qu.Catch()
  687. before := ju.DeepCopy(doc).(map[string]interface{})
  688. extinfo := map[string]interface{}{}
  689. if in.IsLua {
  690. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  691. if j != nil {
  692. lua.Block = j.Block
  693. }
  694. extinfo = lua.RunScript("pre")
  695. for k, v := range extinfo { //结果覆盖原doc
  696. doc[k] = v
  697. }
  698. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  699. } else {
  700. var key string
  701. if !j.IsFile {
  702. key = qu.If(in.Field == "", "detail", in.Field).(string)
  703. } else {
  704. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  705. }
  706. text := qu.ObjToString(doc[key])
  707. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  708. doc[key] = extinfo[key] //结果覆盖原doc
  709. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  710. }
  711. return doc
  712. }
  713. //抽取-规则
  714. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  715. //候选人加入
  716. var kvMap map[string][]map[string]interface{}
  717. extByReg := true
  718. if vc.ExtFrom != "title" {
  719. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  720. }
  721. for _, v := range vc.RuleCores {
  722. if v.IsLua {
  723. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  724. } else if extByReg {
  725. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  726. }
  727. }
  728. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  729. if vc.Field == "budget" && len(kvMap) == 0 {
  730. if len(j.BlockPackage) == 1 {
  731. for _, bp := range j.BlockPackage {
  732. for fieldname, field := range vc.LFields {
  733. if field != vc.Field {
  734. continue
  735. }
  736. tp := ""
  737. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  738. if k == 0 {
  739. tp = "colon"
  740. } else if k == 1 {
  741. tp = "space"
  742. } else if k == 2 {
  743. tp = "table"
  744. }
  745. if v == nil || v.KvTags == nil {
  746. continue
  747. }
  748. for _, vv := range v.KvTags[fieldname] {
  749. text := ju.TrimLRSpace(vv.Value, "")
  750. if text != "" {
  751. tmp := &ju.ExtField{
  752. ExtFrom: "package",
  753. Field: vc.Field,
  754. Code: "CL_分包",
  755. Type: tp,
  756. MatchType: "package",
  757. RuleText: bp.Text,
  758. SourceValue: vv.Key,
  759. Value: text,
  760. }
  761. if isSite {
  762. tmp.Score = 1
  763. }
  764. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  765. }
  766. }
  767. }
  768. }
  769. break
  770. }
  771. }
  772. } else {
  773. for k, v := range kvMap {
  774. if j.Result[k] == nil {
  775. j.Result[k] = [](*ju.ExtField){}
  776. }
  777. for _, tmp := range v {
  778. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  779. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  780. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  781. MatchType: qu.ObjToString(tmp["matchtype"]),
  782. RuleText: qu.ObjToString(tmp["ruletext"]),
  783. SourceValue: tmp["sourcevalue"],
  784. Value: tmp["value"]}
  785. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  786. field.Score = 1
  787. }
  788. if isSite {
  789. field.Score = 1
  790. }
  791. if tmp["blocktag"] != nil {
  792. btag := make(map[string]string)
  793. for k := range tmp["blocktag"].(map[string]bool) {
  794. blocktag.Lock()
  795. if TagConfigDesc[k] != "" {
  796. btag[k] = TagConfigDesc[k]
  797. }
  798. blocktag.Unlock()
  799. }
  800. field.BlockTag = btag
  801. }
  802. j.Result[k] = append(j.Result[k], field)
  803. }
  804. }
  805. }
  806. }
  807. //抽取-规则-kv
  808. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  809. defer qu.Catch()
  810. if extfrom == "title" || !in.IsLua {
  811. return
  812. }
  813. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  814. lua.KvMap = *kvMap
  815. lua.Block = j.Block
  816. extinfo := lua.RunScript("core")
  817. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  818. for _, v := range tmps {
  819. v["core"] = in.Code
  820. }
  821. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  822. }
  823. if len(extinfo) > 0 {
  824. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  825. }
  826. }
  827. //抽取-规则-正则
  828. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  829. defer qu.Catch()
  830. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  831. b := IsExtract(in.Field, j.Title, j.Content)
  832. if !b {
  833. return
  834. }
  835. //全文正则
  836. //text := qu.ObjToString(doc[extfrom])
  837. //if in.Field != "" {
  838. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  839. // if len(extinfo) > 0 {
  840. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  841. // }
  842. //}
  843. //块抽取
  844. if in.Field != "" {
  845. if extfrom == "title" {
  846. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  847. if len(extinfo) > 0 {
  848. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  849. }
  850. } else {
  851. for _, v := range j.Block {
  852. btag := make(map[string]string)
  853. for k := range v.Classify {
  854. blocktag.Lock()
  855. btag[k] = TagConfigDesc[k]
  856. blocktag.Unlock()
  857. }
  858. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  859. if len(extinfo) > 0 {
  860. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  861. }
  862. }
  863. }
  864. }
  865. }
  866. //pkg抽取-规则-正则
  867. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  868. defer qu.Catch()
  869. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  870. b := IsExtract(in.Field, j.Title, j.Content)
  871. if !b {
  872. return
  873. }
  874. //块抽取
  875. if in.Field != "" {
  876. for k, vbpkg := range j.BlockPackage {
  877. rep := map[string]string{}
  878. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  879. if in.Field == "budget" && vbpkg.Budget > 0 {
  880. continue
  881. }
  882. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  883. continue
  884. }
  885. if in.Field == "winner" && vbpkg.Winner != "" {
  886. continue
  887. }
  888. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  889. continue
  890. }
  891. if in.Field == "projectname" && vbpkg.Name != "" {
  892. continue
  893. }
  894. if in.Field == "winner" && vbpkg.Winner != "" {
  895. continue
  896. }
  897. if in.Field == "winnerperson" {
  898. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  899. continue
  900. }
  901. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  902. continue
  903. }
  904. }
  905. if in.Field == "winnertel" {
  906. if vbpkg.WinnerPerson == "" {
  907. continue
  908. }
  909. }
  910. //处理正负数修正
  911. ptmp := strings.Split(in.RuleText, "#")
  912. sign := 0
  913. if len(ptmp) == 2 {
  914. if ptmp[1] == "正" {
  915. sign = 1
  916. } else if ptmp[1] == "负" {
  917. sign = -1
  918. }
  919. }
  920. tmp := strings.Split(ptmp[0], "__")
  921. if len(tmp) == 2 {
  922. epos := strings.Split(tmp[1], ",")
  923. posm := map[string]int{}
  924. for _, v := range epos {
  925. ks := strings.Split(v, ":")
  926. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  927. posm[ks[1]] = qu.IntAll(ks[0])
  928. } else {
  929. posm[in.Field] = qu.IntAll(ks[0])
  930. }
  931. }
  932. var pattern string
  933. if strings.Contains(tmp[0], "\\u") {
  934. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  935. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  936. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  937. } else {
  938. pattern = tmp[0]
  939. }
  940. //log.Debug("pattern", pattern)
  941. //fmt.Println(text)
  942. reg := regexp.MustCompile(pattern)
  943. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  944. for i, _ := range apos {
  945. pos := apos[i]
  946. for k, p := range posm {
  947. if len(pos) > p {
  948. if pos[p] == -1 || pos[p+1] == -1 {
  949. continue
  950. }
  951. val := vbpkg.Text[pos[p]:pos[p+1]]
  952. if string(val) == "" {
  953. continue
  954. }
  955. if sign == -1 {
  956. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  957. } else {
  958. rep[k+"_"+fmt.Sprint(i)] = val
  959. }
  960. }
  961. }
  962. }
  963. //fmt.Println(text)
  964. for i := 0; i < len(apos); i++ {
  965. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  966. if in.Field == "budget" && vbpkg.Budget <= 0 {
  967. lock.Lock()
  968. cfn := e.ClearFn[in.Field]
  969. lock.Unlock()
  970. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  971. if data[len(data)-1].(bool) {
  972. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  973. j.BlockPackage[k].IsTrueBudget = true
  974. }
  975. break
  976. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  977. lock.Lock()
  978. cfn := e.ClearFn[in.Field]
  979. lock.Unlock()
  980. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  981. if data[len(data)-1].(bool) {
  982. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  983. j.BlockPackage[k].IsTrueBidamount = true
  984. }
  985. break
  986. } else if in.Field == "winner" {
  987. if j.BlockPackage[k].Winner == "" {
  988. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  989. break
  990. }
  991. } else if in.Field == "winnertel" {
  992. if j.BlockPackage[k].WinnerTel == "" {
  993. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  994. break
  995. }
  996. } else if in.Field == "winnerperson" {
  997. if j.BlockPackage[k].WinnerPerson == "" {
  998. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  999. break
  1000. }
  1001. } else if in.Field == "bidstatus" {
  1002. if j.BlockPackage[k].BidStatus == "" {
  1003. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1004. break
  1005. }
  1006. } else if in.Field == "projectname" {
  1007. if j.BlockPackage[k].Name == "" {
  1008. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1009. break
  1010. }
  1011. } else if in.Field == "winnerperson" {
  1012. if j.BlockPackage[k].WinnerPerson == "" {
  1013. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1014. break
  1015. }
  1016. } else if in.Field == "winnertel" {
  1017. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1018. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1019. break
  1020. }
  1021. }
  1022. }
  1023. }
  1024. }
  1025. } else {
  1026. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1027. val := ""
  1028. if len(pos) == 2 {
  1029. //"text" = "text"[pos[1]:]
  1030. val = "text"[pos[1]:]
  1031. rs := regexp.MustCompile("[^\r\n\t]+")
  1032. tmp := rs.FindAllString("text", -1)
  1033. if len(tmp) > 0 {
  1034. val = tmp[0]
  1035. }
  1036. }
  1037. if val != "" {
  1038. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1039. lock.Lock()
  1040. cfn := e.ClearFn[in.Field]
  1041. lock.Unlock()
  1042. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1043. if data[len(data)-1].(bool) {
  1044. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1045. j.BlockPackage[k].IsTrueBudget = true
  1046. }
  1047. break
  1048. }
  1049. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1050. lock.Lock()
  1051. cfn := e.ClearFn[in.Field]
  1052. lock.Unlock()
  1053. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1054. if data[len(data)-1].(bool) {
  1055. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1056. j.BlockPackage[k].IsTrueBidamount = true
  1057. }
  1058. break
  1059. } else if in.Field == "bidstatus" {
  1060. if j.BlockPackage[k].BidStatus == "" {
  1061. j.BlockPackage[k].BidStatus = val
  1062. break
  1063. }
  1064. } else if in.Field == "projectname" {
  1065. if j.BlockPackage[k].Name == "" {
  1066. j.BlockPackage[k].Name = val
  1067. break
  1068. }
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. }
  1075. //lua脚本根据属性设置提取kv值
  1076. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1077. kvmap := map[string][]map[string]interface{}{}
  1078. if len(j.Winnerorder) > 1 {
  1079. if vc.Field == "bidamount" {
  1080. for _, v := range j.Winnerorder {
  1081. if v["price"] == nil {
  1082. continue
  1083. }
  1084. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1085. "code": "winnerorder",
  1086. "field": vc.Field,
  1087. "ruletext": "中标候选人_" + v["sortstr"].(string),
  1088. "extfrom": v["sortstr"],
  1089. "sourcevalue": v["price"],
  1090. "value": v["price"],
  1091. "type": "winnerorder",
  1092. "matchtype": "winnerorder",
  1093. })
  1094. return kvmap, false
  1095. }
  1096. //候选人中标金额
  1097. if price := j.Winnerorder[0]["price"]; price != nil {
  1098. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1099. "code": "CL_中标候选人",
  1100. "field": vc.Field,
  1101. "ruletext": "中标候选人",
  1102. "extfrom": j.Winnerorder[0]["sortstr"],
  1103. "sourcevalue": price,
  1104. "value": price,
  1105. "type": "winnerorder",
  1106. "matchtype": "winnerorder",
  1107. })
  1108. return kvmap, false
  1109. }
  1110. }
  1111. //else if vc.Field == "winner" {
  1112. // for _, v := range j.Winnerorder {
  1113. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1114. // "code": "winnerorder",
  1115. // "field": vc.Field,
  1116. // "ruletext": "中标候选人",
  1117. // "extfrom": vc.ExtFrom,
  1118. // "sourcevalue": "中标候选人",
  1119. // "value": v["entname"],
  1120. // "type": "winnerorder",
  1121. // "matchtype": "winnerorder",
  1122. // })
  1123. // }
  1124. // //候选人中标单位
  1125. // if entname := j.Winnerorder[0]["entname"]; entname != nil {
  1126. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1127. // "code": "CL_中标候选人",
  1128. // "field": vc.Field,
  1129. // "ruletext": "中标候选人",
  1130. // "extfrom": vc.ExtFrom,
  1131. // "sourcevalue": "中标候选人",
  1132. // "value": entname,
  1133. // "type": "winnerorder",
  1134. // "matchtype": "winnerorder",
  1135. // })
  1136. // return kvmap, false
  1137. // }
  1138. //}
  1139. }
  1140. for fieldname, field := range vc.LFields {
  1141. if field != vc.Field {
  1142. continue
  1143. }
  1144. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1145. }
  1146. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1147. return kvmap, true
  1148. }
  1149. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1150. //qu.Debug("fieldname+++", fieldname)
  1151. for _, bl := range blocks {
  1152. tp := ""
  1153. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1154. if k == 0 {
  1155. tp = "colon"
  1156. // for _, vv := range v.Kvs {
  1157. // qu.Debug("colon-kvs:", vv.Key, vv.Value)
  1158. // }
  1159. // for kkk, vv := range v.KvTags {
  1160. // for _, vvv := range vv {
  1161. // qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
  1162. // }
  1163. // }
  1164. } else if k == 1 {
  1165. tp = "space"
  1166. // for _, vv := range v.Kvs {
  1167. // qu.Debug("space-kvs:", vv.Key, vv.Value)
  1168. // }
  1169. // for kkk, vv := range v.KvTags {
  1170. // for _, vvv := range vv {
  1171. // qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
  1172. // }
  1173. // }
  1174. } else if k == 2 {
  1175. tp = "table"
  1176. // for _, vv := range v.Kvs {
  1177. // qu.Debug("table-kvs:", vv.Key, vv.Value)
  1178. // }
  1179. // for kkk, vv := range v.KvTags {
  1180. // for _, vvv := range vv {
  1181. // qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
  1182. // }
  1183. // }
  1184. }
  1185. if v == nil || v.KvTags == nil {
  1186. continue
  1187. }
  1188. for _, vv := range v.KvTags[fieldname] {
  1189. text := ju.TrimLRSpace(vv.Value, "")
  1190. if text != "" {
  1191. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1192. "code": "CL_" + vv.Key,
  1193. "field": field,
  1194. "ruletext": vv.Key,
  1195. "extfrom": vc.ExtFrom,
  1196. "sourcevalue": text,
  1197. "value": text,
  1198. "type": tp,
  1199. "matchtype": "tag_string",
  1200. "blocktag": bl.Classify,
  1201. "weight": vv.Weight,
  1202. })
  1203. //if field != "winnertel" && field != "winnerperson" {
  1204. // //break //暂定取第一个
  1205. //}
  1206. }
  1207. }
  1208. }
  1209. if len(kvmap[field]) == 0 {
  1210. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1211. }
  1212. }
  1213. }
  1214. //正则提取结果
  1215. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1216. defer qu.Catch()
  1217. var score float64
  1218. score = vre.Score
  1219. if isSite {
  1220. score = score + 1.0
  1221. }
  1222. extinfo := map[string][]map[string]interface{}{}
  1223. rep := map[string]string{}
  1224. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1225. //处理正负数修正
  1226. ptmp := strings.Split(vre.RuleText, "#")
  1227. sign := 0
  1228. if len(ptmp) == 2 {
  1229. if ptmp[1] == "正" {
  1230. sign = 1
  1231. } else if ptmp[1] == "负" {
  1232. sign = -1
  1233. }
  1234. }
  1235. tmp := strings.Split(ptmp[0], "__")
  1236. if len(tmp) == 2 {
  1237. epos := strings.Split(tmp[1], ",")
  1238. posm := map[string]int{}
  1239. for _, v := range epos {
  1240. ks := strings.Split(v, ":")
  1241. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1242. posm[ks[1]] = qu.IntAll(ks[0])
  1243. } else {
  1244. posm[vre.Field] = qu.IntAll(ks[0])
  1245. }
  1246. }
  1247. var pattern string
  1248. if strings.Contains(tmp[0], "\\u") {
  1249. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1250. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1251. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1252. } else {
  1253. pattern = tmp[0]
  1254. }
  1255. //log.Debug("pattern", pattern)
  1256. //fmt.Println(text)
  1257. reg := regexp.MustCompile(pattern)
  1258. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1259. for i, _ := range apos {
  1260. pos := apos[i]
  1261. for k, p := range posm {
  1262. if len(pos) > p {
  1263. if pos[p] == -1 || pos[p+1] == -1 {
  1264. continue
  1265. }
  1266. val := text[pos[p]:pos[p+1]]
  1267. if string(val) == "" {
  1268. continue
  1269. }
  1270. if sign == -1 {
  1271. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1272. } else {
  1273. rep[k+"_"+fmt.Sprint(i)] = val
  1274. }
  1275. }
  1276. }
  1277. }
  1278. //fmt.Println(text)
  1279. tmps := []map[string]interface{}{}
  1280. for i := 0; i < len(apos); i++ {
  1281. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1282. tmp := map[string]interface{}{
  1283. "field": vre.Field,
  1284. "code": vre.Code,
  1285. "ruletext": vre.RuleText,
  1286. "extfrom": text,
  1287. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1288. "type": "regexp",
  1289. "matchtype": "regcontent",
  1290. "blocktag": *tag,
  1291. "score": score,
  1292. }
  1293. tmps = append(tmps, tmp)
  1294. exfield := ju.ExtField{
  1295. BlockTag: *tag,
  1296. Field: vre.Field,
  1297. Code: vre.Code,
  1298. RuleText: vre.RuleText,
  1299. Type: "regexp",
  1300. MatchType: "regcontent",
  1301. ExtFrom: extfrom,
  1302. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1303. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1304. Score: score}
  1305. if tmp["blocktag"] != nil {
  1306. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1307. }
  1308. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1309. }
  1310. }
  1311. if len(tmps) > 0 {
  1312. //fmt.Println(tmps)
  1313. extinfo[vre.Field] = tmps
  1314. }
  1315. }
  1316. } else {
  1317. pos := vre.RegCore.Reg.FindStringIndex(text)
  1318. val := ""
  1319. if len(pos) == 2 {
  1320. text = text[pos[1]:]
  1321. rs := regexp.MustCompile("[^\r\n\t]+")
  1322. tmp := rs.FindAllString(text, -1)
  1323. if len(tmp) > 0 {
  1324. val = tmp[0]
  1325. }
  1326. }
  1327. if val != "" {
  1328. tmps := []map[string]interface{}{}
  1329. tmp := map[string]interface{}{
  1330. "field": vre.Field,
  1331. "code": vre.Code,
  1332. "ruletext": vre.RuleText,
  1333. "extfrom": text,
  1334. "value": val,
  1335. "type": "regexp",
  1336. "matchtype": "regcontent",
  1337. "blocktag": *tag,
  1338. "score": score,
  1339. }
  1340. tmps = append(tmps, tmp)
  1341. extinfo[vre.Field] = tmps
  1342. if j.Result[vre.Field] == nil {
  1343. j.Result[vre.Field] = [](*ju.ExtField){}
  1344. }
  1345. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1346. Value: val,
  1347. Score: score}
  1348. if tmp["blocktag"] != nil {
  1349. field.BlockTag = tmp["blocktag"].(map[string]string)
  1350. }
  1351. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1352. }
  1353. }
  1354. return extinfo
  1355. }
  1356. //后置过滤
  1357. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1358. defer qu.Catch()
  1359. if in.IsLua {
  1360. result := GetResultMapForLua(j)
  1361. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1362. if j != nil {
  1363. lua.Block = j.Block
  1364. }
  1365. extinfo := lua.RunScript("back")
  1366. for k, v := range extinfo {
  1367. if tmps, ok := v.([]map[string]interface{}); ok {
  1368. j.Result[k] = [](*ju.ExtField){}
  1369. for _, tmp := range tmps {
  1370. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1371. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1372. Value: tmp["value"]}
  1373. if tmp["blocktag"] != nil {
  1374. field.BlockTag = tmp["blocktag"].(map[string]string)
  1375. }
  1376. j.Result[k] = append(j.Result[k], field)
  1377. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1378. }
  1379. }
  1380. }
  1381. if len(extinfo) > 0 {
  1382. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1383. }
  1384. } else {
  1385. extinfo := map[string]interface{}{}
  1386. if in.Field != "" {
  1387. clearByTitle := false
  1388. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1389. clearByTitle = true
  1390. }
  1391. if j.Result[in.Field] != nil {
  1392. tmp := j.Result[in.Field]
  1393. exts := []interface{}{}
  1394. for k, v := range tmp {
  1395. if clearByTitle && v.ExtFrom != "title" {
  1396. continue
  1397. }
  1398. //table抽取到的数据不清理
  1399. // if v.Type == "table" && v.Field != "projectname" {
  1400. // continue
  1401. // }
  1402. text := qu.ObjToString(v.Value)
  1403. if text != "" {
  1404. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1405. }
  1406. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1407. continue
  1408. }
  1409. j.Result[in.Field][k].Value = text
  1410. exts = append(exts, map[string]interface{}{
  1411. "field": v.Field,
  1412. "code": v.Code,
  1413. "ruletext": v.RuleText,
  1414. "type": v.Type,
  1415. "matchtype": v.MatchType,
  1416. "extfrom": v.ExtFrom,
  1417. "value": text,
  1418. })
  1419. }
  1420. if len(exts) > 0 {
  1421. extinfo[in.Field] = exts
  1422. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1423. }
  1424. }
  1425. } else {
  1426. for key, tmp := range j.Result {
  1427. exts := []interface{}{}
  1428. for k, v := range tmp {
  1429. if v.Type == "table" { //table抽取到的数据不清理
  1430. continue
  1431. }
  1432. text := qu.ObjToString(v.Value)
  1433. if text != "" {
  1434. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1435. }
  1436. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1437. continue
  1438. }
  1439. j.Result[key][k].Value = text
  1440. exts = append(exts, map[string]interface{}{
  1441. "field": v.Field,
  1442. "code": v.Code,
  1443. "ruletext": v.RuleText,
  1444. "type": v.Type,
  1445. "matchtype": v.MatchType,
  1446. "extfrom": v.ExtFrom,
  1447. "value": text,
  1448. })
  1449. }
  1450. if len(exts) > 0 {
  1451. extinfo[key] = exts
  1452. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1453. }
  1454. }
  1455. }
  1456. }
  1457. }
  1458. //后置过滤
  1459. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1460. defer qu.Catch()
  1461. for k, v := range j.BlockPackage {
  1462. if in.Field == "winner" {
  1463. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1464. } else if in.Field == "bidstatus" {
  1465. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1466. } else if in.Field == "" {
  1467. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1468. } else if in.Field == "projectname" {
  1469. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1470. } else if in.Field == "winnerperson" {
  1471. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1472. } else if in.Field == "winnertel" {
  1473. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1474. }
  1475. }
  1476. }
  1477. //KV过滤
  1478. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1479. defer qu.Catch()
  1480. extinfo := map[string]interface{}{}
  1481. if in.Field != "" {
  1482. if j.Result[in.Field] != nil {
  1483. tmp := j.Result[in.Field]
  1484. exts := []interface{}{}
  1485. for k, v := range tmp {
  1486. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1487. continue
  1488. }
  1489. text := qu.ObjToString(v.Value)
  1490. if text != "" {
  1491. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1492. }
  1493. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1494. continue
  1495. }
  1496. j.Result[in.Field][k].Value = text
  1497. exts = append(exts, map[string]interface{}{
  1498. "field": v.Field,
  1499. "code": v.Code,
  1500. "ruletext": v.RuleText,
  1501. "type": v.Type,
  1502. "matchtype": v.MatchType,
  1503. "extfrom": v.ExtFrom,
  1504. "value": text,
  1505. })
  1506. }
  1507. if len(exts) > 0 {
  1508. extinfo[in.Field] = exts
  1509. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1510. }
  1511. }
  1512. }
  1513. }
  1514. //获取抽取结果map[string][]interface{},lua脚本使用
  1515. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1516. defer qu.Catch()
  1517. result := map[string][]map[string]interface{}{}
  1518. for key, val := range j.Result {
  1519. if result[key] == nil {
  1520. result[key] = []map[string]interface{}{}
  1521. }
  1522. for _, v := range val {
  1523. tmp := map[string]interface{}{
  1524. "field": v.Field,
  1525. "code": v.Code,
  1526. "ruletext": v.RuleText,
  1527. "value": v.Value,
  1528. "type": v.Type,
  1529. "matchtype": v.MatchType,
  1530. "extfrom": v.ExtFrom,
  1531. }
  1532. result[key] = append(result[key], tmp)
  1533. }
  1534. }
  1535. return result
  1536. }
  1537. //抽取日志
  1538. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1539. defer qu.Catch()
  1540. if !t.IsEtxLog {
  1541. return
  1542. }
  1543. logdata := map[string]interface{}{
  1544. "code": qu.If(v.Code == "", "kv", v.Code),
  1545. "name": v.Name,
  1546. "type": ftype,
  1547. "ruletext": v.RuleText,
  1548. "islua": v.IsLua,
  1549. "field": v.Field,
  1550. "version": t.Version,
  1551. "taskname": t.Name,
  1552. "before": before,
  1553. "extinfo": extinfo,
  1554. "sid": sid,
  1555. "comeintime": time.Now().Unix(),
  1556. }
  1557. lock.Lock()
  1558. ExtLogs[t] = append(ExtLogs[t], logdata)
  1559. lock.Unlock()
  1560. }
  1561. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1562. exts := []map[string]interface{}{}
  1563. exts = append(exts, map[string]interface{}{
  1564. "field": ext.Field,
  1565. "code": ext.Code,
  1566. "type": ftype,
  1567. "matchtype": matchtype,
  1568. "extfrom": ext.ExtFrom,
  1569. "value": ext.Value,
  1570. })
  1571. extinfo := map[string]interface{}{
  1572. ext.Field: exts,
  1573. }
  1574. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1575. }
  1576. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1577. defer qu.Catch()
  1578. if !t.IsEtxLog {
  1579. return
  1580. }
  1581. logdata := map[string]interface{}{
  1582. "code": code,
  1583. "name": name,
  1584. "type": ftype,
  1585. "ruletext": "",
  1586. "islua": false,
  1587. "field": field,
  1588. "version": t.Version,
  1589. "taskname": t.Name,
  1590. "before": before,
  1591. "extinfo": extinfo,
  1592. "sid": sid,
  1593. "comeintime": time.Now().Unix(),
  1594. }
  1595. lock.Lock()
  1596. ExtLogs[t] = append(ExtLogs[t], logdata)
  1597. lock.Unlock()
  1598. }
  1599. //保存抽取日志
  1600. func SaveExtLog() {
  1601. defer qu.Catch()
  1602. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1603. lock.Lock()
  1604. tmpLogs = ExtLogs
  1605. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1606. lock.Unlock()
  1607. for k, v := range tmpLogs {
  1608. if len(v) < saveLimit {
  1609. db.Mgo.SaveBulk(k.TrackColl, v...)
  1610. } else {
  1611. for {
  1612. if len(v) > saveLimit {
  1613. tmp := v[:saveLimit]
  1614. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1615. v = v[saveLimit:]
  1616. } else {
  1617. db.Mgo.SaveBulk(k.TrackColl, v...)
  1618. break
  1619. }
  1620. }
  1621. }
  1622. }
  1623. time.AfterFunc(10*time.Second, SaveExtLog)
  1624. }
  1625. type FieldValue struct {
  1626. Value interface{}
  1627. Count int
  1628. }
  1629. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1630. //分析抽取结果并保存
  1631. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1632. qu.Try(func() {
  1633. //重新取出清理过后的中标候选人
  1634. resetWinnerorder(j)
  1635. doc, result, _id := funcAnalysis(j, e)
  1636. if ju.IsSaveTag {
  1637. go otherNeedSave(j, result, e)
  1638. }
  1639. auxinfo := auxInfo(j)
  1640. //从排序结果中取值
  1641. tmp := map[string]interface{}{} //抽取值
  1642. tmp["spidercode"] = j.SpiderCode
  1643. tmp["site"] = j.Site
  1644. tmp["jsondata"] = j.Jsondata
  1645. tmp["fieldall"] = auxinfo
  1646. for _, val := range result {
  1647. for _, v := range val { //取第一个非负数,项目名称除外
  1648. //存0是否有效
  1649. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
  1650. tmp[v.Field] = v.Value
  1651. break
  1652. }
  1653. if v.Score > -1 {
  1654. tmp[v.Field] = v.Value
  1655. break
  1656. } else if v.Field == "projectname" {
  1657. tmp[v.Field] = v.Value
  1658. break
  1659. }
  1660. }
  1661. }
  1662. if len(j.PackageInfo) > 15 {
  1663. for k, v := range j.PackageInfo {
  1664. j.PackageInfo = map[string]map[string]interface{}{}
  1665. j.PackageInfo[k] = v
  1666. break
  1667. }
  1668. }
  1669. if len(j.PackageInfo) > 0 { //分包信息
  1670. tmp["package"] = j.PackageInfo
  1671. //包预算,中标金额合并大于抽取就覆盖
  1672. var tmpBidamount, tmpBudget float64
  1673. //s_winner逗号分隔拼接,分包中标人
  1674. var tmpstr, savewinner []string
  1675. //按包排序
  1676. for b, v := range j.PackageInfo {
  1677. if v["winner"] != nil && v["winner"] != "" {
  1678. tmpstr = append(tmpstr, b)
  1679. }
  1680. }
  1681. //包预算,中标金额合并大于抽取就覆盖
  1682. if len(j.PackageInfo) >= 1 {
  1683. //包数大于1累加
  1684. for _, v := range j.PackageInfo {
  1685. if v["budget"] != nil {
  1686. tmpBudget += qu.Float64All(v["budget"])
  1687. }
  1688. if v["bidamount"] != nil {
  1689. tmpBidamount += qu.Float64All(v["bidamount"])
  1690. }
  1691. }
  1692. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1693. tmp["budget"] = tmpBudget
  1694. }
  1695. if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1696. tmp["bidamount"] = tmpBidamount
  1697. }
  1698. } else {
  1699. //包数等于1,tmp没有值取包里的值
  1700. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1701. for _, v := range j.PackageInfo {
  1702. if v["budget"] != nil {
  1703. tmp["budget"] = v["budget"]
  1704. }
  1705. }
  1706. }
  1707. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1708. for _, v := range j.PackageInfo {
  1709. if v["bidamount"] != nil {
  1710. tmp["bidamount"] = v["bidamount"]
  1711. }
  1712. }
  1713. }
  1714. }
  1715. //s_winner逗号分隔拼接,分包中标人
  1716. sort.Strings(tmpstr)
  1717. for _, v := range tmpstr {
  1718. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1719. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1720. if savevvv == "" {
  1721. continue
  1722. }
  1723. savewinner = append(savewinner, savevvv)
  1724. }
  1725. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1726. tmp["s_winner"] = tmp["winner"]
  1727. } else if savewinner != nil {
  1728. savewinner = RemoveReplicaSliceString(savewinner)
  1729. tmp["s_winner"] = strings.Join(savewinner, ",")
  1730. }
  1731. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1732. //没有分包取winner
  1733. tmp["s_winner"] = tmp["winner"]
  1734. }
  1735. if len(j.Winnerorder) > 0 { //候选人信息
  1736. for i, v := range j.Winnerorder {
  1737. if v["price"] != nil {
  1738. j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
  1739. }
  1740. }
  1741. tmp["winnerorder"] = j.Winnerorder
  1742. }
  1743. //处理附件
  1744. var resultf map[string][]*ju.ExtField
  1745. if jf != nil {
  1746. _, resultf, _ = funcAnalysis(jf, e)
  1747. auxinfof := auxInfo(jf)
  1748. tmp["fieldallf"] = auxinfof
  1749. ffield := map[string]interface{}{}
  1750. for _, val := range resultf {
  1751. for _, v := range val { //取第一个非负数
  1752. if v.Score > -1 {
  1753. ffield[v.Field] = v.Value
  1754. break
  1755. }
  1756. }
  1757. }
  1758. if len(jf.PackageInfo) > 0 { //分包信息
  1759. ffield["package"] = jf.PackageInfo
  1760. }
  1761. if len(jf.Winnerorder) > 0 { //候选人信息
  1762. ffield["winnerorder"] = jf.Winnerorder
  1763. }
  1764. tmp["ffield"] = ffield
  1765. }
  1766. for k, v := range *doc {
  1767. //去重冗余字段
  1768. if delFiled(k) {
  1769. continue
  1770. }
  1771. if tmp[k] == nil {
  1772. tmp[k] = v
  1773. }
  1774. }
  1775. //质量审核
  1776. if ju.QualityAudit {
  1777. e.QualityAudit(tmp)
  1778. }
  1779. if e.IsExtractCity { //城市抽取
  1780. //e.ExtractCity(j, tmp, _id)
  1781. e.NewExtractCity(j, tmp, _id)
  1782. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1783. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1784. // tmp["district"] = d
  1785. // if b {
  1786. // tmp["city"] = c
  1787. // tmp["area"] = p
  1788. // }
  1789. }
  1790. //品牌抽取
  1791. if ju.IsBrandGoods {
  1792. tmp["checkhas"] = map[string]int{
  1793. "hastable": j.HasTable,
  1794. "hasgoods": j.HasGoods,
  1795. "hasbrand": j.HasBrand,
  1796. "haskey": j.HasKey,
  1797. }
  1798. if len(j.BrandData) > 0 {
  1799. tmp["tablebrand"] = j.BrandData
  1800. }
  1801. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1802. }
  1803. //prince和number抽取
  1804. if ju.IsPriceNumber {
  1805. priceNumberLen := len(j.PriceNumberData)
  1806. if priceNumberLen > 1 { //table数据去重
  1807. tmpPriceNumberData := []map[string]interface{}{}
  1808. tableStrs := map[string]bool{}
  1809. for _, tb := range j.PriceNumberData {
  1810. has := false
  1811. bytes, _ := json.Marshal(tb)
  1812. str := string(bytes)
  1813. if len(tableStrs) > 0 && tableStrs[str] {
  1814. has = true
  1815. } else {
  1816. tableStrs[str] = true
  1817. }
  1818. if !has {
  1819. for _, data := range tb {
  1820. tmpPriceNumberData = append(tmpPriceNumberData, data)
  1821. }
  1822. }
  1823. }
  1824. tmp["pricenumber"] = tmpPriceNumberData
  1825. } else if priceNumberLen == 1 {
  1826. tmp["pricenumber"] = j.PriceNumberData[0]
  1827. }
  1828. }
  1829. //所有kv组成的字符串
  1830. var kvtext bytes.Buffer
  1831. blocks := make([]ju.BlockAndTag, 0)
  1832. for _, v := range j.Block {
  1833. //分包和标签
  1834. if ju.SaveBlock {
  1835. xx, _ := json.Marshal(v)
  1836. tmpblock := new(ju.TmpBlock)
  1837. err := json.Unmarshal(xx, &tmpblock)
  1838. if err != nil {
  1839. if v.BPackage != nil {
  1840. bpb, _ := json.Marshal(v.BPackage)
  1841. tmpblock.BPackage = string(bpb)
  1842. }
  1843. tmpblock = rangeBlockToJson(v, *tmpblock)
  1844. }
  1845. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1846. }
  1847. //把所有kv组装成一个字符串,存库
  1848. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1849. if jv == nil {
  1850. continue
  1851. }
  1852. for jv_k, jv_v := range jv.KvTags {
  1853. for _, jv_vv := range jv_v {
  1854. kvtext.WriteString(jv_k)
  1855. kvtext.WriteString(":")
  1856. kvtext.WriteString(jv_vv.Value)
  1857. kvtext.WriteString(" ")
  1858. }
  1859. }
  1860. }
  1861. }
  1862. if kvtext.Len() > 0 {
  1863. tmp["kvtext"] = kvtext.String()
  1864. }
  1865. if len(blocks) > 0 {
  1866. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1867. if utf8.RuneCount(blocksBytes) < 100000 {
  1868. tmp["blocks"] = string(blocksBytes)
  1869. }
  1870. }
  1871. }
  1872. // fmt.Println("=============抽取结果================")
  1873. // for k, v := range tmp {
  1874. // qu.Debug(k, "---", v)
  1875. // }
  1876. //tmp["extract_content"] = j.Content
  1877. if e.TaskInfo.TestColl == "" {
  1878. if len(tmp) > 0 { //保存抽取结果
  1879. /* if len(e.SiteFields) <= 0 {
  1880. //for field, _ := range e.Fields {
  1881. // if tmp[field] == nil && {
  1882. // tmp[field] = "" //覆盖之前版本数据
  1883. // }
  1884. //}
  1885. } else {
  1886. //for field, _ := range e.SiteFields {
  1887. // if tmp[field] == nil &&{
  1888. // tmp[field] = "" //覆盖之前版本数据
  1889. // }
  1890. //}
  1891. }*/
  1892. tmp["repeat"] = 0
  1893. tmparr := []map[string]interface{}{
  1894. map[string]interface{}{
  1895. "_id": qu.StringTOBsonId(_id),
  1896. },
  1897. map[string]interface{}{"$set": tmp},
  1898. }
  1899. e.RWMutex.Lock()
  1900. e.BidArr = append(e.BidArr, tmparr)
  1901. e.BidTotal++
  1902. e.RWMutex.Unlock()
  1903. }
  1904. if ju.SaveResult {
  1905. id := tmp["_id"]
  1906. tmp["result"] = result
  1907. tmp["resultf"] = resultf
  1908. delete(tmp, "_id")
  1909. tmparr := []map[string]interface{}{
  1910. map[string]interface{}{
  1911. "_id": id,
  1912. },
  1913. map[string]interface{}{"$set": tmp},
  1914. }
  1915. e.RWMutex.Lock()
  1916. e.ResultArr = append(e.ResultArr, tmparr)
  1917. e.RWMutex.Unlock()
  1918. }
  1919. } else { //测试结果
  1920. delete(tmp, "_id")
  1921. delete(tmp, "fieldall")
  1922. if len(j.BlockPackage) > 0 { //分包详情
  1923. if len(j.BlockPackage) > 10 {
  1924. tmp["epackage"] = "分包异常"
  1925. } else {
  1926. bs, _ := json.Marshal(j.BlockPackage)
  1927. tmp["epackage"] = string(bs)
  1928. }
  1929. }
  1930. tmp["result"] = result
  1931. tmp["resultf"] = resultf
  1932. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1933. if !b {
  1934. log.Debug(e.TaskInfo.TestColl, _id)
  1935. }
  1936. }
  1937. }, func(err interface{}) {
  1938. log.Debug("AnalysisSaveResult err", err)
  1939. })
  1940. }
  1941. //保存其他
  1942. //kv、表格、块上的标签凡是新的标签都入库
  1943. //val type times firstid createtime 判定field
  1944. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1945. now := time.Now().Unix()
  1946. coll := e.TaskInfo.TestColl
  1947. if coll == "" {
  1948. coll = "extract_tag_result"
  1949. } else {
  1950. coll += "_tag"
  1951. }
  1952. datas := []map[string]interface{}{}
  1953. kv := map[string]int{}
  1954. for _, v := range j.Block {
  1955. //
  1956. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1957. if vv == nil || vv.KvTags == nil {
  1958. continue
  1959. }
  1960. for kkk, vvv := range vv.KvTags {
  1961. for _, vvvv := range vvv {
  1962. if vvvv.IsInvalid {
  1963. kv[kkk] = kv[kkk] + 1
  1964. break
  1965. }
  1966. }
  1967. }
  1968. }
  1969. for _, vv := range v.NotClassifyTitles {
  1970. datas = append(datas, map[string]interface{}{
  1971. "val": vv,
  1972. "times": 0,
  1973. "type": "block",
  1974. "firstid": j.SourceMid,
  1975. "createtime": now,
  1976. })
  1977. if len(datas) == saveLimit {
  1978. db.Mgo.SaveBulk(coll, datas...)
  1979. datas = []map[string]interface{}{}
  1980. }
  1981. }
  1982. }
  1983. for k, v := range kv {
  1984. datas = append(datas, map[string]interface{}{
  1985. "val": k,
  1986. "times": v,
  1987. "type": "kv",
  1988. "firstid": j.SourceMid,
  1989. "createtime": now,
  1990. })
  1991. if len(datas) == saveLimit {
  1992. db.Mgo.SaveBulk(coll, datas...)
  1993. datas = []map[string]interface{}{}
  1994. }
  1995. }
  1996. if len(datas) > 0 {
  1997. db.Mgo.SaveBulk(coll, datas...)
  1998. }
  1999. }
  2000. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2001. if j == nil {
  2002. return nil
  2003. }
  2004. if len(j.Block) > 0 {
  2005. for i, v := range j.Block {
  2006. rangetmp := new(ju.TmpBlock)
  2007. vb, _ := json.Marshal(v)
  2008. json.Unmarshal(vb, &rangetmp)
  2009. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2010. }
  2011. }
  2012. if j.ColonKV != nil {
  2013. cb, _ := json.Marshal(j.ColonKV)
  2014. tmpblock.ColonKV = string(cb)
  2015. }
  2016. if j.SpaceKV != nil {
  2017. sb, _ := json.Marshal(j.SpaceKV)
  2018. tmpblock.SpaceKV = string(sb)
  2019. }
  2020. if j.TableKV != nil {
  2021. tb, _ := json.Marshal(j.TableKV)
  2022. tmpblock.TableKV = string(tb)
  2023. }
  2024. return &tmpblock
  2025. }
  2026. //去重冗余字段
  2027. func delFiled(k string) bool {
  2028. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2029. }
  2030. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2031. defer qu.Catch()
  2032. doc := j.Data
  2033. result := j.Result
  2034. _id := qu.BsonIdToSId((*doc)["_id"])
  2035. result = ScoreFields(j, e.Tag) //正负面词打分
  2036. //结果排序
  2037. for _, val := range result {
  2038. ju.Sort(val)
  2039. }
  2040. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2041. //jsondata清理
  2042. clearJd(j.Jsondata, e)
  2043. marshalbt, _ := json.Marshal(j.Jsondata)
  2044. tmpjddata := make(map[string]interface{})
  2045. json.Unmarshal(marshalbt, &tmpjddata)
  2046. for _, jdkey := range ju.JsonData {
  2047. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2048. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2049. if jdkey == "budget" || jdkey == "bidamount" {
  2050. lockclear.Lock()
  2051. cfn := e.ClearFn[jdkey]
  2052. lockclear.Unlock()
  2053. if len(cfn) == 0 {
  2054. continue
  2055. }
  2056. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
  2057. if tmpv.Value == newNum[0] {
  2058. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2059. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2060. ju.Sort(j.Result[jdkey])
  2061. delete((*j.Jsondata), jdkey)
  2062. break
  2063. }
  2064. } else {
  2065. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2066. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2067. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2068. ju.Sort(j.Result[jdkey])
  2069. delete((*j.Jsondata), jdkey)
  2070. break
  2071. }
  2072. }
  2073. }
  2074. }
  2075. }
  2076. if len(*j.Jsondata) > 0 {
  2077. j.Result = JsonDataMergeProcessing(j, e)
  2078. }
  2079. j.Jsondata = &tmpjddata
  2080. }
  2081. return doc, result, _id
  2082. }
  2083. //辅助信息,如果没有排序先排序
  2084. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2085. fieldalls := map[string][]map[string]interface{}{}
  2086. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2087. defer qykredis.Close()
  2088. db := 0
  2089. for field, val := range j.Result {
  2090. //ju.Sort(val)
  2091. if field == "buyer" {
  2092. db = ju.BuyerDB
  2093. } else if field == "winner" {
  2094. db = ju.WinnerDB
  2095. } else if field == "agency" {
  2096. db = ju.AgencyDB
  2097. }
  2098. sfields := []map[string]interface{}{}
  2099. for _, v := range val {
  2100. standardized := false
  2101. if _, err := qykredis.Do("SELECT", db); err != nil {
  2102. fmt.Println("redis select err", err)
  2103. } else {
  2104. rep, err := qykredis.Do("GET", v.Value)
  2105. if rep != nil && err == nil {
  2106. standardized = true
  2107. }
  2108. }
  2109. if field == "budget" || field == "bidamount" {
  2110. if !v.IsTrue {
  2111. continue
  2112. }
  2113. }
  2114. sfield := map[string]interface{}{
  2115. "val": v.Value,
  2116. "type": v.Type,
  2117. "score": v.Score,
  2118. "blocktag": v.BlockTag,
  2119. "sourceval": v.SourceValue,
  2120. "standardized": standardized,
  2121. }
  2122. sfields = append(sfields, sfield)
  2123. }
  2124. fieldalls[field] = sfields
  2125. }
  2126. return fieldalls
  2127. }
  2128. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2129. defer qu.Catch()
  2130. //获取审核字段
  2131. for _, field := range e.AuditFields {
  2132. //1.分包
  2133. if resulttmp["package"] != nil {
  2134. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2135. for _, val := range packagedata {
  2136. if val[field] != nil {
  2137. fv := qu.ObjToString(val[field])
  2138. if fv != "" {
  2139. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2140. e.RedisMatch(field, fv, val) //redis匹配
  2141. } else { //除了buyer和winner,其他字段走规则匹配
  2142. e.RuleMatch(field, fv, val)
  2143. }
  2144. }
  2145. }
  2146. }
  2147. }
  2148. //2.外围
  2149. if resulttmp[field] != nil {
  2150. fv := qu.ObjToString(resulttmp[field])
  2151. if fv != "" {
  2152. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2153. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2154. } else { //除了buyer和winner,其他字段走规则匹配
  2155. e.RuleMatch(field, fv, resulttmp)
  2156. }
  2157. }
  2158. }
  2159. }
  2160. }
  2161. //Redis匹配
  2162. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2163. defer qu.Catch()
  2164. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2165. if i == 0 { //reids未找到,执行规则匹配
  2166. val[field+"_isredis"] = false
  2167. e.RuleMatch(field, fv, val) //规则匹配
  2168. } else { //redis找到,打标识存库
  2169. val[field+"_isredis"] = true
  2170. }
  2171. }
  2172. //规则匹配
  2173. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2174. defer qu.Catch()
  2175. if fieldval != "" {
  2176. SMap := e.StartMatch(field, fieldval)
  2177. //SMap.AddKey(field+"_isaudit", false)
  2178. for _, k := range SMap.Keys {
  2179. tmpMap[k] = SMap.Map[k]
  2180. }
  2181. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2182. }
  2183. }
  2184. //开始规则匹配
  2185. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2186. defer qu.Catch()
  2187. SMap := pretreated.NewSortMap()
  2188. lock.Lock()
  2189. f := e.RecogFieldMap[field]
  2190. lock.Unlock()
  2191. if len(f) > 0 {
  2192. fid := qu.BsonIdToSId(f["_id"])
  2193. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2194. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2195. if textAfterRecogFieldPrerule != "" {
  2196. lock.Lock()
  2197. classMap := e.FidClassMap[fid]
  2198. lock.Unlock()
  2199. L:
  2200. for _, c := range classMap { //class
  2201. classid := qu.BsonIdToSId(c["_id"])
  2202. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2203. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2204. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2205. if textAfterClassPrerule != "" {
  2206. lock.Lock()
  2207. ruleMap := e.CidRuleMap[classid]
  2208. lock.Unlock()
  2209. for _, r := range ruleMap { //rule
  2210. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2211. s_name := qu.ObjToString(r["s_name"])
  2212. rule := r["rule"].([]interface{})
  2213. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2214. if textAfterRulePrerule != "" {
  2215. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2216. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2217. if savefield != "" { //保存字段不为空,存储代码信息
  2218. SMap.AddKey(field+"_"+savefield, s_name)
  2219. }
  2220. break L
  2221. }
  2222. }
  2223. }
  2224. }
  2225. }
  2226. }
  2227. }
  2228. return SMap
  2229. }
  2230. //中标候选人经过清理之后,重新取出赋值
  2231. func resetWinnerorder(j *ju.Job) {
  2232. if len(j.Winnerorder) == 0 {
  2233. return
  2234. }
  2235. maxlen := len(j.Winnerorder) - 1
  2236. //中标单位
  2237. //i := 0
  2238. winners := []*ju.ExtField{}
  2239. bidamounts := []*ju.ExtField{}
  2240. //for _, v := range j.Result["winner"] {
  2241. // if v.Code == "winnerorder" {
  2242. // if maxlen < i {
  2243. // continue
  2244. // }
  2245. // j.Winnerorder[i]["entname"] = v.Value
  2246. // i++
  2247. // } else {
  2248. // winners = append(winners, v)
  2249. // }
  2250. //}
  2251. if maxlen > 0 {
  2252. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2253. if j.Winnerorder[0]["price"] != nil {
  2254. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
  2255. if tmpPrice[len(tmpPrice)-1].(bool) {
  2256. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
  2257. }
  2258. }
  2259. }
  2260. if j.Result["winner"] == nil && len(winners) > 0 {
  2261. j.Result["winner"] = winners
  2262. } else if len(winners) > 0 {
  2263. j.Result["winner"] = append(j.Result["winner"], winners...)
  2264. }
  2265. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2266. j.Result["bidamount"] = bidamounts
  2267. } else if len(bidamounts) > 0 {
  2268. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2269. }
  2270. //j.Result["winner"] = winners
  2271. //中标金额
  2272. //i = 0
  2273. //bidamounts := []*ju.ExtField{}
  2274. //for _, v := range j.Result["bidamount"] {
  2275. // if v.Code == "winnerorder" {
  2276. // if maxlen < i {
  2277. // continue
  2278. // }
  2279. // j.Winnerorder[i]["price"] = v.Value
  2280. // i++
  2281. // } else {
  2282. // bidamounts = append(bidamounts, v)
  2283. // }
  2284. //}
  2285. //j.Result["bidamount"] = bidamounts
  2286. }
  2287. func RemoveReplicaSliceString(slc []string) []string {
  2288. result := make([]string, 0)
  2289. tempMap := make(map[string]bool, len(slc))
  2290. for _, e := range slc {
  2291. if tempMap[e] == false {
  2292. tempMap[e] = true
  2293. result = append(result, e)
  2294. }
  2295. }
  2296. return result
  2297. }