extract.go 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. log "github.com/donnie4w/go-logger/logger"
  19. "gopkg.in/mgo.v2/bson"
  20. )
  21. var (
  22. lock, lockrule, lockclear sync.RWMutex
  23. cut = ju.NewCut() //获取正文并清理
  24. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  25. TaskList map[string]*ExtractTask //任务列表
  26. ClearTaskList map[string]*ClearTask //清理任务列表
  27. saveLimit = 200 //抽取日志批量保存
  28. PageSize = 5000 //查询分页
  29. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  30. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  31. )
  32. //启动测试抽取
  33. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  34. defer qu.Catch()
  35. ext := &ExtractTask{}
  36. ext.Id = taskId
  37. ext.IsRun = true
  38. ext.InitTestTaskInfo(resultcoll, trackcoll)
  39. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  40. ext.InitRulePres()
  41. ext.InitRuleBacks()
  42. ext.InitRuleCore()
  43. ext.InitPkgCore()
  44. ext.InitBlockRule()
  45. ext.InfoTypeList()
  46. ext.InitTag()
  47. ext.InitClearFn()
  48. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  49. //初始化城市DFA信息
  50. ext.InitCityDFA()
  51. ext.InitAreaCode()
  52. ext.InitPostCode()
  53. }
  54. //质量审核
  55. ext.InitAuditFields()
  56. ext.InitAuditRule()
  57. ext.InitAuditClass()
  58. ext.InitAuditRecogField()
  59. //品牌抽取是否开启
  60. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  61. //附件抽取是否开启
  62. ext.InitFile()
  63. return RunExtractTestTask(ext, startId, num)
  64. }
  65. func IdTrans(startId string) bson.ObjectId {
  66. defer qu.Catch()
  67. return bson.ObjectIdHex(startId)
  68. }
  69. //开始测试任务抽取
  70. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  71. n, _ := strconv.Atoi(num)
  72. id := IdTrans(startId)
  73. if id.Valid() {
  74. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  75. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  76. for _, v := range *list {
  77. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  78. continue
  79. }
  80. var j, jf *ju.Job
  81. if ext.IsFileField && v["projectinfo"] != nil {
  82. v["isextFile"] = true
  83. j, jf = ext.PreInfo(v)
  84. } else {
  85. j, _ = ext.PreInfo(v)
  86. }
  87. ext.TaskInfo.ProcessPool <- true
  88. go ext.ExtractProcess(j, jf)
  89. }
  90. return true
  91. } else {
  92. return false
  93. }
  94. }
  95. //启动抽取
  96. func StartExtractTaskId(taskId string) bool {
  97. defer qu.Catch()
  98. isgo := false
  99. ext := TaskList[taskId]
  100. if ext == nil {
  101. ext = &ExtractTask{}
  102. ext.Id = taskId
  103. ext.InitTaskInfo()
  104. isgo = true
  105. } else {
  106. ext.Id = taskId
  107. ext.InitTaskInfo()
  108. }
  109. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  110. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  111. ext.InitRulePres()
  112. ext.InitRuleBacks()
  113. ext.InitRuleCore()
  114. ext.InitPkgCore()
  115. ext.InitBlockRule()
  116. ext.InfoTypeList()
  117. ext.InitTag()
  118. ext.InitClearFn()
  119. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  120. //初始化城市DFA信息
  121. ext.InitCityDFA()
  122. ext.InitAreaCode()
  123. ext.InitPostCode()
  124. }
  125. //质量审核
  126. ext.InitAuditFields()
  127. ext.InitAuditRule()
  128. ext.InitAuditClass()
  129. ext.InitAuditRecogField()
  130. //品牌抽取是否开启
  131. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  132. //附件抽取是否开启
  133. ext.InitFile()
  134. ext.IsRun = true
  135. go ext.ResultSave(true)
  136. go ext.BidSave(true)
  137. if isgo {
  138. go RunExtractTask(taskId)
  139. }
  140. TaskList[taskId] = ext
  141. return true
  142. }
  143. //停止抽取
  144. func StopExtractTaskId(taskId string) bool {
  145. defer qu.Catch()
  146. ext := TaskList[taskId]
  147. if ext != nil {
  148. ext.IsRun = false
  149. TaskList[taskId] = ext
  150. }
  151. //更新task.s_extlastid
  152. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  153. return true
  154. }
  155. //开始抽取
  156. func RunExtractTask(taskId string) {
  157. defer qu.Catch()
  158. ext := TaskList[taskId]
  159. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  160. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  161. pageNum := (count + PageSize - 1) / PageSize
  162. limit := PageSize
  163. if count < PageSize {
  164. limit = count
  165. }
  166. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  167. for i := 0; i < pageNum; i++ {
  168. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  169. fmt.Printf("page=%d,query=%v", i+1, query)
  170. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  171. for _, v := range *list {
  172. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  173. continue
  174. }
  175. //根据标题判断是否抽取
  176. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  177. if !b {
  178. continue
  179. }
  180. _id := qu.BsonIdToSId(v["_id"])
  181. //log.Debug(_id)
  182. if !ext.IsRun {
  183. break
  184. }
  185. var j, jf *ju.Job
  186. if ext.IsFileField && v["projectinfo"] != nil {
  187. v["isextFile"] = true
  188. j, jf = ext.PreInfo(v)
  189. } else {
  190. j, _ = ext.PreInfo(v)
  191. }
  192. ext.TaskInfo.ProcessPool <- true
  193. go ext.ExtractProcess(j, jf)
  194. ext.TaskInfo.LastExtId = _id
  195. }
  196. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  197. if !ext.IsRun {
  198. break
  199. }
  200. }
  201. //更新task.s_extlastid
  202. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  203. }
  204. //信息预处理-不和版本关联,取最新版本的配置项
  205. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  206. return (&ExtractTask{}).PreInfo(doc)
  207. }
  208. //信息预处理-和版本关联
  209. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  210. defer qu.Catch()
  211. //判断是否有附件这个字段
  212. var isextFile bool
  213. if doc["isextFile"] != nil {
  214. isextFile = doc["isextFile"].(bool)
  215. }
  216. detail := ""
  217. d1, _ := doc["detail"].(string)
  218. d2, _ := doc["contenthtml"].(string)
  219. if len(d1) >= len(d2) || d2 == "" {
  220. detail = d1
  221. } else {
  222. detail = d2
  223. }
  224. d3, _ := doc["summary"].(string)
  225. detail = ju.CutLableStr(d3 + "\n" + detail)
  226. detail = cut.ClearHtml(d3 + "\n" + detail)
  227. doc["detail"] = detail
  228. if isextFile {
  229. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  230. }
  231. toptype := qu.ObjToString(doc["toptype"])
  232. subtype := qu.ObjToString(doc["subtype"])
  233. if qu.ObjToString(doc["type"]) == "bid" {
  234. toptype = "结果"
  235. }
  236. if toptype == "" {
  237. toptype = "all"
  238. }
  239. if subtype == "" {
  240. subtype = "all"
  241. }
  242. j = &ju.Job{
  243. SourceMid: qu.BsonIdToSId(doc["_id"]),
  244. Category: toptype,
  245. CategorySecond: subtype,
  246. Content: qu.ObjToString(doc["detail"]),
  247. SpiderCode: qu.ObjToString(doc["spidercode"]),
  248. //Domain: qu.ObjToString(doc["domain"]),
  249. //Href: qu.ObjToString(doc["href"]),
  250. Title: qu.ObjToString(doc["title"]),
  251. Data: &doc,
  252. City: qu.ObjToString(doc["city"]),
  253. Province: qu.ObjToString(doc["area"]),
  254. Jsondata: qu.ObjToMap(doc["jsondata"]),
  255. Result: map[string][]*ju.ExtField{},
  256. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  257. RuleBlock: e.RuleBlock,
  258. }
  259. if isextFile {
  260. jf = &ju.Job{
  261. SourceMid: qu.BsonIdToSId(doc["_id"]),
  262. Category: toptype,
  263. Content: qu.ObjToString(doc["detailfile"]),
  264. SpiderCode: qu.ObjToString(doc["spidercode"]),
  265. Title: qu.ObjToString(doc["title"]),
  266. Data: &doc,
  267. City: qu.ObjToString(doc["city"]),
  268. Province: qu.ObjToString(doc["area"]),
  269. Jsondata: qu.ObjToMap(doc["jsondata"]),
  270. Result: map[string][]*ju.ExtField{},
  271. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  272. RuleBlock: e.RuleBlock,
  273. IsFile: isextFile,
  274. }
  275. }
  276. qu.Try(func() {
  277. pretreated.AnalyStart(j) //job.Block分块
  278. if isextFile {
  279. pretreated.AnalyStart(jf)
  280. }
  281. }, func(err interface{}) {
  282. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  283. })
  284. return j, jf
  285. }
  286. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  287. func file2text(doc *map[string]interface{}) {
  288. var strfileinfo bytes.Buffer
  289. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  290. if va, ok := v["attachments"].(map[string]interface{}); ok {
  291. for _, vaatt := range va {
  292. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  293. if qu.ObjToString(fileinfo["content"]) != "" {
  294. switch fileinfo["content"].(type) {
  295. case string:
  296. lock.Lock()
  297. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  298. lock.Unlock()
  299. case []map[string]interface{}:
  300. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  301. if fv["context"] != nil {
  302. lock.Lock()
  303. strfileinfo.WriteString(fv["context"].(string) + " \n")
  304. lock.Unlock()
  305. }
  306. }
  307. }
  308. }
  309. }
  310. }
  311. }
  312. }
  313. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  314. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  315. }
  316. }
  317. //抽取
  318. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  319. e.ExtractDetail(j)
  320. if jf != nil && jf.IsFile {
  321. e.ExtractFile(jf)
  322. }
  323. //分析抽取结果并保存 todo
  324. AnalysisSaveResult(j, jf, e)
  325. <-e.TaskInfo.ProcessPool
  326. }
  327. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  328. qu.Try(func() {
  329. doc := *j.Data
  330. //全局前置规则,结果覆盖doc属性
  331. //for _, v := range e.RulePres {
  332. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  333. //}
  334. tmprules := map[string][]*RuleCore{}
  335. lockrule.Lock()
  336. if j.Category == "all" || j.CategorySecond == "all" {
  337. for k, vc1 := range e.RuleCores["all_all"] {
  338. tmprules[k] = vc1
  339. }
  340. } else {
  341. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  342. tmprules[k] = vc1
  343. }
  344. }
  345. if len(tmprules) < 1 { //分类未覆盖部分
  346. for k, vc1 := range e.RuleCores["all_all"] {
  347. tmprules[k] = vc1
  348. }
  349. }
  350. lockrule.Unlock()
  351. //抽取规则
  352. for _, vc1 := range tmprules {
  353. for _, vc := range vc1 {
  354. tmp := ju.DeepCopy(doc).(map[string]interface{})
  355. //是否进入逻辑
  356. if !ju.Logic(vc.LuaLogic, tmp) {
  357. continue
  358. }
  359. ////抽取-前置规则
  360. //for _, v := range vc.RulePres {
  361. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  362. //}
  363. // log.Debug("抽取-前置规则", tmp)
  364. //抽取-规则
  365. for _, v := range vc.RuleCores {
  366. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  367. }
  368. // log.Debug("抽取-规则", tmp)
  369. //项目名称未能抽取到,标题来凑
  370. if vc.Field == "projectname" {
  371. //if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
  372. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  373. if tmp["blocktag"] != nil {
  374. btag := make(map[string]string)
  375. for k := range tmp["blocktag"].(map[string]bool) {
  376. btag[k] = TagConfigDesc[k]
  377. }
  378. field.BlockTag = btag
  379. }
  380. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  381. //}
  382. }
  383. //抽取-后置规则
  384. for _, v := range vc.RuleBacks {
  385. ExtRegBack(j, v, e.TaskInfo)
  386. }
  387. // log.Debug("抽取-后置规则", tmp)
  388. }
  389. }
  390. //for _, vvc := range j.Result["budget"] {
  391. //log.Debug("-----", fmt.Sprintf("%+v", vvc))
  392. //}
  393. //全局后置规则
  394. for _, v := range e.RuleBacks {
  395. ExtRegBack(j, v, e.TaskInfo)
  396. }
  397. //候选人加入
  398. if len(j.Winnerorder) > 0 {
  399. winner := &ju.ExtField{
  400. Field: "winner",
  401. Code: "",
  402. RuleText: "",
  403. Type: "winnerorder",
  404. MatchType: "winnerorder",
  405. ExtFrom: "",
  406. Value: j.Winnerorder[0]["entname"],
  407. Score: 0,
  408. }
  409. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  410. winner.Score = -5
  411. }
  412. winners := j.Result["winner"]
  413. if winners != nil {
  414. winners = append(winners, winner)
  415. } else {
  416. winners = []*ju.ExtField{}
  417. winners = append(winners, winner)
  418. }
  419. j.Result["winner"] = winners
  420. }
  421. //函数清理
  422. for key, val := range j.Result {
  423. tmpExtFields := make([]*ju.ExtField, 0)
  424. tmpWeight := -999 //记录最大权重
  425. tmpIndex := -999 //记录最大权重下标
  426. for _, v := range val {
  427. lockclear.Lock()
  428. cfn := e.ClearFn[key]
  429. lockclear.Unlock()
  430. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  431. before, _ := v.Value.(string)
  432. v.Value = data[0]
  433. BeforeAddClearFnLog("clearcfn", "函数清理", j.SourceMid, before, "clear_cfn", v, e)
  434. //添加行数清理的日志
  435. //清理特殊符号
  436. lockclear.Lock()
  437. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  438. text := qu.ObjToString(v.Value)
  439. before = text
  440. text = clear.OtherClean(key, text)
  441. if text != "" {
  442. v.Value = text
  443. }
  444. BeforeAddClearFnLog("clearsymbol", "特殊符号清理", j.SourceMid, before, "clear_symbol", v, e)
  445. }
  446. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  447. lockclear.Unlock()
  448. }
  449. //项目编号,采购单位权重清理
  450. if (key == "projectcode" || key == "buyer") && len(val) > 1 {
  451. for i, v := range val {
  452. if v.Weight == 0 {
  453. tmpExtFields = append(tmpExtFields, v)
  454. continue
  455. } else if v.Weight > tmpWeight {
  456. tmpWeight = v.Weight
  457. tmpIndex = i
  458. }
  459. }
  460. if tmpIndex != -999 {
  461. tmpExtFields = append(tmpExtFields, val[tmpIndex])
  462. j.Result[key] = tmpExtFields
  463. }
  464. }
  465. }
  466. PackageDetail(j, e) //处理分包信息
  467. // bs, _ := json.Marshal(j.Result)
  468. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  469. }, func(err interface{}) {
  470. log.Debug("ExtractProcess err", err)
  471. })
  472. }
  473. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  474. qu.Try(func() {
  475. doc := *j.Data
  476. //全局前置规则,结果覆盖doc属性
  477. // for _, v := range e.RulePres {
  478. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  479. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  480. // }
  481. // }
  482. //抽取规则
  483. tmprules := map[string][]*RuleCore{}
  484. lockrule.Lock()
  485. if j.Category == "all" || j.CategorySecond == "all" {
  486. for k, vc1 := range e.RuleCores["all_all"] {
  487. tmprules[k] = vc1
  488. }
  489. } else {
  490. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  491. tmprules[k] = vc1
  492. }
  493. }
  494. lockrule.Unlock()
  495. for _, vc1 := range tmprules {
  496. for _, vc := range vc1 {
  497. tmp := ju.DeepCopy(doc).(map[string]interface{})
  498. //是否进入逻辑
  499. if !ju.Logic(vc.LuaLogic, tmp) {
  500. continue
  501. }
  502. //抽取-前置规则
  503. // for _, v := range vc.RulePres {
  504. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  505. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  506. // }
  507. // }
  508. // log.Debug("抽取-前置规则", tmp)
  509. //抽取-规则
  510. for _, v := range vc.RuleCores {
  511. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  512. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  513. }
  514. }
  515. // log.Debug("抽取-规则", tmp)
  516. //抽取-后置规则
  517. for _, v := range vc.RuleBacks {
  518. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  519. ExtRegBack(j, v, e.TaskInfo)
  520. }
  521. }
  522. // log.Debug("抽取-后置规则", tmp)
  523. }
  524. }
  525. //全局后置规则
  526. for _, v := range e.RuleBacks {
  527. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  528. ExtRegBack(j, v, e.TaskInfo)
  529. }
  530. }
  531. //候选人加入
  532. if len(j.Winnerorder) > 0 {
  533. winner := &ju.ExtField{
  534. Field: "winner",
  535. Code: "",
  536. RuleText: "",
  537. Type: "winnerorder",
  538. MatchType: "winnerorder",
  539. ExtFrom: "",
  540. Value: j.Winnerorder[0]["entname"],
  541. Score: 0,
  542. }
  543. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  544. winner.Score = -5
  545. }
  546. winners := j.Result["winner"]
  547. if winners != nil {
  548. winners = append(winners, winner)
  549. } else {
  550. winners = []*ju.ExtField{}
  551. winners = append(winners, winner)
  552. }
  553. j.Result["winner"] = winners
  554. }
  555. //函数清理
  556. for key, val := range j.Result {
  557. for _, v := range val {
  558. lockclear.Lock()
  559. cfn := e.ClearFn[key]
  560. lockclear.Unlock()
  561. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  562. v.Value = data[0]
  563. //清理特殊符号
  564. lockclear.Lock()
  565. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  566. clear.MesField[key] != nil {
  567. text := qu.ObjToString(v.Value)
  568. text = clear.OtherClean(key, text)
  569. v.Value = text
  570. }
  571. lockclear.Unlock()
  572. }
  573. }
  574. PackageDetail(j, e) //处理分包信息
  575. // bs, _ := json.Marshal(j.Result)
  576. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  577. }, func(err interface{}) {
  578. log.Debug("ExtractProcess err", err)
  579. })
  580. }
  581. //前置过滤
  582. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  583. defer qu.Catch()
  584. before := ju.DeepCopy(doc).(map[string]interface{})
  585. extinfo := map[string]interface{}{}
  586. if in.IsLua {
  587. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  588. if j != nil {
  589. lua.Block = j.Block
  590. }
  591. extinfo = lua.RunScript("pre")
  592. for k, v := range extinfo { //结果覆盖原doc
  593. doc[k] = v
  594. }
  595. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  596. } else {
  597. var key string
  598. if !j.IsFile {
  599. key = qu.If(in.Field == "", "detail", in.Field).(string)
  600. } else {
  601. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  602. }
  603. text := qu.ObjToString(doc[key])
  604. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  605. doc[key] = extinfo[key] //结果覆盖原doc
  606. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  607. }
  608. return doc
  609. }
  610. //抽取-规则
  611. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  612. defer qu.Catch()
  613. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  614. b := IsExtract(in.Field, j.Title, j.Content)
  615. if !b {
  616. return
  617. }
  618. if in.IsLua {
  619. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  620. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  621. lua.Block = j.Block
  622. extinfo := lua.RunScript("core")
  623. for k, v := range extinfo {
  624. if k == in.Field {
  625. if j.Result[k] == nil {
  626. j.Result[k] = [](*ju.ExtField){}
  627. }
  628. if tmps, ok := v.([]map[string]interface{}); ok {
  629. for _, tmp := range tmps {
  630. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
  631. if tmp["blocktag"] != nil {
  632. btag := make(map[string]string)
  633. for k := range tmp["blocktag"].(map[string]bool) {
  634. if TagConfigDesc[k] != "" {
  635. btag[k] = TagConfigDesc[k]
  636. }
  637. }
  638. field.BlockTag = btag
  639. }
  640. j.Result[k] = append(j.Result[k], field)
  641. }
  642. }
  643. }
  644. }
  645. if len(extinfo) > 0 {
  646. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  647. }
  648. } else {
  649. //全文正则
  650. //text := qu.ObjToString(doc[extfrom])
  651. //if in.Field != "" {
  652. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  653. // if len(extinfo) > 0 {
  654. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  655. // }
  656. //}
  657. //块抽取
  658. if in.Field != "" {
  659. if extfrom == "title" {
  660. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
  661. if len(extinfo) > 0 {
  662. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  663. }
  664. } else {
  665. for _, v := range j.Block {
  666. btag := make(map[string]string)
  667. for k := range v.Classify {
  668. btag[k] = TagConfigDesc[k]
  669. }
  670. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
  671. if len(extinfo) > 0 {
  672. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  673. }
  674. }
  675. }
  676. }
  677. }
  678. }
  679. //lua脚本根据属性设置提取kv值
  680. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  681. kvmap := map[string][]map[string]interface{}{}
  682. blocks := []*ju.Block{}
  683. for _, bl := range j.Block {
  684. if len(bl.Block) > 0 {
  685. blocks = append(blocks, bl.Block...)
  686. } else {
  687. blocks = append(blocks, bl)
  688. }
  689. }
  690. for fieldname, field := range in.LFields {
  691. if field != in.Field {
  692. continue
  693. }
  694. for _, bl := range blocks {
  695. tp := ""
  696. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  697. if k == 0 {
  698. tp = "colon"
  699. } else if k == 1 {
  700. tp = "space"
  701. } else if k == 2 {
  702. tp = "table"
  703. }
  704. if v == nil || v.KvTags == nil {
  705. continue
  706. }
  707. for _, vv := range v.KvTags[fieldname] {
  708. text := ju.TrimLRSpace(vv.Value, "")
  709. if text != "" {
  710. kvmap[field] = append(kvmap[field], map[string]interface{}{
  711. "field": field,
  712. "code": in.Code,
  713. "ruletext": vv.Key,
  714. "extfrom": extfrom,
  715. "sourcevalue": text,
  716. "value": text,
  717. "type": tp,
  718. "matchtype": "tag_string",
  719. "blocktag": bl.Classify,
  720. "weight": vv.Weight,
  721. })
  722. }
  723. }
  724. }
  725. }
  726. }
  727. return kvmap
  728. }
  729. //正则提取结果
  730. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  731. defer qu.Catch()
  732. extinfo := map[string][]map[string]interface{}{}
  733. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  734. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  735. if len(apos) > 0 {
  736. pos := apos[0]
  737. for k, p := range v.RegCore.ExtractPos {
  738. if len(pos) > p {
  739. if pos[p] == -1 || pos[p+1] == -1 {
  740. continue
  741. }
  742. val := text[pos[p]:pos[p+1]]
  743. sourcevalue := val
  744. if val == "招标公告" {
  745. return extinfo
  746. }
  747. if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
  748. val = text
  749. }
  750. tmps := []map[string]interface{}{}
  751. tmp := map[string]interface{}{
  752. "field": v.Field,
  753. "code": v.Code,
  754. "ruletext": v.RuleText,
  755. "extfrom": extfrom,
  756. "value": val,
  757. "type": "regexp",
  758. "matchtype": "regcontent",
  759. "blocktag": *tag,
  760. }
  761. tmps = append(tmps, tmp)
  762. extinfo[k] = tmps
  763. if strings.TrimSpace(val) != "" {
  764. if v.RegCore.NumSign == -1 { //正负值修正
  765. val = "-" + val
  766. }
  767. exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
  768. if tmp["blocktag"] != nil {
  769. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  770. }
  771. j.Result[k] = append(j.Result[k], &exfield)
  772. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  773. }
  774. }
  775. }
  776. if len(extinfo) == 0 {
  777. regArr := strings.Split(v.RuleText, "__")
  778. //fmt.Println(regArr[0])
  779. if len(regArr) > 0 {
  780. reg, err := regexp.Compile(regArr[0])
  781. if err == nil {
  782. datavals := reg.FindStringSubmatch(text)
  783. tmps := []map[string]interface{}{}
  784. for _, value := range datavals {
  785. if value == "" {
  786. continue
  787. }
  788. tmp := map[string]interface{}{
  789. "field": v.Field,
  790. "code": v.Code,
  791. "ruletext": regArr[0],
  792. "extfrom": extfrom,
  793. "value": value,
  794. "type": "regexp",
  795. "matchtype": "regcontent",
  796. "blocktag": *tag,
  797. }
  798. tmps = append(tmps, tmp)
  799. extinfo[v.Field] = tmps
  800. exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
  801. if tmp["blocktag"] != nil {
  802. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  803. }
  804. j.Result[v.Field] = append(j.Result[v.Field], &exfield)
  805. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  806. }
  807. }
  808. }
  809. }
  810. }
  811. } else {
  812. pos := v.RegCore.Reg.FindStringIndex(text)
  813. val := ""
  814. if len(pos) == 2 {
  815. text = text[pos[1]:]
  816. rs := regexp.MustCompile("[^\r\n\t]+")
  817. tmp := rs.FindAllString(text, -1)
  818. if len(tmp) > 0 {
  819. val = tmp[0]
  820. }
  821. }
  822. if val != "" {
  823. tmps := []map[string]interface{}{}
  824. tmp := map[string]interface{}{
  825. "field": v.Field,
  826. "code": v.Code,
  827. "ruletext": v.RuleText,
  828. "extfrom": extfrom,
  829. "value": val,
  830. "type": "regexp",
  831. "matchtype": "regcontent",
  832. "blocktag": *tag,
  833. }
  834. tmps = append(tmps, tmp)
  835. extinfo[v.Field] = tmps
  836. if j.Result[v.Field] == nil {
  837. j.Result[v.Field] = [](*ju.ExtField){}
  838. }
  839. field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
  840. if tmp["blocktag"] != nil {
  841. field.BlockTag = tmp["blocktag"].(map[string]string)
  842. }
  843. j.Result[v.Field] = append(j.Result[v.Field], field)
  844. }
  845. }
  846. return extinfo
  847. }
  848. //后置过滤
  849. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  850. defer qu.Catch()
  851. if in.IsLua {
  852. result := GetResultMapForLua(j)
  853. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  854. if j != nil {
  855. lua.Block = j.Block
  856. }
  857. extinfo := lua.RunScript("back")
  858. for k, v := range extinfo {
  859. if tmps, ok := v.([]map[string]interface{}); ok {
  860. j.Result[k] = [](*ju.ExtField){}
  861. for _, tmp := range tmps {
  862. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  863. if tmp["blocktag"] != nil {
  864. field.BlockTag = tmp["blocktag"].(map[string]string)
  865. }
  866. j.Result[k] = append(j.Result[k], field)
  867. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  868. }
  869. }
  870. }
  871. if len(extinfo) > 0 {
  872. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  873. }
  874. } else {
  875. extinfo := map[string]interface{}{}
  876. if in.Field != "" {
  877. if j.Result[in.Field] != nil {
  878. tmp := j.Result[in.Field]
  879. exts := []interface{}{}
  880. for k, v := range tmp {
  881. //table抽取到的数据不清理
  882. // if v.Type == "table" && v.Field != "projectname" {
  883. // continue
  884. // }
  885. text := qu.ObjToString(v.Value)
  886. if text != "" {
  887. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  888. }
  889. j.Result[in.Field][k].Value = text
  890. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  891. continue
  892. }
  893. exts = append(exts, map[string]interface{}{
  894. "field": v.Field,
  895. "code": v.Code,
  896. "ruletext": v.RuleText,
  897. "type": v.Type,
  898. "matchtype": v.MatchType,
  899. "extfrom": v.ExtFrom,
  900. "value": text,
  901. })
  902. }
  903. extinfo[in.Field] = exts
  904. if len(extinfo) > 0 {
  905. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  906. }
  907. }
  908. } else {
  909. for key, tmp := range j.Result {
  910. exts := []interface{}{}
  911. for k, v := range tmp {
  912. if v.Type == "table" { //table抽取到的数据不清理
  913. continue
  914. }
  915. text := qu.ObjToString(v.Value)
  916. if text != "" {
  917. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  918. }
  919. j.Result[key][k].Value = text
  920. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  921. continue
  922. }
  923. exts = append(exts, map[string]interface{}{
  924. "field": v.Field,
  925. "code": v.Code,
  926. "ruletext": v.RuleText,
  927. "type": v.Type,
  928. "matchtype": v.MatchType,
  929. "extfrom": v.ExtFrom,
  930. "value": text,
  931. })
  932. }
  933. extinfo[key] = exts
  934. }
  935. if len(extinfo) > 0 {
  936. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  937. }
  938. }
  939. }
  940. }
  941. //获取抽取结果map[string][]interface{},lua脚本使用
  942. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  943. defer qu.Catch()
  944. result := map[string][]map[string]interface{}{}
  945. for key, val := range j.Result {
  946. if result[key] == nil {
  947. result[key] = []map[string]interface{}{}
  948. }
  949. for _, v := range val {
  950. tmp := map[string]interface{}{
  951. "field": v.Field,
  952. "code": v.Code,
  953. "ruletext": v.RuleText,
  954. "value": v.Value,
  955. "type": v.Type,
  956. "matchtype": v.MatchType,
  957. "extfrom": v.ExtFrom,
  958. }
  959. result[key] = append(result[key], tmp)
  960. }
  961. }
  962. return result
  963. }
  964. //抽取日志
  965. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  966. defer qu.Catch()
  967. if !t.IsEtxLog {
  968. return
  969. }
  970. logdata := map[string]interface{}{
  971. "code": v.Code,
  972. "name": v.Name,
  973. "type": ftype,
  974. "ruletext": v.RuleText,
  975. "islua": v.IsLua,
  976. "field": v.Field,
  977. "version": t.Version,
  978. "taskname": t.Name,
  979. "before": before,
  980. "extinfo": extinfo,
  981. "sid": sid,
  982. "comeintime": time.Now().Unix(),
  983. }
  984. lock.Lock()
  985. ExtLogs[t] = append(ExtLogs[t], logdata)
  986. lock.Unlock()
  987. }
  988. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  989. exts := []map[string]interface{}{}
  990. exts = append(exts, map[string]interface{}{
  991. "field": ext.Field,
  992. "code": ext.Code,
  993. "type": ftype,
  994. "matchtype": matchtype,
  995. "extfrom": ext.ExtFrom,
  996. "value": ext.Value,
  997. })
  998. extinfo := map[string]interface{}{
  999. ext.Field: exts,
  1000. }
  1001. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1002. }
  1003. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1004. defer qu.Catch()
  1005. if !t.IsEtxLog {
  1006. return
  1007. }
  1008. logdata := map[string]interface{}{
  1009. "code": code,
  1010. "name": name,
  1011. "type": ftype,
  1012. "ruletext": "",
  1013. "islua": false,
  1014. "field": field,
  1015. "version": t.Version,
  1016. "taskname": t.Name,
  1017. "before": before,
  1018. "extinfo": extinfo,
  1019. "sid": sid,
  1020. "comeintime": time.Now().Unix(),
  1021. }
  1022. lock.Lock()
  1023. ExtLogs[t] = append(ExtLogs[t], logdata)
  1024. lock.Unlock()
  1025. }
  1026. //保存抽取日志
  1027. func SaveExtLog() {
  1028. defer qu.Catch()
  1029. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1030. lock.Lock()
  1031. tmpLogs = ExtLogs
  1032. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1033. lock.Unlock()
  1034. for k, v := range tmpLogs {
  1035. if len(v) < saveLimit {
  1036. db.Mgo.SaveBulk(k.TrackColl, v...)
  1037. } else {
  1038. for {
  1039. if len(v) > saveLimit {
  1040. tmp := v[:saveLimit]
  1041. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1042. v = v[saveLimit:]
  1043. } else {
  1044. db.Mgo.SaveBulk(k.TrackColl, v...)
  1045. break
  1046. }
  1047. }
  1048. }
  1049. }
  1050. time.AfterFunc(10*time.Second, SaveExtLog)
  1051. }
  1052. type FieldValue struct {
  1053. Value interface{}
  1054. Count int
  1055. }
  1056. //分析抽取结果并保存
  1057. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1058. qu.Try(func() {
  1059. doc, result, _id := funcAnalysis(j)
  1060. if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
  1061. go otherNeedSave(j, result, e)
  1062. }
  1063. auxinfo := auxInfo(j)
  1064. //从排序结果中取值
  1065. tmp := map[string]interface{}{} //抽取值
  1066. tmp["fieldall"] = auxinfo
  1067. for _, val := range result {
  1068. for _, v := range val { //取第一个非负数
  1069. if v.Score > -1 {
  1070. tmp[v.Field] = v.Value
  1071. break
  1072. }
  1073. }
  1074. }
  1075. if len(j.PackageInfo) > 0 { //分包信息
  1076. tmp["package"] = j.PackageInfo
  1077. }
  1078. if len(j.Winnerorder) > 0 { //候选人信息
  1079. tmp["winnerorder"] = j.Winnerorder
  1080. }
  1081. //处理附件
  1082. var resultf map[string][]*ju.ExtField
  1083. if jf != nil {
  1084. _, resultf, _ = funcAnalysis(jf)
  1085. auxinfof := auxInfo(jf)
  1086. tmp["fieldallf"] = auxinfof
  1087. ffield := map[string]interface{}{}
  1088. for _, val := range resultf {
  1089. for _, v := range val { //取第一个非负数
  1090. if v.Score > -1 {
  1091. ffield[v.Field] = v.Value
  1092. break
  1093. }
  1094. }
  1095. }
  1096. if len(jf.PackageInfo) > 0 { //分包信息
  1097. ffield["package"] = jf.PackageInfo
  1098. }
  1099. if len(jf.Winnerorder) > 0 { //候选人信息
  1100. ffield["winnerorder"] = jf.Winnerorder
  1101. }
  1102. tmp["ffield"] = ffield
  1103. }
  1104. for k, v := range *doc {
  1105. //去重冗余字段
  1106. if delFiled(k) {
  1107. continue
  1108. }
  1109. if tmp[k] == nil {
  1110. tmp[k] = v
  1111. }
  1112. }
  1113. //质量审核
  1114. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1115. e.QualityAudit(tmp)
  1116. }
  1117. if e.IsExtractCity { //城市抽取
  1118. e.ExtractCity(j, tmp, _id)
  1119. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1120. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1121. // tmp["district"] = d
  1122. // if b {
  1123. // tmp["city"] = c
  1124. // tmp["area"] = p
  1125. // }
  1126. }
  1127. //品牌抽取
  1128. if ju.IsBrandGoods {
  1129. tmp["checkhas"] = map[string]int{
  1130. "hastable": j.HasTable,
  1131. "hasgoods": j.HasGoods,
  1132. "hasbrand": j.HasBrand,
  1133. "haskey": j.HasKey,
  1134. }
  1135. if len(j.BrandData) > 0 {
  1136. tmp["tablebrand"] = j.BrandData
  1137. }
  1138. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1139. }
  1140. //所有kv组成的字符串
  1141. var kvtext bytes.Buffer
  1142. blocks := make([]ju.BlockAndTag, 0)
  1143. for _, v := range j.Block {
  1144. //分包和标签
  1145. if ju.Config["saveblock"].(bool) {
  1146. xx, _ := json.Marshal(v)
  1147. tmpblock := new(ju.TmpBlock)
  1148. err := json.Unmarshal(xx, &tmpblock)
  1149. if err != nil {
  1150. if v.BPackage != nil {
  1151. bpb, _ := json.Marshal(v.BPackage)
  1152. tmpblock.BPackage = string(bpb)
  1153. }
  1154. tmpblock = rangeBlockToJson(v, *tmpblock)
  1155. }
  1156. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1157. }
  1158. //把所有kv组装成一个字符串,存库
  1159. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1160. if jv == nil {
  1161. continue
  1162. }
  1163. for jv_k, jv_v := range jv.KvTags {
  1164. for _, jv_vv := range jv_v {
  1165. kvtext.WriteString(jv_k)
  1166. kvtext.WriteString(":")
  1167. kvtext.WriteString(jv_vv.Value)
  1168. kvtext.WriteString(" ")
  1169. }
  1170. }
  1171. }
  1172. }
  1173. if kvtext.Len() > 0 {
  1174. tmp["kvtext"] = kvtext.String()
  1175. }
  1176. if len(blocks) > 0 {
  1177. tmp["blocks"] = blocks
  1178. }
  1179. //tmp["extract_content"] = j.Content
  1180. if e.TaskInfo.TestColl == "" {
  1181. if len(tmp) > 0 { //保存抽取结果
  1182. for field, _ := range e.Fields {
  1183. if tmp[field] == nil {
  1184. tmp[field] = "" //覆盖之前版本数据
  1185. }
  1186. }
  1187. tmp["repeat"] = 0
  1188. tmparr := []map[string]interface{}{
  1189. map[string]interface{}{
  1190. "_id": qu.StringTOBsonId(_id),
  1191. },
  1192. map[string]interface{}{"$set": tmp},
  1193. }
  1194. e.BidArr = append(e.BidArr, tmparr)
  1195. e.BidTotal++
  1196. }
  1197. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1198. id := tmp["_id"]
  1199. tmp["result"] = result
  1200. tmp["resultf"] = resultf
  1201. delete(tmp, "_id")
  1202. tmparr := []map[string]interface{}{
  1203. map[string]interface{}{
  1204. "_id": id,
  1205. },
  1206. map[string]interface{}{"$set": tmp},
  1207. }
  1208. e.ResultArr = append(e.ResultArr, tmparr)
  1209. }
  1210. } else { //测试结果
  1211. delete(tmp, "_id")
  1212. if len(j.BlockPackage) > 0 { //分包详情
  1213. bs, _ := json.Marshal(j.BlockPackage)
  1214. tmp["epackage"] = string(bs)
  1215. }
  1216. tmp["result"] = result
  1217. tmp["resultf"] = resultf
  1218. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1219. if !b {
  1220. log.Debug(e.TaskInfo.TestColl, _id)
  1221. }
  1222. }
  1223. }, func(err interface{}) {
  1224. log.Debug("AnalysisSaveResult err", err)
  1225. })
  1226. }
  1227. //保存其他
  1228. //kv、表格、块上的标签凡是新的标签都入库
  1229. //val type times firstid createtime 判定field
  1230. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1231. now := time.Now().Unix()
  1232. coll := e.TaskInfo.TestColl
  1233. if coll == "" {
  1234. coll = "extract_tag_result"
  1235. } else {
  1236. coll += "_tag"
  1237. }
  1238. datas := []map[string]interface{}{}
  1239. kv := map[string]int{}
  1240. for _, v := range j.Block {
  1241. //
  1242. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1243. if vv == nil || vv.KvTags == nil {
  1244. continue
  1245. }
  1246. for kkk, vvv := range vv.KvTags {
  1247. for _, vvvv := range vvv {
  1248. if vvvv.IsInvalid {
  1249. kv[kkk] = kv[kkk] + 1
  1250. break
  1251. }
  1252. }
  1253. }
  1254. }
  1255. for _, vv := range v.NotClassifyTitles {
  1256. datas = append(datas, map[string]interface{}{
  1257. "val": vv,
  1258. "times": 0,
  1259. "type": "block",
  1260. "firstid": j.SourceMid,
  1261. "createtime": now,
  1262. })
  1263. if len(datas) == 200 {
  1264. db.Mgo.SaveBulk(coll, datas...)
  1265. datas = []map[string]interface{}{}
  1266. }
  1267. }
  1268. }
  1269. for k, v := range kv {
  1270. datas = append(datas, map[string]interface{}{
  1271. "val": k,
  1272. "times": v,
  1273. "type": "kv",
  1274. "firstid": j.SourceMid,
  1275. "createtime": now,
  1276. })
  1277. if len(datas) == 200 {
  1278. db.Mgo.SaveBulk(coll, datas...)
  1279. datas = []map[string]interface{}{}
  1280. }
  1281. }
  1282. if len(datas) > 0 {
  1283. db.Mgo.SaveBulk(coll, datas...)
  1284. }
  1285. }
  1286. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1287. if j == nil {
  1288. return nil
  1289. }
  1290. if len(j.Block) > 0 {
  1291. for i, v := range j.Block {
  1292. rangetmp := new(ju.TmpBlock)
  1293. vb, _ := json.Marshal(v)
  1294. json.Unmarshal(vb, &rangetmp)
  1295. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1296. }
  1297. }
  1298. if j.ColonKV != nil {
  1299. cb, _ := json.Marshal(j.ColonKV)
  1300. tmpblock.ColonKV = string(cb)
  1301. }
  1302. if j.SpaceKV != nil {
  1303. sb, _ := json.Marshal(j.SpaceKV)
  1304. tmpblock.SpaceKV = string(sb)
  1305. }
  1306. if j.TableKV != nil {
  1307. tb, _ := json.Marshal(j.TableKV)
  1308. tmpblock.TableKV = string(tb)
  1309. }
  1310. return &tmpblock
  1311. }
  1312. //去重冗余字段
  1313. func delFiled(k string) bool {
  1314. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1315. }
  1316. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1317. defer qu.Catch()
  1318. doc := j.Data
  1319. result := j.Result
  1320. _id := qu.BsonIdToSId((*doc)["_id"])
  1321. result = ScoreFields(j)
  1322. //结果排序
  1323. for _, val := range result {
  1324. ju.Sort(val)
  1325. }
  1326. return doc, result, _id
  1327. }
  1328. //辅助信息,如果没有排序先排序
  1329. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1330. fieldalls := map[string][]map[string]interface{}{}
  1331. for field, val := range j.Result {
  1332. //ju.Sort(val)
  1333. sfields := []map[string]interface{}{}
  1334. for _, v := range val {
  1335. standardized := false
  1336. if field == "buyer" || field == "winner" || field == "agency" {
  1337. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1338. if i > 0 {
  1339. standardized = true
  1340. }
  1341. }
  1342. sfield := map[string]interface{}{
  1343. "val": v.Value,
  1344. "type": v.Type,
  1345. "score": v.Score,
  1346. "blocktag": v.BlockTag,
  1347. "sourceval": v.SourceValue,
  1348. "standardized": standardized,
  1349. }
  1350. sfields = append(sfields, sfield)
  1351. }
  1352. fieldalls[field] = sfields
  1353. }
  1354. return fieldalls
  1355. }
  1356. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1357. defer qu.Catch()
  1358. //获取审核字段
  1359. for _, field := range e.AuditFields {
  1360. //1.分包
  1361. if resulttmp["package"] != nil {
  1362. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1363. for _, val := range packagedata {
  1364. if val[field] != nil {
  1365. fv := qu.ObjToString(val[field])
  1366. if fv != "" {
  1367. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1368. e.RedisMatch(field, fv, val) //redis匹配
  1369. } else { //除了buyer和winner,其他字段走规则匹配
  1370. e.RuleMatch(field, fv, val)
  1371. }
  1372. }
  1373. }
  1374. }
  1375. }
  1376. //2.外围
  1377. if resulttmp[field] != nil {
  1378. fv := qu.ObjToString(resulttmp[field])
  1379. if fv != "" {
  1380. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1381. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1382. } else { //除了buyer和winner,其他字段走规则匹配
  1383. e.RuleMatch(field, fv, resulttmp)
  1384. }
  1385. }
  1386. }
  1387. }
  1388. }
  1389. //Redis匹配
  1390. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1391. defer qu.Catch()
  1392. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1393. if i == 0 { //reids未找到,执行规则匹配
  1394. val[field+"_isredis"] = false
  1395. e.RuleMatch(field, fv, val) //规则匹配
  1396. } else { //redis找到,打标识存库
  1397. val[field+"_isredis"] = true
  1398. }
  1399. }
  1400. //规则匹配
  1401. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1402. defer qu.Catch()
  1403. if fieldval != "" {
  1404. SMap := e.StartMatch(field, fieldval)
  1405. //SMap.AddKey(field+"_isaudit", false)
  1406. for _, k := range SMap.Keys {
  1407. tmpMap[k] = SMap.Map[k]
  1408. }
  1409. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1410. }
  1411. }
  1412. //开始规则匹配
  1413. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1414. defer qu.Catch()
  1415. SMap := pretreated.NewSortMap()
  1416. lock.Lock()
  1417. f := e.RecogFieldMap[field]
  1418. lock.Unlock()
  1419. if len(f) > 0 {
  1420. fid := qu.BsonIdToSId(f["_id"])
  1421. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1422. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1423. if textAfterRecogFieldPrerule != "" {
  1424. lock.Lock()
  1425. classMap := e.FidClassMap[fid]
  1426. lock.Unlock()
  1427. L:
  1428. for _, c := range classMap { //class
  1429. classid := qu.BsonIdToSId(c["_id"])
  1430. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1431. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1432. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1433. if textAfterClassPrerule != "" {
  1434. lock.Lock()
  1435. ruleMap := e.CidRuleMap[classid]
  1436. lock.Unlock()
  1437. for _, r := range ruleMap { //rule
  1438. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1439. s_name := qu.ObjToString(r["s_name"])
  1440. rule := r["rule"].([]interface{})
  1441. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1442. if textAfterRulePrerule != "" {
  1443. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1444. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1445. if savefield != "" { //保存字段不为空,存储代码信息
  1446. SMap.AddKey(field+"_"+savefield, s_name)
  1447. }
  1448. break L
  1449. }
  1450. }
  1451. }
  1452. }
  1453. }
  1454. }
  1455. }
  1456. return SMap
  1457. }