extract.go 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. log "github.com/donnie4w/go-logger/logger"
  19. "gopkg.in/mgo.v2/bson"
  20. )
  21. var (
  22. lock sync.RWMutex
  23. cut = ju.NewCut() //获取正文并清理
  24. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  25. TaskList map[string]*ExtractTask //任务列表
  26. ClearTaskList map[string]*ClearTask //清理任务列表
  27. saveLimit = 200 //抽取日志批量保存
  28. PageSize = 5000 //查询分页
  29. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  30. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  31. )
  32. //启动测试抽取
  33. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  34. defer qu.Catch()
  35. ext := &ExtractTask{}
  36. ext.Id = taskId
  37. ext.IsRun = true
  38. ext.InitTestTaskInfo(resultcoll, trackcoll)
  39. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  40. ext.InitRulePres()
  41. ext.InitRuleBacks()
  42. ext.InitRuleCore()
  43. ext.InitPkgCore()
  44. ext.InitBlockRule()
  45. ext.InfoTypeList()
  46. ext.InitTag()
  47. ext.InitClearFn()
  48. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  49. //初始化城市DFA信息
  50. ext.InitCityDFA()
  51. ext.InitAreaCode()
  52. ext.InitPostCode()
  53. }
  54. //质量审核
  55. ext.InitAuditFields()
  56. ext.InitAuditRule()
  57. ext.InitAuditClass()
  58. ext.InitAuditRecogField()
  59. //品牌抽取是否开启
  60. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  61. //附件抽取是否开启
  62. ext.InitFile()
  63. return RunExtractTestTask(ext, startId, num)
  64. }
  65. func IdTrans(startId string) bson.ObjectId {
  66. defer qu.Catch()
  67. return bson.ObjectIdHex(startId)
  68. }
  69. //开始测试任务抽取
  70. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  71. n, _ := strconv.Atoi(num)
  72. id := IdTrans(startId)
  73. if id.Valid() {
  74. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  75. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  76. for _, v := range *list {
  77. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  78. continue
  79. }
  80. var j, jf *ju.Job
  81. if ext.IsFileField && v["projectinfo"] != nil {
  82. v["isextFile"] = true
  83. j, jf = ext.PreInfo(v)
  84. } else {
  85. j, _ = ext.PreInfo(v)
  86. }
  87. ext.TaskInfo.ProcessPool <- true
  88. go ext.ExtractProcess(j, jf)
  89. }
  90. return true
  91. } else {
  92. return false
  93. }
  94. }
  95. //启动抽取
  96. func StartExtractTaskId(taskId string) bool {
  97. defer qu.Catch()
  98. isgo := false
  99. ext := TaskList[taskId]
  100. if ext == nil {
  101. ext = &ExtractTask{}
  102. ext.Id = taskId
  103. ext.InitTaskInfo()
  104. isgo = true
  105. } else {
  106. ext.Id = taskId
  107. ext.InitTaskInfo()
  108. }
  109. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  110. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  111. ext.InitRulePres()
  112. ext.InitRuleBacks()
  113. ext.InitRuleCore()
  114. ext.InitPkgCore()
  115. ext.InitBlockRule()
  116. ext.InfoTypeList()
  117. ext.InitTag()
  118. ext.InitClearFn()
  119. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  120. //初始化城市DFA信息
  121. ext.InitCityDFA()
  122. ext.InitAreaCode()
  123. ext.InitPostCode()
  124. }
  125. //质量审核
  126. ext.InitAuditFields()
  127. ext.InitAuditRule()
  128. ext.InitAuditClass()
  129. ext.InitAuditRecogField()
  130. //品牌抽取是否开启
  131. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  132. //附件抽取是否开启
  133. ext.InitFile()
  134. ext.IsRun = true
  135. go ext.ResultSave(true)
  136. go ext.BidSave(true)
  137. if isgo {
  138. go RunExtractTask(taskId)
  139. }
  140. TaskList[taskId] = ext
  141. return true
  142. }
  143. //停止抽取
  144. func StopExtractTaskId(taskId string) bool {
  145. defer qu.Catch()
  146. ext := TaskList[taskId]
  147. if ext != nil {
  148. ext.IsRun = false
  149. TaskList[taskId] = ext
  150. }
  151. //更新task.s_extlastid
  152. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  153. return true
  154. }
  155. //开始抽取
  156. func RunExtractTask(taskId string) {
  157. defer qu.Catch()
  158. ext := TaskList[taskId]
  159. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  160. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  161. pageNum := (count + PageSize - 1) / PageSize
  162. limit := PageSize
  163. if count < PageSize {
  164. limit = count
  165. }
  166. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  167. for i := 0; i < pageNum; i++ {
  168. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  169. fmt.Printf("page=%d,query=%v", i+1, query)
  170. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  171. for _, v := range *list {
  172. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  173. continue
  174. }
  175. _id := qu.BsonIdToSId(v["_id"])
  176. //log.Debug(_id)
  177. if !ext.IsRun {
  178. break
  179. }
  180. var j, jf *ju.Job
  181. if ext.IsFileField && v["projectinfo"] != nil {
  182. v["isextFile"] = true
  183. j, jf = ext.PreInfo(v)
  184. } else {
  185. j, _ = ext.PreInfo(v)
  186. }
  187. ext.TaskInfo.ProcessPool <- true
  188. go ext.ExtractProcess(j, jf)
  189. ext.TaskInfo.LastExtId = _id
  190. }
  191. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  192. if !ext.IsRun {
  193. break
  194. }
  195. }
  196. //更新task.s_extlastid
  197. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  198. }
  199. //信息预处理-不和版本关联,取最新版本的配置项
  200. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  201. return (&ExtractTask{}).PreInfo(doc)
  202. }
  203. //信息预处理-和版本关联
  204. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  205. defer qu.Catch()
  206. //判断是否有附件这个字段
  207. var isextFile bool
  208. if doc["isextFile"] != nil {
  209. isextFile = doc["isextFile"].(bool)
  210. }
  211. detail := ""
  212. d1, _ := doc["detail"].(string)
  213. d2, _ := doc["contenthtml"].(string)
  214. if len(d1) >= len(d2) || d2 == "" {
  215. detail = d1
  216. } else {
  217. detail = d2
  218. }
  219. detail = ju.CutLableStr(detail)
  220. detail = cut.ClearHtml(detail)
  221. doc["detail"] = detail
  222. if isextFile {
  223. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  224. }
  225. toptype := qu.ObjToString(doc["toptype"])
  226. subtype := qu.ObjToString(doc["subtype"])
  227. if qu.ObjToString(doc["type"]) == "bid" {
  228. toptype = "结果"
  229. }
  230. if toptype == "" {
  231. toptype = "*"
  232. }
  233. j = &ju.Job{
  234. SourceMid: qu.BsonIdToSId(doc["_id"]),
  235. Category: toptype,
  236. CategorySecond: subtype,
  237. Content: qu.ObjToString(doc["detail"]),
  238. SpiderCode: qu.ObjToString(doc["spidercode"]),
  239. //Domain: qu.ObjToString(doc["domain"]),
  240. //Href: qu.ObjToString(doc["href"]),
  241. Title: qu.ObjToString(doc["title"]),
  242. Data: &doc,
  243. City: qu.ObjToString(doc["city"]),
  244. Province: qu.ObjToString(doc["area"]),
  245. Jsondata: qu.ObjToMap(doc["jsondata"]),
  246. Result: map[string][]*ju.ExtField{},
  247. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  248. RuleBlock: e.RuleBlock,
  249. }
  250. if isextFile {
  251. jf = &ju.Job{
  252. SourceMid: qu.BsonIdToSId(doc["_id"]),
  253. Category: toptype,
  254. Content: qu.ObjToString(doc["detailfile"]),
  255. SpiderCode: qu.ObjToString(doc["spidercode"]),
  256. Title: qu.ObjToString(doc["title"]),
  257. Data: &doc,
  258. City: qu.ObjToString(doc["city"]),
  259. Province: qu.ObjToString(doc["area"]),
  260. Jsondata: qu.ObjToMap(doc["jsondata"]),
  261. Result: map[string][]*ju.ExtField{},
  262. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  263. RuleBlock: e.RuleBlock,
  264. IsFile: isextFile,
  265. }
  266. }
  267. qu.Try(func() {
  268. pretreated.AnalyStart(j) //job.Block分块
  269. if isextFile {
  270. pretreated.AnalyStart(jf)
  271. }
  272. }, func(err interface{}) {
  273. log.Debug("pretreated.AnalyStart", err)
  274. })
  275. return j, jf
  276. }
  277. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  278. func file2text(doc *map[string]interface{}) {
  279. var strfileinfo bytes.Buffer
  280. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  281. if va, ok := v["attachments"].(map[string]interface{}); ok {
  282. for _, vaatt := range va {
  283. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  284. if qu.ObjToString(fileinfo["content"]) != "" {
  285. switch fileinfo["content"].(type) {
  286. case string:
  287. lock.Lock()
  288. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  289. lock.Unlock()
  290. case []map[string]interface{}:
  291. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  292. if fv["context"] != nil {
  293. lock.Lock()
  294. strfileinfo.WriteString(fv["context"].(string) + " \n")
  295. lock.Unlock()
  296. }
  297. }
  298. }
  299. }
  300. }
  301. }
  302. }
  303. }
  304. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  305. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  306. }
  307. }
  308. //抽取
  309. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  310. e.ExtractDetail(j)
  311. if jf != nil && jf.IsFile {
  312. e.ExtractFile(jf)
  313. }
  314. //分析抽取结果并保存 todo
  315. AnalysisSaveResult(j, jf, e)
  316. <-e.TaskInfo.ProcessPool
  317. }
  318. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  319. qu.Try(func() {
  320. doc := *j.Data
  321. //全局前置规则,结果覆盖doc属性
  322. //for _, v := range e.RulePres {
  323. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  324. //}
  325. if j.CategorySecond == "" {
  326. //抽取规则
  327. tmprules := map[string][]*RuleCore{}
  328. lock.Lock()
  329. if e.RuleCores[j.Category] == nil {
  330. j.Category = "*_其他"
  331. }
  332. for k, vc1 := range e.RuleCores[j.Category] {
  333. tmprules[k] = vc1
  334. }
  335. lock.Unlock()
  336. for _, vc1 := range tmprules {
  337. for _, vc := range vc1 {
  338. tmp := ju.DeepCopy(doc).(map[string]interface{})
  339. //是否进入逻辑
  340. if !ju.Logic(vc.LuaLogic, tmp) {
  341. continue
  342. }
  343. ////抽取-前置规则
  344. //for _, v := range vc.RulePres {
  345. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  346. //}
  347. // log.Debug("抽取-前置规则", tmp)
  348. //抽取-规则
  349. for _, v := range vc.RuleCores {
  350. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  351. }
  352. // log.Debug("抽取-规则", tmp)
  353. //项目名称未能抽取到,标题来凑
  354. if vc.Field == "projectname" {
  355. if len(j.Result[vc.Field]) < 1 {
  356. items := make([]*ju.ScoreItem, 1)
  357. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  358. field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
  359. if tmp["blocktag"] != nil {
  360. field.BlockTag = tmp["blocktag"].(map[string]bool)
  361. }
  362. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  363. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  364. }
  365. }
  366. //抽取-后置规则
  367. for _, v := range vc.RuleBacks {
  368. ExtRegBack(j, v, e.TaskInfo)
  369. }
  370. // log.Debug("抽取-后置规则", tmp)
  371. }
  372. }
  373. } else {
  374. var cores map[string][]*RuleCore
  375. if e.RuleCores[j.Category+"_"+j.CategorySecond] == nil {
  376. cores = e.RuleCores["*_其他"]
  377. } else {
  378. cores = e.RuleCores[j.Category+"_"+j.CategorySecond]
  379. }
  380. for _, vc1 := range cores {
  381. for _, vc := range vc1 {
  382. tmp := ju.DeepCopy(doc).(map[string]interface{})
  383. //是否进入逻辑
  384. if !ju.Logic(vc.LuaLogic, tmp) {
  385. continue
  386. }
  387. //抽取-前置规则
  388. for _, v := range vc.RulePres {
  389. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  390. }
  391. // log.Debug("抽取-前置规则", tmp)
  392. //抽取-规则
  393. for _, v := range vc.RuleCores {
  394. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  395. }
  396. // log.Debug("抽取-规则", tmp)
  397. //项目名称未能抽取到,标题来凑
  398. if vc.Field == "projectname" {
  399. items := make([]*ju.ScoreItem, 1)
  400. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  401. field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
  402. if len(j.Result[vc.Field]) < 1 {
  403. if tmp["blocktag"] != nil {
  404. field.BlockTag = tmp["blocktag"].(map[string]bool)
  405. }
  406. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  407. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  408. }
  409. }
  410. //抽取-后置规则
  411. for _, v := range vc.RuleBacks {
  412. ExtRegBack(j, v, e.TaskInfo)
  413. }
  414. // log.Debug("抽取-后置规则", tmp)
  415. }
  416. }
  417. }
  418. //全局后置规则
  419. for _, v := range e.RuleBacks {
  420. ExtRegBack(j, v, e.TaskInfo)
  421. }
  422. //候选人加入
  423. if len(j.Winnerorder) > 0 {
  424. winner := &ju.ExtField{
  425. Field: "winner",
  426. Code: "",
  427. RuleText: "",
  428. Type: "winnerorder",
  429. MatchType: "winnerorder",
  430. ExtFrom: "",
  431. Value: j.Winnerorder[0]["entname"],
  432. Score: 0,
  433. }
  434. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  435. winner.Score = -5
  436. }
  437. winners := j.Result["winner"]
  438. if winners != nil {
  439. winners = append(winners, winner)
  440. } else {
  441. winners = []*ju.ExtField{}
  442. winners = append(winners, winner)
  443. }
  444. j.Result["winner"] = winners
  445. }
  446. //函数清理
  447. for key, val := range j.Result {
  448. for _, v := range val {
  449. lock.Lock()
  450. cfn := e.ClearFn[key]
  451. lock.Unlock()
  452. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  453. v.Value = data[0]
  454. //清理特殊符号
  455. lock.Lock()
  456. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  457. clear.MesField[key] != nil {
  458. text := qu.ObjToString(v.Value)
  459. text = clear.OtherClean(key, text)
  460. if text != "" {
  461. v.Value = text
  462. }
  463. }
  464. lock.Unlock()
  465. }
  466. }
  467. PackageDetail(j, e) //处理分包信息
  468. // bs, _ := json.Marshal(j.Result)
  469. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  470. }, func(err interface{}) {
  471. log.Debug("ExtractProcess err", err)
  472. })
  473. }
  474. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  475. qu.Try(func() {
  476. doc := *j.Data
  477. //全局前置规则,结果覆盖doc属性
  478. for _, v := range e.RulePres {
  479. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  480. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  481. }
  482. }
  483. //抽取规则
  484. if j.CategorySecond == "" {
  485. for _, vc1 := range e.RuleCores[j.Category] {
  486. for _, vc := range vc1 {
  487. tmp := ju.DeepCopy(doc).(map[string]interface{})
  488. //是否进入逻辑
  489. if !ju.Logic(vc.LuaLogic, tmp) {
  490. continue
  491. }
  492. //抽取-前置规则
  493. for _, v := range vc.RulePres {
  494. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  495. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  496. }
  497. }
  498. // log.Debug("抽取-前置规则", tmp)
  499. //抽取-规则
  500. for _, v := range vc.RuleCores {
  501. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  502. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  503. }
  504. }
  505. // log.Debug("抽取-规则", tmp)
  506. //抽取-后置规则
  507. for _, v := range vc.RuleBacks {
  508. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  509. ExtRegBack(j, v, e.TaskInfo)
  510. }
  511. }
  512. // log.Debug("抽取-后置规则", tmp)
  513. }
  514. }
  515. } else {
  516. for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  517. for _, vc := range vc1 {
  518. tmp := ju.DeepCopy(doc).(map[string]interface{})
  519. //是否进入逻辑
  520. if !ju.Logic(vc.LuaLogic, tmp) {
  521. continue
  522. }
  523. //抽取-前置规则
  524. for _, v := range vc.RulePres {
  525. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  526. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  527. }
  528. }
  529. // log.Debug("抽取-前置规则", tmp)
  530. //抽取-规则
  531. for _, v := range vc.RuleCores {
  532. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  533. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  534. }
  535. }
  536. // log.Debug("抽取-规则", tmp)
  537. //抽取-后置规则
  538. for _, v := range vc.RuleBacks {
  539. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  540. ExtRegBack(j, v, e.TaskInfo)
  541. }
  542. }
  543. // log.Debug("抽取-后置规则", tmp)
  544. }
  545. }
  546. }
  547. //全局后置规则
  548. for _, v := range e.RuleBacks {
  549. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  550. ExtRegBack(j, v, e.TaskInfo)
  551. }
  552. }
  553. //候选人加入
  554. if len(j.Winnerorder) > 0 {
  555. winner := &ju.ExtField{
  556. Field: "winner",
  557. Code: "",
  558. RuleText: "",
  559. Type: "winnerorder",
  560. MatchType: "winnerorder",
  561. ExtFrom: "",
  562. Value: j.Winnerorder[0]["entname"],
  563. Score: 0,
  564. }
  565. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  566. winner.Score = -5
  567. }
  568. winners := j.Result["winner"]
  569. if winners != nil {
  570. winners = append(winners, winner)
  571. } else {
  572. winners = []*ju.ExtField{}
  573. winners = append(winners, winner)
  574. }
  575. j.Result["winner"] = winners
  576. }
  577. //函数清理
  578. for key, val := range j.Result {
  579. for _, v := range val {
  580. lock.Lock()
  581. cfn := e.ClearFn[key]
  582. lock.Unlock()
  583. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  584. v.Value = data[0]
  585. //清理特殊符号
  586. lock.Lock()
  587. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  588. clear.MesField[key] != nil {
  589. text := qu.ObjToString(v.Value)
  590. text = clear.OtherClean(key, text)
  591. v.Value = text
  592. }
  593. lock.Unlock()
  594. }
  595. }
  596. PackageDetail(j, e) //处理分包信息
  597. // bs, _ := json.Marshal(j.Result)
  598. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  599. }, func(err interface{}) {
  600. log.Debug("ExtractProcess err", err)
  601. })
  602. }
  603. //前置过滤
  604. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  605. defer qu.Catch()
  606. before := ju.DeepCopy(doc).(map[string]interface{})
  607. extinfo := map[string]interface{}{}
  608. if in.IsLua {
  609. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  610. if j != nil {
  611. lua.Block = j.Block
  612. }
  613. extinfo = lua.RunScript("pre")
  614. for k, v := range extinfo { //结果覆盖原doc
  615. doc[k] = v
  616. }
  617. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  618. } else {
  619. var key string
  620. if !j.IsFile {
  621. key = qu.If(in.Field == "", "detail", in.Field).(string)
  622. } else {
  623. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  624. }
  625. text := qu.ObjToString(doc[key])
  626. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  627. doc[key] = extinfo[key] //结果覆盖原doc
  628. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  629. }
  630. return doc
  631. }
  632. //抽取-规则
  633. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  634. defer qu.Catch()
  635. //废标、流标、ppp等跳过
  636. b := IsExtract(in.Field, j.Title, j.Content)
  637. if !b {
  638. return
  639. }
  640. if in.IsLua {
  641. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  642. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  643. lua.Block = j.Block
  644. extinfo := lua.RunScript("core")
  645. for k, v := range extinfo {
  646. if k == in.Field {
  647. if j.Result[k] == nil {
  648. j.Result[k] = [](*ju.ExtField){}
  649. }
  650. if tmps, ok := v.([]map[string]interface{}); ok {
  651. for _, tmp := range tmps {
  652. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
  653. if extfrom == "title" {
  654. field.Score = 4
  655. }
  656. if tmp["blocktag"] != nil {
  657. field.BlockTag = tmp["blocktag"].(map[string]bool)
  658. }
  659. item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
  660. if extfrom == "title" {
  661. item.Score = 4
  662. }
  663. if tmp["scoreitem"] == nil {
  664. scoreItems := make([]*ju.ScoreItem, 0)
  665. scoreItems = append(scoreItems, item)
  666. field.ScoreItem = scoreItems
  667. } else {
  668. field.ScoreItem = append(field.ScoreItem, item)
  669. }
  670. j.Result[k] = append(j.Result[k], field)
  671. }
  672. }
  673. }
  674. }
  675. if len(extinfo) > 0 {
  676. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  677. }
  678. } else {
  679. //全文正则
  680. //text := qu.ObjToString(doc[extfrom])
  681. //if in.Field != "" {
  682. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  683. // if len(extinfo) > 0 {
  684. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  685. // }
  686. //}
  687. //块抽取
  688. if in.Field != "" {
  689. if extfrom == "title" {
  690. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
  691. if len(extinfo) > 0 {
  692. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  693. }
  694. } else {
  695. for _, v := range j.Block {
  696. extinfo := extRegCoreToResult(extfrom, v.Text, &v.Classify, j, in)
  697. if len(extinfo) > 0 {
  698. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  699. }
  700. }
  701. }
  702. }
  703. }
  704. }
  705. //lua脚本根据属性设置提取kv值
  706. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  707. defer qu.Catch()
  708. kvmap := map[string][]map[string]interface{}{}
  709. for fieldname, field := range in.LFields {
  710. lock.Lock()
  711. tags := t[field] //获取对应标签库
  712. lock.Unlock()
  713. if tags == nil {
  714. continue
  715. }
  716. for _, bl := range j.Block {
  717. //冒号kv
  718. if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
  719. kvs := bl.ColonKV.Kvs
  720. kvs2 := bl.ColonKV.Kvs_2
  721. // log.Debug("ColonKV1", kvs)
  722. // log.Debug("ColonKV2", kvs2)
  723. for _, tag := range tags {
  724. for _, kv := range kvs {
  725. if tag.Type == "string" {
  726. if kv.Key == tag.Key {
  727. text := ju.TrimLRSpace(kv.Value, "")
  728. if text != "" {
  729. kvmap[field] = append(kvmap[field], map[string]interface{}{
  730. "field": field,
  731. "code": in.Code,
  732. "ruletext": tag.Key,
  733. "extfrom": extfrom,
  734. "value": text,
  735. "type": "colon1",
  736. "matchtype": "tag_string",
  737. "blocktag": bl.Tag,
  738. })
  739. }
  740. break
  741. }
  742. } else if tag.Type == "regexp" {
  743. if tag.Reg.MatchString(kv.Key) {
  744. text := ju.TrimLRSpace(kv.Value, "")
  745. if text != "" {
  746. kvmap[field] = append(kvmap[field], map[string]interface{}{
  747. "field": field,
  748. "code": in.Code,
  749. "ruletext": tag.Key,
  750. "extfrom": extfrom,
  751. "value": text,
  752. "type": "colon1",
  753. "matchtype": "tag_regexp",
  754. "blocktag": bl.Tag,
  755. })
  756. }
  757. break
  758. }
  759. }
  760. }
  761. for _, kv := range kvs2 {
  762. if tag.Type == "string" {
  763. if kv.Key == tag.Key {
  764. text := ju.TrimLRSpace(kv.Value, "")
  765. if text != "" {
  766. kvmap[field] = append(kvmap[field], map[string]interface{}{
  767. "field": field,
  768. "code": in.Code,
  769. "ruletext": tag.Key,
  770. "extfrom": extfrom,
  771. "value": text,
  772. "type": "colon2",
  773. "matchtype": "tag_string",
  774. "blocktag": bl.Tag,
  775. })
  776. }
  777. break
  778. }
  779. } else if tag.Type == "regexp" {
  780. if tag.Reg.MatchString(kv.Key) {
  781. text := ju.TrimLRSpace(kv.Value, "")
  782. if text != "" {
  783. kvmap[field] = append(kvmap[field], map[string]interface{}{
  784. "field": field,
  785. "code": in.Code,
  786. "ruletext": tag.Key,
  787. "extfrom": extfrom,
  788. "value": text,
  789. "type": "colon2",
  790. "matchtype": "tag_regexp",
  791. "blocktag": bl.Tag,
  792. })
  793. }
  794. break
  795. }
  796. }
  797. }
  798. }
  799. }
  800. //空格kv
  801. if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
  802. kvs := bl.SpaceKV.Kvs
  803. // log.Debug("SpaceKV", kvs)
  804. for _, tag := range tags {
  805. for _, kv := range kvs {
  806. if tag.Type == "string" {
  807. if kv.Key == tag.Key {
  808. text := ju.TrimLRSpace(kv.Value, "")
  809. if text != "" {
  810. kvmap[field] = append(kvmap[field], map[string]interface{}{
  811. "field": field,
  812. "code": in.Code,
  813. "ruletext": tag.Key,
  814. "extfrom": extfrom,
  815. "value": text,
  816. "type": "space",
  817. "matchtype": "tag_string",
  818. "blocktag": bl.Tag,
  819. })
  820. }
  821. break
  822. }
  823. } else if tag.Type == "regexp" {
  824. if tag.Reg.MatchString(kv.Key) {
  825. text := ju.TrimLRSpace(kv.Value, "")
  826. if text != "" {
  827. kvmap[field] = append(kvmap[field], map[string]interface{}{
  828. "field": field,
  829. "code": in.Code,
  830. "ruletext": tag.Key,
  831. "extfrom": extfrom,
  832. "value": text,
  833. "type": "space",
  834. "matchtype": "tag_regexp",
  835. "blocktag": bl.Tag,
  836. })
  837. }
  838. break
  839. }
  840. }
  841. }
  842. }
  843. }
  844. //表格kv
  845. if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
  846. tkv := bl.TableKV
  847. // log.Debug("tkv", tkv)
  848. for k, v := range tkv.Kv {
  849. if k == fieldname {
  850. if len(tags) > -tkv.KvIndex[fieldname] {
  851. ruletext := ""
  852. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  853. ruletext = "项目名称"
  854. } else {
  855. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  856. }
  857. kvmap[field] = append(kvmap[field], map[string]interface{}{
  858. "field": field,
  859. "code": in.Code,
  860. "ruletext": ruletext,
  861. "extfrom": "table",
  862. "value": v,
  863. "type": "table",
  864. "matchtype": "tag_string",
  865. "blocktag": bl.Tag,
  866. })
  867. } else { //涉及其他待处理
  868. // log.Debug(tags)
  869. }
  870. }
  871. }
  872. }
  873. }
  874. }
  875. return kvmap
  876. }
  877. //正则提取结果
  878. func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  879. defer qu.Catch()
  880. extinfo := map[string][]map[string]interface{}{}
  881. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  882. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  883. if len(apos) > 0 {
  884. pos := apos[0]
  885. for k, p := range v.RegCore.ExtractPos {
  886. if len(pos) > p {
  887. if pos[p] == -1 || pos[p+1] == -1 {
  888. continue
  889. }
  890. val := text[pos[p]:pos[p+1]]
  891. if val == "招标公告" {
  892. return extinfo
  893. }
  894. if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
  895. val = text
  896. }
  897. tmps := []map[string]interface{}{}
  898. tmp := map[string]interface{}{
  899. "field": v.Field,
  900. "code": v.Code,
  901. "ruletext": v.RuleText,
  902. "extfrom": extfrom,
  903. "value": val,
  904. "type": "regexp",
  905. "matchtype": "regcontent",
  906. "blocktag": *tag,
  907. }
  908. tmps = append(tmps, tmp)
  909. extinfo[k] = tmps
  910. if strings.TrimSpace(val) != "" {
  911. if v.RegCore.NumSign == -1 { //正负值修正
  912. val = "-" + val
  913. }
  914. exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  915. if extfrom == "title" {
  916. exfield.Score = 4
  917. }
  918. if tmp["blocktag"] != nil {
  919. exfield.BlockTag = tmp["blocktag"].(map[string]bool)
  920. }
  921. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  922. if extfrom == "title" {
  923. item.Score = 4
  924. }
  925. if strings.Contains(val,"\n") {
  926. item.Score -=1
  927. exfield.Score-=1
  928. }
  929. if tmp["scoreitem"] == nil {
  930. sitems := make([]*ju.ScoreItem, 0)
  931. sitems = append(sitems, &item)
  932. exfield.ScoreItem = sitems
  933. } else {
  934. exfield.ScoreItem = append(exfield.ScoreItem, &item)
  935. }
  936. j.Result[k] = append(j.Result[k], &exfield)
  937. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  938. }
  939. }
  940. }
  941. }
  942. } else {
  943. pos := v.RegCore.Reg.FindStringIndex(text)
  944. val := ""
  945. if len(pos) == 2 {
  946. text = text[pos[1]:]
  947. rs := regexp.MustCompile("[^\r\n\t]+")
  948. tmp := rs.FindAllString(text, -1)
  949. if len(tmp) > 0 {
  950. val = tmp[0]
  951. }
  952. }
  953. if val != "" {
  954. tmps := []map[string]interface{}{}
  955. tmp := map[string]interface{}{
  956. "field": v.Field,
  957. "code": v.Code,
  958. "ruletext": v.RuleText,
  959. "extfrom": extfrom,
  960. "value": val,
  961. "type": "regexp",
  962. "matchtype": "regcontent",
  963. "blocktag": *tag,
  964. }
  965. tmps = append(tmps, tmp)
  966. extinfo[v.Field] = tmps
  967. if j.Result[v.Field] == nil {
  968. j.Result[v.Field] = [](*ju.ExtField){}
  969. }
  970. field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  971. if extfrom == "title" {
  972. field.Score = 4
  973. }
  974. if tmp["blocktag"] != nil {
  975. field.BlockTag = tmp["blocktag"].(map[string]bool)
  976. }
  977. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  978. if extfrom == "title" {
  979. item.Score = 4
  980. }
  981. if tmp["scoreitem"] == nil {
  982. sitems := make([]*ju.ScoreItem, 0)
  983. sitems = append(sitems, &item)
  984. field.ScoreItem = sitems
  985. } else {
  986. field.ScoreItem = append(field.ScoreItem, &item)
  987. }
  988. j.Result[v.Field] = append(j.Result[v.Field], field)
  989. }
  990. }
  991. return extinfo
  992. }
  993. //后置过滤
  994. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  995. defer qu.Catch()
  996. if in.IsLua {
  997. result := GetResultMapForLua(j)
  998. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  999. if j != nil {
  1000. lua.Block = j.Block
  1001. }
  1002. extinfo := lua.RunScript("back")
  1003. for k, v := range extinfo {
  1004. if tmps, ok := v.([]map[string]interface{}); ok {
  1005. j.Result[k] = [](*ju.ExtField){}
  1006. for _, tmp := range tmps {
  1007. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  1008. if tmp["blocktag"] != nil {
  1009. field.BlockTag = tmp["blocktag"].(map[string]bool)
  1010. }
  1011. item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  1012. if tmp["scoreitem"] == nil {
  1013. scoreItems := make([]*ju.ScoreItem, 0)
  1014. scoreItems = append(scoreItems, &item)
  1015. field.ScoreItem = scoreItems
  1016. } else {
  1017. field.ScoreItem = append(field.ScoreItem, &item)
  1018. }
  1019. j.Result[k] = append(j.Result[k], field)
  1020. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1021. }
  1022. }
  1023. }
  1024. if len(extinfo) > 0 {
  1025. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1026. }
  1027. } else {
  1028. extinfo := map[string]interface{}{}
  1029. if in.Field != "" {
  1030. if j.Result[in.Field] != nil {
  1031. tmp := j.Result[in.Field]
  1032. exts := []interface{}{}
  1033. for k, v := range tmp {
  1034. //table抽取到的数据不清理
  1035. // if v.Type == "table" && v.Field != "projectname" {
  1036. // continue
  1037. // }
  1038. text := qu.ObjToString(v.Value)
  1039. if text != "" && v.ExtFrom != "title" {
  1040. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1041. }
  1042. j.Result[in.Field][k].Value = text
  1043. exts = append(exts, map[string]interface{}{
  1044. "field": v.Field,
  1045. "code": v.Code,
  1046. "ruletext": v.RuleText,
  1047. "type": v.Type,
  1048. "matchtype": v.MatchType,
  1049. "extfrom": v.ExtFrom,
  1050. "value": text,
  1051. })
  1052. }
  1053. extinfo[in.Field] = exts
  1054. if len(extinfo) > 0 {
  1055. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1056. }
  1057. }
  1058. } else {
  1059. for key, tmp := range j.Result {
  1060. exts := []interface{}{}
  1061. for k, v := range tmp {
  1062. if v.Type == "table" { //table抽取到的数据不清理
  1063. continue
  1064. }
  1065. text := qu.ObjToString(v.Value)
  1066. if text != "" {
  1067. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1068. }
  1069. j.Result[key][k].Value = text
  1070. exts = append(exts, map[string]interface{}{
  1071. "field": v.Field,
  1072. "code": v.Code,
  1073. "ruletext": v.RuleText,
  1074. "type": v.Type,
  1075. "matchtype": v.MatchType,
  1076. "extfrom": v.ExtFrom,
  1077. "value": text,
  1078. })
  1079. }
  1080. extinfo[key] = exts
  1081. }
  1082. if len(extinfo) > 0 {
  1083. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1084. }
  1085. }
  1086. }
  1087. }
  1088. //获取抽取结果map[string][]interface{},lua脚本使用
  1089. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1090. defer qu.Catch()
  1091. result := map[string][]map[string]interface{}{}
  1092. for key, val := range j.Result {
  1093. if result[key] == nil {
  1094. result[key] = []map[string]interface{}{}
  1095. }
  1096. for _, v := range val {
  1097. tmp := map[string]interface{}{
  1098. "field": v.Field,
  1099. "code": v.Code,
  1100. "ruletext": v.RuleText,
  1101. "value": v.Value,
  1102. "type": v.Type,
  1103. "matchtype": v.MatchType,
  1104. "extfrom": v.ExtFrom,
  1105. }
  1106. result[key] = append(result[key], tmp)
  1107. }
  1108. }
  1109. return result
  1110. }
  1111. //抽取日志
  1112. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1113. defer qu.Catch()
  1114. if !t.IsEtxLog {
  1115. return
  1116. }
  1117. logdata := map[string]interface{}{
  1118. "code": v.Code,
  1119. "name": v.Name,
  1120. "type": ftype,
  1121. "ruletext": v.RuleText,
  1122. "islua": v.IsLua,
  1123. "field": v.Field,
  1124. "version": t.Version,
  1125. "taskname": t.Name,
  1126. "before": before,
  1127. "extinfo": extinfo,
  1128. "sid": sid,
  1129. "comeintime": time.Now().Unix(),
  1130. }
  1131. lock.Lock()
  1132. ExtLogs[t] = append(ExtLogs[t], logdata)
  1133. lock.Unlock()
  1134. }
  1135. //保存抽取日志
  1136. func SaveExtLog() {
  1137. defer qu.Catch()
  1138. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1139. lock.Lock()
  1140. tmpLogs = ExtLogs
  1141. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1142. lock.Unlock()
  1143. for k, v := range tmpLogs {
  1144. if len(v) < saveLimit {
  1145. db.Mgo.SaveBulk(k.TrackColl, v...)
  1146. } else {
  1147. for {
  1148. if len(v) > saveLimit {
  1149. tmp := v[:saveLimit]
  1150. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1151. v = v[saveLimit:]
  1152. } else {
  1153. db.Mgo.SaveBulk(k.TrackColl, v...)
  1154. break
  1155. }
  1156. }
  1157. }
  1158. }
  1159. time.AfterFunc(10*time.Second, SaveExtLog)
  1160. }
  1161. type FieldValue struct {
  1162. Value interface{}
  1163. Count int
  1164. }
  1165. //分析抽取结果并保存
  1166. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1167. qu.Try(func() {
  1168. doc, result, _id := funcAnalysis(j)
  1169. auxinfo := auxInfo(j)
  1170. //从排序结果中取值
  1171. tmp := map[string]interface{}{} //抽取值
  1172. tmp["fieldall"] = auxinfo
  1173. for _, val := range result {
  1174. for _, v := range val { //取第一个非负数
  1175. if v.Score > -1 {
  1176. tmp[v.Field] = v.Value
  1177. break
  1178. }
  1179. }
  1180. }
  1181. if len(j.PackageInfo) > 0 { //分包信息
  1182. tmp["package"] = j.PackageInfo
  1183. }
  1184. if len(j.Winnerorder) > 0 { //候选人信息
  1185. tmp["winnerorder"] = j.Winnerorder
  1186. }
  1187. //处理附件
  1188. var resultf map[string][]*ju.ExtField
  1189. if jf != nil {
  1190. _, resultf, _ = funcAnalysis(jf)
  1191. auxinfof := auxInfo(jf)
  1192. tmp["fieldallf"] = auxinfof
  1193. ffield := map[string]interface{}{}
  1194. for _, val := range resultf {
  1195. for _, v := range val { //取第一个非负数
  1196. if v.Score > -1 {
  1197. ffield[v.Field] = v.Value
  1198. break
  1199. }
  1200. }
  1201. }
  1202. if len(jf.PackageInfo) > 0 { //分包信息
  1203. ffield["package"] = jf.PackageInfo
  1204. }
  1205. if len(jf.Winnerorder) > 0 { //候选人信息
  1206. ffield["winnerorder"] = jf.Winnerorder
  1207. }
  1208. tmp["ffield"] = ffield
  1209. }
  1210. for k, v := range *doc {
  1211. //去重冗余字段
  1212. if delFiled(k) {
  1213. continue
  1214. }
  1215. if tmp[k] == nil {
  1216. tmp[k] = v
  1217. }
  1218. }
  1219. //质量审核
  1220. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1221. e.QualityAudit(tmp)
  1222. }
  1223. if e.IsExtractCity { //城市抽取
  1224. e.ExtractCity(j, tmp, _id)
  1225. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1226. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1227. // tmp["district"] = d
  1228. // if b {
  1229. // tmp["city"] = c
  1230. // tmp["area"] = p
  1231. // }
  1232. }
  1233. //品牌抽取
  1234. if ju.IsBrandGoods {
  1235. tmp["checkhas"] = map[string]int{
  1236. "hastable": j.HasTable,
  1237. "hasgoods": j.HasGoods,
  1238. "hasbrand": j.HasBrand,
  1239. "haskey": j.HasKey,
  1240. }
  1241. if len(j.BrandData) > 0 {
  1242. tmp["tablebrand"] = j.BrandData
  1243. }
  1244. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1245. }
  1246. //分包和标签
  1247. if ju.Config["saveblock"].(bool) {
  1248. blocks := make([]ju.BlockAndTag, 0)
  1249. for _, v := range j.Block {
  1250. xx, _ := json.Marshal(v)
  1251. tmpblock := new(ju.TmpBlock)
  1252. err := json.Unmarshal(xx, &tmpblock)
  1253. if err != nil {
  1254. if v.BPackage != nil {
  1255. bpb, _ := json.Marshal(v.BPackage)
  1256. tmpblock.BPackage = string(bpb)
  1257. }
  1258. tmpblock = rangeBlockToJson(v, *tmpblock)
  1259. }
  1260. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1261. }
  1262. tmp["blocks"] = blocks
  1263. }
  1264. if e.TaskInfo.TestColl == "" {
  1265. if len(tmp) > 0 { //保存抽取结果
  1266. for field, _ := range e.Fields {
  1267. if tmp[field] == nil {
  1268. tmp[field] = "" //覆盖之前版本数据
  1269. }
  1270. }
  1271. tmp["repeat"] = 0
  1272. tmparr := []map[string]interface{}{
  1273. map[string]interface{}{
  1274. "_id": qu.StringTOBsonId(_id),
  1275. },
  1276. map[string]interface{}{"$set": tmp},
  1277. }
  1278. e.BidArr = append(e.BidArr, tmparr)
  1279. e.BidTotal++
  1280. }
  1281. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1282. id := tmp["_id"]
  1283. tmp["result"] = result
  1284. tmp["resultf"] = resultf
  1285. delete(tmp, "_id")
  1286. tmparr := []map[string]interface{}{
  1287. map[string]interface{}{
  1288. "_id": id,
  1289. },
  1290. map[string]interface{}{"$set": tmp},
  1291. }
  1292. e.ResultArr = append(e.ResultArr, tmparr)
  1293. }
  1294. } else { //测试结果
  1295. delete(tmp, "_id")
  1296. if len(j.BlockPackage) > 0 { //分包详情
  1297. bs, _ := json.Marshal(j.BlockPackage)
  1298. tmp["epackage"] = string(bs)
  1299. }
  1300. tmp["result"] = result
  1301. tmp["resultf"] = resultf
  1302. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1303. if !b {
  1304. log.Debug(e.TaskInfo.TestColl, _id)
  1305. }
  1306. }
  1307. }, func(err interface{}) {
  1308. log.Debug("AnalysisSaveResult err", err)
  1309. })
  1310. }
  1311. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1312. if j == nil {
  1313. return nil
  1314. }
  1315. if len(j.Block) > 0 {
  1316. for i, v := range j.Block {
  1317. rangetmp := new(ju.TmpBlock)
  1318. vb, _ := json.Marshal(v)
  1319. json.Unmarshal(vb, &rangetmp)
  1320. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1321. }
  1322. }
  1323. if j.ColonKV != nil {
  1324. cb, _ := json.Marshal(j.ColonKV)
  1325. tmpblock.ColonKV = string(cb)
  1326. }
  1327. if j.SpaceKV != nil {
  1328. sb, _ := json.Marshal(j.SpaceKV)
  1329. tmpblock.SpaceKV = string(sb)
  1330. }
  1331. if j.TableKV != nil {
  1332. tb, _ := json.Marshal(j.TableKV)
  1333. tmpblock.TableKV = string(tb)
  1334. }
  1335. return &tmpblock
  1336. }
  1337. //去重冗余字段
  1338. func delFiled(k string) bool {
  1339. return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1340. }
  1341. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1342. defer qu.Catch()
  1343. doc := j.Data
  1344. result := j.Result
  1345. _id := qu.BsonIdToSId((*doc)["_id"])
  1346. result = ScoreFields(j)
  1347. //结果排序
  1348. for _, val := range result {
  1349. ju.Sort(val)
  1350. }
  1351. return doc, result, _id
  1352. }
  1353. //辅助信息,如果没有排序先排序
  1354. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1355. fieldalls := map[string][]map[string]interface{}{}
  1356. for field, val := range j.Result {
  1357. //ju.Sort(val)
  1358. sfields := []map[string]interface{}{}
  1359. for _, v := range val {
  1360. standardized := false
  1361. if field == "buyer" || field == "winner" || field == "agency" {
  1362. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1363. if i > 0 {
  1364. standardized = true
  1365. }
  1366. }
  1367. sfield := map[string]interface{}{
  1368. "val": v.Value,
  1369. "type": v.Type,
  1370. "score": v.Score,
  1371. "blocktag": v.BlockTag,
  1372. "sourceval": v.SourceValue,
  1373. "standardized": standardized,
  1374. }
  1375. sfields = append(sfields, sfield)
  1376. }
  1377. fieldalls[field] = sfields
  1378. }
  1379. return fieldalls
  1380. }
  1381. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1382. defer qu.Catch()
  1383. //获取审核字段
  1384. for _, field := range e.AuditFields {
  1385. //1.分包
  1386. if resulttmp["package"] != nil {
  1387. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1388. for _, val := range packagedata {
  1389. if val[field] != nil {
  1390. fv := qu.ObjToString(val[field])
  1391. if fv != "" {
  1392. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1393. e.RedisMatch(field, fv, val) //redis匹配
  1394. } else { //除了buyer和winner,其他字段走规则匹配
  1395. e.RuleMatch(field, fv, val)
  1396. }
  1397. }
  1398. }
  1399. }
  1400. }
  1401. //2.外围
  1402. if resulttmp[field] != nil {
  1403. fv := qu.ObjToString(resulttmp[field])
  1404. if fv != "" {
  1405. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1406. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1407. } else { //除了buyer和winner,其他字段走规则匹配
  1408. e.RuleMatch(field, fv, resulttmp)
  1409. }
  1410. }
  1411. }
  1412. }
  1413. }
  1414. //Redis匹配
  1415. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1416. defer qu.Catch()
  1417. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1418. if i == 0 { //reids未找到,执行规则匹配
  1419. val[field+"_isredis"] = false
  1420. e.RuleMatch(field, fv, val) //规则匹配
  1421. } else { //redis找到,打标识存库
  1422. val[field+"_isredis"] = true
  1423. }
  1424. }
  1425. //规则匹配
  1426. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1427. defer qu.Catch()
  1428. if fieldval != "" {
  1429. SMap := e.StartMatch(field, fieldval)
  1430. //SMap.AddKey(field+"_isaudit", false)
  1431. for _, k := range SMap.Keys {
  1432. tmpMap[k] = SMap.Map[k]
  1433. }
  1434. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1435. }
  1436. }
  1437. //开始规则匹配
  1438. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1439. defer qu.Catch()
  1440. SMap := pretreated.NewSortMap()
  1441. lock.Lock()
  1442. f := e.RecogFieldMap[field]
  1443. lock.Unlock()
  1444. if len(f) > 0 {
  1445. fid := qu.BsonIdToSId(f["_id"])
  1446. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1447. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1448. if textAfterRecogFieldPrerule != "" {
  1449. lock.Lock()
  1450. classMap := e.FidClassMap[fid]
  1451. lock.Unlock()
  1452. L:
  1453. for _, c := range classMap { //class
  1454. classid := qu.BsonIdToSId(c["_id"])
  1455. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1456. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1457. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1458. if textAfterClassPrerule != "" {
  1459. lock.Lock()
  1460. ruleMap := e.CidRuleMap[classid]
  1461. lock.Unlock()
  1462. for _, r := range ruleMap { //rule
  1463. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1464. s_name := qu.ObjToString(r["s_name"])
  1465. rule := r["rule"].([]interface{})
  1466. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1467. if textAfterRulePrerule != "" {
  1468. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1469. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1470. if savefield != "" { //保存字段不为空,存储代码信息
  1471. SMap.AddKey(field+"_"+savefield, s_name)
  1472. }
  1473. break L
  1474. }
  1475. }
  1476. }
  1477. }
  1478. }
  1479. }
  1480. }
  1481. return SMap
  1482. }