extract.go 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "reflect"
  13. "regexp"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 200 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  31. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  32. )
  33. //启动测试抽取
  34. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  35. defer qu.Catch()
  36. ext := &ExtractTask{}
  37. ext.Id = taskId
  38. ext.IsRun = true
  39. ext.InitTestTaskInfo(resultcoll, trackcoll)
  40. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  41. ext.InitRulePres()
  42. ext.InitRuleBacks()
  43. ext.InitRuleCore()
  44. ext.InitPkgCore()
  45. ext.InitBlockRule()
  46. ext.InfoTypeList()
  47. ext.InitTag()
  48. ext.InitClearFn()
  49. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  50. //初始化城市DFA信息
  51. ext.InitCityDFA()
  52. ext.InitAreaCode()
  53. ext.InitPostCode()
  54. }
  55. //质量审核
  56. ext.InitAuditFields()
  57. ext.InitAuditRule()
  58. ext.InitAuditClass()
  59. ext.InitAuditRecogField()
  60. //品牌抽取是否开启
  61. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  62. //附件抽取是否开启
  63. ext.InitFile()
  64. return RunExtractTestTask(ext, startId, num)
  65. }
  66. func IdTrans(startId string) bson.ObjectId {
  67. defer qu.Catch()
  68. return bson.ObjectIdHex(startId)
  69. }
  70. //开始测试任务抽取
  71. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  72. n, _ := strconv.Atoi(num)
  73. id := IdTrans(startId)
  74. if id.Valid() {
  75. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  76. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  77. for _, v := range *list {
  78. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  79. continue
  80. }
  81. var j, jf *ju.Job
  82. if ext.IsFileField && v["projectinfo"] != nil {
  83. v["isextFile"] = true
  84. j, jf = ext.PreInfo(v)
  85. } else {
  86. j, _ = ext.PreInfo(v)
  87. }
  88. ext.TaskInfo.ProcessPool <- true
  89. go ext.ExtractProcess(j, jf)
  90. }
  91. return true
  92. } else {
  93. return false
  94. }
  95. }
  96. //启动抽取
  97. func StartExtractTaskId(taskId string) bool {
  98. defer qu.Catch()
  99. isgo := false
  100. ext := TaskList[taskId]
  101. if ext == nil {
  102. ext = &ExtractTask{}
  103. ext.Id = taskId
  104. ext.InitTaskInfo()
  105. isgo = true
  106. } else {
  107. ext.Id = taskId
  108. ext.InitTaskInfo()
  109. }
  110. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  111. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  112. ext.InitRulePres()
  113. ext.InitRuleBacks()
  114. ext.InitRuleCore()
  115. ext.InitPkgCore()
  116. ext.InitBlockRule()
  117. ext.InfoTypeList()
  118. ext.InitTag()
  119. ext.InitClearFn()
  120. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  121. //初始化城市DFA信息
  122. ext.InitCityDFA()
  123. ext.InitAreaCode()
  124. ext.InitPostCode()
  125. }
  126. //质量审核
  127. ext.InitAuditFields()
  128. ext.InitAuditRule()
  129. ext.InitAuditClass()
  130. ext.InitAuditRecogField()
  131. //品牌抽取是否开启
  132. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  133. //附件抽取是否开启
  134. ext.InitFile()
  135. ext.IsRun = true
  136. go ext.ResultSave(true)
  137. go ext.BidSave(true)
  138. if isgo {
  139. go RunExtractTask(taskId)
  140. }
  141. TaskList[taskId] = ext
  142. return true
  143. }
  144. //停止抽取
  145. func StopExtractTaskId(taskId string) bool {
  146. defer qu.Catch()
  147. ext := TaskList[taskId]
  148. if ext != nil {
  149. ext.IsRun = false
  150. TaskList[taskId] = ext
  151. }
  152. //更新task.s_extlastid
  153. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  154. return true
  155. }
  156. //开始抽取
  157. func RunExtractTask(taskId string) {
  158. defer qu.Catch()
  159. ext := TaskList[taskId]
  160. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  161. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  162. pageNum := (count + PageSize - 1) / PageSize
  163. limit := PageSize
  164. if count < PageSize {
  165. limit = count
  166. }
  167. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  168. for i := 0; i < pageNum; i++ {
  169. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  170. fmt.Printf("page=%d,query=%v", i+1, query)
  171. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  172. for _, v := range *list {
  173. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  174. continue
  175. }
  176. _id := qu.BsonIdToSId(v["_id"])
  177. //log.Debug(_id)
  178. if !ext.IsRun {
  179. break
  180. }
  181. var j, jf *ju.Job
  182. if ext.IsFileField && v["projectinfo"] != nil {
  183. v["isextFile"] = true
  184. j, jf = ext.PreInfo(v)
  185. } else {
  186. j, _ = ext.PreInfo(v)
  187. }
  188. ext.TaskInfo.ProcessPool <- true
  189. go ext.ExtractProcess(j, jf)
  190. ext.TaskInfo.LastExtId = _id
  191. }
  192. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  193. if !ext.IsRun {
  194. break
  195. }
  196. }
  197. //更新task.s_extlastid
  198. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  199. }
  200. //信息预处理-不和版本关联,取最新版本的配置项
  201. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  202. return (&ExtractTask{}).PreInfo(doc)
  203. }
  204. //信息预处理-和版本关联
  205. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
  206. defer qu.Catch()
  207. //判断是否有附件这个字段
  208. var isextFile bool
  209. if doc["isextFile"] != nil {
  210. isextFile = doc["isextFile"].(bool)
  211. }
  212. detail := ""
  213. d1, _ := doc["detail"].(string)
  214. d2, _ := doc["contenthtml"].(string)
  215. if len(d1) >= len(d2) || d2 == "" {
  216. detail = d1
  217. } else {
  218. detail = d2
  219. }
  220. detail = ju.CutLableStr(detail)
  221. detail = cut.ClearHtml(detail)
  222. doc["detail"] = detail
  223. if isextFile {
  224. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  225. }
  226. toptype := qu.ObjToString(doc["toptype"])
  227. subtype := qu.ObjToString(doc["subtype"])
  228. if qu.ObjToString(doc["type"]) == "bid" {
  229. toptype = "结果"
  230. }
  231. if toptype == "" {
  232. toptype = "*"
  233. }
  234. j = &ju.Job{
  235. SourceMid: qu.BsonIdToSId(doc["_id"]),
  236. Category: toptype,
  237. CategorySecond: subtype,
  238. Content: qu.ObjToString(doc["detail"]),
  239. SpiderCode: qu.ObjToString(doc["spidercode"]),
  240. //Domain: qu.ObjToString(doc["domain"]),
  241. //Href: qu.ObjToString(doc["href"]),
  242. Title: qu.ObjToString(doc["title"]),
  243. Data: &doc,
  244. City: qu.ObjToString(doc["city"]),
  245. Province: qu.ObjToString(doc["area"]),
  246. Jsondata: qu.ObjToMap(doc["jsondata"]),
  247. Result: map[string][]*ju.ExtField{},
  248. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  249. RuleBlock: e.RuleBlock,
  250. }
  251. if isextFile {
  252. jf = &ju.Job{
  253. SourceMid: qu.BsonIdToSId(doc["_id"]),
  254. Category: toptype,
  255. Content: qu.ObjToString(doc["detailfile"]),
  256. SpiderCode: qu.ObjToString(doc["spidercode"]),
  257. Title: qu.ObjToString(doc["title"]),
  258. Data: &doc,
  259. City: qu.ObjToString(doc["city"]),
  260. Province: qu.ObjToString(doc["area"]),
  261. Jsondata: qu.ObjToMap(doc["jsondata"]),
  262. Result: map[string][]*ju.ExtField{},
  263. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  264. RuleBlock: e.RuleBlock,
  265. IsFile: isextFile,
  266. }
  267. }
  268. qu.Try(func() {
  269. pretreated.AnalyStart(j) //job.Block分块
  270. if isextFile {
  271. pretreated.AnalyStart(jf)
  272. }
  273. }, func(err interface{}) {
  274. log.Debug("pretreated.AnalyStart", err)
  275. })
  276. return j, jf
  277. }
  278. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  279. func file2text(doc *map[string]interface{}) {
  280. var strfileinfo bytes.Buffer
  281. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  282. if va, ok := v["attachments"].(map[string]interface{}); ok {
  283. for _, vaatt := range va {
  284. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  285. if qu.ObjToString(fileinfo["content"]) != "" {
  286. switch fileinfo["content"].(type) {
  287. case string:
  288. lock.Lock()
  289. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  290. lock.Unlock()
  291. case []map[string]interface{}:
  292. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  293. if fv["context"] != nil {
  294. lock.Lock()
  295. strfileinfo.WriteString(fv["context"].(string) + " \n")
  296. lock.Unlock()
  297. }
  298. }
  299. }
  300. }
  301. }
  302. }
  303. }
  304. }
  305. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  306. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  307. }
  308. }
  309. //抽取
  310. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
  311. e.ExtractDetail(j)
  312. if jf != nil && jf.IsFile {
  313. e.ExtractFile(jf)
  314. }
  315. //分析抽取结果并保存 todo
  316. AnalysisSaveResult(j, jf, e)
  317. <-e.TaskInfo.ProcessPool
  318. }
  319. func (e *ExtractTask) ExtractDetail(j *ju.Job) {
  320. qu.Try(func() {
  321. doc := *j.Data
  322. //全局前置规则,结果覆盖doc属性
  323. //for _, v := range e.RulePres {
  324. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  325. //}
  326. if j.CategorySecond == "" {
  327. //抽取规则
  328. tmprules := map[string][]*RuleCore{}
  329. lock.Lock()
  330. if j.Category == "*"{
  331. j.Category = "*_其他"
  332. }
  333. for k, vc1 := range e.RuleCores[j.Category] {
  334. tmprules[k] = vc1
  335. }
  336. lock.Unlock()
  337. for _, vc1 := range tmprules {
  338. for _, vc := range vc1 {
  339. tmp := ju.DeepCopy(doc).(map[string]interface{})
  340. //是否进入逻辑
  341. if !ju.Logic(vc.LuaLogic, tmp) {
  342. continue
  343. }
  344. ////抽取-前置规则
  345. //for _, v := range vc.RulePres {
  346. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  347. //}
  348. // log.Debug("抽取-前置规则", tmp)
  349. //抽取-规则
  350. for _, v := range vc.RuleCores {
  351. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  352. }
  353. // log.Debug("抽取-规则", tmp)
  354. //项目名称未能抽取到,标题来凑
  355. if vc.Field == "projectname" {
  356. if len(j.Result[vc.Field]) < 1 {
  357. items := make([]*ju.ScoreItem, 1)
  358. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  359. field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
  360. if tmp["blocktag"] != nil {
  361. field.BlockTag = tmp["blocktag"].(map[string]bool)
  362. }
  363. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  364. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  365. }
  366. }
  367. //抽取-后置规则
  368. for _, v := range vc.RuleBacks {
  369. ExtRegBack(j, v, e.TaskInfo)
  370. }
  371. // log.Debug("抽取-后置规则", tmp)
  372. }
  373. }
  374. } else {
  375. for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  376. for _, vc := range vc1 {
  377. tmp := ju.DeepCopy(doc).(map[string]interface{})
  378. //是否进入逻辑
  379. if !ju.Logic(vc.LuaLogic, tmp) {
  380. continue
  381. }
  382. //抽取-前置规则
  383. for _, v := range vc.RulePres {
  384. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  385. }
  386. // log.Debug("抽取-前置规则", tmp)
  387. //抽取-规则
  388. for _, v := range vc.RuleCores {
  389. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  390. }
  391. // log.Debug("抽取-规则", tmp)
  392. //项目名称未能抽取到,标题来凑
  393. if vc.Field == "projectname" {
  394. items := make([]*ju.ScoreItem, 1)
  395. items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
  396. field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
  397. if len(j.Result[vc.Field]) < 1 {
  398. if tmp["blocktag"] != nil {
  399. field.BlockTag = tmp["blocktag"].(map[string]bool)
  400. }
  401. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  402. //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  403. }
  404. }
  405. //抽取-后置规则
  406. for _, v := range vc.RuleBacks {
  407. ExtRegBack(j, v, e.TaskInfo)
  408. }
  409. // log.Debug("抽取-后置规则", tmp)
  410. }
  411. }
  412. }
  413. //全局后置规则
  414. for _, v := range e.RuleBacks {
  415. ExtRegBack(j, v, e.TaskInfo)
  416. }
  417. //候选人加入
  418. if len(j.Winnerorder) > 0 {
  419. winner := &ju.ExtField{
  420. Field: "winner",
  421. Code: "",
  422. RuleText: "",
  423. Type: "winnerorder",
  424. MatchType: "winnerorder",
  425. ExtFrom: "",
  426. Value: j.Winnerorder[0]["entname"],
  427. Score: 0,
  428. }
  429. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  430. winner.Score = -5
  431. }
  432. winners := j.Result["winner"]
  433. if winners != nil {
  434. winners = append(winners, winner)
  435. } else {
  436. winners = []*ju.ExtField{}
  437. winners = append(winners, winner)
  438. }
  439. j.Result["winner"] = winners
  440. }
  441. //函数清理
  442. for key, val := range j.Result {
  443. for _, v := range val {
  444. lock.Lock()
  445. cfn := e.ClearFn[key]
  446. lock.Unlock()
  447. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  448. v.Value = data[0]
  449. //清理特殊符号
  450. lock.Lock()
  451. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  452. clear.MesField[key] != nil {
  453. text := qu.ObjToString(v.Value)
  454. text = clear.OtherClean(key, text)
  455. v.Value = text
  456. }
  457. lock.Unlock()
  458. }
  459. }
  460. PackageDetail(j, e) //处理分包信息
  461. // bs, _ := json.Marshal(j.Result)
  462. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  463. }, func(err interface{}) {
  464. log.Debug("ExtractProcess err", err)
  465. })
  466. }
  467. func (e *ExtractTask) ExtractFile(j *ju.Job) {
  468. qu.Try(func() {
  469. doc := *j.Data
  470. //全局前置规则,结果覆盖doc属性
  471. for _, v := range e.RulePres {
  472. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  473. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  474. }
  475. }
  476. //抽取规则
  477. if j.CategorySecond == "" {
  478. for _, vc1 := range e.RuleCores[j.Category] {
  479. for _, vc := range vc1 {
  480. tmp := ju.DeepCopy(doc).(map[string]interface{})
  481. //是否进入逻辑
  482. if !ju.Logic(vc.LuaLogic, tmp) {
  483. continue
  484. }
  485. //抽取-前置规则
  486. for _, v := range vc.RulePres {
  487. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  488. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  489. }
  490. }
  491. // log.Debug("抽取-前置规则", tmp)
  492. //抽取-规则
  493. for _, v := range vc.RuleCores {
  494. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  495. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  496. }
  497. }
  498. // log.Debug("抽取-规则", tmp)
  499. //抽取-后置规则
  500. for _, v := range vc.RuleBacks {
  501. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  502. ExtRegBack(j, v, e.TaskInfo)
  503. }
  504. }
  505. // log.Debug("抽取-后置规则", tmp)
  506. }
  507. }
  508. } else {
  509. for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  510. for _, vc := range vc1 {
  511. tmp := ju.DeepCopy(doc).(map[string]interface{})
  512. //是否进入逻辑
  513. if !ju.Logic(vc.LuaLogic, tmp) {
  514. continue
  515. }
  516. //抽取-前置规则
  517. for _, v := range vc.RulePres {
  518. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  519. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  520. }
  521. }
  522. // log.Debug("抽取-前置规则", tmp)
  523. //抽取-规则
  524. for _, v := range vc.RuleCores {
  525. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  526. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  527. }
  528. }
  529. // log.Debug("抽取-规则", tmp)
  530. //抽取-后置规则
  531. for _, v := range vc.RuleBacks {
  532. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  533. ExtRegBack(j, v, e.TaskInfo)
  534. }
  535. }
  536. // log.Debug("抽取-后置规则", tmp)
  537. }
  538. }
  539. }
  540. //全局后置规则
  541. for _, v := range e.RuleBacks {
  542. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  543. ExtRegBack(j, v, e.TaskInfo)
  544. }
  545. }
  546. //候选人加入
  547. if len(j.Winnerorder) > 0 {
  548. winner := &ju.ExtField{
  549. Field: "winner",
  550. Code: "",
  551. RuleText: "",
  552. Type: "winnerorder",
  553. MatchType: "winnerorder",
  554. ExtFrom: "",
  555. Value: j.Winnerorder[0]["entname"],
  556. Score: 0,
  557. }
  558. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  559. winner.Score = -5
  560. }
  561. winners := j.Result["winner"]
  562. if winners != nil {
  563. winners = append(winners, winner)
  564. } else {
  565. winners = []*ju.ExtField{}
  566. winners = append(winners, winner)
  567. }
  568. j.Result["winner"] = winners
  569. }
  570. //函数清理
  571. for key, val := range j.Result {
  572. for _, v := range val {
  573. lock.Lock()
  574. cfn := e.ClearFn[key]
  575. lock.Unlock()
  576. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  577. v.Value = data[0]
  578. //清理特殊符号
  579. lock.Lock()
  580. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  581. clear.MesField[key] != nil {
  582. text := qu.ObjToString(v.Value)
  583. text = clear.OtherClean(key, text)
  584. v.Value = text
  585. }
  586. lock.Unlock()
  587. }
  588. }
  589. PackageDetail(j, e) //处理分包信息
  590. // bs, _ := json.Marshal(j.Result)
  591. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  592. }, func(err interface{}) {
  593. log.Debug("ExtractProcess err", err)
  594. })
  595. }
  596. //前置过滤
  597. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  598. defer qu.Catch()
  599. before := ju.DeepCopy(doc).(map[string]interface{})
  600. extinfo := map[string]interface{}{}
  601. if in.IsLua {
  602. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  603. if j != nil {
  604. lua.Block = j.Block
  605. }
  606. extinfo = lua.RunScript("pre")
  607. for k, v := range extinfo { //结果覆盖原doc
  608. doc[k] = v
  609. }
  610. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  611. } else {
  612. var key string
  613. if !j.IsFile {
  614. key = qu.If(in.Field == "", "detail", in.Field).(string)
  615. } else {
  616. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  617. }
  618. text := qu.ObjToString(doc[key])
  619. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  620. doc[key] = extinfo[key] //结果覆盖原doc
  621. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  622. }
  623. return doc
  624. }
  625. //抽取-规则
  626. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  627. defer qu.Catch()
  628. //废标、流标、ppp等跳过
  629. b := IsExtract(in.Field, j.Title, j.Content)
  630. if !b {
  631. return
  632. }
  633. if in.IsLua {
  634. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  635. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  636. lua.Block = j.Block
  637. extinfo := lua.RunScript("core")
  638. for k, v := range extinfo {
  639. if k == in.Field {
  640. if j.Result[k] == nil {
  641. j.Result[k] = [](*ju.ExtField){}
  642. }
  643. if tmps, ok := v.([]map[string]interface{}); ok {
  644. for _, tmp := range tmps {
  645. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], }
  646. if extfrom == "title"{
  647. field.Score = 4
  648. }
  649. if tmp["blocktag"] != nil {
  650. field.BlockTag = tmp["blocktag"].(map[string]bool)
  651. }
  652. item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
  653. if extfrom =="title"{
  654. item.Score = 4
  655. }
  656. if tmp["scoreitem"] == nil {
  657. scoreItems := make([]*ju.ScoreItem, 0)
  658. scoreItems = append(scoreItems, item)
  659. field.ScoreItem = scoreItems
  660. } else {
  661. field.ScoreItem = append(field.ScoreItem, item)
  662. }
  663. j.Result[k] = append(j.Result[k], field)
  664. }
  665. }
  666. }
  667. }
  668. if len(extinfo) > 0 {
  669. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  670. }
  671. } else {
  672. //全文正则
  673. //text := qu.ObjToString(doc[extfrom])
  674. //if in.Field != "" {
  675. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  676. // if len(extinfo) > 0 {
  677. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  678. // }
  679. //}
  680. //块抽取
  681. if in.Field != "" {
  682. if extfrom == "title" {
  683. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
  684. if len(extinfo) > 0 {
  685. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  686. }
  687. } else {
  688. for _, v := range j.Block {
  689. extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
  690. if len(extinfo) > 0 {
  691. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  692. }
  693. }
  694. }
  695. }
  696. }
  697. }
  698. //lua脚本根据属性设置提取kv值
  699. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  700. defer qu.Catch()
  701. kvmap := map[string][]map[string]interface{}{}
  702. for fieldname, field := range in.LFields {
  703. lock.Lock()
  704. tags := t[field] //获取对应标签库
  705. lock.Unlock()
  706. if tags == nil {
  707. continue
  708. }
  709. for _, bl := range j.Block {
  710. //冒号kv
  711. if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
  712. kvs := bl.ColonKV.Kvs
  713. kvs2 := bl.ColonKV.Kvs_2
  714. // log.Debug("ColonKV1", kvs)
  715. // log.Debug("ColonKV2", kvs2)
  716. for _, tag := range tags {
  717. for _, kv := range kvs {
  718. if tag.Type == "string" {
  719. if kv.Key == tag.Key {
  720. text := ju.TrimLRSpace(kv.Value, "")
  721. if text != "" {
  722. kvmap[field] = append(kvmap[field], map[string]interface{}{
  723. "field": field,
  724. "code": in.Code,
  725. "ruletext": tag.Key,
  726. "extfrom": extfrom,
  727. "value": text,
  728. "type": "colon1",
  729. "matchtype": "tag_string",
  730. "blocktag": bl.Tag,
  731. })
  732. }
  733. break
  734. }
  735. } else if tag.Type == "regexp" {
  736. if tag.Reg.MatchString(kv.Key) {
  737. text := ju.TrimLRSpace(kv.Value, "")
  738. if text != "" {
  739. kvmap[field] = append(kvmap[field], map[string]interface{}{
  740. "field": field,
  741. "code": in.Code,
  742. "ruletext": tag.Key,
  743. "extfrom": extfrom,
  744. "value": text,
  745. "type": "colon1",
  746. "matchtype": "tag_regexp",
  747. "blocktag": bl.Tag,
  748. })
  749. }
  750. break
  751. }
  752. }
  753. }
  754. for _, kv := range kvs2 {
  755. if tag.Type == "string" {
  756. if kv.Key == tag.Key {
  757. text := ju.TrimLRSpace(kv.Value, "")
  758. if text != "" {
  759. kvmap[field] = append(kvmap[field], map[string]interface{}{
  760. "field": field,
  761. "code": in.Code,
  762. "ruletext": tag.Key,
  763. "extfrom": extfrom,
  764. "value": text,
  765. "type": "colon2",
  766. "matchtype": "tag_string",
  767. "blocktag": bl.Tag,
  768. })
  769. }
  770. break
  771. }
  772. } else if tag.Type == "regexp" {
  773. if tag.Reg.MatchString(kv.Key) {
  774. text := ju.TrimLRSpace(kv.Value, "")
  775. if text != "" {
  776. kvmap[field] = append(kvmap[field], map[string]interface{}{
  777. "field": field,
  778. "code": in.Code,
  779. "ruletext": tag.Key,
  780. "extfrom": extfrom,
  781. "value": text,
  782. "type": "colon2",
  783. "matchtype": "tag_regexp",
  784. "blocktag": bl.Tag,
  785. })
  786. }
  787. break
  788. }
  789. }
  790. }
  791. }
  792. }
  793. //空格kv
  794. if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
  795. kvs := bl.SpaceKV.Kvs
  796. // log.Debug("SpaceKV", kvs)
  797. for _, tag := range tags {
  798. for _, kv := range kvs {
  799. if tag.Type == "string" {
  800. if kv.Key == tag.Key {
  801. text := ju.TrimLRSpace(kv.Value, "")
  802. if text != "" {
  803. kvmap[field] = append(kvmap[field], map[string]interface{}{
  804. "field": field,
  805. "code": in.Code,
  806. "ruletext": tag.Key,
  807. "extfrom": extfrom,
  808. "value": text,
  809. "type": "space",
  810. "matchtype": "tag_string",
  811. "blocktag": bl.Tag,
  812. })
  813. }
  814. break
  815. }
  816. } else if tag.Type == "regexp" {
  817. if tag.Reg.MatchString(kv.Key) {
  818. text := ju.TrimLRSpace(kv.Value, "")
  819. if text != "" {
  820. kvmap[field] = append(kvmap[field], map[string]interface{}{
  821. "field": field,
  822. "code": in.Code,
  823. "ruletext": tag.Key,
  824. "extfrom": extfrom,
  825. "value": text,
  826. "type": "space",
  827. "matchtype": "tag_regexp",
  828. "blocktag": bl.Tag,
  829. })
  830. }
  831. break
  832. }
  833. }
  834. }
  835. }
  836. }
  837. //表格kv
  838. if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
  839. tkv := bl.TableKV
  840. // log.Debug("tkv", tkv)
  841. for k, v := range tkv.Kv {
  842. if k == fieldname {
  843. if len(tags) > -tkv.KvIndex[fieldname] {
  844. ruletext := ""
  845. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  846. ruletext = "项目名称"
  847. } else {
  848. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  849. }
  850. kvmap[field] = append(kvmap[field], map[string]interface{}{
  851. "field": field,
  852. "code": in.Code,
  853. "ruletext": ruletext,
  854. "extfrom": "table",
  855. "value": v,
  856. "type": "table",
  857. "matchtype": "tag_string",
  858. "blocktag": bl.Tag,
  859. })
  860. } else { //涉及其他待处理
  861. // log.Debug(tags)
  862. }
  863. }
  864. }
  865. }
  866. }
  867. }
  868. return kvmap
  869. }
  870. //正则提取结果
  871. func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  872. defer qu.Catch()
  873. extinfo := map[string][]map[string]interface{}{}
  874. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  875. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  876. if len(apos) > 0 {
  877. pos := apos[0]
  878. for k, p := range v.RegCore.ExtractPos {
  879. if len(pos) > p {
  880. if pos[p] == -1 || pos[p+1] == -1 {
  881. continue
  882. }
  883. val := text[pos[p]:pos[p+1]]
  884. tmps := []map[string]interface{}{}
  885. tmp := map[string]interface{}{
  886. "field": v.Field,
  887. "code": v.Code,
  888. "ruletext": v.RuleText,
  889. "extfrom": extfrom,
  890. "value": val,
  891. "type": "regexp",
  892. "matchtype": "regcontent",
  893. "blocktag": *tag,
  894. }
  895. tmps = append(tmps, tmp)
  896. extinfo[k] = tmps
  897. if strings.TrimSpace(val) != "" {
  898. exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  899. if extfrom == "title"{
  900. exfield.Score = 4
  901. }
  902. if tmp["blocktag"] != nil {
  903. exfield.BlockTag = tmp["blocktag"].(map[string]bool)
  904. }
  905. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  906. if extfrom =="title"{
  907. item.Score = 4
  908. }
  909. if tmp["scoreitem"] == nil {
  910. sitems := make([]*ju.ScoreItem, 0)
  911. sitems = append(sitems, &item)
  912. exfield.ScoreItem = sitems
  913. } else {
  914. exfield.ScoreItem = append(exfield.ScoreItem, &item)
  915. }
  916. j.Result[k] = append(j.Result[k], &exfield)
  917. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  918. }
  919. }
  920. }
  921. }
  922. } else {
  923. pos := v.RegCore.Reg.FindStringIndex(text)
  924. val := ""
  925. if len(pos) == 2 {
  926. text = text[pos[1]:]
  927. rs := regexp.MustCompile("[^\r\n\t]+")
  928. tmp := rs.FindAllString(text, -1)
  929. if len(tmp) > 0 {
  930. val = tmp[0]
  931. }
  932. }
  933. if val != "" {
  934. tmps := []map[string]interface{}{}
  935. tmp := map[string]interface{}{
  936. "field": v.Field,
  937. "code": v.Code,
  938. "ruletext": v.RuleText,
  939. "extfrom": extfrom,
  940. "value": val,
  941. "type": "regexp",
  942. "matchtype": "regcontent",
  943. "blocktag": *tag,
  944. }
  945. tmps = append(tmps, tmp)
  946. extinfo[v.Field] = tmps
  947. if j.Result[v.Field] == nil {
  948. j.Result[v.Field] = [](*ju.ExtField){}
  949. }
  950. field := &ju.ExtField{Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  951. if extfrom == "title"{
  952. field.Score = 4
  953. }
  954. if tmp["blocktag"] != nil {
  955. field.BlockTag = tmp["blocktag"].(map[string]bool)
  956. }
  957. item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
  958. if extfrom =="title"{
  959. item.Score = 4
  960. }
  961. if tmp["scoreitem"] == nil {
  962. sitems := make([]*ju.ScoreItem, 0)
  963. sitems = append(sitems, &item)
  964. field.ScoreItem = sitems
  965. } else {
  966. field.ScoreItem = append(field.ScoreItem, &item)
  967. }
  968. j.Result[v.Field] = append(j.Result[v.Field], field)
  969. }
  970. }
  971. return extinfo
  972. }
  973. //后置过滤
  974. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  975. defer qu.Catch()
  976. if in.IsLua {
  977. result := GetResultMapForLua(j)
  978. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  979. if j != nil {
  980. lua.Block = j.Block
  981. }
  982. extinfo := lua.RunScript("back")
  983. for k, v := range extinfo {
  984. if tmps, ok := v.([]map[string]interface{}); ok {
  985. j.Result[k] = [](*ju.ExtField){}
  986. for _, tmp := range tmps {
  987. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  988. if tmp["blocktag"] != nil {
  989. field.BlockTag = tmp["blocktag"].(map[string]bool)
  990. }
  991. item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
  992. if tmp["scoreitem"] == nil {
  993. scoreItems := make([]*ju.ScoreItem, 0)
  994. scoreItems = append(scoreItems, &item)
  995. field.ScoreItem = scoreItems
  996. } else {
  997. field.ScoreItem = append(field.ScoreItem, &item)
  998. }
  999. j.Result[k] = append(j.Result[k], field)
  1000. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1001. }
  1002. }
  1003. }
  1004. if len(extinfo) > 0 {
  1005. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1006. }
  1007. } else {
  1008. extinfo := map[string]interface{}{}
  1009. if in.Field != "" {
  1010. if j.Result[in.Field] != nil {
  1011. tmp := j.Result[in.Field]
  1012. exts := []interface{}{}
  1013. for k, v := range tmp {
  1014. //table抽取到的数据不清理
  1015. // if v.Type == "table" && v.Field != "projectname" {
  1016. // continue
  1017. // }
  1018. text := qu.ObjToString(v.Value)
  1019. if text != "" {
  1020. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1021. }
  1022. j.Result[in.Field][k].Value = text
  1023. exts = append(exts, map[string]interface{}{
  1024. "field": v.Field,
  1025. "code": v.Code,
  1026. "ruletext": v.RuleText,
  1027. "type": v.Type,
  1028. "matchtype": v.MatchType,
  1029. "extfrom": v.ExtFrom,
  1030. "value": text,
  1031. })
  1032. }
  1033. extinfo[in.Field] = exts
  1034. if len(extinfo) > 0 {
  1035. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1036. }
  1037. }
  1038. } else {
  1039. for key, tmp := range j.Result {
  1040. exts := []interface{}{}
  1041. for k, v := range tmp {
  1042. if v.Type == "table" { //table抽取到的数据不清理
  1043. continue
  1044. }
  1045. text := qu.ObjToString(v.Value)
  1046. if text != "" {
  1047. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1048. }
  1049. j.Result[key][k].Value = text
  1050. exts = append(exts, map[string]interface{}{
  1051. "field": v.Field,
  1052. "code": v.Code,
  1053. "ruletext": v.RuleText,
  1054. "type": v.Type,
  1055. "matchtype": v.MatchType,
  1056. "extfrom": v.ExtFrom,
  1057. "value": text,
  1058. })
  1059. }
  1060. extinfo[key] = exts
  1061. }
  1062. if len(extinfo) > 0 {
  1063. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1064. }
  1065. }
  1066. }
  1067. }
  1068. //获取抽取结果map[string][]interface{},lua脚本使用
  1069. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1070. defer qu.Catch()
  1071. result := map[string][]map[string]interface{}{}
  1072. for key, val := range j.Result {
  1073. if result[key] == nil {
  1074. result[key] = []map[string]interface{}{}
  1075. }
  1076. for _, v := range val {
  1077. tmp := map[string]interface{}{
  1078. "field": v.Field,
  1079. "code": v.Code,
  1080. "ruletext": v.RuleText,
  1081. "value": v.Value,
  1082. "type": v.Type,
  1083. "matchtype": v.MatchType,
  1084. "extfrom": v.ExtFrom,
  1085. }
  1086. result[key] = append(result[key], tmp)
  1087. }
  1088. }
  1089. return result
  1090. }
  1091. //抽取日志
  1092. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1093. defer qu.Catch()
  1094. if !t.IsEtxLog {
  1095. return
  1096. }
  1097. logdata := map[string]interface{}{
  1098. "code": v.Code,
  1099. "name": v.Name,
  1100. "type": ftype,
  1101. "ruletext": v.RuleText,
  1102. "islua": v.IsLua,
  1103. "field": v.Field,
  1104. "version": t.Version,
  1105. "taskname": t.Name,
  1106. "before": before,
  1107. "extinfo": extinfo,
  1108. "sid": sid,
  1109. "comeintime": time.Now().Unix(),
  1110. }
  1111. lock.Lock()
  1112. ExtLogs[t] = append(ExtLogs[t], logdata)
  1113. lock.Unlock()
  1114. }
  1115. //保存抽取日志
  1116. func SaveExtLog() {
  1117. defer qu.Catch()
  1118. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1119. lock.Lock()
  1120. tmpLogs = ExtLogs
  1121. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1122. lock.Unlock()
  1123. for k, v := range tmpLogs {
  1124. if len(v) < saveLimit {
  1125. db.Mgo.SaveBulk(k.TrackColl, v...)
  1126. } else {
  1127. for {
  1128. if len(v) > saveLimit {
  1129. tmp := v[:saveLimit]
  1130. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1131. v = v[saveLimit:]
  1132. } else {
  1133. db.Mgo.SaveBulk(k.TrackColl, v...)
  1134. break
  1135. }
  1136. }
  1137. }
  1138. }
  1139. time.AfterFunc(10*time.Second, SaveExtLog)
  1140. }
  1141. type FieldValue struct {
  1142. Value interface{}
  1143. Count int
  1144. }
  1145. //分析抽取结果并保存
  1146. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1147. qu.Try(func() {
  1148. doc, result, _id, values := funcAnalysis(j)
  1149. //从排序结果中取值
  1150. tmp := map[string]interface{}{} //抽取值
  1151. for key, val := range values {
  1152. for _, v := range val { //取第一个非负数
  1153. if v.Key != "" && v.Value > -1 {
  1154. tmp[key] = v.Object
  1155. break
  1156. }
  1157. }
  1158. }
  1159. if len(j.PackageInfo) > 0 { //分包信息
  1160. tmp["package"] = j.PackageInfo
  1161. }
  1162. if len(j.Winnerorder) > 0 { //候选人信息
  1163. tmp["winnerorder"] = j.Winnerorder
  1164. }
  1165. //处理附件
  1166. var resultf map[string][]*ju.ExtField
  1167. var filevalues map[string][]*ju.SortObject
  1168. if jf != nil {
  1169. _, resultf, _, filevalues = funcAnalysis(jf)
  1170. ffield := map[string]interface{}{}
  1171. for key, val := range filevalues {
  1172. for _, v := range val { //取第一个非负数
  1173. if v.Key != "" && v.Value > -1 {
  1174. ffield[key] = v.Object
  1175. break
  1176. }
  1177. }
  1178. }
  1179. if len(jf.PackageInfo) > 0 { //分包信息
  1180. ffield["package"] = jf.PackageInfo
  1181. }
  1182. if len(jf.Winnerorder) > 0 { //候选人信息
  1183. ffield["winnerorder"] = jf.Winnerorder
  1184. }
  1185. tmp["ffield"] = ffield
  1186. }
  1187. for k, v := range *doc {
  1188. //去重冗余字段
  1189. if delFiled(k) {
  1190. continue
  1191. }
  1192. if tmp[k] == nil {
  1193. tmp[k] = v
  1194. }
  1195. }
  1196. //质量审核
  1197. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1198. e.QualityAudit(tmp)
  1199. }
  1200. if e.IsExtractCity { //城市抽取
  1201. e.ExtractCity(j, tmp, _id)
  1202. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1203. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1204. // tmp["district"] = d
  1205. // if b {
  1206. // tmp["city"] = c
  1207. // tmp["area"] = p
  1208. // }
  1209. }
  1210. //品牌抽取
  1211. if ju.IsBrandGoods {
  1212. tmp["checkhas"] = map[string]int{
  1213. "hastable": j.HasTable,
  1214. "hasgoods": j.HasGoods,
  1215. "hasbrand": j.HasBrand,
  1216. "haskey": j.HasKey,
  1217. }
  1218. if len(j.BrandData) > 0 {
  1219. tmp["tablebrand"] = j.BrandData
  1220. }
  1221. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1222. }
  1223. //分包和标签
  1224. if ju.Config["saveblock"].(bool) {
  1225. blocks := make([]ju.BlockAndTag, 0)
  1226. for _, v := range j.Block {
  1227. xx, _ := json.Marshal(v)
  1228. tmpblock := new(ju.TmpBlock)
  1229. err := json.Unmarshal(xx, &tmpblock)
  1230. if err != nil {
  1231. if v.BPackage != nil {
  1232. bpb, _ := json.Marshal(v.BPackage)
  1233. tmpblock.BPackage = string(bpb)
  1234. }
  1235. tmpblock = rangeBlockToJson(v, *tmpblock)
  1236. }
  1237. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1238. }
  1239. tmp["blocks"] = blocks
  1240. }
  1241. if e.TaskInfo.TestColl == "" {
  1242. if len(tmp) > 0 { //保存抽取结果
  1243. for field, _ := range e.Fields {
  1244. if tmp[field] == nil {
  1245. tmp[field] = "" //覆盖之前版本数据
  1246. }
  1247. }
  1248. tmp["repeat"] = 0
  1249. tmparr := []map[string]interface{}{
  1250. map[string]interface{}{
  1251. "_id": qu.StringTOBsonId(_id),
  1252. },
  1253. map[string]interface{}{"$set": tmp},
  1254. }
  1255. e.BidArr = append(e.BidArr, tmparr)
  1256. }
  1257. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1258. id := tmp["_id"]
  1259. tmp["result"] = result
  1260. tmp["resultf"] = resultf
  1261. delete(tmp, "_id")
  1262. tmparr := []map[string]interface{}{
  1263. map[string]interface{}{
  1264. "_id": id,
  1265. },
  1266. map[string]interface{}{"$set": tmp},
  1267. }
  1268. e.ResultArr = append(e.ResultArr, tmparr)
  1269. }
  1270. } else { //测试结果
  1271. delete(tmp, "_id")
  1272. if len(j.BlockPackage) > 0 { //分包详情
  1273. bs, _ := json.Marshal(j.BlockPackage)
  1274. tmp["epackage"] = string(bs)
  1275. }
  1276. tmp["result"] = result
  1277. tmp["resultf"] = resultf
  1278. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1279. if !b {
  1280. log.Debug(e.TaskInfo.TestColl, _id)
  1281. }
  1282. }
  1283. }, func(err interface{}) {
  1284. log.Debug("AnalysisSaveResult err", err)
  1285. })
  1286. }
  1287. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1288. if j == nil {
  1289. return nil
  1290. }
  1291. if len(j.Block) > 0 {
  1292. for i, v := range j.Block {
  1293. rangetmp := new(ju.TmpBlock)
  1294. vb, _ := json.Marshal(v)
  1295. json.Unmarshal(vb, &rangetmp)
  1296. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1297. }
  1298. }
  1299. if j.ColonKV != nil {
  1300. cb, _ := json.Marshal(j.ColonKV)
  1301. tmpblock.ColonKV = string(cb)
  1302. }
  1303. if j.SpaceKV != nil {
  1304. sb, _ := json.Marshal(j.SpaceKV)
  1305. tmpblock.SpaceKV = string(sb)
  1306. }
  1307. if j.TableKV != nil {
  1308. tb, _ := json.Marshal(j.TableKV)
  1309. tmpblock.TableKV = string(tb)
  1310. }
  1311. return &tmpblock
  1312. }
  1313. //去重冗余字段
  1314. func delFiled(k string) bool {
  1315. return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1316. }
  1317. func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {
  1318. defer qu.Catch()
  1319. doc := j.Data
  1320. result := j.Result
  1321. _id := qu.BsonIdToSId((*doc)["_id"])
  1322. iscore, _ := ju.Config["fieldscore"].(bool)
  1323. if iscore { //打分
  1324. result = ScoreFields(j)
  1325. }
  1326. //结果排序
  1327. values := map[string][]*ju.SortObject{}
  1328. for key, val := range result {
  1329. fieldValue := map[string][]interface{}{}
  1330. if iscore { //走打分
  1331. for _, v := range val {
  1332. if len(fmt.Sprint(v.Value)) < 1 {
  1333. continue //去除空串
  1334. }
  1335. fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
  1336. }
  1337. } else { //不走打分,按出现频次
  1338. for _, v := range val {
  1339. if len(fmt.Sprint(v.Value)) < 1 {
  1340. continue //去除空串
  1341. }
  1342. if fieldValue[fmt.Sprint(v.Value)] == nil {
  1343. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  1344. } else {
  1345. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  1346. }
  1347. }
  1348. }
  1349. objects := []*ju.SortObject{}
  1350. for k, v := range fieldValue {
  1351. ValueStr := "" //第二排序
  1352. if reflect.TypeOf(v[1]).String() == "string" {
  1353. ValueStr = qu.ObjToString(v[1])
  1354. }
  1355. tmp := &ju.SortObject{
  1356. Key: k,
  1357. Value: qu.IntAll(v[0]),
  1358. Object: v[1],
  1359. ValueStr: ValueStr,
  1360. }
  1361. objects = append(objects, tmp)
  1362. }
  1363. values[key] = ju.ExtSort(objects)
  1364. }
  1365. return doc, result, _id, values
  1366. }
  1367. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1368. defer qu.Catch()
  1369. //获取审核字段
  1370. for _, field := range e.AuditFields {
  1371. //1.分包
  1372. if resulttmp["package"] != nil {
  1373. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1374. for _, val := range packagedata {
  1375. if val[field] != nil {
  1376. fv := qu.ObjToString(val[field])
  1377. if fv != "" {
  1378. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1379. e.RedisMatch(field, fv, val) //redis匹配
  1380. } else { //除了buyer和winner,其他字段走规则匹配
  1381. e.RuleMatch(field, fv, val)
  1382. }
  1383. }
  1384. }
  1385. }
  1386. }
  1387. //2.外围
  1388. if resulttmp[field] != nil {
  1389. fv := qu.ObjToString(resulttmp[field])
  1390. if fv != "" {
  1391. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1392. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1393. } else { //除了buyer和winner,其他字段走规则匹配
  1394. e.RuleMatch(field, fv, resulttmp)
  1395. }
  1396. }
  1397. }
  1398. }
  1399. }
  1400. //Redis匹配
  1401. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1402. defer qu.Catch()
  1403. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1404. if i == 0 { //reids未找到,执行规则匹配
  1405. val[field+"_isredis"] = false
  1406. e.RuleMatch(field, fv, val) //规则匹配
  1407. } else { //redis找到,打标识存库
  1408. val[field+"_isredis"] = true
  1409. }
  1410. }
  1411. //规则匹配
  1412. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1413. defer qu.Catch()
  1414. if fieldval != "" {
  1415. SMap := e.StartMatch(field, fieldval)
  1416. //SMap.AddKey(field+"_isaudit", false)
  1417. for _, k := range SMap.Keys {
  1418. tmpMap[k] = SMap.Map[k]
  1419. }
  1420. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1421. }
  1422. }
  1423. //开始规则匹配
  1424. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1425. defer qu.Catch()
  1426. SMap := pretreated.NewSortMap()
  1427. lock.Lock()
  1428. f := e.RecogFieldMap[field]
  1429. lock.Unlock()
  1430. if len(f) > 0 {
  1431. fid := qu.BsonIdToSId(f["_id"])
  1432. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1433. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1434. if textAfterRecogFieldPrerule != "" {
  1435. lock.Lock()
  1436. classMap := e.FidClassMap[fid]
  1437. lock.Unlock()
  1438. L:
  1439. for _, c := range classMap { //class
  1440. classid := qu.BsonIdToSId(c["_id"])
  1441. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1442. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1443. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1444. if textAfterClassPrerule != "" {
  1445. lock.Lock()
  1446. ruleMap := e.CidRuleMap[classid]
  1447. lock.Unlock()
  1448. for _, r := range ruleMap { //rule
  1449. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1450. s_name := qu.ObjToString(r["s_name"])
  1451. rule := r["rule"].([]interface{})
  1452. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1453. if textAfterRulePrerule != "" {
  1454. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1455. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1456. if savefield != "" { //保存字段不为空,存储代码信息
  1457. SMap.AddKey(field+"_"+savefield, s_name)
  1458. }
  1459. break L
  1460. }
  1461. }
  1462. }
  1463. }
  1464. }
  1465. }
  1466. }
  1467. return SMap
  1468. }