extract.go 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030
  1. package extract
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "jy/clear"
  6. db "jy/mongodbutil"
  7. "jy/pretreated"
  8. ju "jy/util"
  9. "log"
  10. qu "qfw/util"
  11. redis "qfw/util/redis"
  12. "reflect"
  13. "regexp"
  14. "strconv"
  15. "sync"
  16. "time"
  17. "gopkg.in/mgo.v2/bson"
  18. )
  19. var (
  20. lock sync.RWMutex
  21. cut = ju.NewCut() //获取正文并清理
  22. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  23. TaskList map[string]*ExtractTask //任务列表
  24. ClearTaskList map[string]*ClearTask //清理任务列表
  25. saveLimit = 200 //抽取日志批量保存
  26. PageSize = 5000 //查询分页
  27. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1}`
  28. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  29. )
  30. //启动测试抽取
  31. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  32. defer qu.Catch()
  33. ext := &ExtractTask{}
  34. ext.Id = taskId
  35. ext.IsRun = true
  36. ext.InitTestTaskInfo(resultcoll, trackcoll)
  37. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  38. ext.InitRulePres()
  39. ext.InitRuleBacks()
  40. ext.InitRuleCore()
  41. ext.InitPkgCore()
  42. ext.InitTag()
  43. ext.InitClearFn()
  44. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  45. //初始化城市DFA信息
  46. ext.InitDFA()
  47. }
  48. //质量审核
  49. ext.InitAuditFields()
  50. ext.InitAuditRule()
  51. ext.InitAuditClass()
  52. ext.InitAuditRecogField()
  53. //品牌抽取是否开启
  54. ju.IsBrandGoods = ju.Config["brandgoods"].(bool)
  55. return RunExtractTestTask(ext, startId, num)
  56. }
  57. func IdTrans(startId string) bson.ObjectId {
  58. defer qu.Catch()
  59. return bson.ObjectIdHex(startId)
  60. }
  61. //开始测试任务抽取
  62. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  63. n, _ := strconv.Atoi(num)
  64. id := IdTrans(startId)
  65. if id.Valid() {
  66. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  67. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  68. for _, v := range *list {
  69. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  70. continue
  71. }
  72. //log.Println(v["_id"])
  73. j := PreInfo(v)
  74. ext.TaskInfo.ProcessPool <- true
  75. go ext.ExtractProcess(j)
  76. }
  77. return true
  78. } else {
  79. return false
  80. }
  81. }
  82. //启动抽取
  83. func StartExtractTaskId(taskId string) bool {
  84. defer qu.Catch()
  85. isgo := false
  86. ext := TaskList[taskId]
  87. if ext == nil {
  88. ext = &ExtractTask{}
  89. ext.Id = taskId
  90. ext.InitTaskInfo()
  91. isgo = true
  92. } else {
  93. ext.Id = taskId
  94. ext.InitTaskInfo()
  95. }
  96. ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  97. ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  98. ext.InitRulePres()
  99. ext.InitRuleBacks()
  100. ext.InitRuleCore()
  101. ext.InitPkgCore()
  102. ext.InitTag()
  103. ext.InitClearFn()
  104. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  105. //初始化城市DFA信息
  106. ext.InitDFA()
  107. }
  108. //质量审核
  109. ext.InitAuditFields()
  110. ext.InitAuditRule()
  111. ext.InitAuditClass()
  112. ext.InitAuditRecogField()
  113. //品牌抽取是否开启
  114. ju.IsBrandGoods = ju.Config["brandgoods"].(bool)
  115. ext.IsRun = true
  116. go ext.ResultSave()
  117. go ext.BidSave()
  118. if isgo {
  119. go RunExtractTask(taskId)
  120. }
  121. TaskList[taskId] = ext
  122. return true
  123. }
  124. //停止抽取
  125. func StopExtractTaskId(taskId string) bool {
  126. defer qu.Catch()
  127. ext := TaskList[taskId]
  128. if ext != nil {
  129. ext.IsRun = false
  130. TaskList[taskId] = ext
  131. }
  132. //更新task.s_extlastid
  133. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  134. return true
  135. }
  136. //开始抽取
  137. func RunExtractTask(taskId string) {
  138. defer qu.Catch()
  139. ext := TaskList[taskId]
  140. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  141. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  142. pageNum := (count + PageSize - 1) / PageSize
  143. limit := PageSize
  144. if count < PageSize {
  145. limit = count
  146. }
  147. log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  148. for i := 0; i < pageNum; i++ {
  149. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  150. log.Printf("page=%d,query=%v", i+1, query)
  151. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  152. for _, v := range *list {
  153. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  154. continue
  155. }
  156. //log.Println(v["_id"])
  157. if !ext.IsRun {
  158. break
  159. }
  160. j := PreInfo(v)
  161. ext.TaskInfo.ProcessPool <- true
  162. go ext.ExtractProcess(j)
  163. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  164. }
  165. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  166. if !ext.IsRun {
  167. break
  168. }
  169. }
  170. //更新task.s_extlastid
  171. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  172. }
  173. //信息预处理
  174. func PreInfo(doc map[string]interface{}) *ju.Job {
  175. defer qu.Catch()
  176. detail := ""
  177. d1, _ := doc["detail"].(string)
  178. d2, _ := doc["contenthtml"].(string)
  179. if len(d1) >= len(d2) || d2 == "" {
  180. detail = d1
  181. } else {
  182. detail = d2
  183. }
  184. detail = ju.CutLableStr(detail)
  185. detail = cut.ClearHtml(detail)
  186. doc["detail"] = detail
  187. toptype := qu.ObjToString(doc["toptype"])
  188. if qu.ObjToString(doc["type"]) == "bid" {
  189. toptype = "结果"
  190. }
  191. if toptype == "" {
  192. toptype = "*"
  193. }
  194. j := &ju.Job{
  195. SourceMid: qu.BsonIdToSId(doc["_id"]),
  196. Category: toptype,
  197. Content: qu.ObjToString(doc["detail"]),
  198. SpiderCode: qu.ObjToString(doc["spidercode"]),
  199. //Domain: qu.ObjToString(doc["domain"]),
  200. //Href: qu.ObjToString(doc["href"]),
  201. Title: qu.ObjToString(doc["title"]),
  202. Data: &doc,
  203. City: qu.ObjToString(doc["city"]),
  204. Province: qu.ObjToString(doc["area"]),
  205. Result: map[string][]*ju.ExtField{},
  206. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  207. }
  208. qu.Try(func() {
  209. pretreated.AnalyStart(j)
  210. }, func(err interface{}) {
  211. log.Println("pretreated.AnalyStart", err)
  212. })
  213. return j
  214. }
  215. //抽取
  216. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  217. qu.Try(func() {
  218. doc := *j.Data
  219. //全局前置规则,结果覆盖doc属性
  220. for _, v := range e.RulePres {
  221. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  222. }
  223. //抽取规则
  224. for _, vc := range e.RuleCores {
  225. tmp := ju.DeepCopy(doc).(map[string]interface{})
  226. //是否进入逻辑
  227. if !ju.Logic(vc.LuaLogic, tmp) {
  228. continue
  229. }
  230. //抽取-前置规则
  231. for _, v := range vc.RulePres {
  232. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  233. }
  234. //log.Println("抽取-前置规则", tmp)
  235. //抽取-规则
  236. for _, v := range vc.RuleCores {
  237. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  238. }
  239. //log.Println("抽取-规则", tmp)
  240. //项目名称未能抽取到,标题来凑
  241. if vc.Field == "projectname" {
  242. if len(j.Result[vc.Field]) < 1 {
  243. j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  244. }
  245. }
  246. //抽取-后置规则
  247. for _, v := range vc.RuleBacks {
  248. ExtRegBack(j, v, e.TaskInfo)
  249. }
  250. //log.Println("抽取-后置规则", tmp)
  251. }
  252. //全局后置规则
  253. for _, v := range e.RuleBacks {
  254. ExtRegBack(j, v, e.TaskInfo)
  255. }
  256. //候选人加入
  257. if len(j.Winnerorder) > 0 {
  258. winner := &ju.ExtField{
  259. Field: "winner",
  260. Code: "",
  261. RuleText: "",
  262. Type: "winnerorder",
  263. MatchType: "winnerorder",
  264. ExtFrom: "",
  265. Value: j.Winnerorder[0]["entname"],
  266. Score: 0,
  267. }
  268. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  269. winner.Score = -5
  270. }
  271. winners := j.Result["winner"]
  272. if winners != nil {
  273. winners = append(winners, winner)
  274. } else {
  275. winners = []*ju.ExtField{}
  276. winners = append(winners, winner)
  277. }
  278. j.Result["winner"] = winners
  279. }
  280. //函数清理
  281. for key, val := range j.Result {
  282. for _, v := range val {
  283. lock.Lock()
  284. cfn := e.ClearFn[key]
  285. lock.Unlock()
  286. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  287. v.Value = data[0]
  288. //清理特殊符号
  289. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  290. clear.MesField[key] != nil {
  291. text := qu.ObjToString(v.Value)
  292. text = clear.OtherClean(key, text)
  293. v.Value = text
  294. }
  295. }
  296. }
  297. PackageDetail(j, e) //处理分包信息
  298. // bs, _ := json.Marshal(j.Result)
  299. // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  300. //分析抽取结果并保存 todo
  301. AnalysisSaveResult(j, e)
  302. <-e.TaskInfo.ProcessPool
  303. }, func(err interface{}) {
  304. log.Println("ExtractProcess err", err, (*j.Data)["_id"])
  305. <-e.TaskInfo.ProcessPool
  306. })
  307. }
  308. //前置过滤
  309. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  310. defer qu.Catch()
  311. before := ju.DeepCopy(doc).(map[string]interface{})
  312. extinfo := map[string]interface{}{}
  313. if in.IsLua {
  314. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  315. if j != nil {
  316. lua.Block = j.Block
  317. }
  318. extinfo = lua.RunScript("pre")
  319. for k, v := range extinfo { //结果覆盖原doc
  320. doc[k] = v
  321. }
  322. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  323. } else {
  324. key := qu.If(in.Field == "", "detail", in.Field).(string)
  325. text := qu.ObjToString(doc[key])
  326. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  327. doc[key] = extinfo[key] //结果覆盖原doc
  328. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  329. }
  330. return doc
  331. }
  332. //抽取-规则
  333. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  334. defer qu.Catch()
  335. //废标、流标、ppp等跳过
  336. b := IsExtract(in.Field, j.Title, j.Content)
  337. if !b {
  338. return
  339. }
  340. if in.IsLua {
  341. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  342. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  343. lua.Block = j.Block
  344. extinfo := lua.RunScript("core")
  345. for k, v := range extinfo {
  346. if k == in.Field {
  347. if j.Result[k] == nil {
  348. j.Result[k] = [](*ju.ExtField){}
  349. }
  350. if tmps, ok := v.([]map[string]interface{}); ok {
  351. for _, tmp := range tmps {
  352. j.Result[k] = append(j.Result[k],
  353. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
  354. }
  355. }
  356. }
  357. }
  358. if len(extinfo) > 0 {
  359. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  360. }
  361. } else {
  362. //全文正则
  363. text := qu.ObjToString(doc[extfrom])
  364. if in.Field != "" {
  365. extinfo := extRegCoreToResult(extfrom, text, j, in)
  366. if len(extinfo) > 0 {
  367. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  368. }
  369. }
  370. }
  371. }
  372. //lua脚本根据属性设置提取kv值
  373. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  374. defer qu.Catch()
  375. kvmap := map[string][]map[string]interface{}{}
  376. for fieldname, field := range in.LFields {
  377. lock.Lock()
  378. tags := t[field] //获取对应标签库
  379. lock.Unlock()
  380. for _, bl := range j.Block {
  381. //冒号kv
  382. if bl.ColonKV != nil {
  383. kvs := bl.ColonKV.Kvs
  384. kvs2 := bl.ColonKV.Kvs_2
  385. //log.Println("ColonKV1", kvs)
  386. //log.Println("ColonKV2", kvs2)
  387. for _, tag := range tags {
  388. for _, kv := range kvs {
  389. if tag.Type == "string" {
  390. if kv.Key == tag.Key {
  391. text := ju.TrimLRSpace(kv.Value, "")
  392. if text != "" {
  393. kvmap[field] = append(kvmap[field], map[string]interface{}{
  394. "field": field,
  395. "code": in.Code,
  396. "ruletext": tag.Key,
  397. "extfrom": extfrom,
  398. "value": text,
  399. "type": "colon1",
  400. "matchtype": "tag_string",
  401. })
  402. }
  403. break
  404. }
  405. } else if tag.Type == "regexp" {
  406. if tag.Reg.MatchString(kv.Key) {
  407. text := ju.TrimLRSpace(kv.Value, "")
  408. if text != "" {
  409. kvmap[field] = append(kvmap[field], map[string]interface{}{
  410. "field": field,
  411. "code": in.Code,
  412. "ruletext": tag.Key,
  413. "extfrom": extfrom,
  414. "value": text,
  415. "type": "colon1",
  416. "matchtype": "tag_regexp",
  417. })
  418. }
  419. break
  420. }
  421. }
  422. }
  423. for _, kv := range kvs2 {
  424. if tag.Type == "string" {
  425. if kv.Key == tag.Key {
  426. text := ju.TrimLRSpace(kv.Value, "")
  427. if text != "" {
  428. kvmap[field] = append(kvmap[field], map[string]interface{}{
  429. "field": field,
  430. "code": in.Code,
  431. "ruletext": tag.Key,
  432. "extfrom": extfrom,
  433. "value": text,
  434. "type": "colon2",
  435. "matchtype": "tag_string",
  436. })
  437. }
  438. break
  439. }
  440. } else if tag.Type == "regexp" {
  441. if tag.Reg.MatchString(kv.Key) {
  442. text := ju.TrimLRSpace(kv.Value, "")
  443. if text != "" {
  444. kvmap[field] = append(kvmap[field], map[string]interface{}{
  445. "field": field,
  446. "code": in.Code,
  447. "ruletext": tag.Key,
  448. "extfrom": extfrom,
  449. "value": text,
  450. "type": "colon2",
  451. "matchtype": "tag_regexp",
  452. })
  453. }
  454. break
  455. }
  456. }
  457. }
  458. }
  459. }
  460. //空格kv
  461. if bl.SpaceKV != nil {
  462. kvs := bl.SpaceKV.Kvs
  463. //log.Println("SpaceKV", kvs)
  464. for _, tag := range tags {
  465. for _, kv := range kvs {
  466. if tag.Type == "string" {
  467. if kv.Key == tag.Key {
  468. text := ju.TrimLRSpace(kv.Value, "")
  469. if text != "" {
  470. kvmap[field] = append(kvmap[field], map[string]interface{}{
  471. "field": field,
  472. "code": in.Code,
  473. "ruletext": tag.Key,
  474. "extfrom": extfrom,
  475. "value": text,
  476. "type": "space",
  477. "matchtype": "tag_string",
  478. })
  479. }
  480. break
  481. }
  482. } else if tag.Type == "regexp" {
  483. if tag.Reg.MatchString(kv.Key) {
  484. text := ju.TrimLRSpace(kv.Value, "")
  485. if text != "" {
  486. kvmap[field] = append(kvmap[field], map[string]interface{}{
  487. "field": field,
  488. "code": in.Code,
  489. "ruletext": tag.Key,
  490. "extfrom": extfrom,
  491. "value": text,
  492. "type": "space",
  493. "matchtype": "tag_regexp",
  494. })
  495. }
  496. break
  497. }
  498. }
  499. }
  500. }
  501. }
  502. //表格kv
  503. if bl.TableKV != nil {
  504. tkv := bl.TableKV
  505. //log.Println("tkv", tkv)
  506. for k, v := range tkv.Kv {
  507. if k == fieldname {
  508. if len(tags) > -tkv.KvIndex[fieldname] {
  509. ruletext := ""
  510. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  511. ruletext = "项目名称"
  512. } else {
  513. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  514. }
  515. kvmap[field] = append(kvmap[field], map[string]interface{}{
  516. "field": field,
  517. "code": in.Code,
  518. "ruletext": ruletext,
  519. "extfrom": "table",
  520. "value": v,
  521. "type": "table",
  522. "matchtype": "tag_string",
  523. })
  524. } else { //涉及其他待处理
  525. //log.Println(tags)
  526. }
  527. }
  528. }
  529. }
  530. }
  531. }
  532. return kvmap
  533. }
  534. //正则提取结果
  535. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  536. defer qu.Catch()
  537. extinfo := map[string][]map[string]interface{}{}
  538. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  539. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  540. if len(apos) > 0 {
  541. pos := apos[0]
  542. for k, p := range v.RegCore.ExtractPos {
  543. if len(pos) > p {
  544. if pos[p] == -1 || pos[p+1] == -1 {
  545. continue
  546. }
  547. val := text[pos[p]:pos[p+1]]
  548. tmps := []map[string]interface{}{}
  549. tmp := map[string]interface{}{
  550. "field": v.Field,
  551. "code": v.Code,
  552. "ruletext": v.RuleText,
  553. "extfrom": extfrom,
  554. "value": val,
  555. "type": "regexp",
  556. "matchtype": "regcontent",
  557. }
  558. tmps = append(tmps, tmp)
  559. extinfo[k] = tmps
  560. if val != "" {
  561. if j.Result[v.Field] == nil {
  562. j.Result[k] = [](*ju.ExtField){}
  563. }
  564. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  565. }
  566. }
  567. }
  568. }
  569. } else {
  570. pos := v.RegCore.Reg.FindStringIndex(text)
  571. val := ""
  572. if len(pos) == 2 {
  573. text = text[pos[1]:]
  574. rs := regexp.MustCompile("[^\r\n\t]+")
  575. tmp := rs.FindAllString(text, -1)
  576. if len(tmp) > 0 {
  577. val = tmp[0]
  578. }
  579. }
  580. if val != "" {
  581. tmps := []map[string]interface{}{}
  582. tmp := map[string]interface{}{
  583. "field": v.Field,
  584. "code": v.Code,
  585. "ruletext": v.RuleText,
  586. "extfrom": extfrom,
  587. "value": val,
  588. "type": "regexp",
  589. "matchtype": "regcontent",
  590. }
  591. tmps = append(tmps, tmp)
  592. extinfo[v.Field] = tmps
  593. if j.Result[v.Field] == nil {
  594. j.Result[v.Field] = [](*ju.ExtField){}
  595. }
  596. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  597. }
  598. }
  599. return extinfo
  600. }
  601. //后置过滤
  602. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  603. defer qu.Catch()
  604. if in.IsLua {
  605. result := GetResultMapForLua(j)
  606. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  607. if j != nil {
  608. lua.Block = j.Block
  609. }
  610. extinfo := lua.RunScript("back")
  611. for k, v := range extinfo {
  612. if tmps, ok := v.([]map[string]interface{}); ok {
  613. j.Result[k] = [](*ju.ExtField){}
  614. for _, tmp := range tmps {
  615. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  616. }
  617. }
  618. }
  619. if len(extinfo) > 0 {
  620. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  621. }
  622. } else {
  623. extinfo := map[string]interface{}{}
  624. if in.Field != "" {
  625. if j.Result[in.Field] != nil {
  626. tmp := j.Result[in.Field]
  627. exts := []interface{}{}
  628. for k, v := range tmp {
  629. if v.Type == "table" { //table抽取到的数据不清理
  630. continue
  631. }
  632. text := qu.ObjToString(v.Value)
  633. if text != "" {
  634. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  635. }
  636. j.Result[in.Field][k].Value = text
  637. exts = append(exts, map[string]interface{}{
  638. "field": v.Field,
  639. "code": v.Code,
  640. "ruletext": v.RuleText,
  641. "type": v.Type,
  642. "matchtype": v.MatchType,
  643. "extfrom": v.ExtFrom,
  644. "value": text,
  645. })
  646. }
  647. extinfo[in.Field] = exts
  648. if len(extinfo) > 0 {
  649. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  650. }
  651. }
  652. } else {
  653. for key, tmp := range j.Result {
  654. exts := []interface{}{}
  655. for k, v := range tmp {
  656. if v.Type == "table" { //table抽取到的数据不清理
  657. continue
  658. }
  659. text := qu.ObjToString(v.Value)
  660. if text != "" {
  661. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  662. }
  663. j.Result[key][k].Value = text
  664. exts = append(exts, map[string]interface{}{
  665. "field": v.Field,
  666. "code": v.Code,
  667. "ruletext": v.RuleText,
  668. "type": v.Type,
  669. "matchtype": v.MatchType,
  670. "extfrom": v.ExtFrom,
  671. "value": text,
  672. })
  673. }
  674. extinfo[key] = exts
  675. }
  676. if len(extinfo) > 0 {
  677. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  678. }
  679. }
  680. }
  681. }
  682. //获取抽取结果map[string][]interface{},lua脚本使用
  683. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  684. defer qu.Catch()
  685. result := map[string][]map[string]interface{}{}
  686. for key, val := range j.Result {
  687. if result[key] == nil {
  688. result[key] = []map[string]interface{}{}
  689. }
  690. for _, v := range val {
  691. tmp := map[string]interface{}{
  692. "field": v.Field,
  693. "code": v.Code,
  694. "ruletext": v.RuleText,
  695. "value": v.Value,
  696. "type": v.Type,
  697. "matchtype": v.MatchType,
  698. "extfrom": v.ExtFrom,
  699. }
  700. result[key] = append(result[key], tmp)
  701. }
  702. }
  703. return result
  704. }
  705. //抽取日志
  706. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  707. defer qu.Catch()
  708. if !t.IsEtxLog {
  709. return
  710. }
  711. logdata := map[string]interface{}{
  712. "code": v.Code,
  713. "name": v.Name,
  714. "type": ftype,
  715. "ruletext": v.RuleText,
  716. "islua": v.IsLua,
  717. "field": v.Field,
  718. "version": t.Version,
  719. "taskname": t.Name,
  720. "before": before,
  721. "extinfo": extinfo,
  722. "sid": sid,
  723. "comeintime": time.Now().Unix(),
  724. }
  725. lock.Lock()
  726. ExtLogs[t] = append(ExtLogs[t], logdata)
  727. lock.Unlock()
  728. }
  729. //保存抽取日志
  730. func SaveExtLog() {
  731. defer qu.Catch()
  732. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  733. lock.Lock()
  734. tmpLogs = ExtLogs
  735. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  736. lock.Unlock()
  737. for k, v := range tmpLogs {
  738. if len(v) < saveLimit {
  739. db.Mgo.SaveBulk(k.TrackColl, v...)
  740. } else {
  741. for {
  742. if len(v) > saveLimit {
  743. tmp := v[:saveLimit]
  744. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  745. v = v[saveLimit:]
  746. } else {
  747. db.Mgo.SaveBulk(k.TrackColl, v...)
  748. break
  749. }
  750. }
  751. }
  752. }
  753. time.AfterFunc(10*time.Second, SaveExtLog)
  754. }
  755. type FieldValue struct {
  756. Value interface{}
  757. Count int
  758. }
  759. //分析抽取结果并保存
  760. func AnalysisSaveResult(j *ju.Job, e *ExtractTask) {
  761. qu.Try(func() {
  762. doc := j.Data
  763. result := j.Result
  764. _id := qu.BsonIdToSId((*doc)["_id"])
  765. iscore, _ := ju.Config["fieldscore"].(bool)
  766. if iscore { //打分
  767. result = ScoreFields(j)
  768. }
  769. //结果排序
  770. values := map[string][]*ju.SortObject{}
  771. for key, val := range result {
  772. fieldValue := map[string][]interface{}{}
  773. if iscore { //走打分
  774. for _, v := range val {
  775. if len(fmt.Sprint(v.Value)) < 1 {
  776. continue //去除空串
  777. }
  778. fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
  779. }
  780. } else { //不走打分,按出现频次
  781. for _, v := range val {
  782. if len(fmt.Sprint(v.Value)) < 1 {
  783. continue //去除空串
  784. }
  785. if fieldValue[fmt.Sprint(v.Value)] == nil {
  786. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  787. } else {
  788. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  789. }
  790. }
  791. }
  792. objects := []*ju.SortObject{}
  793. for k, v := range fieldValue {
  794. ValueStr := "" //第二排序
  795. if reflect.TypeOf(v[1]).String() == "string" {
  796. ValueStr = qu.ObjToString(v[1])
  797. }
  798. tmp := &ju.SortObject{
  799. Key: k,
  800. Value: qu.IntAll(v[0]),
  801. Object: v[1],
  802. ValueStr: ValueStr,
  803. }
  804. objects = append(objects, tmp)
  805. }
  806. values[key] = ju.ExtSort(objects)
  807. }
  808. //从排序结果中取值
  809. tmp := map[string]interface{}{} //抽取值
  810. for key, val := range values {
  811. for _, v := range val { //取第一个非负数
  812. if v.Key != "" && v.Value > -1 {
  813. tmp[key] = v.Object
  814. break
  815. }
  816. }
  817. }
  818. if len(j.PackageInfo) > 0 { //分包信息
  819. tmp["package"] = j.PackageInfo
  820. }
  821. if len(j.Winnerorder) > 0 { //候选人信息
  822. tmp["winnerorder"] = j.Winnerorder
  823. }
  824. for k, v := range *doc {
  825. //去重冗余字段
  826. if k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" {
  827. continue
  828. }
  829. if tmp[k] == nil {
  830. tmp[k] = v
  831. }
  832. }
  833. //质量审核
  834. if ju.Config["qualityaudit"].(bool) {
  835. e.QualityAudit(tmp)
  836. }
  837. if e.IsExtractCity { //城市抽取
  838. b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  839. //log.Println("省份---", p, "城市---", c, "区---", d)
  840. tmp["district"] = d
  841. if b {
  842. tmp["city"] = c
  843. tmp["area"] = p
  844. }
  845. }
  846. //品牌抽取
  847. if ju.IsBrandGoods {
  848. tmp["checkhas"] = map[string]int{
  849. "hastable": j.HasTable,
  850. "hasgoods": j.HasGoods,
  851. "hasbrand": j.HasBrand,
  852. "haskey": j.HasKey,
  853. }
  854. if len(j.BrandData) > 0 {
  855. tmp["brand"] = j.BrandData
  856. }
  857. //log.Println("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  858. }
  859. if e.TaskInfo.TestColl == "" {
  860. if len(tmp) > 0 { //保存抽取结果
  861. tmparr := []map[string]interface{}{
  862. map[string]interface{}{
  863. "_id": qu.StringTOBsonId(_id),
  864. },
  865. map[string]interface{}{"$set": tmp},
  866. }
  867. e.BidArr = append(e.BidArr, tmparr)
  868. }
  869. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  870. id := tmp["_id"]
  871. tmp["result"] = result
  872. delete(tmp, "_id")
  873. tmparr := []map[string]interface{}{
  874. map[string]interface{}{
  875. "_id": id,
  876. },
  877. map[string]interface{}{"$set": tmp},
  878. }
  879. e.ResultArr = append(e.ResultArr, tmparr)
  880. }
  881. } else { //测试结果
  882. delete(tmp, "_id")
  883. if len(j.BlockPackage) > 0 { //分包详情
  884. bs, _ := json.Marshal(j.BlockPackage)
  885. tmp["epackage"] = string(bs)
  886. }
  887. tmp["result"] = result
  888. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  889. if !b {
  890. log.Println(e.TaskInfo.TestColl, _id)
  891. }
  892. }
  893. }, func(err interface{}) {
  894. log.Println("AnalysisSaveResult err", err)
  895. })
  896. }
  897. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  898. defer qu.Catch()
  899. //获取审核字段
  900. for _, field := range e.AuditFields {
  901. //1.分包
  902. if resulttmp["package"] != nil {
  903. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  904. for _, val := range packagedata {
  905. if val[field] != nil {
  906. fv := qu.ObjToString(val[field])
  907. if fv != "" {
  908. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  909. e.RedisMatch(field, fv, val) //redis匹配
  910. } else { //除了buyer和winner,其他字段走规则匹配
  911. e.RuleMatch(field, fv, val)
  912. }
  913. }
  914. }
  915. }
  916. }
  917. //2.外围
  918. if resulttmp[field] != nil {
  919. fv := qu.ObjToString(resulttmp[field])
  920. if fv != "" {
  921. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  922. e.RedisMatch(field, fv, resulttmp) //redis匹配
  923. } else { //除了buyer和winner,其他字段走规则匹配
  924. e.RuleMatch(field, fv, resulttmp)
  925. }
  926. }
  927. }
  928. }
  929. }
  930. //Redis匹配
  931. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  932. defer qu.Catch()
  933. i := redis.GetInt(field, field+"_"+fv) //查找redis
  934. if i == 0 { //reids未找到,执行规则匹配
  935. val[field+"_isredis"] = false
  936. e.RuleMatch(field, fv, val) //规则匹配
  937. } else { //redis找到,打标识存库
  938. val[field+"_isredis"] = true
  939. }
  940. }
  941. //规则匹配
  942. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  943. defer qu.Catch()
  944. if fieldval != "" {
  945. SMap := e.StartMatch(field, fieldval)
  946. //SMap.AddKey(field+"_isaudit", false)
  947. for _, k := range SMap.Keys {
  948. tmpMap[k] = SMap.Map[k]
  949. }
  950. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  951. }
  952. }
  953. //开始规则匹配
  954. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  955. defer qu.Catch()
  956. SMap := pretreated.NewSortMap()
  957. lock.Lock()
  958. f := e.RecogFieldMap[field]
  959. lock.Unlock()
  960. if len(f) > 0 {
  961. fid := qu.BsonIdToSId(f["_id"])
  962. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  963. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  964. if textAfterRecogFieldPrerule != "" {
  965. lock.Lock()
  966. classMap := e.FidClassMap[fid]
  967. lock.Unlock()
  968. L:
  969. for _, c := range classMap { //class
  970. classid := qu.BsonIdToSId(c["_id"])
  971. classPrerule := qu.ObjToString(c["s_class_prerule"])
  972. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  973. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  974. if textAfterClassPrerule != "" {
  975. lock.Lock()
  976. ruleMap := e.CidRuleMap[classid]
  977. lock.Unlock()
  978. for _, r := range ruleMap { //rule
  979. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  980. s_name := qu.ObjToString(r["s_name"])
  981. rule := r["rule"].([]interface{})
  982. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  983. if textAfterRulePrerule != "" {
  984. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  985. if b { //匹配到一个分类下某个规则时,不再继续匹配
  986. if savefield != "" { //保存字段不为空,存储代码信息
  987. SMap.AddKey(field+"_"+savefield, s_name)
  988. }
  989. break L
  990. }
  991. }
  992. }
  993. }
  994. }
  995. }
  996. }
  997. return SMap
  998. }