extract.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993
  1. package extract
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "jy/clear"
  6. db "jy/mongodbutil"
  7. "jy/pretreated"
  8. ju "jy/util"
  9. "log"
  10. qu "qfw/util"
  11. redis "qfw/util/redis"
  12. "reflect"
  13. "regexp"
  14. "strconv"
  15. "sync"
  16. "time"
  17. "gopkg.in/mgo.v2/bson"
  18. )
  19. var (
  20. lock sync.RWMutex
  21. cut = ju.NewCut() //获取正文并清理
  22. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  23. TaskList map[string]*ExtractTask //任务列表
  24. saveLimit = 200 //抽取日志批量保存
  25. PageSize = 5000 //查询分页
  26. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"check_sensitive2":1}`
  27. )
  28. //启动测试抽取
  29. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  30. defer qu.Catch()
  31. ext := &ExtractTask{}
  32. ext.Id = taskId
  33. ext.IsRun = true
  34. ext.InitTestTaskInfo(resultcoll, trackcoll)
  35. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  36. ext.InitRulePres()
  37. ext.InitRuleBacks()
  38. ext.InitRuleCore()
  39. ext.InitPkgCore()
  40. ext.InitTag()
  41. ext.InitClearFn()
  42. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  43. //初始化城市DFA信息
  44. ext.InitDFA()
  45. }
  46. //质量审核
  47. ext.InitAuditFields()
  48. ext.InitAuditRule()
  49. ext.InitAuditClass()
  50. ext.InitAuditRecogField()
  51. return RunExtractTestTask(ext, startId, num)
  52. }
  53. func IdTrans(startId string) bson.ObjectId {
  54. defer qu.Catch()
  55. return bson.ObjectIdHex(startId)
  56. }
  57. //开始测试任务抽取
  58. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  59. n, _ := strconv.Atoi(num)
  60. id := IdTrans(startId)
  61. if id.Valid() {
  62. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  63. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  64. for _, v := range *list {
  65. //log.Println(v["_id"])
  66. j := PreInfo(v)
  67. ext.TaskInfo.ProcessPool <- true
  68. go ext.ExtractProcess(j)
  69. }
  70. return true
  71. } else {
  72. return false
  73. }
  74. }
  75. //启动抽取
  76. func StartExtractTaskId(taskId string) bool {
  77. isgo := false
  78. ext := TaskList[taskId]
  79. if ext == nil {
  80. ext = &ExtractTask{}
  81. ext.Id = taskId
  82. ext.InitTaskInfo()
  83. isgo = true
  84. } else {
  85. ext.Id = taskId
  86. ext.InitTaskInfo()
  87. }
  88. ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  89. ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  90. ext.InitRulePres()
  91. ext.InitRuleBacks()
  92. ext.InitRuleCore()
  93. ext.InitPkgCore()
  94. ext.InitTag()
  95. ext.InitClearFn()
  96. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  97. //初始化城市DFA信息
  98. ext.InitDFA()
  99. }
  100. //质量审核
  101. ext.InitAuditFields()
  102. ext.InitAuditRule()
  103. ext.InitAuditClass()
  104. ext.InitAuditRecogField()
  105. ext.IsRun = true
  106. go ext.ResultSave()
  107. go ext.BidSave()
  108. if isgo {
  109. go RunExtractTask(taskId)
  110. }
  111. TaskList[taskId] = ext
  112. return true
  113. }
  114. //停止抽取
  115. func StopExtractTaskId(taskId string) bool {
  116. ext := TaskList[taskId]
  117. if ext != nil {
  118. ext.IsRun = false
  119. TaskList[taskId] = ext
  120. }
  121. //更新task.s_extlastid
  122. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  123. return true
  124. }
  125. //开始抽取
  126. func RunExtractTask(taskId string) {
  127. ext := TaskList[taskId]
  128. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  129. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  130. pageNum := (count + PageSize - 1) / PageSize
  131. limit := PageSize
  132. if count < PageSize {
  133. limit = count
  134. }
  135. log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  136. for i := 0; i < pageNum; i++ {
  137. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  138. log.Printf("page=%d,query=%v", i+1, query)
  139. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  140. for _, v := range *list {
  141. if v["check_sensitive2"] != nil { //去除含敏感词数据
  142. if v["check_sensitive2"].(string) != "" {
  143. continue
  144. }
  145. }
  146. //log.Println(v["_id"])
  147. if !ext.IsRun {
  148. break
  149. }
  150. j := PreInfo(v)
  151. ext.TaskInfo.ProcessPool <- true
  152. go ext.ExtractProcess(j)
  153. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  154. }
  155. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  156. if !ext.IsRun {
  157. break
  158. }
  159. }
  160. //更新task.s_extlastid
  161. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  162. }
  163. //信息预处理
  164. func PreInfo(doc map[string]interface{}) *ju.Job {
  165. detail := ""
  166. d1, _ := doc["detail"].(string)
  167. d2, _ := doc["contenthtml"].(string)
  168. if len(d1) >= len(d2) || d2 == "" {
  169. detail = d1
  170. } else {
  171. detail = d2
  172. }
  173. detail = ju.CutLableStr(detail)
  174. detail = cut.ClearHtml(detail)
  175. doc["detail"] = detail
  176. toptype := qu.ObjToString(doc["toptype"])
  177. if qu.ObjToString(doc["type"]) == "bid" {
  178. toptype = "结果"
  179. }
  180. if toptype == "" {
  181. toptype = "*"
  182. }
  183. j := &ju.Job{
  184. SourceMid: qu.BsonIdToSId(doc["_id"]),
  185. Category: toptype,
  186. Content: qu.ObjToString(doc["detail"]),
  187. SpiderCode: qu.ObjToString(doc["spidercode"]),
  188. //Domain: qu.ObjToString(doc["domain"]),
  189. //Href: qu.ObjToString(doc["href"]),
  190. Title: qu.ObjToString(doc["title"]),
  191. Data: &doc,
  192. City: qu.ObjToString(doc["city"]),
  193. Province: qu.ObjToString(doc["area"]),
  194. Result: map[string][]*ju.ExtField{},
  195. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  196. }
  197. qu.Try(func() {
  198. pretreated.AnalyStart(j)
  199. }, func(err interface{}) {
  200. log.Println("pretreated.AnalyStart", err)
  201. })
  202. return j
  203. }
  204. //抽取
  205. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  206. qu.Try(func() {
  207. doc := *j.Data
  208. //全局前置规则,结果覆盖doc属性
  209. for _, v := range e.RulePres {
  210. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  211. }
  212. //抽取规则
  213. for _, vc := range e.RuleCores {
  214. tmp := ju.DeepCopy(doc).(map[string]interface{})
  215. //是否进入逻辑
  216. if !ju.Logic(vc.LuaLogic, tmp) {
  217. continue
  218. }
  219. //抽取-前置规则
  220. for _, v := range vc.RulePres {
  221. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  222. }
  223. //log.Println("抽取-前置规则", tmp)
  224. //抽取-规则
  225. for _, v := range vc.RuleCores {
  226. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  227. }
  228. //log.Println("抽取-规则", tmp)
  229. //项目名称未能抽取到,标题来凑
  230. if vc.Field == "projectname" {
  231. if len(j.Result[vc.Field]) < 1 {
  232. j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  233. }
  234. }
  235. //抽取-后置规则
  236. for _, v := range vc.RuleBacks {
  237. ExtRegBack(j, v, e.TaskInfo)
  238. }
  239. //log.Println("抽取-后置规则", tmp)
  240. }
  241. //全局后置规则
  242. for _, v := range e.RuleBacks {
  243. ExtRegBack(j, v, e.TaskInfo)
  244. }
  245. //候选人加入
  246. if len(j.Winnerorder) > 0 {
  247. winner := &ju.ExtField{
  248. Field: "winner",
  249. Code: "",
  250. RuleText: "",
  251. Type: "winnerorder",
  252. MatchType: "winnerorder",
  253. ExtFrom: "",
  254. Value: j.Winnerorder[0]["entname"],
  255. Score: 0,
  256. }
  257. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  258. winner.Score = -5
  259. }
  260. winners := j.Result["winner"]
  261. if winners != nil {
  262. winners = append(winners, winner)
  263. } else {
  264. winners = []*ju.ExtField{}
  265. winners = append(winners, winner)
  266. }
  267. j.Result["winner"] = winners
  268. }
  269. //函数清理
  270. for key, val := range j.Result {
  271. for _, v := range val {
  272. lock.Lock()
  273. cfn := e.ClearFn[key]
  274. lock.Unlock()
  275. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  276. v.Value = data[0]
  277. //清理特殊符号
  278. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  279. clear.MesField[key] != nil {
  280. text := qu.ObjToString(v.Value)
  281. text = clear.OtherClean(key, text)
  282. v.Value = text
  283. }
  284. }
  285. }
  286. PackageDetail(j, e) //处理分包信息
  287. // bs, _ := json.Marshal(j.Result)
  288. // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  289. //分析抽取结果并保存 todo
  290. AnalysisSaveResult(j, e)
  291. }, func(err interface{}) {
  292. log.Println((*j.Data)["_id"], err)
  293. <-e.TaskInfo.ProcessPool
  294. })
  295. <-e.TaskInfo.ProcessPool
  296. }
  297. //前置过滤
  298. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  299. before := ju.DeepCopy(doc).(map[string]interface{})
  300. extinfo := map[string]interface{}{}
  301. if in.IsLua {
  302. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  303. if j != nil {
  304. lua.Block = j.Block
  305. }
  306. extinfo = lua.RunScript("pre")
  307. for k, v := range extinfo { //结果覆盖原doc
  308. doc[k] = v
  309. }
  310. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  311. } else {
  312. key := qu.If(in.Field == "", "detail", in.Field).(string)
  313. text := qu.ObjToString(doc[key])
  314. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  315. doc[key] = extinfo[key] //结果覆盖原doc
  316. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  317. }
  318. return doc
  319. }
  320. //抽取-规则
  321. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  322. //废标、流标、ppp等跳过
  323. b := IsExtract(in.Field, j.Title, j.Content)
  324. if !b {
  325. return
  326. }
  327. if in.IsLua {
  328. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  329. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  330. lua.Block = j.Block
  331. extinfo := lua.RunScript("core")
  332. for k, v := range extinfo {
  333. if k == in.Field {
  334. if j.Result[k] == nil {
  335. j.Result[k] = [](*ju.ExtField){}
  336. }
  337. if tmps, ok := v.([]map[string]interface{}); ok {
  338. for _, tmp := range tmps {
  339. j.Result[k] = append(j.Result[k],
  340. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
  341. }
  342. }
  343. }
  344. }
  345. if len(extinfo) > 0 {
  346. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  347. }
  348. } else {
  349. //全文正则
  350. text := qu.ObjToString(doc[extfrom])
  351. if in.Field != "" {
  352. extinfo := extRegCoreToResult(extfrom, text, j, in)
  353. if len(extinfo) > 0 {
  354. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  355. }
  356. }
  357. }
  358. }
  359. //lua脚本根据属性设置提取kv值
  360. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  361. kvmap := map[string][]map[string]interface{}{}
  362. for fieldname, field := range in.LFields {
  363. lock.Lock()
  364. tags := t[field] //获取对应标签库
  365. lock.Unlock()
  366. for _, bl := range j.Block {
  367. //冒号kv
  368. if bl.ColonKV != nil {
  369. kvs := bl.ColonKV.Kvs
  370. kvs2 := bl.ColonKV.Kvs_2
  371. //log.Println("ColonKV1", kvs)
  372. //log.Println("ColonKV2", kvs2)
  373. for _, tag := range tags {
  374. for _, kv := range kvs {
  375. if tag.Type == "string" {
  376. if kv.Key == tag.Key {
  377. text := ju.TrimLRSpace(kv.Value, "")
  378. if text != "" {
  379. kvmap[field] = append(kvmap[field], map[string]interface{}{
  380. "field": field,
  381. "code": in.Code,
  382. "ruletext": tag.Key,
  383. "extfrom": extfrom,
  384. "value": text,
  385. "type": "colon1",
  386. "matchtype": "tag_string",
  387. })
  388. }
  389. break
  390. }
  391. } else if tag.Type == "regexp" {
  392. if tag.Reg.MatchString(kv.Key) {
  393. text := ju.TrimLRSpace(kv.Value, "")
  394. if text != "" {
  395. kvmap[field] = append(kvmap[field], map[string]interface{}{
  396. "field": field,
  397. "code": in.Code,
  398. "ruletext": tag.Key,
  399. "extfrom": extfrom,
  400. "value": text,
  401. "type": "colon1",
  402. "matchtype": "tag_regexp",
  403. })
  404. }
  405. break
  406. }
  407. }
  408. }
  409. for _, kv := range kvs2 {
  410. if tag.Type == "string" {
  411. if kv.Key == tag.Key {
  412. text := ju.TrimLRSpace(kv.Value, "")
  413. if text != "" {
  414. kvmap[field] = append(kvmap[field], map[string]interface{}{
  415. "field": field,
  416. "code": in.Code,
  417. "ruletext": tag.Key,
  418. "extfrom": extfrom,
  419. "value": text,
  420. "type": "colon2",
  421. "matchtype": "tag_string",
  422. })
  423. }
  424. break
  425. }
  426. } else if tag.Type == "regexp" {
  427. if tag.Reg.MatchString(kv.Key) {
  428. text := ju.TrimLRSpace(kv.Value, "")
  429. if text != "" {
  430. kvmap[field] = append(kvmap[field], map[string]interface{}{
  431. "field": field,
  432. "code": in.Code,
  433. "ruletext": tag.Key,
  434. "extfrom": extfrom,
  435. "value": text,
  436. "type": "colon2",
  437. "matchtype": "tag_regexp",
  438. })
  439. }
  440. break
  441. }
  442. }
  443. }
  444. }
  445. }
  446. //空格kv
  447. if bl.SpaceKV != nil {
  448. kvs := bl.SpaceKV.Kvs
  449. //log.Println("SpaceKV", kvs)
  450. for _, tag := range tags {
  451. for _, kv := range kvs {
  452. if tag.Type == "string" {
  453. if kv.Key == tag.Key {
  454. text := ju.TrimLRSpace(kv.Value, "")
  455. if text != "" {
  456. kvmap[field] = append(kvmap[field], map[string]interface{}{
  457. "field": field,
  458. "code": in.Code,
  459. "ruletext": tag.Key,
  460. "extfrom": extfrom,
  461. "value": text,
  462. "type": "space",
  463. "matchtype": "tag_string",
  464. })
  465. }
  466. break
  467. }
  468. } else if tag.Type == "regexp" {
  469. if tag.Reg.MatchString(kv.Key) {
  470. text := ju.TrimLRSpace(kv.Value, "")
  471. if text != "" {
  472. kvmap[field] = append(kvmap[field], map[string]interface{}{
  473. "field": field,
  474. "code": in.Code,
  475. "ruletext": tag.Key,
  476. "extfrom": extfrom,
  477. "value": text,
  478. "type": "space",
  479. "matchtype": "tag_regexp",
  480. })
  481. }
  482. break
  483. }
  484. }
  485. }
  486. }
  487. }
  488. //表格kv
  489. if bl.TableKV != nil {
  490. tkv := bl.TableKV
  491. //log.Println("tkv", tkv)
  492. for k, v := range tkv.Kv {
  493. if k == fieldname {
  494. if len(tags) > -tkv.KvIndex[fieldname] {
  495. ruletext := ""
  496. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  497. ruletext = "项目名称"
  498. } else {
  499. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  500. }
  501. kvmap[field] = append(kvmap[field], map[string]interface{}{
  502. "field": field,
  503. "code": in.Code,
  504. "ruletext": ruletext,
  505. "extfrom": "table",
  506. "value": v,
  507. "type": "table",
  508. "matchtype": "tag_string",
  509. })
  510. } else { //涉及其他待处理
  511. //log.Println(tags)
  512. }
  513. }
  514. }
  515. }
  516. }
  517. }
  518. return kvmap
  519. }
  520. //正则提取结果
  521. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  522. extinfo := map[string][]map[string]interface{}{}
  523. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  524. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  525. if len(apos) > 0 {
  526. pos := apos[0]
  527. for k, p := range v.RegCore.ExtractPos {
  528. if len(pos) > p {
  529. if pos[p] == -1 || pos[p+1] == -1 {
  530. continue
  531. }
  532. val := text[pos[p]:pos[p+1]]
  533. tmps := []map[string]interface{}{}
  534. tmp := map[string]interface{}{
  535. "field": v.Field,
  536. "code": v.Code,
  537. "ruletext": v.RuleText,
  538. "extfrom": extfrom,
  539. "value": val,
  540. "type": "regexp",
  541. "matchtype": "regcontent",
  542. }
  543. tmps = append(tmps, tmp)
  544. extinfo[k] = tmps
  545. if val != "" {
  546. if j.Result[v.Field] == nil {
  547. j.Result[k] = [](*ju.ExtField){}
  548. }
  549. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  550. }
  551. }
  552. }
  553. }
  554. } else {
  555. pos := v.RegCore.Reg.FindStringIndex(text)
  556. val := ""
  557. if len(pos) == 2 {
  558. text = text[pos[1]:]
  559. rs := regexp.MustCompile("[^\r\n\t]+")
  560. tmp := rs.FindAllString(text, -1)
  561. if len(tmp) > 0 {
  562. val = tmp[0]
  563. }
  564. }
  565. if val != "" {
  566. tmps := []map[string]interface{}{}
  567. tmp := map[string]interface{}{
  568. "field": v.Field,
  569. "code": v.Code,
  570. "ruletext": v.RuleText,
  571. "extfrom": extfrom,
  572. "value": val,
  573. "type": "regexp",
  574. "matchtype": "regcontent",
  575. }
  576. tmps = append(tmps, tmp)
  577. extinfo[v.Field] = tmps
  578. if j.Result[v.Field] == nil {
  579. j.Result[v.Field] = [](*ju.ExtField){}
  580. }
  581. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  582. }
  583. }
  584. return extinfo
  585. }
  586. //后置过滤
  587. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  588. if in.IsLua {
  589. result := GetResultMapForLua(j)
  590. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  591. if j != nil {
  592. lua.Block = j.Block
  593. }
  594. extinfo := lua.RunScript("back")
  595. for k, v := range extinfo {
  596. if tmps, ok := v.([]map[string]interface{}); ok {
  597. j.Result[k] = [](*ju.ExtField){}
  598. for _, tmp := range tmps {
  599. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  600. }
  601. }
  602. }
  603. if len(extinfo) > 0 {
  604. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  605. }
  606. } else {
  607. extinfo := map[string]interface{}{}
  608. if in.Field != "" {
  609. if j.Result[in.Field] != nil {
  610. tmp := j.Result[in.Field]
  611. exts := []interface{}{}
  612. for k, v := range tmp {
  613. if v.Type == "table" { //table抽取到的数据不清理
  614. continue
  615. }
  616. text := qu.ObjToString(v.Value)
  617. if text != "" {
  618. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  619. }
  620. j.Result[in.Field][k].Value = text
  621. exts = append(exts, map[string]interface{}{
  622. "field": v.Field,
  623. "code": v.Code,
  624. "ruletext": v.RuleText,
  625. "type": v.Type,
  626. "matchtype": v.MatchType,
  627. "extfrom": v.ExtFrom,
  628. "value": text,
  629. })
  630. }
  631. extinfo[in.Field] = exts
  632. if len(extinfo) > 0 {
  633. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  634. }
  635. }
  636. } else {
  637. for key, tmp := range j.Result {
  638. exts := []interface{}{}
  639. for k, v := range tmp {
  640. if v.Type == "table" { //table抽取到的数据不清理
  641. continue
  642. }
  643. text := qu.ObjToString(v.Value)
  644. if text != "" {
  645. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  646. }
  647. j.Result[key][k].Value = text
  648. exts = append(exts, map[string]interface{}{
  649. "field": v.Field,
  650. "code": v.Code,
  651. "ruletext": v.RuleText,
  652. "type": v.Type,
  653. "matchtype": v.MatchType,
  654. "extfrom": v.ExtFrom,
  655. "value": text,
  656. })
  657. }
  658. extinfo[key] = exts
  659. }
  660. if len(extinfo) > 0 {
  661. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  662. }
  663. }
  664. }
  665. }
  666. //获取抽取结果map[string][]interface{},lua脚本使用
  667. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  668. result := map[string][]map[string]interface{}{}
  669. for key, val := range j.Result {
  670. if result[key] == nil {
  671. result[key] = []map[string]interface{}{}
  672. }
  673. for _, v := range val {
  674. tmp := map[string]interface{}{
  675. "field": v.Field,
  676. "code": v.Code,
  677. "ruletext": v.RuleText,
  678. "value": v.Value,
  679. "type": v.Type,
  680. "matchtype": v.MatchType,
  681. "extfrom": v.ExtFrom,
  682. }
  683. result[key] = append(result[key], tmp)
  684. }
  685. }
  686. return result
  687. }
  688. //抽取日志
  689. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  690. if !t.IsEtxLog {
  691. return
  692. }
  693. logdata := map[string]interface{}{
  694. "code": v.Code,
  695. "name": v.Name,
  696. "type": ftype,
  697. "ruletext": v.RuleText,
  698. "islua": v.IsLua,
  699. "field": v.Field,
  700. "version": t.Version,
  701. "taskname": t.Name,
  702. "before": before,
  703. "extinfo": extinfo,
  704. "sid": sid,
  705. "comeintime": time.Now().Unix(),
  706. }
  707. lock.Lock()
  708. ExtLogs[t] = append(ExtLogs[t], logdata)
  709. lock.Unlock()
  710. }
  711. //保存抽取日志
  712. func SaveExtLog() {
  713. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  714. lock.Lock()
  715. tmpLogs = ExtLogs
  716. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  717. lock.Unlock()
  718. for k, v := range tmpLogs {
  719. if len(v) < saveLimit {
  720. db.Mgo.SaveBulk(k.TrackColl, v...)
  721. } else {
  722. for {
  723. if len(v) > saveLimit {
  724. tmp := v[:saveLimit]
  725. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  726. v = v[saveLimit:]
  727. } else {
  728. db.Mgo.SaveBulk(k.TrackColl, v...)
  729. break
  730. }
  731. }
  732. }
  733. }
  734. time.AfterFunc(10*time.Second, SaveExtLog)
  735. }
  736. type FieldValue struct {
  737. Value interface{}
  738. Count int
  739. }
  740. //分析抽取结果并保存
  741. func AnalysisSaveResult(j *ju.Job, e *ExtractTask) {
  742. doc := j.Data
  743. result := j.Result
  744. _id := qu.BsonIdToSId((*doc)["_id"])
  745. iscore, _ := ju.Config["fieldscore"].(bool)
  746. if iscore { //打分
  747. result = ScoreFields(j)
  748. }
  749. //结果排序
  750. values := map[string][]*ju.SortObject{}
  751. for key, val := range result {
  752. fieldValue := map[string][]interface{}{}
  753. if iscore { //走打分
  754. for _, v := range val {
  755. if len(fmt.Sprint(v.Value)) < 1 {
  756. continue //去除空串
  757. }
  758. fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
  759. }
  760. } else { //不走打分,按出现频次
  761. for _, v := range val {
  762. if len(fmt.Sprint(v.Value)) < 1 {
  763. continue //去除空串
  764. }
  765. if fieldValue[fmt.Sprint(v.Value)] == nil {
  766. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  767. } else {
  768. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  769. }
  770. }
  771. }
  772. objects := []*ju.SortObject{}
  773. for k, v := range fieldValue {
  774. ValueStr := "" //第二排序
  775. if reflect.TypeOf(v[1]).String() == "string" {
  776. ValueStr = qu.ObjToString(v[1])
  777. }
  778. tmp := &ju.SortObject{
  779. Key: k,
  780. Value: qu.IntAll(v[0]),
  781. Object: v[1],
  782. ValueStr: ValueStr,
  783. }
  784. objects = append(objects, tmp)
  785. }
  786. values[key] = ju.ExtSort(objects)
  787. }
  788. //从排序结果中取值
  789. tmp := map[string]interface{}{} //抽取值
  790. for key, val := range values {
  791. for _, v := range val { //取第一个非负数
  792. if v.Key != "" && v.Value > -1 {
  793. tmp[key] = v.Object
  794. break
  795. }
  796. }
  797. }
  798. if len(j.PackageInfo) > 0 { //分包信息
  799. tmp["package"] = j.PackageInfo
  800. }
  801. if len(j.Winnerorder) > 0 { //候选人信息
  802. tmp["winnerorder"] = j.Winnerorder
  803. }
  804. for k, v := range *doc {
  805. //去重冗余字段
  806. if k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" {
  807. continue
  808. }
  809. if tmp[k] == nil {
  810. tmp[k] = v
  811. }
  812. }
  813. //质量审核
  814. if ju.Config["qualityaudit"].(bool) {
  815. e.QualityAudit(tmp)
  816. }
  817. if e.IsExtractCity { //城市抽取
  818. b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  819. //log.Println("省份---", p, "城市---", c, "区---", d)
  820. tmp["district"] = d
  821. if b {
  822. tmp["city"] = c
  823. tmp["area"] = p
  824. }
  825. }
  826. if e.TaskInfo.TestColl == "" {
  827. if len(tmp) > 0 { //保存抽取结果
  828. tmparr := []map[string]interface{}{
  829. map[string]interface{}{
  830. "_id": qu.StringTOBsonId(_id),
  831. },
  832. map[string]interface{}{"$set": tmp},
  833. }
  834. e.BidArr = append(e.BidArr, tmparr)
  835. }
  836. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  837. id := tmp["_id"]
  838. tmp["result"] = result
  839. delete(tmp, "_id")
  840. tmparr := []map[string]interface{}{
  841. map[string]interface{}{
  842. "_id": id,
  843. },
  844. map[string]interface{}{"$set": tmp},
  845. }
  846. e.ResultArr = append(e.ResultArr, tmparr)
  847. }
  848. } else { //测试结果
  849. delete(tmp, "_id")
  850. if len(j.BlockPackage) > 0 { //分包详情
  851. bs, _ := json.Marshal(j.BlockPackage)
  852. tmp["epackage"] = string(bs)
  853. }
  854. tmp["result"] = result
  855. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  856. if !b {
  857. log.Println(e.TaskInfo.TestColl, _id)
  858. }
  859. }
  860. }
  861. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  862. defer qu.Catch()
  863. //获取审核字段
  864. for _, field := range e.AuditFields {
  865. //1.分包
  866. if resulttmp["package"] != nil {
  867. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  868. for _, val := range packagedata {
  869. if val[field] != nil {
  870. fv := qu.ObjToString(val[field])
  871. if fv != "" {
  872. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  873. e.RedisMatch(field, fv, val) //redis匹配
  874. } else { //除了buyer和winner,其他字段走规则匹配
  875. e.RuleMatch(field, fv, val)
  876. }
  877. }
  878. }
  879. }
  880. }
  881. //2.外围
  882. if resulttmp[field] != nil {
  883. fv := qu.ObjToString(resulttmp[field])
  884. if fv != "" {
  885. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  886. e.RedisMatch(field, fv, resulttmp) //redis匹配
  887. } else { //除了buyer和winner,其他字段走规则匹配
  888. e.RuleMatch(field, fv, resulttmp)
  889. }
  890. }
  891. }
  892. }
  893. }
  894. //Redis匹配
  895. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  896. defer qu.Catch()
  897. i := redis.GetInt(field, field+"_"+fv) //查找redis
  898. if i == 0 { //reids未找到,执行规则匹配
  899. val[field+"_isredis"] = false
  900. e.RuleMatch(field, fv, val) //规则匹配
  901. } else { //redis找到,打标识存库
  902. val[field+"_isredis"] = true
  903. }
  904. }
  905. //规则匹配
  906. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  907. defer qu.Catch()
  908. if fieldval != "" {
  909. SMap := e.StartMatch(field, fieldval)
  910. //SMap.AddKey(field+"_isaudit", false)
  911. for _, k := range SMap.Keys {
  912. tmpMap[k] = SMap.Map[k]
  913. }
  914. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  915. }
  916. }
  917. //开始规则匹配
  918. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  919. defer qu.Catch()
  920. SMap := pretreated.NewSortMap()
  921. lock.Lock()
  922. f := e.RecogFieldMap[field]
  923. lock.Unlock()
  924. if len(f) > 0 {
  925. fid := qu.BsonIdToSId(f["_id"])
  926. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  927. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  928. if textAfterRecogFieldPrerule != "" {
  929. lock.Lock()
  930. classMap := e.FidClassMap[fid]
  931. lock.Unlock()
  932. L:
  933. for _, c := range classMap { //class
  934. classid := qu.BsonIdToSId(c["_id"])
  935. classPrerule := qu.ObjToString(c["s_class_prerule"])
  936. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  937. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  938. if textAfterClassPrerule != "" {
  939. lock.Lock()
  940. ruleMap := e.CidRuleMap[classid]
  941. lock.Unlock()
  942. for _, r := range ruleMap { //rule
  943. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  944. s_code := qu.ObjToString(r["s_code"])
  945. rule := r["rule"].([]interface{})
  946. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  947. if textAfterRulePrerule != "" {
  948. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  949. if b { //匹配到一个分类下某个规则时,不再继续匹配
  950. if savefield != "" { //保存字段不为空,存储代码信息
  951. SMap.AddKey(field+"_"+savefield, s_code)
  952. }
  953. break L
  954. }
  955. }
  956. }
  957. }
  958. }
  959. }
  960. }
  961. return SMap
  962. }