extract.go 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007
  1. package extract
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "jy/clear"
  6. db "jy/mongodbutil"
  7. "jy/pretreated"
  8. ju "jy/util"
  9. "log"
  10. qu "qfw/util"
  11. redis "qfw/util/redis"
  12. "reflect"
  13. "regexp"
  14. "strconv"
  15. "sync"
  16. "time"
  17. "gopkg.in/mgo.v2/bson"
  18. )
  19. var (
  20. lock sync.RWMutex
  21. cut = ju.NewCut() //获取正文并清理
  22. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  23. TaskList map[string]*ExtractTask //任务列表
  24. ClearTaskList map[string]*ClearTask //清理任务列表
  25. saveLimit = 200 //抽取日志批量保存
  26. PageSize = 5000 //查询分页
  27. Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1}`
  28. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  29. )
  30. //启动测试抽取
  31. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  32. defer qu.Catch()
  33. ext := &ExtractTask{}
  34. ext.Id = taskId
  35. ext.IsRun = true
  36. ext.InitTestTaskInfo(resultcoll, trackcoll)
  37. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  38. ext.InitRulePres()
  39. ext.InitRuleBacks()
  40. ext.InitRuleCore()
  41. ext.InitPkgCore()
  42. ext.InitTag()
  43. ext.InitClearFn()
  44. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  45. //初始化城市DFA信息
  46. ext.InitDFA()
  47. }
  48. //质量审核
  49. ext.InitAuditFields()
  50. ext.InitAuditRule()
  51. ext.InitAuditClass()
  52. ext.InitAuditRecogField()
  53. //品牌抽取是否开启
  54. ju.IsBrandGoods = ju.Config["brandgoods"].(bool)
  55. return RunExtractTestTask(ext, startId, num)
  56. }
  57. func IdTrans(startId string) bson.ObjectId {
  58. defer qu.Catch()
  59. return bson.ObjectIdHex(startId)
  60. }
  61. //开始测试任务抽取
  62. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  63. n, _ := strconv.Atoi(num)
  64. id := IdTrans(startId)
  65. if id.Valid() {
  66. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  67. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  68. for _, v := range *list {
  69. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  70. continue
  71. }
  72. //log.Println(v["_id"])
  73. j := PreInfo(v)
  74. //fmt.Println(j.HasTable, j.HasGoods, j.HasBrand, j.HasKey, "j-------", j.BrandData)
  75. ext.TaskInfo.ProcessPool <- true
  76. go ext.ExtractProcess(j)
  77. }
  78. return true
  79. } else {
  80. return false
  81. }
  82. }
  83. //启动抽取
  84. func StartExtractTaskId(taskId string) bool {
  85. isgo := false
  86. ext := TaskList[taskId]
  87. if ext == nil {
  88. ext = &ExtractTask{}
  89. ext.Id = taskId
  90. ext.InitTaskInfo()
  91. isgo = true
  92. } else {
  93. ext.Id = taskId
  94. ext.InitTaskInfo()
  95. }
  96. ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  97. ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  98. ext.InitRulePres()
  99. ext.InitRuleBacks()
  100. ext.InitRuleCore()
  101. ext.InitPkgCore()
  102. ext.InitTag()
  103. ext.InitClearFn()
  104. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  105. //初始化城市DFA信息
  106. ext.InitDFA()
  107. }
  108. //质量审核
  109. ext.InitAuditFields()
  110. ext.InitAuditRule()
  111. ext.InitAuditClass()
  112. ext.InitAuditRecogField()
  113. ext.IsRun = true
  114. go ext.ResultSave()
  115. go ext.BidSave()
  116. if isgo {
  117. go RunExtractTask(taskId)
  118. }
  119. TaskList[taskId] = ext
  120. return true
  121. }
  122. //停止抽取
  123. func StopExtractTaskId(taskId string) bool {
  124. ext := TaskList[taskId]
  125. if ext != nil {
  126. ext.IsRun = false
  127. TaskList[taskId] = ext
  128. }
  129. //更新task.s_extlastid
  130. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  131. return true
  132. }
  133. //开始抽取
  134. func RunExtractTask(taskId string) {
  135. ext := TaskList[taskId]
  136. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  137. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  138. pageNum := (count + PageSize - 1) / PageSize
  139. limit := PageSize
  140. if count < PageSize {
  141. limit = count
  142. }
  143. log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  144. for i := 0; i < pageNum; i++ {
  145. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  146. log.Printf("page=%d,query=%v", i+1, query)
  147. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  148. for _, v := range *list {
  149. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  150. continue
  151. }
  152. //log.Println(v["_id"])
  153. if !ext.IsRun {
  154. break
  155. }
  156. j := PreInfo(v)
  157. ext.TaskInfo.ProcessPool <- true
  158. go ext.ExtractProcess(j)
  159. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  160. }
  161. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  162. if !ext.IsRun {
  163. break
  164. }
  165. }
  166. //更新task.s_extlastid
  167. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  168. }
  169. //信息预处理
  170. func PreInfo(doc map[string]interface{}) *ju.Job {
  171. detail := ""
  172. d1, _ := doc["detail"].(string)
  173. d2, _ := doc["contenthtml"].(string)
  174. if len(d1) >= len(d2) || d2 == "" {
  175. detail = d1
  176. } else {
  177. detail = d2
  178. }
  179. detail = ju.CutLableStr(detail)
  180. detail = cut.ClearHtml(detail)
  181. doc["detail"] = detail
  182. toptype := qu.ObjToString(doc["toptype"])
  183. if qu.ObjToString(doc["type"]) == "bid" {
  184. toptype = "结果"
  185. }
  186. if toptype == "" {
  187. toptype = "*"
  188. }
  189. j := &ju.Job{
  190. SourceMid: qu.BsonIdToSId(doc["_id"]),
  191. Category: toptype,
  192. Content: qu.ObjToString(doc["detail"]),
  193. SpiderCode: qu.ObjToString(doc["spidercode"]),
  194. //Domain: qu.ObjToString(doc["domain"]),
  195. //Href: qu.ObjToString(doc["href"]),
  196. Title: qu.ObjToString(doc["title"]),
  197. Data: &doc,
  198. City: qu.ObjToString(doc["city"]),
  199. Province: qu.ObjToString(doc["area"]),
  200. Result: map[string][]*ju.ExtField{},
  201. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  202. }
  203. qu.Try(func() {
  204. pretreated.AnalyStart(j)
  205. }, func(err interface{}) {
  206. log.Println("pretreated.AnalyStart", err)
  207. })
  208. return j
  209. }
  210. //抽取
  211. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  212. qu.Try(func() {
  213. doc := *j.Data
  214. //全局前置规则,结果覆盖doc属性
  215. for _, v := range e.RulePres {
  216. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  217. }
  218. //抽取规则
  219. for _, vc := range e.RuleCores {
  220. tmp := ju.DeepCopy(doc).(map[string]interface{})
  221. //是否进入逻辑
  222. if !ju.Logic(vc.LuaLogic, tmp) {
  223. continue
  224. }
  225. //抽取-前置规则
  226. for _, v := range vc.RulePres {
  227. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  228. }
  229. //log.Println("抽取-前置规则", tmp)
  230. //抽取-规则
  231. for _, v := range vc.RuleCores {
  232. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  233. }
  234. //log.Println("抽取-规则", tmp)
  235. //项目名称未能抽取到,标题来凑
  236. if vc.Field == "projectname" {
  237. if len(j.Result[vc.Field]) < 1 {
  238. j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
  239. }
  240. }
  241. //抽取-后置规则
  242. for _, v := range vc.RuleBacks {
  243. ExtRegBack(j, v, e.TaskInfo)
  244. }
  245. //log.Println("抽取-后置规则", tmp)
  246. }
  247. //全局后置规则
  248. for _, v := range e.RuleBacks {
  249. ExtRegBack(j, v, e.TaskInfo)
  250. }
  251. //候选人加入
  252. if len(j.Winnerorder) > 0 {
  253. winner := &ju.ExtField{
  254. Field: "winner",
  255. Code: "",
  256. RuleText: "",
  257. Type: "winnerorder",
  258. MatchType: "winnerorder",
  259. ExtFrom: "",
  260. Value: j.Winnerorder[0]["entname"],
  261. Score: 0,
  262. }
  263. if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 {
  264. winner.Score = -5
  265. }
  266. winners := j.Result["winner"]
  267. if winners != nil {
  268. winners = append(winners, winner)
  269. } else {
  270. winners = []*ju.ExtField{}
  271. winners = append(winners, winner)
  272. }
  273. j.Result["winner"] = winners
  274. }
  275. //函数清理
  276. for key, val := range j.Result {
  277. for _, v := range val {
  278. lock.Lock()
  279. cfn := e.ClearFn[key]
  280. lock.Unlock()
  281. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  282. v.Value = data[0]
  283. //清理特殊符号
  284. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  285. clear.MesField[key] != nil {
  286. text := qu.ObjToString(v.Value)
  287. if key == "projectname" {
  288. fmt.Println("1===========", text)
  289. }
  290. text = clear.OtherClean(key, text)
  291. if key == "projectname" {
  292. fmt.Println("2===========", text)
  293. }
  294. v.Value = text
  295. }
  296. }
  297. }
  298. PackageDetail(j, e) //处理分包信息
  299. // bs, _ := json.Marshal(j.Result)
  300. // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  301. //分析抽取结果并保存 todo
  302. AnalysisSaveResult(j, e)
  303. }, func(err interface{}) {
  304. log.Println((*j.Data)["_id"], err)
  305. <-e.TaskInfo.ProcessPool
  306. })
  307. <-e.TaskInfo.ProcessPool
  308. }
  309. //前置过滤
  310. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  311. before := ju.DeepCopy(doc).(map[string]interface{})
  312. extinfo := map[string]interface{}{}
  313. if in.IsLua {
  314. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  315. if j != nil {
  316. lua.Block = j.Block
  317. }
  318. extinfo = lua.RunScript("pre")
  319. for k, v := range extinfo { //结果覆盖原doc
  320. doc[k] = v
  321. }
  322. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  323. } else {
  324. key := qu.If(in.Field == "", "detail", in.Field).(string)
  325. text := qu.ObjToString(doc[key])
  326. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  327. doc[key] = extinfo[key] //结果覆盖原doc
  328. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  329. }
  330. return doc
  331. }
  332. //抽取-规则
  333. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  334. //废标、流标、ppp等跳过
  335. b := IsExtract(in.Field, j.Title, j.Content)
  336. if !b {
  337. return
  338. }
  339. if in.IsLua {
  340. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  341. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  342. lua.Block = j.Block
  343. extinfo := lua.RunScript("core")
  344. for k, v := range extinfo {
  345. if k == in.Field {
  346. if j.Result[k] == nil {
  347. j.Result[k] = [](*ju.ExtField){}
  348. }
  349. if tmps, ok := v.([]map[string]interface{}); ok {
  350. for _, tmp := range tmps {
  351. j.Result[k] = append(j.Result[k],
  352. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
  353. }
  354. }
  355. }
  356. }
  357. if len(extinfo) > 0 {
  358. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  359. }
  360. } else {
  361. //全文正则
  362. text := qu.ObjToString(doc[extfrom])
  363. if in.Field != "" {
  364. extinfo := extRegCoreToResult(extfrom, text, j, in)
  365. if len(extinfo) > 0 {
  366. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  367. }
  368. }
  369. }
  370. }
  371. //lua脚本根据属性设置提取kv值
  372. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  373. kvmap := map[string][]map[string]interface{}{}
  374. for fieldname, field := range in.LFields {
  375. lock.Lock()
  376. tags := t[field] //获取对应标签库
  377. lock.Unlock()
  378. for _, bl := range j.Block {
  379. //冒号kv
  380. if bl.ColonKV != nil {
  381. kvs := bl.ColonKV.Kvs
  382. kvs2 := bl.ColonKV.Kvs_2
  383. //log.Println("ColonKV1", kvs)
  384. //log.Println("ColonKV2", kvs2)
  385. for _, tag := range tags {
  386. for _, kv := range kvs {
  387. if tag.Type == "string" {
  388. if kv.Key == tag.Key {
  389. text := ju.TrimLRSpace(kv.Value, "")
  390. if text != "" {
  391. kvmap[field] = append(kvmap[field], map[string]interface{}{
  392. "field": field,
  393. "code": in.Code,
  394. "ruletext": tag.Key,
  395. "extfrom": extfrom,
  396. "value": text,
  397. "type": "colon1",
  398. "matchtype": "tag_string",
  399. })
  400. }
  401. break
  402. }
  403. } else if tag.Type == "regexp" {
  404. if tag.Reg.MatchString(kv.Key) {
  405. text := ju.TrimLRSpace(kv.Value, "")
  406. if text != "" {
  407. kvmap[field] = append(kvmap[field], map[string]interface{}{
  408. "field": field,
  409. "code": in.Code,
  410. "ruletext": tag.Key,
  411. "extfrom": extfrom,
  412. "value": text,
  413. "type": "colon1",
  414. "matchtype": "tag_regexp",
  415. })
  416. }
  417. break
  418. }
  419. }
  420. }
  421. for _, kv := range kvs2 {
  422. if tag.Type == "string" {
  423. if kv.Key == tag.Key {
  424. text := ju.TrimLRSpace(kv.Value, "")
  425. if text != "" {
  426. kvmap[field] = append(kvmap[field], map[string]interface{}{
  427. "field": field,
  428. "code": in.Code,
  429. "ruletext": tag.Key,
  430. "extfrom": extfrom,
  431. "value": text,
  432. "type": "colon2",
  433. "matchtype": "tag_string",
  434. })
  435. }
  436. break
  437. }
  438. } else if tag.Type == "regexp" {
  439. if tag.Reg.MatchString(kv.Key) {
  440. text := ju.TrimLRSpace(kv.Value, "")
  441. if text != "" {
  442. kvmap[field] = append(kvmap[field], map[string]interface{}{
  443. "field": field,
  444. "code": in.Code,
  445. "ruletext": tag.Key,
  446. "extfrom": extfrom,
  447. "value": text,
  448. "type": "colon2",
  449. "matchtype": "tag_regexp",
  450. })
  451. }
  452. break
  453. }
  454. }
  455. }
  456. }
  457. }
  458. //空格kv
  459. if bl.SpaceKV != nil {
  460. kvs := bl.SpaceKV.Kvs
  461. //log.Println("SpaceKV", kvs)
  462. for _, tag := range tags {
  463. for _, kv := range kvs {
  464. if tag.Type == "string" {
  465. if kv.Key == tag.Key {
  466. text := ju.TrimLRSpace(kv.Value, "")
  467. if text != "" {
  468. kvmap[field] = append(kvmap[field], map[string]interface{}{
  469. "field": field,
  470. "code": in.Code,
  471. "ruletext": tag.Key,
  472. "extfrom": extfrom,
  473. "value": text,
  474. "type": "space",
  475. "matchtype": "tag_string",
  476. })
  477. }
  478. break
  479. }
  480. } else if tag.Type == "regexp" {
  481. if tag.Reg.MatchString(kv.Key) {
  482. text := ju.TrimLRSpace(kv.Value, "")
  483. if text != "" {
  484. kvmap[field] = append(kvmap[field], map[string]interface{}{
  485. "field": field,
  486. "code": in.Code,
  487. "ruletext": tag.Key,
  488. "extfrom": extfrom,
  489. "value": text,
  490. "type": "space",
  491. "matchtype": "tag_regexp",
  492. })
  493. }
  494. break
  495. }
  496. }
  497. }
  498. }
  499. }
  500. //表格kv
  501. if bl.TableKV != nil {
  502. tkv := bl.TableKV
  503. //log.Println("tkv", tkv)
  504. for k, v := range tkv.Kv {
  505. if k == fieldname {
  506. if len(tags) > -tkv.KvIndex[fieldname] {
  507. ruletext := ""
  508. if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 {
  509. ruletext = "项目名称"
  510. } else {
  511. ruletext = tags[-tkv.KvIndex[fieldname]].Key
  512. }
  513. kvmap[field] = append(kvmap[field], map[string]interface{}{
  514. "field": field,
  515. "code": in.Code,
  516. "ruletext": ruletext,
  517. "extfrom": "table",
  518. "value": v,
  519. "type": "table",
  520. "matchtype": "tag_string",
  521. })
  522. } else { //涉及其他待处理
  523. //log.Println(tags)
  524. }
  525. }
  526. }
  527. }
  528. }
  529. }
  530. return kvmap
  531. }
  532. //正则提取结果
  533. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  534. extinfo := map[string][]map[string]interface{}{}
  535. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  536. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  537. if len(apos) > 0 {
  538. pos := apos[0]
  539. for k, p := range v.RegCore.ExtractPos {
  540. if len(pos) > p {
  541. if pos[p] == -1 || pos[p+1] == -1 {
  542. continue
  543. }
  544. val := text[pos[p]:pos[p+1]]
  545. tmps := []map[string]interface{}{}
  546. tmp := map[string]interface{}{
  547. "field": v.Field,
  548. "code": v.Code,
  549. "ruletext": v.RuleText,
  550. "extfrom": extfrom,
  551. "value": val,
  552. "type": "regexp",
  553. "matchtype": "regcontent",
  554. }
  555. tmps = append(tmps, tmp)
  556. extinfo[k] = tmps
  557. if val != "" {
  558. if j.Result[v.Field] == nil {
  559. j.Result[k] = [](*ju.ExtField){}
  560. }
  561. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  562. }
  563. }
  564. }
  565. }
  566. } else {
  567. pos := v.RegCore.Reg.FindStringIndex(text)
  568. val := ""
  569. if len(pos) == 2 {
  570. text = text[pos[1]:]
  571. rs := regexp.MustCompile("[^\r\n\t]+")
  572. tmp := rs.FindAllString(text, -1)
  573. if len(tmp) > 0 {
  574. val = tmp[0]
  575. }
  576. }
  577. if val != "" {
  578. tmps := []map[string]interface{}{}
  579. tmp := map[string]interface{}{
  580. "field": v.Field,
  581. "code": v.Code,
  582. "ruletext": v.RuleText,
  583. "extfrom": extfrom,
  584. "value": val,
  585. "type": "regexp",
  586. "matchtype": "regcontent",
  587. }
  588. tmps = append(tmps, tmp)
  589. extinfo[v.Field] = tmps
  590. if j.Result[v.Field] == nil {
  591. j.Result[v.Field] = [](*ju.ExtField){}
  592. }
  593. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
  594. }
  595. }
  596. return extinfo
  597. }
  598. //后置过滤
  599. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  600. if in.IsLua {
  601. result := GetResultMapForLua(j)
  602. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  603. if j != nil {
  604. lua.Block = j.Block
  605. }
  606. extinfo := lua.RunScript("back")
  607. for k, v := range extinfo {
  608. if tmps, ok := v.([]map[string]interface{}); ok {
  609. j.Result[k] = [](*ju.ExtField){}
  610. for _, tmp := range tmps {
  611. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  612. }
  613. }
  614. }
  615. if len(extinfo) > 0 {
  616. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  617. }
  618. } else {
  619. extinfo := map[string]interface{}{}
  620. if in.Field != "" {
  621. if j.Result[in.Field] != nil {
  622. tmp := j.Result[in.Field]
  623. exts := []interface{}{}
  624. for k, v := range tmp {
  625. if v.Type == "table" { //table抽取到的数据不清理
  626. continue
  627. }
  628. text := qu.ObjToString(v.Value)
  629. if text != "" {
  630. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  631. }
  632. j.Result[in.Field][k].Value = text
  633. exts = append(exts, map[string]interface{}{
  634. "field": v.Field,
  635. "code": v.Code,
  636. "ruletext": v.RuleText,
  637. "type": v.Type,
  638. "matchtype": v.MatchType,
  639. "extfrom": v.ExtFrom,
  640. "value": text,
  641. })
  642. }
  643. extinfo[in.Field] = exts
  644. if len(extinfo) > 0 {
  645. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  646. }
  647. }
  648. } else {
  649. for key, tmp := range j.Result {
  650. exts := []interface{}{}
  651. for k, v := range tmp {
  652. if v.Type == "table" { //table抽取到的数据不清理
  653. continue
  654. }
  655. text := qu.ObjToString(v.Value)
  656. if text != "" {
  657. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  658. }
  659. j.Result[key][k].Value = text
  660. exts = append(exts, map[string]interface{}{
  661. "field": v.Field,
  662. "code": v.Code,
  663. "ruletext": v.RuleText,
  664. "type": v.Type,
  665. "matchtype": v.MatchType,
  666. "extfrom": v.ExtFrom,
  667. "value": text,
  668. })
  669. }
  670. extinfo[key] = exts
  671. }
  672. if len(extinfo) > 0 {
  673. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  674. }
  675. }
  676. }
  677. }
  678. //获取抽取结果map[string][]interface{},lua脚本使用
  679. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  680. result := map[string][]map[string]interface{}{}
  681. for key, val := range j.Result {
  682. if result[key] == nil {
  683. result[key] = []map[string]interface{}{}
  684. }
  685. for _, v := range val {
  686. tmp := map[string]interface{}{
  687. "field": v.Field,
  688. "code": v.Code,
  689. "ruletext": v.RuleText,
  690. "value": v.Value,
  691. "type": v.Type,
  692. "matchtype": v.MatchType,
  693. "extfrom": v.ExtFrom,
  694. }
  695. result[key] = append(result[key], tmp)
  696. }
  697. }
  698. return result
  699. }
  700. //抽取日志
  701. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  702. if !t.IsEtxLog {
  703. return
  704. }
  705. logdata := map[string]interface{}{
  706. "code": v.Code,
  707. "name": v.Name,
  708. "type": ftype,
  709. "ruletext": v.RuleText,
  710. "islua": v.IsLua,
  711. "field": v.Field,
  712. "version": t.Version,
  713. "taskname": t.Name,
  714. "before": before,
  715. "extinfo": extinfo,
  716. "sid": sid,
  717. "comeintime": time.Now().Unix(),
  718. }
  719. lock.Lock()
  720. ExtLogs[t] = append(ExtLogs[t], logdata)
  721. lock.Unlock()
  722. }
  723. //保存抽取日志
  724. func SaveExtLog() {
  725. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  726. lock.Lock()
  727. tmpLogs = ExtLogs
  728. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  729. lock.Unlock()
  730. for k, v := range tmpLogs {
  731. if len(v) < saveLimit {
  732. db.Mgo.SaveBulk(k.TrackColl, v...)
  733. } else {
  734. for {
  735. if len(v) > saveLimit {
  736. tmp := v[:saveLimit]
  737. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  738. v = v[saveLimit:]
  739. } else {
  740. db.Mgo.SaveBulk(k.TrackColl, v...)
  741. break
  742. }
  743. }
  744. }
  745. }
  746. time.AfterFunc(10*time.Second, SaveExtLog)
  747. }
  748. type FieldValue struct {
  749. Value interface{}
  750. Count int
  751. }
  752. //分析抽取结果并保存
  753. func AnalysisSaveResult(j *ju.Job, e *ExtractTask) {
  754. log.Println("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  755. doc := j.Data
  756. result := j.Result
  757. _id := qu.BsonIdToSId((*doc)["_id"])
  758. iscore, _ := ju.Config["fieldscore"].(bool)
  759. if iscore { //打分
  760. result = ScoreFields(j)
  761. }
  762. //结果排序
  763. values := map[string][]*ju.SortObject{}
  764. for key, val := range result {
  765. fieldValue := map[string][]interface{}{}
  766. if iscore { //走打分
  767. for _, v := range val {
  768. if len(fmt.Sprint(v.Value)) < 1 {
  769. continue //去除空串
  770. }
  771. fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
  772. }
  773. } else { //不走打分,按出现频次
  774. for _, v := range val {
  775. if len(fmt.Sprint(v.Value)) < 1 {
  776. continue //去除空串
  777. }
  778. if fieldValue[fmt.Sprint(v.Value)] == nil {
  779. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  780. } else {
  781. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  782. }
  783. }
  784. }
  785. objects := []*ju.SortObject{}
  786. for k, v := range fieldValue {
  787. ValueStr := "" //第二排序
  788. if reflect.TypeOf(v[1]).String() == "string" {
  789. ValueStr = qu.ObjToString(v[1])
  790. }
  791. tmp := &ju.SortObject{
  792. Key: k,
  793. Value: qu.IntAll(v[0]),
  794. Object: v[1],
  795. ValueStr: ValueStr,
  796. }
  797. objects = append(objects, tmp)
  798. }
  799. values[key] = ju.ExtSort(objects)
  800. }
  801. //从排序结果中取值
  802. tmp := map[string]interface{}{} //抽取值
  803. for key, val := range values {
  804. for _, v := range val { //取第一个非负数
  805. if v.Key != "" && v.Value > -1 {
  806. tmp[key] = v.Object
  807. break
  808. }
  809. }
  810. }
  811. if len(j.PackageInfo) > 0 { //分包信息
  812. tmp["package"] = j.PackageInfo
  813. }
  814. if len(j.Winnerorder) > 0 { //候选人信息
  815. tmp["winnerorder"] = j.Winnerorder
  816. }
  817. for k, v := range *doc {
  818. //去重冗余字段
  819. if k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" {
  820. continue
  821. }
  822. if tmp[k] == nil {
  823. tmp[k] = v
  824. }
  825. }
  826. //质量审核
  827. if ju.Config["qualityaudit"].(bool) {
  828. e.QualityAudit(tmp)
  829. }
  830. if e.IsExtractCity { //城市抽取
  831. b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  832. //log.Println("省份---", p, "城市---", c, "区---", d)
  833. tmp["district"] = d
  834. if b {
  835. tmp["city"] = c
  836. tmp["area"] = p
  837. }
  838. }
  839. if e.TaskInfo.TestColl == "" {
  840. if len(tmp) > 0 { //保存抽取结果
  841. tmparr := []map[string]interface{}{
  842. map[string]interface{}{
  843. "_id": qu.StringTOBsonId(_id),
  844. },
  845. map[string]interface{}{"$set": tmp},
  846. }
  847. e.BidArr = append(e.BidArr, tmparr)
  848. }
  849. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  850. id := tmp["_id"]
  851. tmp["result"] = result
  852. delete(tmp, "_id")
  853. tmparr := []map[string]interface{}{
  854. map[string]interface{}{
  855. "_id": id,
  856. },
  857. map[string]interface{}{"$set": tmp},
  858. }
  859. e.ResultArr = append(e.ResultArr, tmparr)
  860. }
  861. } else { //测试结果
  862. delete(tmp, "_id")
  863. if len(j.BlockPackage) > 0 { //分包详情
  864. bs, _ := json.Marshal(j.BlockPackage)
  865. tmp["epackage"] = string(bs)
  866. }
  867. tmp["result"] = result
  868. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  869. if !b {
  870. log.Println(e.TaskInfo.TestColl, _id)
  871. }
  872. }
  873. }
  874. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  875. defer qu.Catch()
  876. //获取审核字段
  877. for _, field := range e.AuditFields {
  878. //1.分包
  879. if resulttmp["package"] != nil {
  880. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  881. for _, val := range packagedata {
  882. if val[field] != nil {
  883. fv := qu.ObjToString(val[field])
  884. if fv != "" {
  885. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  886. e.RedisMatch(field, fv, val) //redis匹配
  887. } else { //除了buyer和winner,其他字段走规则匹配
  888. e.RuleMatch(field, fv, val)
  889. }
  890. }
  891. }
  892. }
  893. }
  894. //2.外围
  895. if resulttmp[field] != nil {
  896. fv := qu.ObjToString(resulttmp[field])
  897. if fv != "" {
  898. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  899. e.RedisMatch(field, fv, resulttmp) //redis匹配
  900. } else { //除了buyer和winner,其他字段走规则匹配
  901. e.RuleMatch(field, fv, resulttmp)
  902. }
  903. }
  904. }
  905. }
  906. }
  907. //Redis匹配
  908. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  909. defer qu.Catch()
  910. i := redis.GetInt(field, field+"_"+fv) //查找redis
  911. if i == 0 { //reids未找到,执行规则匹配
  912. val[field+"_isredis"] = false
  913. e.RuleMatch(field, fv, val) //规则匹配
  914. } else { //redis找到,打标识存库
  915. val[field+"_isredis"] = true
  916. }
  917. }
  918. //规则匹配
  919. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  920. defer qu.Catch()
  921. if fieldval != "" {
  922. SMap := e.StartMatch(field, fieldval)
  923. //SMap.AddKey(field+"_isaudit", false)
  924. for _, k := range SMap.Keys {
  925. tmpMap[k] = SMap.Map[k]
  926. }
  927. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  928. }
  929. }
  930. //开始规则匹配
  931. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  932. defer qu.Catch()
  933. SMap := pretreated.NewSortMap()
  934. lock.Lock()
  935. f := e.RecogFieldMap[field]
  936. lock.Unlock()
  937. if len(f) > 0 {
  938. fid := qu.BsonIdToSId(f["_id"])
  939. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  940. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  941. if textAfterRecogFieldPrerule != "" {
  942. lock.Lock()
  943. classMap := e.FidClassMap[fid]
  944. lock.Unlock()
  945. L:
  946. for _, c := range classMap { //class
  947. classid := qu.BsonIdToSId(c["_id"])
  948. classPrerule := qu.ObjToString(c["s_class_prerule"])
  949. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  950. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  951. if textAfterClassPrerule != "" {
  952. lock.Lock()
  953. ruleMap := e.CidRuleMap[classid]
  954. lock.Unlock()
  955. for _, r := range ruleMap { //rule
  956. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  957. s_code := qu.ObjToString(r["s_code"])
  958. rule := r["rule"].([]interface{})
  959. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  960. if textAfterRulePrerule != "" {
  961. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  962. if b { //匹配到一个分类下某个规则时,不再继续匹配
  963. if savefield != "" { //保存字段不为空,存储代码信息
  964. SMap.AddKey(field+"_"+savefield, s_code)
  965. }
  966. break L
  967. }
  968. }
  969. }
  970. }
  971. }
  972. }
  973. }
  974. return SMap
  975. }