extract.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741
  1. package extract
  2. import (
  3. "encoding/json"
  4. //"encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. "log"
  11. qu "qfw/util"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "gopkg.in/mgo.v2/bson"
  18. )
  19. var (
  20. lock sync.RWMutex
  21. cut = ju.NewCut() //获取正文并清理
  22. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  23. TaskList map[string]*ExtractTask //任务列表
  24. saveLimit = 200 //抽取日志批量保存
  25. Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
  26. )
  27. //启动测试抽取
  28. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  29. defer qu.Catch()
  30. ext := &ExtractTask{}
  31. ext.Id = taskId
  32. ext.IsRun = true
  33. ext.InitTestTaskInfo(resultcoll, trackcoll)
  34. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  35. ext.InitRulePres()
  36. ext.InitRuleBacks()
  37. ext.InitRuleCore()
  38. ext.InitTag()
  39. ext.InitClearFn()
  40. ext.InitProvince()
  41. ext.InitCityAll()
  42. ext.InitCitySim()
  43. InitDFA()
  44. return RunExtractTestTask(ext, startId, num)
  45. }
  46. func IdTrans(startId string) bson.ObjectId {
  47. defer qu.Catch()
  48. return bson.ObjectIdHex(startId)
  49. }
  50. //开始测试任务抽取
  51. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  52. n, _ := strconv.Atoi(num)
  53. id := IdTrans(startId)
  54. if id.Valid() {
  55. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  56. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  57. for _, v := range *list {
  58. j := PreInfo(v)
  59. ext.TaskInfo.ProcessPool <- true
  60. go ext.ExtractProcess(j)
  61. }
  62. return true
  63. } else {
  64. return false
  65. }
  66. }
  67. //启动抽取
  68. func StartExtractTaskId(taskId string) bool {
  69. isgo := false
  70. ext := TaskList[taskId]
  71. if ext == nil {
  72. ext = &ExtractTask{}
  73. ext.Id = taskId
  74. ext.InitTaskInfo()
  75. isgo = true
  76. } else {
  77. ext.Id = taskId
  78. ext.InitTaskInfo()
  79. }
  80. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  81. ext.InitRulePres()
  82. ext.InitRuleBacks()
  83. ext.InitRuleCore()
  84. ext.InitTag()
  85. ext.InitClearFn()
  86. ext.InitProvince()
  87. ext.InitCityAll()
  88. ext.InitCitySim()
  89. InitDFA()
  90. ext.IsRun = true
  91. if isgo {
  92. go RunExtractTask(taskId)
  93. }
  94. TaskList[taskId] = ext
  95. return true
  96. }
  97. //停止抽取
  98. func StopExtractTaskId(taskId string) bool {
  99. ext := TaskList[taskId]
  100. if ext != nil {
  101. ext.IsRun = false
  102. TaskList[taskId] = ext
  103. }
  104. //更新task.s_extlastid
  105. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  106. return true
  107. }
  108. //开始抽取
  109. func RunExtractTask(taskId string) {
  110. ext := TaskList[taskId]
  111. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  112. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
  113. for k, v := range *list {
  114. log.Println(k, v["_id"])
  115. if !ext.IsRun {
  116. break
  117. }
  118. j := PreInfo(v)
  119. ext.TaskInfo.ProcessPool <- true
  120. go ext.ExtractProcess(j)
  121. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  122. }
  123. //更新task.s_extlastid
  124. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  125. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  126. }
  127. //信息预处理
  128. func PreInfo(doc map[string]interface{}) *ju.Job {
  129. detail := ""
  130. d1, _ := doc["detail"].(string)
  131. d2, _ := doc["contenthtml"].(string)
  132. if len(d1) >= len(d2) || d2 == "" {
  133. detail = d1
  134. } else {
  135. detail = d2
  136. }
  137. detail = ju.CutLableStr(detail)
  138. detail = cut.ClearHtml(detail)
  139. doc["detail"] = detail
  140. href := qu.ObjToString(doc["href"])
  141. if strings.HasPrefix(href, "http://") {
  142. href = href[7:]
  143. } else if strings.HasPrefix(href, "https://") {
  144. href = href[8:]
  145. }
  146. pos := strings.Index(href, "/")
  147. if pos > 0 {
  148. href = href[:pos]
  149. }
  150. doc["domain"] = href
  151. toptype := qu.ObjToString(doc["toptype"])
  152. if qu.ObjToString(doc["type"]) == "bid" {
  153. toptype = "结果"
  154. }
  155. if toptype == "" {
  156. toptype = "*"
  157. }
  158. j := &ju.Job{
  159. SourceMid: qu.BsonIdToSId(doc["_id"]),
  160. Category: toptype,
  161. Content: qu.ObjToString(doc["detail"]),
  162. SpiderCode: qu.ObjToString(doc["spidercode"]),
  163. Domain: qu.ObjToString(doc["domain"]),
  164. Href: qu.ObjToString(doc["href"]),
  165. Title: qu.ObjToString(doc["title"]),
  166. Data: &doc,
  167. City: qu.ObjToString(doc["city"]),
  168. Province: qu.ObjToString(doc["area"]),
  169. Result: map[string][]*ju.ExtField{},
  170. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  171. }
  172. pretreated.AnalyStart(j)
  173. return j
  174. }
  175. //抽取
  176. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  177. qu.Catch()
  178. qu.Try(func() {
  179. doc := *j.Data
  180. //全局前置规则,结果覆盖doc属性
  181. for _, v := range e.RulePres {
  182. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  183. }
  184. //log.Println("全局前置规则", doc)
  185. //抽取规则
  186. for _, vc := range e.RuleCores {
  187. tmp := ju.DeepCopy(doc).(map[string]interface{})
  188. //是否进入逻辑
  189. if !ju.Logic(vc.LuaLogic, tmp) {
  190. continue
  191. }
  192. //抽取-前置规则
  193. for _, v := range vc.RulePres {
  194. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  195. }
  196. //log.Println("抽取-前置规则", tmp)
  197. //抽取-规则
  198. for _, v := range vc.RuleCores {
  199. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  200. }
  201. //log.Println("抽取-规则", tmp)
  202. //抽取-后置规则
  203. for _, v := range vc.RuleBacks {
  204. ExtRegBack(j, v, e.TaskInfo)
  205. }
  206. //log.Println("抽取-后置规则", tmp)
  207. }
  208. //全局后置规则
  209. for _, v := range e.RuleBacks {
  210. ExtRegBack(j, v, e.TaskInfo)
  211. }
  212. //函数清理
  213. for key, val := range j.Result {
  214. for _, v := range val {
  215. data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
  216. v.Value = data[0]
  217. }
  218. }
  219. bs, _ := json.Marshal(j.Result)
  220. log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  221. //分析抽取结果并保存 todo
  222. AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
  223. }, func(err interface{}) {
  224. log.Println(err)
  225. <-e.TaskInfo.ProcessPool
  226. })
  227. <-e.TaskInfo.ProcessPool
  228. }
  229. //前置过滤
  230. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  231. before := ju.DeepCopy(doc).(map[string]interface{})
  232. extinfo := map[string]interface{}{}
  233. if in.IsLua {
  234. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  235. if j != nil {
  236. lua.Block = j.Block
  237. }
  238. extinfo = lua.RunScript("pre")
  239. for k, v := range extinfo { //结果覆盖原doc
  240. doc[k] = v
  241. }
  242. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  243. } else {
  244. key := qu.If(in.Field == "", "detail", in.Field).(string)
  245. text := qu.ObjToString(doc[key])
  246. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  247. doc[key] = extinfo[key] //结果覆盖原doc
  248. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  249. }
  250. return doc
  251. }
  252. //抽取-规则
  253. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  254. if in.IsLua {
  255. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  256. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  257. lua.Block = j.Block
  258. extinfo := lua.RunScript("core")
  259. for k, v := range extinfo {
  260. if k == in.Field {
  261. if j.Result[k] == nil {
  262. j.Result[k] = [](*ju.ExtField){}
  263. }
  264. if tmps, ok := v.([]map[string]interface{}); ok {
  265. for _, tmp := range tmps {
  266. j.Result[k] = append(j.Result[k],
  267. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
  268. }
  269. }
  270. }
  271. }
  272. if len(extinfo) > 0 {
  273. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  274. }
  275. } else {
  276. //全文正则
  277. text := qu.ObjToString(doc[extfrom])
  278. if in.Field != "" {
  279. extinfo := extRegCoreToResult(extfrom, text, j, in)
  280. if len(extinfo) > 0 {
  281. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  282. }
  283. }
  284. }
  285. }
  286. //lua脚本根据属性设置提取kv值
  287. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  288. kvmap := map[string][]map[string]interface{}{}
  289. for fieldname, field := range in.LFields {
  290. tags := t[field] //获取对应标签库
  291. for _, bl := range j.Block {
  292. //冒号kv
  293. if bl.ColonKV != nil {
  294. kvs := bl.ColonKV.Kvs
  295. kvs2 := bl.ColonKV.Kvs_2
  296. //log.Println("ColonKV1", kvs)
  297. //log.Println("ColonKV2", kvs2)
  298. for _, tag := range tags {
  299. for _, kv := range kvs {
  300. if tag.Type == "string" {
  301. if kv.Key == tag.Key {
  302. text := ju.TrimLRSpace(kv.Value, "")
  303. if text != "" {
  304. kvmap[field] = append(kvmap[field], map[string]interface{}{
  305. "field": field,
  306. "code": in.Code,
  307. "ruletext": tag.Key,
  308. "extfrom": extfrom,
  309. "value": text,
  310. "type": "colon1",
  311. "matchtype": "tag_string",
  312. })
  313. }
  314. break
  315. }
  316. } else if tag.Type == "regexp" {
  317. if tag.Reg.MatchString(kv.Key) {
  318. text := ju.TrimLRSpace(kv.Value, "")
  319. if text != "" {
  320. kvmap[field] = append(kvmap[field], map[string]interface{}{
  321. "field": field,
  322. "code": in.Code,
  323. "ruletext": tag.Key,
  324. "extfrom": extfrom,
  325. "value": text,
  326. "type": "colon1",
  327. "matchtype": "tag_regexp",
  328. })
  329. }
  330. break
  331. }
  332. }
  333. }
  334. for _, kv := range kvs2 {
  335. if tag.Type == "string" {
  336. if kv.Key == tag.Key {
  337. text := ju.TrimLRSpace(kv.Value, "")
  338. if text != "" {
  339. kvmap[field] = append(kvmap[field], map[string]interface{}{
  340. "field": field,
  341. "code": in.Code,
  342. "ruletext": tag.Key,
  343. "extfrom": extfrom,
  344. "value": text,
  345. "type": "colon2",
  346. "matchtype": "tag_string",
  347. })
  348. }
  349. break
  350. }
  351. } else if tag.Type == "regexp" {
  352. if tag.Reg.MatchString(kv.Key) {
  353. text := ju.TrimLRSpace(kv.Value, "")
  354. if text != "" {
  355. kvmap[field] = append(kvmap[field], map[string]interface{}{
  356. "field": field,
  357. "code": in.Code,
  358. "ruletext": tag.Key,
  359. "extfrom": extfrom,
  360. "value": text,
  361. "type": "colon2",
  362. "matchtype": "tag_regexp",
  363. })
  364. }
  365. break
  366. }
  367. }
  368. }
  369. }
  370. }
  371. //空格kv
  372. if bl.SpaceKV != nil {
  373. kvs := bl.SpaceKV.Kvs
  374. //log.Println("SpaceKV", kvs)
  375. for _, tag := range tags {
  376. for _, kv := range kvs {
  377. if tag.Type == "string" {
  378. if kv.Key == tag.Key {
  379. text := ju.TrimLRSpace(kv.Value, "")
  380. if text != "" {
  381. kvmap[field] = append(kvmap[field], map[string]interface{}{
  382. "field": field,
  383. "code": in.Code,
  384. "ruletext": tag.Key,
  385. "extfrom": extfrom,
  386. "value": text,
  387. "type": "space",
  388. "matchtype": "tag_string",
  389. })
  390. }
  391. break
  392. }
  393. } else if tag.Type == "regexp" {
  394. if tag.Reg.MatchString(kv.Key) {
  395. text := ju.TrimLRSpace(kv.Value, "")
  396. if text != "" {
  397. kvmap[field] = append(kvmap[field], map[string]interface{}{
  398. "field": field,
  399. "code": in.Code,
  400. "ruletext": tag.Key,
  401. "extfrom": extfrom,
  402. "value": text,
  403. "type": "space",
  404. "matchtype": "tag_regexp",
  405. })
  406. }
  407. break
  408. }
  409. }
  410. }
  411. }
  412. }
  413. //表格kv
  414. if bl.TableKV != nil {
  415. tkv := bl.TableKV
  416. //log.Println("tkv", tkv)
  417. for k, v := range tkv.Kv {
  418. if k == fieldname {
  419. if len(tags) > -tkv.KvIndex[fieldname] {
  420. kvmap[field] = append(kvmap[field], map[string]interface{}{
  421. "field": field,
  422. "code": in.Code,
  423. "ruletext": tags[-tkv.KvIndex[fieldname]].Key,
  424. "extfrom": "table",
  425. "value": v,
  426. "type": "table",
  427. "matchtype": "tag_string",
  428. })
  429. } else { //涉及其他待处理
  430. //log.Println(tags)
  431. }
  432. }
  433. }
  434. }
  435. }
  436. }
  437. return kvmap
  438. }
  439. //正则提取结果
  440. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  441. extinfo := map[string][]map[string]interface{}{}
  442. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  443. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  444. if len(apos) > 0 {
  445. pos := apos[0]
  446. for k, p := range v.RegCore.ExtractPos {
  447. if len(pos) > p {
  448. if pos[p] == -1 || pos[p+1] == -1 {
  449. continue
  450. }
  451. val := text[pos[p]:pos[p+1]]
  452. tmps := []map[string]interface{}{}
  453. tmp := map[string]interface{}{
  454. "field": v.Field,
  455. "code": v.Code,
  456. "ruletext": v.RuleText,
  457. "extfrom": extfrom,
  458. "value": val,
  459. "type": "regexp",
  460. "matchtype": "regcontent",
  461. }
  462. tmps = append(tmps, tmp)
  463. extinfo[k] = tmps
  464. if val != "" {
  465. if j.Result[v.Field] == nil {
  466. j.Result[k] = [](*ju.ExtField){}
  467. }
  468. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  469. }
  470. }
  471. }
  472. }
  473. } else {
  474. pos := v.RegCore.Reg.FindStringIndex(text)
  475. val := ""
  476. if len(pos) == 2 {
  477. text = text[pos[1]:]
  478. rs := regexp.MustCompile("[^\r\n\t]+")
  479. tmp := rs.FindAllString(text, -1)
  480. if len(tmp) > 0 {
  481. val = tmp[0]
  482. }
  483. }
  484. if val != "" {
  485. tmps := []map[string]interface{}{}
  486. tmp := map[string]interface{}{
  487. "field": v.Field,
  488. "code": v.Code,
  489. "ruletext": v.RuleText,
  490. "extfrom": extfrom,
  491. "value": val,
  492. "type": "regexp",
  493. "matchtype": "regcontent",
  494. }
  495. tmps = append(tmps, tmp)
  496. extinfo[v.Field] = tmps
  497. if j.Result[v.Field] == nil {
  498. j.Result[v.Field] = [](*ju.ExtField){}
  499. }
  500. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  501. }
  502. }
  503. return extinfo
  504. }
  505. //后置过滤
  506. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  507. if in.IsLua {
  508. result := GetResultMapForLua(j)
  509. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  510. if j != nil {
  511. lua.Block = j.Block
  512. }
  513. extinfo := lua.RunScript("back")
  514. for k, v := range extinfo {
  515. if tmps, ok := v.([]map[string]interface{}); ok {
  516. j.Result[k] = [](*ju.ExtField){}
  517. for _, tmp := range tmps {
  518. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
  519. }
  520. }
  521. }
  522. if len(extinfo) > 0 {
  523. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  524. }
  525. } else {
  526. extinfo := map[string]interface{}{}
  527. if in.Field != "" {
  528. if j.Result[in.Field] != nil {
  529. tmp := j.Result[in.Field]
  530. exts := []interface{}{}
  531. for k, v := range tmp {
  532. text := qu.ObjToString(v.Value)
  533. if text != "" {
  534. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  535. }
  536. j.Result[in.Field][k].Value = text
  537. exts = append(exts, map[string]interface{}{
  538. "field": v.Field,
  539. "code": v.Code,
  540. "ruletext": v.RuleText,
  541. "type": v.Type,
  542. "matchtype": v.MatchType,
  543. "extfrom": v.ExtFrom,
  544. "value": text,
  545. })
  546. }
  547. extinfo[in.Field] = exts
  548. if len(extinfo) > 0 {
  549. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  550. }
  551. }
  552. } else {
  553. for key, tmp := range j.Result {
  554. exts := []interface{}{}
  555. for k, v := range tmp {
  556. text := qu.ObjToString(v.Value)
  557. if text != "" {
  558. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  559. }
  560. j.Result[key][k].Value = text
  561. exts = append(exts, map[string]interface{}{
  562. "field": v.Field,
  563. "code": v.Code,
  564. "ruletext": v.RuleText,
  565. "type": v.Type,
  566. "matchtype": v.MatchType,
  567. "extfrom": v.ExtFrom,
  568. "value": text,
  569. })
  570. }
  571. extinfo[key] = exts
  572. }
  573. if len(extinfo) > 0 {
  574. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  575. }
  576. }
  577. }
  578. }
  579. //获取抽取结果map[string][]interface{},lua脚本使用
  580. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  581. result := map[string][]map[string]interface{}{}
  582. for key, val := range j.Result {
  583. if result[key] == nil {
  584. result[key] = []map[string]interface{}{}
  585. }
  586. for _, v := range val {
  587. tmp := map[string]interface{}{
  588. "field": v.Field,
  589. "code": v.Code,
  590. "ruletext": v.RuleText,
  591. "value": v.Value,
  592. "type": v.Type,
  593. "matchtype": v.MatchType,
  594. "extfrom": v.ExtFrom,
  595. }
  596. result[key] = append(result[key], tmp)
  597. }
  598. }
  599. return result
  600. }
  601. //抽取日志
  602. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  603. if !t.IsEtxLog {
  604. return
  605. }
  606. logdata := map[string]interface{}{
  607. "code": v.Code,
  608. "name": v.Name,
  609. "type": ftype,
  610. "ruletext": v.RuleText,
  611. "islua": v.IsLua,
  612. "field": v.Field,
  613. "version": t.Version,
  614. "taskname": t.Name,
  615. "before": before,
  616. "extinfo": extinfo,
  617. "sid": sid,
  618. "comeintime": time.Now().Unix(),
  619. }
  620. lock.Lock()
  621. ExtLogs[t] = append(ExtLogs[t], logdata)
  622. lock.Unlock()
  623. }
  624. //保存抽取日志
  625. func SaveExtLog() {
  626. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  627. lock.Lock()
  628. tmpLogs = ExtLogs
  629. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  630. lock.Unlock()
  631. for k, v := range tmpLogs {
  632. if len(v) < saveLimit {
  633. db.Mgo.SaveBulk(k.TrackColl, v...)
  634. } else {
  635. for {
  636. if len(v) > saveLimit {
  637. tmp := v[:saveLimit]
  638. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  639. v = v[saveLimit:]
  640. } else {
  641. db.Mgo.SaveBulk(k.TrackColl, v...)
  642. break
  643. }
  644. }
  645. }
  646. }
  647. time.AfterFunc(10*time.Second, SaveExtLog)
  648. }
  649. type FieldValue struct {
  650. Value interface{}
  651. Count int
  652. }
  653. //分析抽取结果并保存
  654. func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
  655. _id := qu.BsonIdToSId((*doc)["_id"])
  656. //结果排序
  657. values := map[string][]*ju.SortObject{}
  658. for key, val := range result {
  659. fieldValue := map[string][]interface{}{}
  660. for _, v := range val {
  661. if fieldValue[fmt.Sprint(v.Value)] == nil {
  662. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  663. } else {
  664. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  665. }
  666. }
  667. objects := []*ju.SortObject{}
  668. for k, v := range fieldValue {
  669. tmp := &ju.SortObject{
  670. Key: k,
  671. Value: qu.IntAll(v[0]),
  672. Object: v[1],
  673. }
  674. objects = append(objects, tmp)
  675. }
  676. values[key] = ju.ExtSort(objects)
  677. }
  678. //从排序结果中取值
  679. tmp := map[string]interface{}{} //抽取值
  680. resulttmp := tmp //保存结果
  681. for key, val := range values {
  682. for _, v := range val { //取第一个
  683. if v.Key != "" {
  684. tmp[key] = v.Object
  685. break
  686. }
  687. }
  688. }
  689. resulttmp["result"] = result
  690. for k, v := range *doc {
  691. if resulttmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
  692. resulttmp[k] = v
  693. }
  694. }
  695. b, p, c, d := TransmitData(resulttmp, _id) //抽取省份城市
  696. //log.Println("抽取省份,城市,县结果=====", b, p, c, d)
  697. resulttmp["district"] = d
  698. if b {
  699. resulttmp["city"] = c
  700. resulttmp["area"] = p
  701. }
  702. if task.TestColl == "" {
  703. if len(tmp) > 0 { //保存抽取结果
  704. task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  705. }
  706. db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
  707. } else { //测试结果
  708. db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
  709. }
  710. }