extract.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
  1. package extract
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "jy/clear"
  6. db "jy/mongodbutil"
  7. "jy/pretreated"
  8. ju "jy/util"
  9. "log"
  10. qu "qfw/util"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "time"
  15. "gopkg.in/mgo.v2/bson"
  16. )
  17. var (
  18. lock sync.RWMutex
  19. cut = ju.NewCut() //获取正文并清理
  20. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  21. TaskList map[string]*ExtractTask //任务列表
  22. saveLimit = 200 //抽取日志批量保存
  23. Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
  24. )
  25. //启动测试抽取
  26. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  27. defer qu.Catch()
  28. ext := &ExtractTask{}
  29. ext.Id = taskId
  30. ext.IsRun = true
  31. ext.InitTestTaskInfo(resultcoll, trackcoll)
  32. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  33. ext.InitRulePres()
  34. ext.InitRuleBacks()
  35. ext.InitRuleCore()
  36. ext.InitTag()
  37. ext.InitClearFn()
  38. return RunExtractTestTask(ext, startId, num)
  39. }
  40. func IdTrans(startId string) bson.ObjectId {
  41. defer qu.Catch()
  42. return bson.ObjectIdHex(startId)
  43. }
  44. //开始测试任务抽取
  45. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  46. n, _ := strconv.Atoi(num)
  47. id := IdTrans(startId)
  48. if id.Valid() {
  49. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  50. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  51. for _, v := range *list {
  52. j := PreInfo(v)
  53. ext.TaskInfo.ProcessPool <- true
  54. go ext.ExtractProcess(j)
  55. }
  56. return true
  57. } else {
  58. return false
  59. }
  60. }
  61. //启动抽取
  62. func StartExtractTaskId(taskId string) bool {
  63. ext := TaskList[taskId]
  64. if ext == nil {
  65. ext = &ExtractTask{}
  66. ext.Id = taskId
  67. ext.IsRun = true
  68. ext.InitTaskInfo()
  69. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  70. ext.InitRulePres()
  71. ext.InitRuleBacks()
  72. ext.InitRuleCore()
  73. ext.InitTag()
  74. ext.InitClearFn()
  75. //只启动一次taskId
  76. go RunExtractTask(ext)
  77. }
  78. ext.IsRun = true
  79. TaskList[taskId] = ext
  80. return true
  81. }
  82. //停止抽取
  83. func StopExtractTaskId(taskId string) bool {
  84. ext := TaskList[taskId]
  85. if ext != nil {
  86. ext.IsRun = false
  87. TaskList[taskId] = ext
  88. }
  89. //更新task.s_extlastid
  90. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  91. return true
  92. }
  93. //开始抽取
  94. func RunExtractTask(ext *ExtractTask) {
  95. if !ext.IsRun {
  96. return
  97. }
  98. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  99. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
  100. for _, v := range *list {
  101. if !ext.IsRun {
  102. break
  103. }
  104. j := PreInfo(v)
  105. ext.TaskInfo.ProcessPool <- true
  106. go ext.ExtractProcess(j)
  107. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  108. time.Sleep(1 * time.Second)
  109. }
  110. //更新task.s_extlastid
  111. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  112. time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
  113. }
  114. //信息预处理
  115. func PreInfo(doc map[string]interface{}) *ju.Job {
  116. detail := ""
  117. d1, _ := doc["detail"].(string)
  118. d2, _ := doc["contenthtml"].(string)
  119. if len(d1) >= len(d2) || d2 == "" {
  120. detail = d1
  121. } else {
  122. detail = d2
  123. }
  124. detail = ju.CutLableStr(detail)
  125. detail = cut.ClearHtml(detail)
  126. doc["detail"] = detail
  127. href := qu.ObjToString(doc["href"])
  128. if strings.HasPrefix(href, "http://") {
  129. href = href[7:]
  130. } else if strings.HasPrefix(href, "https://") {
  131. href = href[8:]
  132. }
  133. pos := strings.Index(href, "/")
  134. if pos > 0 {
  135. href = href[:pos]
  136. }
  137. doc["domain"] = href
  138. toptype := qu.ObjToString(doc["toptype"])
  139. if qu.ObjToString(doc["type"]) == "bid" {
  140. toptype = "结果"
  141. }
  142. if toptype == "" {
  143. toptype = "*"
  144. }
  145. j := &ju.Job{
  146. SourceMid: qu.BsonIdToSId(doc["_id"]),
  147. Category: toptype,
  148. Content: qu.ObjToString(doc["detail"]),
  149. SpiderCode: qu.ObjToString(doc["spidercode"]),
  150. Domain: qu.ObjToString(doc["domain"]),
  151. Href: qu.ObjToString(doc["href"]),
  152. Title: qu.ObjToString(doc["title"]),
  153. Data: &doc,
  154. City: qu.ObjToString(doc["city"]),
  155. Province: qu.ObjToString(doc["area"]),
  156. Result: map[string][]*ju.ExtField{},
  157. }
  158. pretreated.AnalyStart(j)
  159. return j
  160. }
  161. //抽取
  162. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  163. qu.Catch()
  164. qu.Try(func() {
  165. doc := *j.Data
  166. //全局前置规则,结果覆盖doc属性
  167. for _, v := range e.RulePres {
  168. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  169. }
  170. //log.Println("全局前置规则", doc)
  171. //抽取规则
  172. for _, vc := range e.RuleCores {
  173. tmp := ju.DeepCopy(doc).(map[string]interface{})
  174. //是否进入逻辑
  175. if !ju.Logic(vc.LuaLogic, tmp) {
  176. continue
  177. }
  178. //抽取-前置规则
  179. for _, v := range vc.RulePres {
  180. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  181. }
  182. //log.Println("抽取-前置规则", tmp)
  183. //抽取-规则
  184. for _, v := range vc.RuleCores {
  185. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  186. }
  187. //log.Println("抽取-规则", tmp)
  188. //抽取-后置规则
  189. for _, v := range vc.RuleBacks {
  190. ExtRegBack(j, v, e.TaskInfo)
  191. }
  192. //log.Println("抽取-后置规则", tmp)
  193. }
  194. //全局后置规则
  195. for _, v := range e.RuleBacks {
  196. ExtRegBack(j, v, e.TaskInfo)
  197. }
  198. //函数清理
  199. for key, val := range j.Result {
  200. for _, v := range val {
  201. data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
  202. v.Value = data[0]
  203. }
  204. }
  205. bs, _ := json.Marshal(j.Result)
  206. log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  207. //分析抽取结果并保存 todo
  208. AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
  209. }, func(err interface{}) {
  210. log.Println(err)
  211. <-e.TaskInfo.ProcessPool
  212. })
  213. <-e.TaskInfo.ProcessPool
  214. }
  215. //前置过滤
  216. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  217. before := ju.DeepCopy(doc).(map[string]interface{})
  218. extinfo := map[string]interface{}{}
  219. if in.IsLua {
  220. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  221. if j != nil {
  222. lua.Block = j.Block
  223. }
  224. extinfo = lua.RunScript("pre")
  225. for k, v := range extinfo { //结果覆盖原doc
  226. doc[k] = v
  227. }
  228. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  229. } else {
  230. key := qu.If(in.Field == "", "detail", in.Field).(string)
  231. text := qu.ObjToString(doc[key])
  232. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  233. doc[key] = extinfo[key] //结果覆盖原doc
  234. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  235. }
  236. return doc
  237. }
  238. //抽取-规则
  239. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  240. if in.IsLua {
  241. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  242. if in.IsHasFields { //lua脚本配置有属性字段
  243. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  244. } else {
  245. lua.KvMap = map[string][]map[string]interface{}{}
  246. }
  247. lua.Block = j.Block
  248. extinfo := lua.RunScript("core")
  249. for k, v := range extinfo {
  250. if j.Result[k] == nil {
  251. j.Result[k] = [](*ju.ExtField){}
  252. }
  253. if tmps, ok := v.([]map[string]interface{}); ok {
  254. for _, tmp := range tmps {
  255. j.Result[k] = append(j.Result[k],
  256. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
  257. }
  258. }
  259. }
  260. if len(extinfo) > 0 {
  261. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  262. }
  263. } else {
  264. //全文正则
  265. text := qu.ObjToString(doc[extfrom])
  266. if in.Field != "" {
  267. extinfo := extRegCoreToResult(extfrom, text, j, in)
  268. if len(extinfo) > 0 {
  269. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  270. }
  271. }
  272. }
  273. }
  274. //lua脚本根据属性设置提取kv值
  275. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  276. kvmap := map[string][]map[string]interface{}{}
  277. for _, vv := range in.LFields {
  278. field := qu.ObjToString(vv)
  279. tags := t[qu.ObjToString(vv)] //获取对应标签库
  280. for _, bl := range j.Block {
  281. //冒号kv
  282. if bl.ColonKV != nil {
  283. kvs := bl.ColonKV.Kvs
  284. kvs2 := bl.ColonKV.Kvs_2
  285. for _, tag := range tags {
  286. for _, kv := range kvs {
  287. if tag.Type == "string" {
  288. if kv.Key == tag.Key {
  289. text := ju.TrimLRSpace(kv.Value, "")
  290. if text != "" {
  291. kvmap[field] = append(kvmap[field], map[string]interface{}{
  292. "field": field,
  293. "code": in.Code,
  294. "ruletext": tag.Key,
  295. "extfrom": extfrom,
  296. "value": text,
  297. "type": "colon1",
  298. "matchtype": "tag_string",
  299. })
  300. }
  301. break
  302. }
  303. } else if tag.Type == "regexp" {
  304. if tag.Reg.MatchString(kv.Key) {
  305. text := ju.TrimLRSpace(kv.Value, "")
  306. if text != "" {
  307. kvmap[field] = append(kvmap[field], map[string]interface{}{
  308. "field": field,
  309. "code": in.Code,
  310. "ruletext": tag.Key,
  311. "extfrom": extfrom,
  312. "value": text,
  313. "type": "colon1",
  314. "matchtype": "tag_regexp",
  315. })
  316. }
  317. break
  318. }
  319. }
  320. }
  321. for _, kv := range kvs2 {
  322. if tag.Type == "string" {
  323. if kv.Key == tag.Key {
  324. text := ju.TrimLRSpace(kv.Value, "")
  325. if text != "" {
  326. kvmap[field] = append(kvmap[field], map[string]interface{}{
  327. "field": field,
  328. "code": in.Code,
  329. "ruletext": tag.Key,
  330. "extfrom": extfrom,
  331. "value": text,
  332. "type": "colon2",
  333. "matchtype": "tag_string",
  334. })
  335. }
  336. break
  337. }
  338. } else if tag.Type == "regexp" {
  339. if tag.Reg.MatchString(kv.Key) {
  340. text := ju.TrimLRSpace(kv.Value, "")
  341. if text != "" {
  342. kvmap[field] = append(kvmap[field], map[string]interface{}{
  343. "field": field,
  344. "code": in.Code,
  345. "ruletext": tag.Key,
  346. "extfrom": extfrom,
  347. "value": text,
  348. "type": "colon2",
  349. "matchtype": "tag_regexp",
  350. })
  351. }
  352. break
  353. }
  354. }
  355. }
  356. }
  357. }
  358. //空格kv
  359. if bl.SpaceKV != nil {
  360. kvs := bl.SpaceKV.Kvs
  361. for _, tag := range tags {
  362. for _, kv := range kvs {
  363. if tag.Type == "string" {
  364. if kv.Key == tag.Key {
  365. text := ju.TrimLRSpace(kv.Value, "")
  366. if text != "" {
  367. kvmap[field] = append(kvmap[field], map[string]interface{}{
  368. "field": field,
  369. "code": in.Code,
  370. "ruletext": tag.Key,
  371. "extfrom": extfrom,
  372. "value": text,
  373. "type": "space",
  374. "matchtype": "tag_string",
  375. })
  376. }
  377. break
  378. }
  379. } else if tag.Type == "regexp" {
  380. if tag.Reg.MatchString(kv.Key) {
  381. text := ju.TrimLRSpace(kv.Value, "")
  382. if text != "" {
  383. kvmap[field] = append(kvmap[field], map[string]interface{}{
  384. "field": field,
  385. "code": in.Code,
  386. "ruletext": tag.Key,
  387. "extfrom": extfrom,
  388. "value": text,
  389. "type": "space",
  390. "matchtype": "tag_regexp",
  391. })
  392. }
  393. break
  394. }
  395. }
  396. }
  397. }
  398. }
  399. //表格kv
  400. if bl.TableKV != nil {
  401. kv := bl.TableKV.Kv
  402. for _, tag := range tags {
  403. for k, val := range kv {
  404. if tag.Type == "string" {
  405. if k == tag.Key {
  406. text := ju.TrimLRSpace(val, "")
  407. if text != "" {
  408. kvmap[field] = append(kvmap[field], map[string]interface{}{
  409. "field": field,
  410. "code": in.Code,
  411. "ruletext": tag.Key,
  412. "extfrom": extfrom,
  413. "value": text,
  414. "type": "table",
  415. "matchtype": "tag_string",
  416. })
  417. }
  418. break
  419. }
  420. } else if tag.Type == "regexp" {
  421. if tag.Reg.MatchString(k) {
  422. text := ju.TrimLRSpace(val, "")
  423. if text != "" {
  424. kvmap[field] = append(kvmap[field], map[string]interface{}{
  425. "field": field,
  426. "code": in.Code,
  427. "ruletext": tag.Key,
  428. "extfrom": extfrom,
  429. "value": text,
  430. "type": "table",
  431. "matchtype": "tag_regexp",
  432. })
  433. }
  434. break
  435. }
  436. }
  437. }
  438. }
  439. }
  440. }
  441. }
  442. return kvmap
  443. }
  444. //正则提取结果
  445. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  446. extinfo := map[string][]map[string]interface{}{}
  447. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  448. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  449. if len(apos) > 0 {
  450. pos := apos[0]
  451. for k, p := range v.RegCore.ExtractPos {
  452. if len(pos) > p {
  453. if pos[p] == -1 || pos[p+1] == -1 {
  454. continue
  455. }
  456. val := text[pos[p]:pos[p+1]]
  457. tmps := []map[string]interface{}{}
  458. tmp := map[string]interface{}{
  459. "field": v.Field,
  460. "code": v.Code,
  461. "ruletext": v.RuleText,
  462. "extfrom": extfrom,
  463. "value": val,
  464. "type": "regexp",
  465. "matchtype": "regcontent",
  466. }
  467. tmps = append(tmps, tmp)
  468. extinfo[k] = tmps
  469. if val != "" {
  470. if j.Result[v.Field] == nil {
  471. j.Result[k] = [](*ju.ExtField){}
  472. }
  473. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  474. }
  475. }
  476. }
  477. }
  478. } else {
  479. val := v.RegCore.Reg.ReplaceAllString(text, "")
  480. if val != "" {
  481. tmps := []map[string]interface{}{}
  482. tmp := map[string]interface{}{
  483. "field": v.Field,
  484. "code": v.Code,
  485. "ruletext": v.RuleText,
  486. "extfrom": extfrom,
  487. "value": val,
  488. "type": "regexp",
  489. "matchtype": "regcontent",
  490. }
  491. tmps = append(tmps, tmp)
  492. extinfo[v.Field] = tmps
  493. if j.Result[v.Field] == nil {
  494. j.Result[v.Field] = [](*ju.ExtField){}
  495. }
  496. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  497. }
  498. }
  499. return extinfo
  500. }
  501. //后置过滤
  502. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  503. if in.IsLua {
  504. result := getResultMapForLua(j)
  505. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  506. if j != nil {
  507. lua.Block = j.Block
  508. }
  509. extinfo := lua.RunScript("back")
  510. for k, v := range extinfo {
  511. if tmps, ok := v.([]map[string]interface{}); ok {
  512. j.Result[k] = [](*ju.ExtField){}
  513. for _, tmp := range tmps {
  514. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
  515. }
  516. }
  517. }
  518. if len(extinfo) > 0 {
  519. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  520. }
  521. } else {
  522. extinfo := map[string]interface{}{}
  523. if in.Field != "" && j.Result[in.Field] != nil {
  524. tmp := j.Result[in.Field]
  525. exts := []interface{}{}
  526. for k, v := range tmp {
  527. text := qu.ObjToString(v.Value)
  528. if text != "" {
  529. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  530. }
  531. j.Result[in.Field][k].Value = text
  532. exts = append(exts, map[string]interface{}{
  533. "field": v.Field,
  534. "code": v.Code,
  535. "ruletext": v.RuleText,
  536. "type": v.Type,
  537. "matchtype": v.MatchType,
  538. "extfrom": v.ExtFrom,
  539. "value": text,
  540. })
  541. }
  542. extinfo[in.Field] = exts
  543. if len(extinfo) > 0 {
  544. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  545. }
  546. } else {
  547. for key, tmp := range j.Result {
  548. exts := []interface{}{}
  549. for k, v := range tmp {
  550. text := qu.ObjToString(v.Value)
  551. if text != "" {
  552. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  553. }
  554. j.Result[key][k].Value = text
  555. exts = append(exts, map[string]interface{}{
  556. "field": v.Field,
  557. "code": v.Code,
  558. "ruletext": v.RuleText,
  559. "type": v.Type,
  560. "matchtype": v.MatchType,
  561. "extfrom": v.ExtFrom,
  562. "value": text,
  563. })
  564. }
  565. extinfo[key] = exts
  566. }
  567. if len(extinfo) > 0 {
  568. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  569. }
  570. }
  571. }
  572. }
  573. //获取抽取结果map[string][]interface{},lua脚本使用
  574. func getResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  575. result := map[string][]map[string]interface{}{}
  576. for key, val := range j.Result {
  577. if result[key] == nil {
  578. result[key] = []map[string]interface{}{}
  579. }
  580. for _, v := range val {
  581. tmp := map[string]interface{}{
  582. "field": v.Field,
  583. "code": v.Code,
  584. "ruletext": v.RuleText,
  585. "value": v.Value,
  586. "type": v.Type,
  587. "matchtype": v.MatchType,
  588. "extfrom": v.ExtFrom,
  589. }
  590. result[key] = append(result[key], tmp)
  591. }
  592. }
  593. return result
  594. }
  595. //抽取日志
  596. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  597. if !t.IsEtxLog {
  598. return
  599. }
  600. logdata := map[string]interface{}{
  601. "code": v.Code,
  602. "name": v.Name,
  603. "type": ftype,
  604. "ruletext": v.RuleText,
  605. "islua": v.IsLua,
  606. "field": v.Field,
  607. "version": t.Version,
  608. "taskname": t.Name,
  609. "before": before,
  610. "extinfo": extinfo,
  611. "sid": sid,
  612. "comeintime": time.Now().Unix(),
  613. }
  614. lock.Lock()
  615. ExtLogs[t] = append(ExtLogs[t], logdata)
  616. lock.Unlock()
  617. }
  618. //保存抽取日志
  619. func SaveExtLog() {
  620. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  621. lock.Lock()
  622. tmpLogs = ExtLogs
  623. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  624. lock.Unlock()
  625. for k, v := range tmpLogs {
  626. if len(v) < saveLimit {
  627. db.Mgo.SaveBulk(k.TrackColl, v...)
  628. } else {
  629. for {
  630. if len(v) > saveLimit {
  631. tmp := v[:saveLimit]
  632. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  633. v = v[saveLimit:]
  634. } else {
  635. db.Mgo.SaveBulk(k.TrackColl, v...)
  636. break
  637. }
  638. }
  639. }
  640. }
  641. time.AfterFunc(10*time.Second, SaveExtLog)
  642. }
  643. type FieldValue struct {
  644. Value interface{}
  645. Count int
  646. }
  647. //分析抽取结果并保存
  648. func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
  649. _id := qu.BsonIdToSId((*doc)["_id"])
  650. //结果排序
  651. values := map[string][]*ju.SortObject{}
  652. for key, val := range result {
  653. fieldValue := map[string][]interface{}{}
  654. for _, v := range val {
  655. if fieldValue[fmt.Sprint(v.Value)] == nil {
  656. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  657. } else {
  658. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  659. }
  660. }
  661. objects := []*ju.SortObject{}
  662. for k, v := range fieldValue {
  663. tmp := &ju.SortObject{
  664. Key: k,
  665. Value: qu.IntAll(v[0]),
  666. Object: v[1],
  667. }
  668. objects = append(objects, tmp)
  669. }
  670. values[key] = ju.ExtSort(objects)
  671. }
  672. //从排序结果中取值
  673. tmp := map[string]interface{}{}
  674. for key, val := range values {
  675. for _, v := range val { //取第一个
  676. if v.Key != "" {
  677. tmp[key] = v.Object
  678. break
  679. }
  680. }
  681. }
  682. if task.TestColl == "" {
  683. //保存抽取结果
  684. task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  685. //保存抽取详情
  686. tmp["result"] = result
  687. for k, v := range *doc {
  688. if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
  689. tmp[k] = v
  690. }
  691. }
  692. db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  693. } else { //测试结果
  694. //保存抽取详情
  695. tmp["result"] = result
  696. for k, v := range *doc {
  697. if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
  698. tmp[k] = v
  699. }
  700. }
  701. db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  702. }
  703. }