extract.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946
  1. package extract
  2. import (
  3. //"encoding/json"
  4. "fmt"
  5. "jy/clear"
  6. db "jy/mongodbutil"
  7. "jy/pretreated"
  8. ju "jy/util"
  9. "log"
  10. qu "qfw/util"
  11. "regexp"
  12. "strconv"
  13. "strings"
  14. "sync"
  15. "time"
  16. "gopkg.in/mgo.v2/bson"
  17. )
  18. var (
  19. lock sync.RWMutex
  20. cut = ju.NewCut() //获取正文并清理
  21. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  22. TaskList map[string]*ExtractTask //任务列表
  23. saveLimit = 200 //抽取日志批量保存
  24. AreaGet DFA //敏感词
  25. AreaProvinceGet DFA //敏感词
  26. AreaSimGet DFA //敏感词
  27. Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
  28. )
  29. var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
  30. var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
  31. var ProviceConfig map[string]interface{} = make(map[string]interface{}) //省份
  32. var ProvinceMap map[string]string = make(map[string]string)
  33. var CityBrief map[string]*City = make(map[string]*City) //只加载一次即可
  34. var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
  35. var AreaToCity map[string][]*City = make(map[string][]*City) //两个文件共用
  36. //启动测试抽取
  37. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  38. defer qu.Catch()
  39. ext := &ExtractTask{}
  40. ext.Id = taskId
  41. ext.IsRun = true
  42. ext.InitTestTaskInfo(resultcoll, trackcoll)
  43. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  44. ext.InitRulePres()
  45. ext.InitRuleBacks()
  46. ext.InitRuleCore()
  47. ext.InitTag()
  48. ext.InitClearFn()
  49. return RunExtractTestTask(ext, startId, num)
  50. }
  51. func IdTrans(startId string) bson.ObjectId {
  52. defer qu.Catch()
  53. return bson.ObjectIdHex(startId)
  54. }
  55. //开始测试任务抽取
  56. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  57. n, _ := strconv.Atoi(num)
  58. id := IdTrans(startId)
  59. if id.Valid() {
  60. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  61. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  62. for _, v := range *list {
  63. j := PreInfo(v)
  64. ext.TaskInfo.ProcessPool <- true
  65. go ext.ExtractProcess(j)
  66. }
  67. return true
  68. } else {
  69. return false
  70. }
  71. }
  72. //启动抽取
  73. func StartExtractTaskId(taskId string) bool {
  74. isgo := false
  75. ext := TaskList[taskId]
  76. if ext == nil {
  77. ext = &ExtractTask{}
  78. ext.Id = taskId
  79. ext.InitTaskInfo()
  80. isgo = true
  81. } else {
  82. ext.Id = taskId
  83. ext.InitTaskInfo()
  84. }
  85. ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  86. ext.InitRulePres()
  87. ext.InitRuleBacks()
  88. ext.InitRuleCore()
  89. ext.InitTag()
  90. ext.InitClearFn()
  91. // ext.InitProvince()
  92. // ext.InitCityAll()
  93. // ext.InitCitySim()
  94. ext.IsRun = true
  95. if isgo {
  96. go RunExtractTask(taskId)
  97. }
  98. TaskList[taskId] = ext
  99. return true
  100. }
  101. //停止抽取
  102. func StopExtractTaskId(taskId string) bool {
  103. ext := TaskList[taskId]
  104. if ext != nil {
  105. ext.IsRun = false
  106. TaskList[taskId] = ext
  107. }
  108. //更新task.s_extlastid
  109. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  110. return true
  111. }
  112. //开始抽取
  113. func RunExtractTask(taskId string) {
  114. ext := TaskList[taskId]
  115. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  116. list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
  117. for k, v := range *list {
  118. log.Println(k, v["_id"])
  119. if !ext.IsRun {
  120. break
  121. }
  122. j := PreInfo(v)
  123. ext.TaskInfo.ProcessPool <- true
  124. go ext.ExtractProcess(j)
  125. ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
  126. }
  127. //更新task.s_extlastid
  128. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  129. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  130. }
  131. //信息预处理
  132. func PreInfo(doc map[string]interface{}) *ju.Job {
  133. detail := ""
  134. d1, _ := doc["detail"].(string)
  135. d2, _ := doc["contenthtml"].(string)
  136. if len(d1) >= len(d2) || d2 == "" {
  137. detail = d1
  138. } else {
  139. detail = d2
  140. }
  141. detail = ju.CutLableStr(detail)
  142. detail = cut.ClearHtml(detail)
  143. doc["detail"] = detail
  144. href := qu.ObjToString(doc["href"])
  145. if strings.HasPrefix(href, "http://") {
  146. href = href[7:]
  147. } else if strings.HasPrefix(href, "https://") {
  148. href = href[8:]
  149. }
  150. pos := strings.Index(href, "/")
  151. if pos > 0 {
  152. href = href[:pos]
  153. }
  154. doc["domain"] = href
  155. toptype := qu.ObjToString(doc["toptype"])
  156. if qu.ObjToString(doc["type"]) == "bid" {
  157. toptype = "结果"
  158. }
  159. if toptype == "" {
  160. toptype = "*"
  161. }
  162. j := &ju.Job{
  163. SourceMid: qu.BsonIdToSId(doc["_id"]),
  164. Category: toptype,
  165. Content: qu.ObjToString(doc["detail"]),
  166. SpiderCode: qu.ObjToString(doc["spidercode"]),
  167. Domain: qu.ObjToString(doc["domain"]),
  168. Href: qu.ObjToString(doc["href"]),
  169. Title: qu.ObjToString(doc["title"]),
  170. Data: &doc,
  171. City: qu.ObjToString(doc["city"]),
  172. Province: qu.ObjToString(doc["area"]),
  173. Result: map[string][]*ju.ExtField{},
  174. //BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  175. }
  176. pretreated.AnalyStart(j)
  177. return j
  178. }
  179. //抽取
  180. func (e *ExtractTask) ExtractProcess(j *ju.Job) {
  181. qu.Catch()
  182. qu.Try(func() {
  183. doc := *j.Data
  184. //全局前置规则,结果覆盖doc属性
  185. for _, v := range e.RulePres {
  186. doc = ExtRegPre(doc, j, v, e.TaskInfo)
  187. }
  188. //log.Println("全局前置规则", doc)
  189. //抽取规则
  190. for _, vc := range e.RuleCores {
  191. tmp := ju.DeepCopy(doc).(map[string]interface{})
  192. //是否进入逻辑
  193. if !ju.Logic(vc.LuaLogic, tmp) {
  194. continue
  195. }
  196. //抽取-前置规则
  197. for _, v := range vc.RulePres {
  198. tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  199. }
  200. //log.Println("抽取-前置规则", tmp)
  201. //抽取-规则
  202. for _, v := range vc.RuleCores {
  203. ExtRegCore(vc.ExtFrom, tmp, j, v, e)
  204. }
  205. //log.Println("抽取-规则", tmp)
  206. //抽取-后置规则
  207. for _, v := range vc.RuleBacks {
  208. ExtRegBack(j, v, e.TaskInfo)
  209. }
  210. //log.Println("抽取-后置规则", tmp)
  211. }
  212. //全局后置规则
  213. for _, v := range e.RuleBacks {
  214. ExtRegBack(j, v, e.TaskInfo)
  215. }
  216. //函数清理
  217. for key, val := range j.Result {
  218. for _, v := range val {
  219. data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
  220. v.Value = data[0]
  221. }
  222. }
  223. //bs, _ := json.Marshal(j.Result)
  224. //log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
  225. //抽取省份城市县
  226. //fmt.Println("-----------", j.Province, j.City, j.BuyerAddr, j.Title) //j.Address
  227. //ExtractPC(j.Result, j.Province, j.City, j.Title, j.BuyerAddr, j.SourceMid) //j.Address
  228. ExtractPC2(j.Result, "Province", "City", "Title", "Addr", j.SourceMid)
  229. //分析抽取结果并保存 todo
  230. AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
  231. }, func(err interface{}) {
  232. log.Println(err)
  233. <-e.TaskInfo.ProcessPool
  234. })
  235. <-e.TaskInfo.ProcessPool
  236. }
  237. //前置过滤
  238. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  239. before := ju.DeepCopy(doc).(map[string]interface{})
  240. extinfo := map[string]interface{}{}
  241. if in.IsLua {
  242. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  243. if j != nil {
  244. lua.Block = j.Block
  245. }
  246. extinfo = lua.RunScript("pre")
  247. for k, v := range extinfo { //结果覆盖原doc
  248. doc[k] = v
  249. }
  250. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  251. } else {
  252. key := qu.If(in.Field == "", "detail", in.Field).(string)
  253. text := qu.ObjToString(doc[key])
  254. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  255. doc[key] = extinfo[key] //结果覆盖原doc
  256. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  257. }
  258. return doc
  259. }
  260. //抽取-规则
  261. func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
  262. if in.IsLua {
  263. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  264. if in.IsHasFields { //lua脚本配置有属性字段
  265. lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
  266. } else {
  267. lua.KvMap = map[string][]map[string]interface{}{}
  268. }
  269. lua.Block = j.Block
  270. extinfo := lua.RunScript("core")
  271. for k, v := range extinfo {
  272. if j.Result[k] == nil {
  273. j.Result[k] = [](*ju.ExtField){}
  274. }
  275. if tmps, ok := v.([]map[string]interface{}); ok {
  276. for _, tmp := range tmps {
  277. j.Result[k] = append(j.Result[k],
  278. &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
  279. }
  280. }
  281. }
  282. if len(extinfo) > 0 {
  283. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  284. }
  285. } else {
  286. //全文正则
  287. text := qu.ObjToString(doc[extfrom])
  288. if in.Field != "" {
  289. extinfo := extRegCoreToResult(extfrom, text, j, in)
  290. if len(extinfo) > 0 {
  291. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  292. }
  293. }
  294. }
  295. }
  296. //lua脚本根据属性设置提取kv值
  297. func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
  298. kvmap := map[string][]map[string]interface{}{}
  299. for _, vv := range in.LFields {
  300. field := qu.ObjToString(vv)
  301. tags := t[qu.ObjToString(vv)] //获取对应标签库
  302. for _, bl := range j.Block {
  303. //冒号kv
  304. if bl.ColonKV != nil {
  305. kvs := bl.ColonKV.Kvs
  306. kvs2 := bl.ColonKV.Kvs_2
  307. for _, tag := range tags {
  308. for _, kv := range kvs {
  309. if tag.Type == "string" {
  310. if kv.Key == tag.Key {
  311. text := ju.TrimLRSpace(kv.Value, "")
  312. if text != "" {
  313. kvmap[field] = append(kvmap[field], map[string]interface{}{
  314. "field": field,
  315. "code": in.Code,
  316. "ruletext": tag.Key,
  317. "extfrom": extfrom,
  318. "value": text,
  319. "type": "colon1",
  320. "matchtype": "tag_string",
  321. })
  322. }
  323. break
  324. }
  325. } else if tag.Type == "regexp" {
  326. if tag.Reg.MatchString(kv.Key) {
  327. text := ju.TrimLRSpace(kv.Value, "")
  328. if text != "" {
  329. kvmap[field] = append(kvmap[field], map[string]interface{}{
  330. "field": field,
  331. "code": in.Code,
  332. "ruletext": tag.Key,
  333. "extfrom": extfrom,
  334. "value": text,
  335. "type": "colon1",
  336. "matchtype": "tag_regexp",
  337. })
  338. }
  339. break
  340. }
  341. }
  342. }
  343. for _, kv := range kvs2 {
  344. if tag.Type == "string" {
  345. if kv.Key == tag.Key {
  346. text := ju.TrimLRSpace(kv.Value, "")
  347. if text != "" {
  348. kvmap[field] = append(kvmap[field], map[string]interface{}{
  349. "field": field,
  350. "code": in.Code,
  351. "ruletext": tag.Key,
  352. "extfrom": extfrom,
  353. "value": text,
  354. "type": "colon2",
  355. "matchtype": "tag_string",
  356. })
  357. }
  358. break
  359. }
  360. } else if tag.Type == "regexp" {
  361. if tag.Reg.MatchString(kv.Key) {
  362. text := ju.TrimLRSpace(kv.Value, "")
  363. if text != "" {
  364. kvmap[field] = append(kvmap[field], map[string]interface{}{
  365. "field": field,
  366. "code": in.Code,
  367. "ruletext": tag.Key,
  368. "extfrom": extfrom,
  369. "value": text,
  370. "type": "colon2",
  371. "matchtype": "tag_regexp",
  372. })
  373. }
  374. break
  375. }
  376. }
  377. }
  378. }
  379. }
  380. //空格kv
  381. if bl.SpaceKV != nil {
  382. kvs := bl.SpaceKV.Kvs
  383. for _, tag := range tags {
  384. for _, kv := range kvs {
  385. if tag.Type == "string" {
  386. if kv.Key == tag.Key {
  387. text := ju.TrimLRSpace(kv.Value, "")
  388. if text != "" {
  389. kvmap[field] = append(kvmap[field], map[string]interface{}{
  390. "field": field,
  391. "code": in.Code,
  392. "ruletext": tag.Key,
  393. "extfrom": extfrom,
  394. "value": text,
  395. "type": "space",
  396. "matchtype": "tag_string",
  397. })
  398. }
  399. break
  400. }
  401. } else if tag.Type == "regexp" {
  402. if tag.Reg.MatchString(kv.Key) {
  403. text := ju.TrimLRSpace(kv.Value, "")
  404. if text != "" {
  405. kvmap[field] = append(kvmap[field], map[string]interface{}{
  406. "field": field,
  407. "code": in.Code,
  408. "ruletext": tag.Key,
  409. "extfrom": extfrom,
  410. "value": text,
  411. "type": "space",
  412. "matchtype": "tag_regexp",
  413. })
  414. }
  415. break
  416. }
  417. }
  418. }
  419. }
  420. }
  421. //表格kv
  422. if bl.TableKV != nil {
  423. kv := bl.TableKV.Kv
  424. for _, tag := range tags {
  425. for k, val := range kv {
  426. if tag.Type == "string" {
  427. if k == tag.Key {
  428. text := ju.TrimLRSpace(val, "")
  429. if text != "" {
  430. kvmap[field] = append(kvmap[field], map[string]interface{}{
  431. "field": field,
  432. "code": in.Code,
  433. "ruletext": tag.Key,
  434. "extfrom": extfrom,
  435. "value": text,
  436. "type": "table",
  437. "matchtype": "tag_string",
  438. })
  439. }
  440. break
  441. }
  442. } else if tag.Type == "regexp" {
  443. if tag.Reg.MatchString(k) {
  444. text := ju.TrimLRSpace(val, "")
  445. if text != "" {
  446. kvmap[field] = append(kvmap[field], map[string]interface{}{
  447. "field": field,
  448. "code": in.Code,
  449. "ruletext": tag.Key,
  450. "extfrom": extfrom,
  451. "value": text,
  452. "type": "table",
  453. "matchtype": "tag_regexp",
  454. })
  455. }
  456. break
  457. }
  458. }
  459. }
  460. }
  461. }
  462. }
  463. }
  464. return kvmap
  465. }
  466. //正则提取结果
  467. func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
  468. extinfo := map[string][]map[string]interface{}{}
  469. if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  470. apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
  471. if len(apos) > 0 {
  472. pos := apos[0]
  473. for k, p := range v.RegCore.ExtractPos {
  474. if len(pos) > p {
  475. if pos[p] == -1 || pos[p+1] == -1 {
  476. continue
  477. }
  478. val := text[pos[p]:pos[p+1]]
  479. tmps := []map[string]interface{}{}
  480. tmp := map[string]interface{}{
  481. "field": v.Field,
  482. "code": v.Code,
  483. "ruletext": v.RuleText,
  484. "extfrom": extfrom,
  485. "value": val,
  486. "type": "regexp",
  487. "matchtype": "regcontent",
  488. }
  489. tmps = append(tmps, tmp)
  490. extinfo[k] = tmps
  491. if val != "" {
  492. if j.Result[v.Field] == nil {
  493. j.Result[k] = [](*ju.ExtField){}
  494. }
  495. j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  496. }
  497. }
  498. }
  499. }
  500. } else {
  501. pos := v.RegCore.Reg.FindStringIndex(text)
  502. val := ""
  503. if len(pos) == 2 {
  504. text = text[pos[1]:]
  505. rs := regexp.MustCompile("[^\r\n\t]+")
  506. tmp := rs.FindAllString(text, -1)
  507. if len(tmp) > 0 {
  508. val = tmp[0]
  509. }
  510. }
  511. if val != "" {
  512. tmps := []map[string]interface{}{}
  513. tmp := map[string]interface{}{
  514. "field": v.Field,
  515. "code": v.Code,
  516. "ruletext": v.RuleText,
  517. "extfrom": extfrom,
  518. "value": val,
  519. "type": "regexp",
  520. "matchtype": "regcontent",
  521. }
  522. tmps = append(tmps, tmp)
  523. extinfo[v.Field] = tmps
  524. if j.Result[v.Field] == nil {
  525. j.Result[v.Field] = [](*ju.ExtField){}
  526. }
  527. j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
  528. }
  529. }
  530. return extinfo
  531. }
  532. //后置过滤
  533. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  534. if in.IsLua {
  535. result := GetResultMapForLua(j)
  536. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  537. if j != nil {
  538. lua.Block = j.Block
  539. }
  540. extinfo := lua.RunScript("back")
  541. for k, v := range extinfo {
  542. if tmps, ok := v.([]map[string]interface{}); ok {
  543. j.Result[k] = [](*ju.ExtField){}
  544. for _, tmp := range tmps {
  545. j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
  546. }
  547. }
  548. }
  549. if len(extinfo) > 0 {
  550. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  551. }
  552. } else {
  553. extinfo := map[string]interface{}{}
  554. if in.Field != "" {
  555. if j.Result[in.Field] != nil {
  556. tmp := j.Result[in.Field]
  557. exts := []interface{}{}
  558. for k, v := range tmp {
  559. text := qu.ObjToString(v.Value)
  560. if text != "" {
  561. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  562. }
  563. j.Result[in.Field][k].Value = text
  564. exts = append(exts, map[string]interface{}{
  565. "field": v.Field,
  566. "code": v.Code,
  567. "ruletext": v.RuleText,
  568. "type": v.Type,
  569. "matchtype": v.MatchType,
  570. "extfrom": v.ExtFrom,
  571. "value": text,
  572. })
  573. }
  574. extinfo[in.Field] = exts
  575. if len(extinfo) > 0 {
  576. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  577. }
  578. }
  579. } else {
  580. for key, tmp := range j.Result {
  581. exts := []interface{}{}
  582. for k, v := range tmp {
  583. text := qu.ObjToString(v.Value)
  584. if text != "" {
  585. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  586. }
  587. j.Result[key][k].Value = text
  588. exts = append(exts, map[string]interface{}{
  589. "field": v.Field,
  590. "code": v.Code,
  591. "ruletext": v.RuleText,
  592. "type": v.Type,
  593. "matchtype": v.MatchType,
  594. "extfrom": v.ExtFrom,
  595. "value": text,
  596. })
  597. }
  598. extinfo[key] = exts
  599. }
  600. if len(extinfo) > 0 {
  601. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  602. }
  603. }
  604. }
  605. }
  606. //获取抽取结果map[string][]interface{},lua脚本使用
  607. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  608. result := map[string][]map[string]interface{}{}
  609. for key, val := range j.Result {
  610. if result[key] == nil {
  611. result[key] = []map[string]interface{}{}
  612. }
  613. for _, v := range val {
  614. tmp := map[string]interface{}{
  615. "field": v.Field,
  616. "code": v.Code,
  617. "ruletext": v.RuleText,
  618. "value": v.Value,
  619. "type": v.Type,
  620. "matchtype": v.MatchType,
  621. "extfrom": v.ExtFrom,
  622. }
  623. result[key] = append(result[key], tmp)
  624. }
  625. }
  626. return result
  627. }
  628. //抽取日志
  629. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  630. if !t.IsEtxLog {
  631. return
  632. }
  633. logdata := map[string]interface{}{
  634. "code": v.Code,
  635. "name": v.Name,
  636. "type": ftype,
  637. "ruletext": v.RuleText,
  638. "islua": v.IsLua,
  639. "field": v.Field,
  640. "version": t.Version,
  641. "taskname": t.Name,
  642. "before": before,
  643. "extinfo": extinfo,
  644. "sid": sid,
  645. "comeintime": time.Now().Unix(),
  646. }
  647. lock.Lock()
  648. ExtLogs[t] = append(ExtLogs[t], logdata)
  649. lock.Unlock()
  650. }
  651. //保存抽取日志
  652. func SaveExtLog() {
  653. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  654. lock.Lock()
  655. tmpLogs = ExtLogs
  656. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  657. lock.Unlock()
  658. for k, v := range tmpLogs {
  659. if len(v) < saveLimit {
  660. db.Mgo.SaveBulk(k.TrackColl, v...)
  661. } else {
  662. for {
  663. if len(v) > saveLimit {
  664. tmp := v[:saveLimit]
  665. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  666. v = v[saveLimit:]
  667. } else {
  668. db.Mgo.SaveBulk(k.TrackColl, v...)
  669. break
  670. }
  671. }
  672. }
  673. }
  674. time.AfterFunc(10*time.Second, SaveExtLog)
  675. }
  676. type FieldValue struct {
  677. Value interface{}
  678. Count int
  679. }
  680. //分析抽取结果并保存
  681. func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
  682. _id := qu.BsonIdToSId((*doc)["_id"])
  683. //结果排序
  684. values := map[string][]*ju.SortObject{}
  685. for key, val := range result {
  686. fieldValue := map[string][]interface{}{}
  687. for _, v := range val {
  688. if fieldValue[fmt.Sprint(v.Value)] == nil {
  689. fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
  690. } else {
  691. fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
  692. }
  693. }
  694. objects := []*ju.SortObject{}
  695. for k, v := range fieldValue {
  696. tmp := &ju.SortObject{
  697. Key: k,
  698. Value: qu.IntAll(v[0]),
  699. Object: v[1],
  700. }
  701. objects = append(objects, tmp)
  702. }
  703. values[key] = ju.ExtSort(objects)
  704. }
  705. //从排序结果中取值
  706. tmp := map[string]interface{}{}
  707. for key, val := range values {
  708. for _, v := range val { //取第一个
  709. if v.Key != "" {
  710. tmp[key] = v.Object
  711. break
  712. }
  713. }
  714. }
  715. if task.TestColl == "" {
  716. if len(tmp) > 0 { //保存抽取结果
  717. task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  718. }
  719. //保存抽取详情
  720. tmp["result"] = result
  721. for k, v := range *doc {
  722. if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
  723. tmp[k] = v
  724. }
  725. }
  726. db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  727. } else { //测试结果
  728. //保存抽取详情
  729. tmp["result"] = result
  730. for k, v := range *doc {
  731. if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
  732. tmp[k] = v
  733. }
  734. }
  735. db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  736. }
  737. }
  738. //抽取城市、省份
  739. func ExtractPC2(result map[string][]*ju.ExtField, province, city, title, addr, sourcemid string) (bres bool, c, p string) {
  740. var pjnarr, buyerarr []string
  741. var pb []interface{}
  742. for n, val := range result["projectname"] {
  743. pjnarr[n] = fmt.Sprint(val.Value)
  744. }
  745. for n, val := range result["buyer"] {
  746. buyerarr[n] = fmt.Sprint(val.Value)
  747. }
  748. pl := len(pjnarr)
  749. bl := len(buyerarr)
  750. max := 0
  751. if pl > bl {
  752. max = pl
  753. } else {
  754. max = bl
  755. }
  756. //city, buyer, addr, projectname, title
  757. if max == 0 { //没有projectname和buyer结果集
  758. tmp1 := []string{city, "", addr, "", title}
  759. pb = append(pb, tmp1)
  760. } else { //至少有一个结果集
  761. if max == pl {
  762. for i := 0; i < max; i++ {
  763. p := pjnarr[i]
  764. b := ""
  765. if i < bl {
  766. b = buyerarr[i]
  767. }
  768. tmp2 := []string{city, b, addr, p, title}
  769. pb = append(pb, tmp2)
  770. }
  771. } else {
  772. for i := 0; i < max; i++ {
  773. b := buyerarr[i]
  774. p := ""
  775. if i < pl {
  776. p = pjnarr[i]
  777. }
  778. tmp3 := []string{city, b, addr, p, title}
  779. pb = append(pb, tmp3)
  780. }
  781. }
  782. }
  783. log.Println(pb)
  784. return
  785. }
  786. func ExtractPC(buyer, projectname, title, city, province, addr string, id interface{}) (bres bool, c, p string) {
  787. defer qu.Catch()
  788. bc := true //是否继续抽取
  789. if city != "" {
  790. if CityBrief[city] == nil { //简称不存在
  791. //log.Println("city err:", city, id)
  792. } else { //简称存在
  793. if province != CityBrief[city].P.Brief { //省份不对
  794. log.Println("province err:", city, province, id)
  795. } else {
  796. bc = false
  797. //原值正确,不用抽取
  798. }
  799. }
  800. }
  801. //有省份
  802. bp := false
  803. if ProvinceBrief[province] != nil {
  804. bp = true
  805. } else { //没有省份,先识别省份
  806. for _, str := range []string{city, buyer, addr, projectname, title} {
  807. word := AreaProvinceGet.CheckSensitiveWord(str) //省全称
  808. if word != "" {
  809. province = ProvinceMap[word] //省简称
  810. bp = true
  811. break
  812. }
  813. }
  814. }
  815. //匹配城市
  816. if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不对,继续抽取
  817. //目前是全匹配模式,如果再加上精简匹配,加一层循环
  818. for pos, GET := range []DFA{AreaGet, AreaSimGet} {
  819. ws := make([]string, 5)
  820. for n, str := range []string{city, buyer, addr, projectname, title} {
  821. if str != "" {
  822. word := GET.CheckSensitiveWord(str)
  823. if pos == 1 { //用简称 后辍为路、集团替换
  824. str1 := strings.Replace(str, word+"路", "", 1)
  825. if str1 != str {
  826. word = GET.CheckSensitiveWord(str1)
  827. }
  828. }
  829. ws[n] = word
  830. if word != "" {
  831. res := AreaToCity[word]
  832. if len(res) == 1 {
  833. //判断省份
  834. if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回
  835. bres = true
  836. c = res[0].Brief
  837. p = res[0].P.Brief
  838. break
  839. } else { //不一致时。。暂时不处理
  840. }
  841. } else { //多个时
  842. }
  843. }
  844. }
  845. }
  846. if !bres {
  847. mc := map[string]int{}
  848. for _, w := range ws {
  849. res := AreaToCity[w]
  850. for _, ct := range res {
  851. if ct == nil {
  852. continue
  853. }
  854. if bp { //有省份
  855. if ct.P != nil && ct.P.Brief == province {
  856. mc[ct.Brief]++
  857. }
  858. } else { //没有省份
  859. mc[ct.Brief]++
  860. }
  861. }
  862. }
  863. //计算mc中最大值且大于1
  864. max := 1
  865. v := ""
  866. for mk, mv := range mc {
  867. if mv > max {
  868. v = mk
  869. }
  870. }
  871. if v != "" {
  872. bres = true
  873. c = CityBrief[v].Brief
  874. p = CityBrief[v].P.Brief
  875. } else if len(mc) > 0 {
  876. //取级别更大的
  877. v := ""
  878. for mk, _ := range mc {
  879. if CityBrief[mk].P.Cap == mk {
  880. bres = true
  881. c = CityBrief[mk].Brief
  882. p = CityBrief[mk].P.Brief
  883. break
  884. } else {
  885. v = mk
  886. }
  887. }
  888. if !bres {
  889. bres = true
  890. c = CityBrief[v].Brief
  891. p = CityBrief[v].P.Brief
  892. }
  893. }
  894. }
  895. if bres {
  896. break
  897. }
  898. }
  899. } else {
  900. return
  901. }
  902. if !bres {
  903. //取默认省会
  904. if ProvinceBrief[province] != nil {
  905. bres = true
  906. c = ProvinceBrief[province].Cap
  907. p = province
  908. }
  909. }
  910. return
  911. }