extract.go 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. "github.com/PuerkitoBio/goquery"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
  31. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  32. )
  33. //启动测试抽取
  34. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  35. defer qu.Catch()
  36. ext := &ExtractTask{}
  37. ext.Id = taskId
  38. ext.IsRun = true
  39. ext.InitTestTaskInfo(resultcoll, trackcoll)
  40. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  41. ext.InitSite()
  42. ext.InitRulePres()
  43. ext.InitRuleBacks(false)
  44. ext.InitRuleBacks(true)
  45. ext.InitRuleCore(false)
  46. ext.InitRuleCore(true)
  47. ext.InitPkgCore()
  48. ext.InitBlockRule()
  49. ext.InfoTypeList()
  50. ext.InitTag(false)
  51. ext.InitTag(true)
  52. ext.InitClearFn(false)
  53. ext.InitClearFn(true)
  54. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  55. //初始化城市DFA信息
  56. ext.InitCityInfo()
  57. //ext.InitCityDFA()
  58. ext.InitAreaCode()
  59. ext.InitPostCode()
  60. }
  61. //质量审核
  62. ext.InitAuditFields()
  63. ext.InitAuditRule()
  64. ext.InitAuditClass()
  65. ext.InitAuditRecogField()
  66. //品牌抽取是否开启
  67. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  68. //附件抽取是否开启
  69. ext.InitFile()
  70. return RunExtractTestTask(ext, startId, num)
  71. }
  72. func IdTrans(startId string) bson.ObjectId {
  73. defer qu.Catch()
  74. return bson.ObjectIdHex(startId)
  75. }
  76. //开始测试任务抽取
  77. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  78. n, _ := strconv.Atoi(num)
  79. id := IdTrans(startId)
  80. if id.Valid() {
  81. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  82. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  83. for _, v := range *list {
  84. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  85. continue
  86. }
  87. var j, jf *ju.Job
  88. var isSite bool
  89. if ext.IsFileField && v["projectinfo"] != nil {
  90. v["isextFile"] = true
  91. j, jf, isSite = ext.PreInfo(v)
  92. } else {
  93. j, _, isSite = ext.PreInfo(v)
  94. }
  95. go ext.ExtractProcess(j, jf, isSite)
  96. ext.TaskInfo.ProcessPool <- true
  97. }
  98. return true
  99. } else {
  100. return false
  101. }
  102. }
  103. //启动抽取
  104. func StartExtractTaskId(taskId string) bool {
  105. defer qu.Catch()
  106. isgo := false
  107. ext := TaskList[taskId]
  108. if ext == nil {
  109. ext = &ExtractTask{}
  110. ext.Id = taskId
  111. ext.InitTaskInfo()
  112. isgo = true
  113. } else {
  114. ext.Id = taskId
  115. ext.InitTaskInfo()
  116. }
  117. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  118. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  119. ext.InitSite()
  120. ext.InitRulePres()
  121. ext.InitRuleBacks(false)
  122. ext.InitRuleBacks(true)
  123. ext.InitRuleCore(false)
  124. ext.InitRuleCore(true)
  125. ext.InitPkgCore()
  126. ext.InitBlockRule()
  127. ext.InfoTypeList()
  128. ext.InitTag(false)
  129. ext.InitTag(true)
  130. ext.InitClearFn(false)
  131. ext.InitClearFn(true)
  132. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  133. //初始化城市DFA信息
  134. //ext.InitCityDFA()
  135. ext.InitCityInfo()
  136. ext.InitAreaCode()
  137. ext.InitPostCode()
  138. }
  139. //质量审核
  140. ext.InitAuditFields()
  141. ext.InitAuditRule()
  142. ext.InitAuditClass()
  143. ext.InitAuditRecogField()
  144. //品牌抽取是否开启
  145. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  146. //附件抽取是否开启
  147. ext.InitFile()
  148. ext.IsRun = true
  149. go ext.ResultSave(true)
  150. go ext.BidSave(true)
  151. if isgo {
  152. go RunExtractTask(taskId)
  153. }
  154. TaskList[taskId] = ext
  155. return true
  156. }
  157. //停止抽取
  158. func StopExtractTaskId(taskId string) bool {
  159. defer qu.Catch()
  160. ext := TaskList[taskId]
  161. if ext != nil {
  162. ext.IsRun = false
  163. TaskList[taskId] = ext
  164. }
  165. //更新task.s_extlastid
  166. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  167. return true
  168. }
  169. //开始抽取
  170. func RunExtractTask(taskId string) {
  171. defer qu.Catch()
  172. ext := TaskList[taskId]
  173. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  174. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  175. pageNum := (count + PageSize - 1) / PageSize
  176. limit := PageSize
  177. if count < PageSize {
  178. limit = count
  179. }
  180. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  181. for i := 0; i < pageNum; i++ {
  182. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  183. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  184. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  185. for _, v := range *list {
  186. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  187. continue
  188. }
  189. //根据标题判断是否抽取
  190. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  191. if !b {
  192. continue
  193. }
  194. _id := qu.BsonIdToSId(v["_id"])
  195. //log.Debug(_id)
  196. if !ext.IsRun {
  197. break
  198. }
  199. var j, jf *ju.Job
  200. var isSite bool
  201. if ext.IsFileField && v["projectinfo"] != nil {
  202. v["isextFile"] = true
  203. j, jf, isSite = ext.PreInfo(v)
  204. } else {
  205. j, _, isSite = ext.PreInfo(v)
  206. }
  207. go ext.ExtractProcess(j, jf, isSite)
  208. ext.TaskInfo.LastExtId = _id
  209. ext.TaskInfo.ProcessPool <- true
  210. }
  211. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  212. if !ext.IsRun {
  213. break
  214. }
  215. }
  216. //更新task.s_extlastid
  217. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  218. }
  219. //信息预处理-不和版本关联,取最新版本的配置项
  220. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  221. return (&ExtractTask{}).PreInfo(doc)
  222. }
  223. //信息预处理-和版本关联
  224. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  225. defer qu.Catch()
  226. //判断是否有附件这个字段
  227. var isextFile bool
  228. if doc["isextFile"] != nil {
  229. isextFile = doc["isextFile"].(bool)
  230. }
  231. detail := ""
  232. d1, _ := doc["detail"].(string)
  233. d2, _ := doc["contenthtml"].(string)
  234. if len(d1) >= len(d2) || d2 == "" {
  235. detail = d1
  236. } else {
  237. detail = d2
  238. }
  239. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  240. d3, _ := doc["summary"].(string)
  241. //全文的需要修复表格
  242. detail = pretreated.RepairCon(detail)
  243. detail = ju.CutLableStr(d3 + "\n" + detail)
  244. detail = cut.ClearHtml(d3 + "\n" + detail)
  245. doc["detail"] = detail
  246. if isextFile {
  247. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  248. }
  249. //正文小于200个字,有附件把附件内容加到正文
  250. tmpDeatil := detail
  251. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  252. if err == nil {
  253. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  254. if conlen < 200 {
  255. if isextFile {
  256. detail += qu.ObjToString(doc["detailfile"])
  257. doc["detail"] = detail
  258. }
  259. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  260. //防止文本过长,造成抽取阻塞
  261. log.Debug("文本太长", doc["_id"], conlen)
  262. doc["detail"] = d3
  263. }
  264. }
  265. toptype := qu.ObjToString(doc["toptype"])
  266. subtype := qu.ObjToString(doc["subtype"])
  267. if qu.ObjToString(doc["type"]) == "bid" {
  268. toptype = "结果"
  269. }
  270. if toptype == "" {
  271. toptype = "all"
  272. }
  273. if subtype == "" {
  274. subtype = "all"
  275. }
  276. if toptype == "其它" || subtype == "其它" || subtype == "其他" || subtype == "结果变更" {
  277. toptype = "all"
  278. subtype = "all"
  279. }
  280. toMap := qu.ObjToMap(doc["jsondata"])
  281. //log.Debug("toMap", toMap)
  282. if (*toMap) != nil {
  283. if (*toMap)["extweight"] == nil {
  284. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  285. }
  286. }
  287. j = &ju.Job{
  288. SourceMid: qu.BsonIdToSId(doc["_id"]),
  289. Category: toptype,
  290. CategorySecond: subtype,
  291. Content: qu.ObjToString(doc["detail"]),
  292. SpiderCode: qu.ObjToString(doc["spidercode"]),
  293. Site: qu.ObjToString(doc["site"]),
  294. //Domain: qu.ObjToString(doc["domain"]),
  295. //Href: qu.ObjToString(doc["href"]),
  296. Title: qu.ObjToString(doc["title"]),
  297. Data: &doc,
  298. City: qu.ObjToString(doc["city"]),
  299. Province: qu.ObjToString(doc["area"]),
  300. Jsondata: toMap,
  301. Result: map[string][]*ju.ExtField{},
  302. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  303. RuleBlock: e.RuleBlock,
  304. }
  305. if isextFile {
  306. jf = &ju.Job{
  307. SourceMid: qu.BsonIdToSId(doc["_id"]),
  308. Category: toptype,
  309. Content: qu.ObjToString(doc["detailfile"]),
  310. SpiderCode: qu.ObjToString(doc["spidercode"]),
  311. Site: qu.ObjToString(doc["site"]),
  312. Title: qu.ObjToString(doc["title"]),
  313. Data: &doc,
  314. City: qu.ObjToString(doc["city"]),
  315. Province: qu.ObjToString(doc["area"]),
  316. Jsondata: toMap,
  317. Result: map[string][]*ju.ExtField{},
  318. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  319. RuleBlock: e.RuleBlock,
  320. IsFile: isextFile,
  321. }
  322. }
  323. //是否配置站点
  324. codeSite := j.SpiderCode
  325. exp, isSite := e.Luacodes.Load(codeSite)
  326. if isSite {
  327. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  328. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  329. }
  330. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  331. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  332. }
  333. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  334. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  335. }
  336. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  337. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  338. }
  339. }
  340. qu.Try(func() {
  341. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  342. if isextFile {
  343. pretreated.AnalyStart(jf, isSite, codeSite)
  344. }
  345. }, func(err interface{}) {
  346. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  347. })
  348. return j, jf, isSite
  349. }
  350. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  351. func file2text(doc *map[string]interface{}) {
  352. var strfileinfo bytes.Buffer
  353. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  354. if va, ok := v["attachments"].(map[string]interface{}); ok {
  355. for _, vaatt := range va {
  356. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  357. if qu.ObjToString(fileinfo["content"]) != "" {
  358. switch fileinfo["content"].(type) {
  359. case string:
  360. lock.Lock()
  361. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  362. lock.Unlock()
  363. case []map[string]interface{}:
  364. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  365. if fv["context"] != nil {
  366. lock.Lock()
  367. strfileinfo.WriteString(fv["context"].(string) + " \n")
  368. lock.Unlock()
  369. }
  370. }
  371. }
  372. }
  373. }
  374. }
  375. }
  376. }
  377. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  378. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  379. }
  380. }
  381. //抽取
  382. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  383. e.ExtractDetail(j, isSite, j.SpiderCode)
  384. if jf != nil && jf.IsFile {
  385. e.ExtractFile(jf, isSite, j.SpiderCode)
  386. }
  387. if isSite {
  388. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  389. if ok && ismerge.(bool) {
  390. tmpj := &ju.Job{
  391. SourceMid: j.SourceMid,
  392. Category: j.Category,
  393. CategorySecond: j.CategorySecond,
  394. Content: j.Content,
  395. SpiderCode: j.SpiderCode,
  396. //Domain: qu.ObjToString(doc["domain"]),
  397. //Href: qu.ObjToString(doc["href"]),
  398. Title: j.Title,
  399. Data: j.Data,
  400. City: j.City,
  401. Province: j.Province,
  402. Jsondata: j.Jsondata,
  403. Result: map[string][]*ju.ExtField{},
  404. BuyerAddr: j.BuyerAddr,
  405. RuleBlock: e.RuleBlock,
  406. }
  407. qu.Try(func() {
  408. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  409. }, func(err interface{}) {
  410. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  411. })
  412. e.ExtractDetail(tmpj, false, "")
  413. //if jf != nil && jf.IsFile {
  414. // e.ExtractFile(jf, false, "")
  415. //}
  416. //合并数据
  417. j.Block = append(j.Block, tmpj.Block...)
  418. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  419. for tmpk, _ := range j.Result {
  420. if len(tmpj.Result[tmpk]) > 0 {
  421. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  422. }
  423. }
  424. for tmpk, _ := range tmpj.Result {
  425. if len(j.Result[tmpk]) == 0 {
  426. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  427. }
  428. }
  429. }
  430. }
  431. // for _, b := range j.Block {
  432. // for _, vv := range b.ColonKV.Kvs {
  433. // qu.Debug(vv.Key, vv.Value)
  434. // }
  435. // for _, vv := range b.SpaceKV.Kvs {
  436. // qu.Debug(vv.Key, vv.Value)
  437. // }
  438. // // for _, vv := range b.TableKV.Kvs {
  439. // // qu.Debug(vv.Key, vv.Value)
  440. // // }
  441. // }
  442. //分析抽取结果并保存
  443. AnalysisSaveResult(j, jf, e)
  444. <-e.TaskInfo.ProcessPool
  445. }
  446. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  447. qu.Try(func() {
  448. doc := *j.Data
  449. //全局前置规则,结果覆盖doc属性
  450. //for _, v := range e.RulePres {
  451. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  452. //}
  453. tmprules := map[string][]*RuleCore{}
  454. lockrule.Lock()
  455. if j.Category == "all" || j.CategorySecond == "all" {
  456. if isSite {
  457. for k, vc1 := range e.SiteRuleCores["all_all"] {
  458. tmprules[k] = vc1
  459. }
  460. } else {
  461. for k, vc1 := range e.RuleCores["all_all"] {
  462. tmprules[k] = vc1
  463. }
  464. }
  465. } else {
  466. if isSite {
  467. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  468. tmprules[k] = vc1
  469. }
  470. } else {
  471. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  472. tmprules[k] = vc1
  473. }
  474. }
  475. }
  476. if len(tmprules) < 1 { //分类未覆盖部分
  477. if isSite {
  478. for k, vc1 := range e.RuleCores["all_all"] {
  479. tmprules[k] = vc1
  480. }
  481. } else {
  482. for k, vc1 := range e.SiteRuleCores["all_all"] {
  483. tmprules[k] = vc1
  484. }
  485. }
  486. }
  487. lockrule.Unlock()
  488. //抽取规则
  489. for _, vc1 := range tmprules {
  490. for _, vc := range vc1 {
  491. tmp := ju.DeepCopy(doc).(map[string]interface{})
  492. //是否进入逻辑
  493. if !ju.Logic(vc.LuaLogic, tmp) {
  494. continue
  495. }
  496. ////抽取-前置规则
  497. //for _, v := range vc.RulePres {
  498. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  499. //}
  500. // log.Debug("抽取-前置规则", tmp)
  501. //抽取-规则
  502. ExtRuleCore(tmp, e, vc, j, isSite)
  503. // log.Debug("抽取-规则", tmp)
  504. //抽取-后置规则
  505. for _, v := range vc.RuleBacks {
  506. ExtRegBack(j, v, e.TaskInfo)
  507. }
  508. //kv规则
  509. for _, v := range vc.KVRuleCores {
  510. ExtRuleKV(j, v, e.TaskInfo)
  511. }
  512. // log.Debug("抽取-后置规则", tmp)
  513. //项目名称未能抽取到,标题来凑
  514. if vc.Field == "projectname" {
  515. if vc.ExtFrom == "title" {
  516. isextitle := true
  517. for _, v := range j.Result[vc.Field] {
  518. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  519. isextitle = false
  520. break
  521. }
  522. }
  523. if isextitle { //标题加入选举
  524. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  525. if isSite {
  526. field.Score = 1
  527. }
  528. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  529. }
  530. }
  531. for i := 0; i < 3; i++ {
  532. for _, v := range vc.RuleBacks {
  533. ExtRegBack(j, v, e.TaskInfo)
  534. }
  535. }
  536. }
  537. }
  538. }
  539. //全局后置规则
  540. if isSite {
  541. for _, v := range e.SiteRuleBacks {
  542. ExtRegBack(j, v, e.TaskInfo)
  543. }
  544. } else {
  545. for _, v := range e.RuleBacks {
  546. ExtRegBack(j, v, e.TaskInfo)
  547. }
  548. }
  549. //函数清理
  550. for key, val := range j.Result {
  551. for _, v := range val {
  552. //qu.Debug(key, v.Value)
  553. lockclear.Lock()
  554. var cfn = []string{}
  555. if isSite {
  556. cfn = e.SiteClearFn[key]
  557. } else {
  558. cfn = e.ClearFn[key]
  559. }
  560. lockclear.Unlock()
  561. if len(cfn) == 0 {
  562. continue
  563. }
  564. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  565. before, _ := v.Value.(string)
  566. v.Value = data[0]
  567. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  568. //添加行数清理的日志
  569. //清理特殊符号
  570. lockclear.Lock()
  571. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  572. text := qu.ObjToString(v.Value)
  573. before = text
  574. v.Value = clear.OtherClean(key, text)
  575. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  576. }
  577. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  578. lockclear.Unlock()
  579. }
  580. }
  581. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  582. // bs, _ := json.Marshal(j.Result)
  583. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  584. }, func(err interface{}) {
  585. log.Debug("ExtractProcess err", err)
  586. })
  587. }
  588. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  589. qu.Try(func() {
  590. doc := *j.Data
  591. //全局前置规则,结果覆盖doc属性
  592. // for _, v := range e.RulePres {
  593. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  594. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  595. // }
  596. // }
  597. //抽取规则
  598. tmprules := map[string][]*RuleCore{}
  599. lockrule.Lock()
  600. if j.Category == "all" || j.CategorySecond == "all" {
  601. for k, vc1 := range e.RuleCores["all_all"] {
  602. tmprules[k] = vc1
  603. }
  604. } else {
  605. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  606. tmprules[k] = vc1
  607. }
  608. }
  609. lockrule.Unlock()
  610. for _, vc1 := range tmprules {
  611. for _, vc := range vc1 {
  612. tmp := ju.DeepCopy(doc).(map[string]interface{})
  613. //是否进入逻辑
  614. if !ju.Logic(vc.LuaLogic, tmp) {
  615. continue
  616. }
  617. //抽取-前置规则
  618. // for _, v := range vc.RulePres {
  619. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  620. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  621. // }
  622. // }
  623. // log.Debug("抽取-前置规则", tmp)
  624. //抽取-规则
  625. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  626. ExtRuleCore(tmp, e, vc, j, isSite)
  627. }
  628. // log.Debug("抽取-规则", tmp)
  629. //抽取-后置规则
  630. for _, v := range vc.RuleBacks {
  631. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  632. ExtRegBack(j, v, e.TaskInfo)
  633. }
  634. }
  635. // log.Debug("抽取-后置规则", tmp)
  636. }
  637. }
  638. //全局后置规则
  639. for _, v := range e.RuleBacks {
  640. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  641. ExtRegBack(j, v, e.TaskInfo)
  642. }
  643. }
  644. //函数清理
  645. for key, val := range j.Result {
  646. for _, v := range val {
  647. lockclear.Lock()
  648. cfn := e.ClearFn[key]
  649. lockclear.Unlock()
  650. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  651. v.Value = data[0]
  652. //清理特殊符号
  653. lockclear.Lock()
  654. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  655. clear.MesField[key] != nil {
  656. text := qu.ObjToString(v.Value)
  657. text = clear.OtherClean(key, text)
  658. v.Value = text
  659. }
  660. lockclear.Unlock()
  661. }
  662. }
  663. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  664. // bs, _ := json.Marshal(j.Result)
  665. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  666. }, func(err interface{}) {
  667. log.Debug("ExtractProcess err", err)
  668. })
  669. }
  670. //前置过滤
  671. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  672. defer qu.Catch()
  673. before := ju.DeepCopy(doc).(map[string]interface{})
  674. extinfo := map[string]interface{}{}
  675. if in.IsLua {
  676. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  677. if j != nil {
  678. lua.Block = j.Block
  679. }
  680. extinfo = lua.RunScript("pre")
  681. for k, v := range extinfo { //结果覆盖原doc
  682. doc[k] = v
  683. }
  684. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  685. } else {
  686. var key string
  687. if !j.IsFile {
  688. key = qu.If(in.Field == "", "detail", in.Field).(string)
  689. } else {
  690. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  691. }
  692. text := qu.ObjToString(doc[key])
  693. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  694. doc[key] = extinfo[key] //结果覆盖原doc
  695. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  696. }
  697. return doc
  698. }
  699. //抽取-规则
  700. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  701. //候选人加入
  702. var kvMap map[string][]map[string]interface{}
  703. extByReg := true
  704. if vc.ExtFrom != "title" {
  705. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  706. }
  707. for _, v := range vc.RuleCores {
  708. if v.IsLua {
  709. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  710. } else if extByReg {
  711. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  712. }
  713. }
  714. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  715. if vc.Field == "budget" && len(kvMap) == 0 {
  716. if len(j.BlockPackage) == 1 {
  717. for _, bp := range j.BlockPackage {
  718. for fieldname, field := range vc.LFields {
  719. if field != vc.Field {
  720. continue
  721. }
  722. tp := ""
  723. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  724. if k == 0 {
  725. tp = "colon"
  726. // for _, vv := range v.Kvs {
  727. // qu.Debug(vv.Key, vv.Value)
  728. // }
  729. } else if k == 1 {
  730. tp = "space"
  731. } else if k == 2 {
  732. tp = "table"
  733. }
  734. if v == nil || v.KvTags == nil {
  735. continue
  736. }
  737. for _, vv := range v.KvTags[fieldname] {
  738. text := ju.TrimLRSpace(vv.Value, "")
  739. if text != "" {
  740. tmp := &ju.ExtField{
  741. Field: vc.Field,
  742. Code: "CL_分包",
  743. Type: tp,
  744. MatchType: "package",
  745. RuleText: bp.Text,
  746. SourceValue: vv.Key,
  747. Value: text,
  748. }
  749. if isSite {
  750. tmp.Score = 1
  751. }
  752. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  753. }
  754. }
  755. }
  756. }
  757. break
  758. }
  759. }
  760. } else {
  761. for k, v := range kvMap {
  762. if j.Result[k] == nil {
  763. j.Result[k] = [](*ju.ExtField){}
  764. }
  765. for _, tmp := range v {
  766. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
  767. if isSite {
  768. field.Score = 1
  769. }
  770. if tmp["blocktag"] != nil {
  771. btag := make(map[string]string)
  772. for k := range tmp["blocktag"].(map[string]bool) {
  773. blocktag.Lock()
  774. if TagConfigDesc[k] != "" {
  775. btag[k] = TagConfigDesc[k]
  776. }
  777. blocktag.Unlock()
  778. }
  779. field.BlockTag = btag
  780. }
  781. j.Result[k] = append(j.Result[k], field)
  782. }
  783. }
  784. }
  785. }
  786. //抽取-规则-kv
  787. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  788. defer qu.Catch()
  789. if extfrom == "title" || !in.IsLua {
  790. return
  791. }
  792. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  793. lua.KvMap = *kvMap
  794. lua.Block = j.Block
  795. extinfo := lua.RunScript("core")
  796. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  797. for _, v := range tmps {
  798. v["core"] = in.Code
  799. }
  800. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  801. }
  802. if len(extinfo) > 0 {
  803. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  804. }
  805. }
  806. //抽取-规则-正则
  807. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  808. defer qu.Catch()
  809. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  810. b := IsExtract(in.Field, j.Title, j.Content)
  811. if !b {
  812. return
  813. }
  814. //全文正则
  815. //text := qu.ObjToString(doc[extfrom])
  816. //if in.Field != "" {
  817. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  818. // if len(extinfo) > 0 {
  819. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  820. // }
  821. //}
  822. //块抽取
  823. if in.Field != "" {
  824. if extfrom == "title" {
  825. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  826. if len(extinfo) > 0 {
  827. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  828. }
  829. } else {
  830. for _, v := range j.Block {
  831. btag := make(map[string]string)
  832. for k := range v.Classify {
  833. blocktag.Lock()
  834. btag[k] = TagConfigDesc[k]
  835. blocktag.Unlock()
  836. }
  837. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  838. if len(extinfo) > 0 {
  839. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  840. }
  841. }
  842. }
  843. }
  844. }
  845. //pkg抽取-规则-正则
  846. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  847. defer qu.Catch()
  848. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  849. b := IsExtract(in.Field, j.Title, j.Content)
  850. if !b {
  851. return
  852. }
  853. //块抽取
  854. if in.Field != "" {
  855. for k, vbpkg := range j.BlockPackage {
  856. rep := map[string]string{}
  857. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  858. if !((in.Field == "budget" && vbpkg.Budget > 0) || (in.Field == "bidamount" && vbpkg.Bidamount > 0) ||
  859. (in.Field == "winner" && vbpkg.Winner == "") || (in.Field == "bidstatus" && vbpkg.BidStatus == "") ||
  860. (in.Field == "projectname" && vbpkg.Name == "")) {
  861. continue
  862. }
  863. //处理正负数修正
  864. ptmp := strings.Split(in.RuleText, "#")
  865. sign := 0
  866. if len(ptmp) == 2 {
  867. if ptmp[1] == "正" {
  868. sign = 1
  869. } else if ptmp[1] == "负" {
  870. sign = -1
  871. }
  872. }
  873. tmp := strings.Split(ptmp[0], "__")
  874. if len(tmp) == 2 {
  875. epos := strings.Split(tmp[1], ",")
  876. posm := map[string]int{}
  877. for _, v := range epos {
  878. ks := strings.Split(v, ":")
  879. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  880. posm[ks[1]] = qu.IntAll(ks[0])
  881. } else {
  882. posm[in.Field] = qu.IntAll(ks[0])
  883. }
  884. }
  885. var pattern string
  886. if strings.Contains(tmp[0], "\\u") {
  887. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  888. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  889. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  890. } else {
  891. pattern = tmp[0]
  892. }
  893. //log.Debug("pattern", pattern)
  894. //fmt.Println(text)
  895. reg := regexp.MustCompile(pattern)
  896. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  897. for i, _ := range apos {
  898. pos := apos[i]
  899. for k, p := range posm {
  900. if len(pos) > p {
  901. if pos[p] == -1 || pos[p+1] == -1 {
  902. continue
  903. }
  904. val := vbpkg.Text[pos[p]:pos[p+1]]
  905. if string(val) == "" {
  906. continue
  907. }
  908. if sign == -1 {
  909. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  910. } else {
  911. rep[k+"_"+fmt.Sprint(i)] = val
  912. }
  913. }
  914. }
  915. }
  916. //fmt.Println(text)
  917. for i := 0; i < len(apos); i++ {
  918. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  919. if in.Field == "budget" && vbpkg.Budget <= 0 {
  920. lock.Lock()
  921. cfn := e.ClearFn[in.Field]
  922. lock.Unlock()
  923. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  924. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  925. break
  926. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  927. lock.Lock()
  928. cfn := e.ClearFn[in.Field]
  929. lock.Unlock()
  930. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  931. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  932. break
  933. } else if in.Field == "winner" {
  934. if j.BlockPackage[k].Winner == "" {
  935. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  936. break
  937. }
  938. } else if in.Field == "bidstatus" {
  939. if j.BlockPackage[k].BidStatus == "" {
  940. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  941. break
  942. }
  943. } else if in.Field == "projectname" {
  944. if j.BlockPackage[k].Name == "" {
  945. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  946. break
  947. }
  948. }
  949. }
  950. }
  951. }
  952. } else {
  953. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  954. val := ""
  955. if len(pos) == 2 {
  956. //"text" = "text"[pos[1]:]
  957. val = "text"[pos[1]:]
  958. rs := regexp.MustCompile("[^\r\n\t]+")
  959. tmp := rs.FindAllString("text", -1)
  960. if len(tmp) > 0 {
  961. val = tmp[0]
  962. }
  963. }
  964. if val != "" {
  965. if in.Field == "budget" && vbpkg.Budget <= 0 {
  966. lock.Lock()
  967. cfn := e.ClearFn[in.Field]
  968. lock.Unlock()
  969. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  970. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  971. break
  972. }
  973. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  974. lock.Lock()
  975. cfn := e.ClearFn[in.Field]
  976. lock.Unlock()
  977. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  978. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  979. break
  980. } else if in.Field == "bidstatus" {
  981. if j.BlockPackage[k].BidStatus == "" {
  982. j.BlockPackage[k].BidStatus = val
  983. break
  984. }
  985. } else if in.Field == "projectname" {
  986. if j.BlockPackage[k].Name == "" {
  987. j.BlockPackage[k].Name = val
  988. break
  989. }
  990. }
  991. }
  992. }
  993. }
  994. }
  995. }
  996. //lua脚本根据属性设置提取kv值
  997. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  998. kvmap := map[string][]map[string]interface{}{}
  999. if len(j.Winnerorder) > 1 {
  1000. if vc.Field == "bidamount" {
  1001. for _, v := range j.Winnerorder {
  1002. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1003. "code": "winnerorder",
  1004. "field": vc.Field,
  1005. "ruletext": "中标候选人",
  1006. "extfrom": vc.ExtFrom,
  1007. "sourcevalue": "中标候选人",
  1008. "value": v["price"],
  1009. "type": "winnerorder",
  1010. "matchtype": "winnerorder",
  1011. })
  1012. }
  1013. //候选人中标金额
  1014. if price := j.Winnerorder[0]["price"]; price != nil {
  1015. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1016. "code": "CL_中标候选人",
  1017. "field": vc.Field,
  1018. "ruletext": "中标候选人",
  1019. "extfrom": vc.ExtFrom,
  1020. "sourcevalue": "中标候选人",
  1021. "value": price,
  1022. "type": "winnerorder",
  1023. "matchtype": "winnerorder",
  1024. })
  1025. return kvmap, false
  1026. }
  1027. }
  1028. //else if vc.Field == "winner" {
  1029. // for _, v := range j.Winnerorder {
  1030. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1031. // "code": "winnerorder",
  1032. // "field": vc.Field,
  1033. // "ruletext": "中标候选人",
  1034. // "extfrom": vc.ExtFrom,
  1035. // "sourcevalue": "中标候选人",
  1036. // "value": v["entname"],
  1037. // "type": "winnerorder",
  1038. // "matchtype": "winnerorder",
  1039. // })
  1040. // }
  1041. // //候选人中标单位
  1042. // if entname := j.Winnerorder[0]["entname"]; entname != nil {
  1043. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1044. // "code": "CL_中标候选人",
  1045. // "field": vc.Field,
  1046. // "ruletext": "中标候选人",
  1047. // "extfrom": vc.ExtFrom,
  1048. // "sourcevalue": "中标候选人",
  1049. // "value": entname,
  1050. // "type": "winnerorder",
  1051. // "matchtype": "winnerorder",
  1052. // })
  1053. // return kvmap, false
  1054. // }
  1055. //}
  1056. }
  1057. for fieldname, field := range vc.LFields {
  1058. if field != vc.Field {
  1059. continue
  1060. }
  1061. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1062. }
  1063. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1064. return kvmap, true
  1065. }
  1066. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1067. //qu.Debug("fieldname+++", fieldname)
  1068. for _, bl := range blocks {
  1069. tp := ""
  1070. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1071. if k == 0 {
  1072. tp = "colon"
  1073. // for _, vv := range v.Kvs {
  1074. // qu.Debug("colon-kvs:", vv.Key, vv.Value)
  1075. // }
  1076. // for kkk, vv := range v.KvTags {
  1077. // for _, vvv := range vv {
  1078. // qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
  1079. // }
  1080. // }
  1081. } else if k == 1 {
  1082. tp = "space"
  1083. } else if k == 2 {
  1084. tp = "table"
  1085. // for _, vv := range v.Kvs {
  1086. // qu.Debug("table-kvs:", vv.Key, vv.Value)
  1087. // }
  1088. // for kkk, vv := range v.KvTags {
  1089. // for _, vvv := range vv {
  1090. // qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
  1091. // }
  1092. // }
  1093. }
  1094. if v == nil || v.KvTags == nil {
  1095. continue
  1096. }
  1097. for _, vv := range v.KvTags[fieldname] {
  1098. text := ju.TrimLRSpace(vv.Value, "")
  1099. if text != "" {
  1100. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1101. "code": "CL_" + vv.Key,
  1102. "field": field,
  1103. "ruletext": vv.Key,
  1104. "extfrom": vc.ExtFrom,
  1105. "sourcevalue": text,
  1106. "value": text,
  1107. "type": tp,
  1108. "matchtype": "tag_string",
  1109. "blocktag": bl.Classify,
  1110. "weight": vv.Weight,
  1111. })
  1112. //if field != "winnertel" && field != "winnerperson" {
  1113. // //break //暂定取第一个
  1114. //}
  1115. }
  1116. }
  1117. }
  1118. if len(kvmap[field]) == 0 {
  1119. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1120. }
  1121. }
  1122. }
  1123. //正则提取结果
  1124. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1125. defer qu.Catch()
  1126. var score int
  1127. if isSite {
  1128. score = 1
  1129. }
  1130. extinfo := map[string][]map[string]interface{}{}
  1131. rep := map[string]string{}
  1132. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1133. //处理正负数修正
  1134. ptmp := strings.Split(vre.RuleText, "#")
  1135. sign := 0
  1136. if len(ptmp) == 2 {
  1137. if ptmp[1] == "正" {
  1138. sign = 1
  1139. } else if ptmp[1] == "负" {
  1140. sign = -1
  1141. }
  1142. }
  1143. tmp := strings.Split(ptmp[0], "__")
  1144. if len(tmp) == 2 {
  1145. epos := strings.Split(tmp[1], ",")
  1146. posm := map[string]int{}
  1147. for _, v := range epos {
  1148. ks := strings.Split(v, ":")
  1149. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1150. posm[ks[1]] = qu.IntAll(ks[0])
  1151. } else {
  1152. posm[vre.Field] = qu.IntAll(ks[0])
  1153. }
  1154. }
  1155. var pattern string
  1156. if strings.Contains(tmp[0], "\\u") {
  1157. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1158. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1159. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1160. } else {
  1161. pattern = tmp[0]
  1162. }
  1163. //log.Debug("pattern", pattern)
  1164. //fmt.Println(text)
  1165. reg := regexp.MustCompile(pattern)
  1166. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1167. for i, _ := range apos {
  1168. pos := apos[i]
  1169. for k, p := range posm {
  1170. if len(pos) > p {
  1171. if pos[p] == -1 || pos[p+1] == -1 {
  1172. continue
  1173. }
  1174. val := text[pos[p]:pos[p+1]]
  1175. if string(val) == "" {
  1176. continue
  1177. }
  1178. if sign == -1 {
  1179. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1180. } else {
  1181. rep[k+"_"+fmt.Sprint(i)] = val
  1182. }
  1183. }
  1184. }
  1185. }
  1186. //fmt.Println(text)
  1187. tmps := []map[string]interface{}{}
  1188. for i := 0; i < len(apos); i++ {
  1189. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1190. tmp := map[string]interface{}{
  1191. "field": vre.Field,
  1192. "code": vre.Code,
  1193. "ruletext": vre.RuleText,
  1194. "extfrom": text,
  1195. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1196. "type": "regexp",
  1197. "matchtype": "regcontent",
  1198. "blocktag": *tag,
  1199. "score": score,
  1200. }
  1201. tmps = append(tmps, tmp)
  1202. exfield := ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)], Value: rep[vre.Field+"_"+fmt.Sprint(i)]}
  1203. if tmp["blocktag"] != nil {
  1204. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1205. }
  1206. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1207. }
  1208. }
  1209. if len(tmps) > 0 {
  1210. extinfo[vre.Field] = tmps
  1211. }
  1212. }
  1213. } else {
  1214. pos := vre.RegCore.Reg.FindStringIndex(text)
  1215. val := ""
  1216. if len(pos) == 2 {
  1217. text = text[pos[1]:]
  1218. rs := regexp.MustCompile("[^\r\n\t]+")
  1219. tmp := rs.FindAllString(text, -1)
  1220. if len(tmp) > 0 {
  1221. val = tmp[0]
  1222. }
  1223. }
  1224. if val != "" {
  1225. tmps := []map[string]interface{}{}
  1226. tmp := map[string]interface{}{
  1227. "field": vre.Field,
  1228. "code": vre.Code,
  1229. "ruletext": vre.RuleText,
  1230. "extfrom": text,
  1231. "value": val,
  1232. "type": "regexp",
  1233. "matchtype": "regcontent",
  1234. "blocktag": *tag,
  1235. "score": score,
  1236. }
  1237. tmps = append(tmps, tmp)
  1238. extinfo[vre.Field] = tmps
  1239. if j.Result[vre.Field] == nil {
  1240. j.Result[vre.Field] = [](*ju.ExtField){}
  1241. }
  1242. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
  1243. if tmp["blocktag"] != nil {
  1244. field.BlockTag = tmp["blocktag"].(map[string]string)
  1245. }
  1246. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1247. }
  1248. }
  1249. return extinfo
  1250. }
  1251. //后置过滤
  1252. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1253. defer qu.Catch()
  1254. if in.IsLua {
  1255. result := GetResultMapForLua(j)
  1256. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1257. if j != nil {
  1258. lua.Block = j.Block
  1259. }
  1260. extinfo := lua.RunScript("back")
  1261. for k, v := range extinfo {
  1262. if tmps, ok := v.([]map[string]interface{}); ok {
  1263. j.Result[k] = [](*ju.ExtField){}
  1264. for _, tmp := range tmps {
  1265. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"]}
  1266. if tmp["blocktag"] != nil {
  1267. field.BlockTag = tmp["blocktag"].(map[string]string)
  1268. }
  1269. j.Result[k] = append(j.Result[k], field)
  1270. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1271. }
  1272. }
  1273. }
  1274. if len(extinfo) > 0 {
  1275. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1276. }
  1277. } else {
  1278. extinfo := map[string]interface{}{}
  1279. if in.Field != "" {
  1280. if j.Result[in.Field] != nil {
  1281. tmp := j.Result[in.Field]
  1282. exts := []interface{}{}
  1283. for k, v := range tmp {
  1284. //table抽取到的数据不清理
  1285. // if v.Type == "table" && v.Field != "projectname" {
  1286. // continue
  1287. // }
  1288. text := qu.ObjToString(v.Value)
  1289. if text != "" {
  1290. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1291. }
  1292. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1293. continue
  1294. }
  1295. j.Result[in.Field][k].Value = text
  1296. exts = append(exts, map[string]interface{}{
  1297. "field": v.Field,
  1298. "code": v.Code,
  1299. "ruletext": v.RuleText,
  1300. "type": v.Type,
  1301. "matchtype": v.MatchType,
  1302. "extfrom": v.ExtFrom,
  1303. "value": text,
  1304. })
  1305. }
  1306. if len(exts) > 0 {
  1307. extinfo[in.Field] = exts
  1308. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1309. }
  1310. }
  1311. } else {
  1312. for key, tmp := range j.Result {
  1313. exts := []interface{}{}
  1314. for k, v := range tmp {
  1315. if v.Type == "table" { //table抽取到的数据不清理
  1316. continue
  1317. }
  1318. text := qu.ObjToString(v.Value)
  1319. if text != "" {
  1320. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1321. }
  1322. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1323. continue
  1324. }
  1325. j.Result[key][k].Value = text
  1326. exts = append(exts, map[string]interface{}{
  1327. "field": v.Field,
  1328. "code": v.Code,
  1329. "ruletext": v.RuleText,
  1330. "type": v.Type,
  1331. "matchtype": v.MatchType,
  1332. "extfrom": v.ExtFrom,
  1333. "value": text,
  1334. })
  1335. }
  1336. if len(exts) > 0 {
  1337. extinfo[key] = exts
  1338. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1339. }
  1340. }
  1341. }
  1342. }
  1343. }
  1344. //后置过滤
  1345. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1346. defer qu.Catch()
  1347. for k, v := range j.BlockPackage {
  1348. if in.Field == "winner" {
  1349. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1350. } else if in.Field == "bidstatus" {
  1351. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1352. } else if in.Field == "" {
  1353. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1354. } else if in.Field == "projectname" {
  1355. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1356. }
  1357. }
  1358. }
  1359. //KV过滤
  1360. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1361. defer qu.Catch()
  1362. extinfo := map[string]interface{}{}
  1363. if in.Field != "" {
  1364. if j.Result[in.Field] != nil {
  1365. tmp := j.Result[in.Field]
  1366. exts := []interface{}{}
  1367. for k, v := range tmp {
  1368. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1369. continue
  1370. }
  1371. text := qu.ObjToString(v.Value)
  1372. if text != "" {
  1373. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1374. }
  1375. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1376. continue
  1377. }
  1378. j.Result[in.Field][k].Value = text
  1379. exts = append(exts, map[string]interface{}{
  1380. "field": v.Field,
  1381. "code": v.Code,
  1382. "ruletext": v.RuleText,
  1383. "type": v.Type,
  1384. "matchtype": v.MatchType,
  1385. "extfrom": v.ExtFrom,
  1386. "value": text,
  1387. })
  1388. }
  1389. if len(exts) > 0 {
  1390. extinfo[in.Field] = exts
  1391. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1392. }
  1393. }
  1394. }
  1395. }
  1396. //获取抽取结果map[string][]interface{},lua脚本使用
  1397. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1398. defer qu.Catch()
  1399. result := map[string][]map[string]interface{}{}
  1400. for key, val := range j.Result {
  1401. if result[key] == nil {
  1402. result[key] = []map[string]interface{}{}
  1403. }
  1404. for _, v := range val {
  1405. tmp := map[string]interface{}{
  1406. "field": v.Field,
  1407. "code": v.Code,
  1408. "ruletext": v.RuleText,
  1409. "value": v.Value,
  1410. "type": v.Type,
  1411. "matchtype": v.MatchType,
  1412. "extfrom": v.ExtFrom,
  1413. }
  1414. result[key] = append(result[key], tmp)
  1415. }
  1416. }
  1417. return result
  1418. }
  1419. //抽取日志
  1420. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1421. defer qu.Catch()
  1422. if !t.IsEtxLog {
  1423. return
  1424. }
  1425. logdata := map[string]interface{}{
  1426. "code": qu.If(v.Code == "", "kv", v.Code),
  1427. "name": v.Name,
  1428. "type": ftype,
  1429. "ruletext": v.RuleText,
  1430. "islua": v.IsLua,
  1431. "field": v.Field,
  1432. "version": t.Version,
  1433. "taskname": t.Name,
  1434. "before": before,
  1435. "extinfo": extinfo,
  1436. "sid": sid,
  1437. "comeintime": time.Now().Unix(),
  1438. }
  1439. lock.Lock()
  1440. ExtLogs[t] = append(ExtLogs[t], logdata)
  1441. lock.Unlock()
  1442. }
  1443. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1444. exts := []map[string]interface{}{}
  1445. exts = append(exts, map[string]interface{}{
  1446. "field": ext.Field,
  1447. "code": ext.Code,
  1448. "type": ftype,
  1449. "matchtype": matchtype,
  1450. "extfrom": ext.ExtFrom,
  1451. "value": ext.Value,
  1452. })
  1453. extinfo := map[string]interface{}{
  1454. ext.Field: exts,
  1455. }
  1456. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1457. }
  1458. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1459. defer qu.Catch()
  1460. if !t.IsEtxLog {
  1461. return
  1462. }
  1463. logdata := map[string]interface{}{
  1464. "code": code,
  1465. "name": name,
  1466. "type": ftype,
  1467. "ruletext": "",
  1468. "islua": false,
  1469. "field": field,
  1470. "version": t.Version,
  1471. "taskname": t.Name,
  1472. "before": before,
  1473. "extinfo": extinfo,
  1474. "sid": sid,
  1475. "comeintime": time.Now().Unix(),
  1476. }
  1477. lock.Lock()
  1478. ExtLogs[t] = append(ExtLogs[t], logdata)
  1479. lock.Unlock()
  1480. }
  1481. //保存抽取日志
  1482. func SaveExtLog() {
  1483. defer qu.Catch()
  1484. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1485. lock.Lock()
  1486. tmpLogs = ExtLogs
  1487. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1488. lock.Unlock()
  1489. for k, v := range tmpLogs {
  1490. if len(v) < saveLimit {
  1491. db.Mgo.SaveBulk(k.TrackColl, v...)
  1492. } else {
  1493. for {
  1494. if len(v) > saveLimit {
  1495. tmp := v[:saveLimit]
  1496. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1497. v = v[saveLimit:]
  1498. } else {
  1499. db.Mgo.SaveBulk(k.TrackColl, v...)
  1500. break
  1501. }
  1502. }
  1503. }
  1504. }
  1505. time.AfterFunc(10*time.Second, SaveExtLog)
  1506. }
  1507. type FieldValue struct {
  1508. Value interface{}
  1509. Count int
  1510. }
  1511. //分析抽取结果并保存
  1512. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1513. qu.Try(func() {
  1514. //重新取出清理过后的中标候选人
  1515. resetWinnerorder(j)
  1516. doc, result, _id := funcAnalysis(j, e)
  1517. if ju.IsSaveTag {
  1518. go otherNeedSave(j, result, e)
  1519. }
  1520. auxinfo := auxInfo(j)
  1521. //从排序结果中取值
  1522. tmp := map[string]interface{}{} //抽取值
  1523. tmp["spidercode"] = j.SpiderCode
  1524. tmp["site"] = j.Site
  1525. tmp["jsondata"] = j.Jsondata
  1526. tmp["fieldall"] = auxinfo
  1527. for _, val := range result {
  1528. for _, v := range val { //取第一个非负数,项目名称除外
  1529. if v.Score > -1 {
  1530. tmp[v.Field] = v.Value
  1531. break
  1532. } else if v.Field == "projectname" {
  1533. tmp[v.Field] = v.Value
  1534. break
  1535. }
  1536. }
  1537. }
  1538. if len(j.PackageInfo) > 0 { //分包信息
  1539. tmp["package"] = j.PackageInfo
  1540. }
  1541. if len(j.Winnerorder) > 0 { //候选人信息
  1542. tmp["winnerorder"] = j.Winnerorder
  1543. }
  1544. //处理附件
  1545. var resultf map[string][]*ju.ExtField
  1546. if jf != nil {
  1547. _, resultf, _ = funcAnalysis(jf, e)
  1548. auxinfof := auxInfo(jf)
  1549. tmp["fieldallf"] = auxinfof
  1550. ffield := map[string]interface{}{}
  1551. for _, val := range resultf {
  1552. for _, v := range val { //取第一个非负数
  1553. if v.Score > -1 {
  1554. ffield[v.Field] = v.Value
  1555. break
  1556. }
  1557. }
  1558. }
  1559. if len(jf.PackageInfo) > 0 { //分包信息
  1560. ffield["package"] = jf.PackageInfo
  1561. }
  1562. if len(jf.Winnerorder) > 0 { //候选人信息
  1563. ffield["winnerorder"] = jf.Winnerorder
  1564. }
  1565. tmp["ffield"] = ffield
  1566. }
  1567. for k, v := range *doc {
  1568. //去重冗余字段
  1569. if delFiled(k) {
  1570. continue
  1571. }
  1572. if tmp[k] == nil {
  1573. tmp[k] = v
  1574. }
  1575. }
  1576. //质量审核
  1577. if ju.QualityAudit {
  1578. e.QualityAudit(tmp)
  1579. }
  1580. if e.IsExtractCity { //城市抽取
  1581. //e.ExtractCity(j, tmp, _id)
  1582. e.NewExtractCity(j, tmp, _id)
  1583. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1584. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1585. // tmp["district"] = d
  1586. // if b {
  1587. // tmp["city"] = c
  1588. // tmp["area"] = p
  1589. // }
  1590. }
  1591. //品牌抽取
  1592. if ju.IsBrandGoods {
  1593. tmp["checkhas"] = map[string]int{
  1594. "hastable": j.HasTable,
  1595. "hasgoods": j.HasGoods,
  1596. "hasbrand": j.HasBrand,
  1597. "haskey": j.HasKey,
  1598. }
  1599. if len(j.BrandData) > 0 {
  1600. tmp["tablebrand"] = j.BrandData
  1601. }
  1602. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1603. }
  1604. //所有kv组成的字符串
  1605. var kvtext bytes.Buffer
  1606. blocks := make([]ju.BlockAndTag, 0)
  1607. for _, v := range j.Block {
  1608. //分包和标签
  1609. if ju.SaveBlock {
  1610. xx, _ := json.Marshal(v)
  1611. tmpblock := new(ju.TmpBlock)
  1612. err := json.Unmarshal(xx, &tmpblock)
  1613. if err != nil {
  1614. if v.BPackage != nil {
  1615. bpb, _ := json.Marshal(v.BPackage)
  1616. tmpblock.BPackage = string(bpb)
  1617. }
  1618. tmpblock = rangeBlockToJson(v, *tmpblock)
  1619. }
  1620. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1621. }
  1622. //把所有kv组装成一个字符串,存库
  1623. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1624. if jv == nil {
  1625. continue
  1626. }
  1627. for jv_k, jv_v := range jv.KvTags {
  1628. for _, jv_vv := range jv_v {
  1629. kvtext.WriteString(jv_k)
  1630. kvtext.WriteString(":")
  1631. kvtext.WriteString(jv_vv.Value)
  1632. kvtext.WriteString(" ")
  1633. }
  1634. }
  1635. }
  1636. }
  1637. if kvtext.Len() > 0 {
  1638. tmp["kvtext"] = kvtext.String()
  1639. }
  1640. if len(blocks) > 0 {
  1641. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1642. if utf8.RuneCount(blocksBytes) < 100000 {
  1643. tmp["blocks"] = string(blocksBytes)
  1644. }
  1645. }
  1646. }
  1647. //tmp["extract_content"] = j.Content
  1648. if e.TaskInfo.TestColl == "" {
  1649. if len(tmp) > 0 { //保存抽取结果
  1650. /* if len(e.SiteFields) <= 0 {
  1651. //for field, _ := range e.Fields {
  1652. // if tmp[field] == nil && {
  1653. // tmp[field] = "" //覆盖之前版本数据
  1654. // }
  1655. //}
  1656. } else {
  1657. //for field, _ := range e.SiteFields {
  1658. // if tmp[field] == nil &&{
  1659. // tmp[field] = "" //覆盖之前版本数据
  1660. // }
  1661. //}
  1662. }*/
  1663. tmp["repeat"] = 0
  1664. tmparr := []map[string]interface{}{
  1665. map[string]interface{}{
  1666. "_id": qu.StringTOBsonId(_id),
  1667. },
  1668. map[string]interface{}{"$set": tmp},
  1669. }
  1670. e.RWMutex.Lock()
  1671. e.BidArr = append(e.BidArr, tmparr)
  1672. e.BidTotal++
  1673. e.RWMutex.Unlock()
  1674. }
  1675. if ju.SaveResult {
  1676. id := tmp["_id"]
  1677. tmp["result"] = result
  1678. tmp["resultf"] = resultf
  1679. delete(tmp, "_id")
  1680. tmparr := []map[string]interface{}{
  1681. map[string]interface{}{
  1682. "_id": id,
  1683. },
  1684. map[string]interface{}{"$set": tmp},
  1685. }
  1686. e.RWMutex.Lock()
  1687. e.ResultArr = append(e.ResultArr, tmparr)
  1688. e.RWMutex.Unlock()
  1689. }
  1690. } else { //测试结果
  1691. // fmt.Println("=============抽取结果================")
  1692. // for k, v := range tmp {
  1693. // qu.Debug(k, "---", v)
  1694. // }
  1695. // for field, _ := range e.Fields {
  1696. // qu.Debug(field, "---", tmp[field])
  1697. // }
  1698. delete(tmp, "_id")
  1699. if len(j.BlockPackage) > 0 { //分包详情
  1700. bs, _ := json.Marshal(j.BlockPackage)
  1701. tmp["epackage"] = string(bs)
  1702. }
  1703. tmp["result"] = result
  1704. tmp["resultf"] = resultf
  1705. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1706. if !b {
  1707. log.Debug(e.TaskInfo.TestColl, _id)
  1708. }
  1709. }
  1710. }, func(err interface{}) {
  1711. log.Debug("AnalysisSaveResult err", err)
  1712. })
  1713. }
  1714. //保存其他
  1715. //kv、表格、块上的标签凡是新的标签都入库
  1716. //val type times firstid createtime 判定field
  1717. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1718. now := time.Now().Unix()
  1719. coll := e.TaskInfo.TestColl
  1720. if coll == "" {
  1721. coll = "extract_tag_result"
  1722. } else {
  1723. coll += "_tag"
  1724. }
  1725. datas := []map[string]interface{}{}
  1726. kv := map[string]int{}
  1727. for _, v := range j.Block {
  1728. //
  1729. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1730. if vv == nil || vv.KvTags == nil {
  1731. continue
  1732. }
  1733. for kkk, vvv := range vv.KvTags {
  1734. for _, vvvv := range vvv {
  1735. if vvvv.IsInvalid {
  1736. kv[kkk] = kv[kkk] + 1
  1737. break
  1738. }
  1739. }
  1740. }
  1741. }
  1742. for _, vv := range v.NotClassifyTitles {
  1743. datas = append(datas, map[string]interface{}{
  1744. "val": vv,
  1745. "times": 0,
  1746. "type": "block",
  1747. "firstid": j.SourceMid,
  1748. "createtime": now,
  1749. })
  1750. if len(datas) == saveLimit {
  1751. db.Mgo.SaveBulk(coll, datas...)
  1752. datas = []map[string]interface{}{}
  1753. }
  1754. }
  1755. }
  1756. for k, v := range kv {
  1757. datas = append(datas, map[string]interface{}{
  1758. "val": k,
  1759. "times": v,
  1760. "type": "kv",
  1761. "firstid": j.SourceMid,
  1762. "createtime": now,
  1763. })
  1764. if len(datas) == saveLimit {
  1765. db.Mgo.SaveBulk(coll, datas...)
  1766. datas = []map[string]interface{}{}
  1767. }
  1768. }
  1769. if len(datas) > 0 {
  1770. db.Mgo.SaveBulk(coll, datas...)
  1771. }
  1772. }
  1773. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1774. if j == nil {
  1775. return nil
  1776. }
  1777. if len(j.Block) > 0 {
  1778. for i, v := range j.Block {
  1779. rangetmp := new(ju.TmpBlock)
  1780. vb, _ := json.Marshal(v)
  1781. json.Unmarshal(vb, &rangetmp)
  1782. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1783. }
  1784. }
  1785. if j.ColonKV != nil {
  1786. cb, _ := json.Marshal(j.ColonKV)
  1787. tmpblock.ColonKV = string(cb)
  1788. }
  1789. if j.SpaceKV != nil {
  1790. sb, _ := json.Marshal(j.SpaceKV)
  1791. tmpblock.SpaceKV = string(sb)
  1792. }
  1793. if j.TableKV != nil {
  1794. tb, _ := json.Marshal(j.TableKV)
  1795. tmpblock.TableKV = string(tb)
  1796. }
  1797. return &tmpblock
  1798. }
  1799. //去重冗余字段
  1800. func delFiled(k string) bool {
  1801. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1802. }
  1803. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1804. defer qu.Catch()
  1805. doc := j.Data
  1806. result := j.Result
  1807. _id := qu.BsonIdToSId((*doc)["_id"])
  1808. result = ScoreFields(j, e.Tag) //正负面词打分
  1809. //结果排序
  1810. for _, val := range result {
  1811. ju.Sort(val)
  1812. }
  1813. j.Result = JsonDataMergeProcessing(j, e)
  1814. return doc, result, _id
  1815. }
  1816. //辅助信息,如果没有排序先排序
  1817. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1818. fieldalls := map[string][]map[string]interface{}{}
  1819. for field, val := range j.Result {
  1820. //ju.Sort(val)
  1821. sfields := []map[string]interface{}{}
  1822. for _, v := range val {
  1823. standardized := false
  1824. if field == "buyer" || field == "winner" || field == "agency" {
  1825. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1826. if i > 0 {
  1827. standardized = true
  1828. }
  1829. }
  1830. sfield := map[string]interface{}{
  1831. "val": v.Value,
  1832. "type": v.Type,
  1833. "score": v.Score,
  1834. "blocktag": v.BlockTag,
  1835. "sourceval": v.SourceValue,
  1836. "standardized": standardized,
  1837. }
  1838. sfields = append(sfields, sfield)
  1839. }
  1840. fieldalls[field] = sfields
  1841. }
  1842. return fieldalls
  1843. }
  1844. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1845. defer qu.Catch()
  1846. //获取审核字段
  1847. for _, field := range e.AuditFields {
  1848. //1.分包
  1849. if resulttmp["package"] != nil {
  1850. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1851. for _, val := range packagedata {
  1852. if val[field] != nil {
  1853. fv := qu.ObjToString(val[field])
  1854. if fv != "" {
  1855. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1856. e.RedisMatch(field, fv, val) //redis匹配
  1857. } else { //除了buyer和winner,其他字段走规则匹配
  1858. e.RuleMatch(field, fv, val)
  1859. }
  1860. }
  1861. }
  1862. }
  1863. }
  1864. //2.外围
  1865. if resulttmp[field] != nil {
  1866. fv := qu.ObjToString(resulttmp[field])
  1867. if fv != "" {
  1868. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1869. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1870. } else { //除了buyer和winner,其他字段走规则匹配
  1871. e.RuleMatch(field, fv, resulttmp)
  1872. }
  1873. }
  1874. }
  1875. }
  1876. }
  1877. //Redis匹配
  1878. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1879. defer qu.Catch()
  1880. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1881. if i == 0 { //reids未找到,执行规则匹配
  1882. val[field+"_isredis"] = false
  1883. e.RuleMatch(field, fv, val) //规则匹配
  1884. } else { //redis找到,打标识存库
  1885. val[field+"_isredis"] = true
  1886. }
  1887. }
  1888. //规则匹配
  1889. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1890. defer qu.Catch()
  1891. if fieldval != "" {
  1892. SMap := e.StartMatch(field, fieldval)
  1893. //SMap.AddKey(field+"_isaudit", false)
  1894. for _, k := range SMap.Keys {
  1895. tmpMap[k] = SMap.Map[k]
  1896. }
  1897. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1898. }
  1899. }
  1900. //开始规则匹配
  1901. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1902. defer qu.Catch()
  1903. SMap := pretreated.NewSortMap()
  1904. lock.Lock()
  1905. f := e.RecogFieldMap[field]
  1906. lock.Unlock()
  1907. if len(f) > 0 {
  1908. fid := qu.BsonIdToSId(f["_id"])
  1909. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1910. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1911. if textAfterRecogFieldPrerule != "" {
  1912. lock.Lock()
  1913. classMap := e.FidClassMap[fid]
  1914. lock.Unlock()
  1915. L:
  1916. for _, c := range classMap { //class
  1917. classid := qu.BsonIdToSId(c["_id"])
  1918. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1919. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1920. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1921. if textAfterClassPrerule != "" {
  1922. lock.Lock()
  1923. ruleMap := e.CidRuleMap[classid]
  1924. lock.Unlock()
  1925. for _, r := range ruleMap { //rule
  1926. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1927. s_name := qu.ObjToString(r["s_name"])
  1928. rule := r["rule"].([]interface{})
  1929. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1930. if textAfterRulePrerule != "" {
  1931. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1932. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1933. if savefield != "" { //保存字段不为空,存储代码信息
  1934. SMap.AddKey(field+"_"+savefield, s_name)
  1935. }
  1936. break L
  1937. }
  1938. }
  1939. }
  1940. }
  1941. }
  1942. }
  1943. }
  1944. return SMap
  1945. }
  1946. //中标候选人经过清理之后,重新取出赋值
  1947. func resetWinnerorder(j *ju.Job) {
  1948. if len(j.Winnerorder) == 0 {
  1949. return
  1950. }
  1951. maxlen := len(j.Winnerorder) - 1
  1952. //中标单位
  1953. i := 0
  1954. winners := []*ju.ExtField{}
  1955. for _, v := range j.Result["winner"] {
  1956. if v.Code == "winnerorder" {
  1957. if maxlen < i {
  1958. continue
  1959. }
  1960. j.Winnerorder[i]["entname"] = v.Value
  1961. i++
  1962. } else {
  1963. winners = append(winners, v)
  1964. }
  1965. }
  1966. j.Result["winner"] = winners
  1967. //中标金额
  1968. i = 0
  1969. bidamounts := []*ju.ExtField{}
  1970. for _, v := range j.Result["bidamount"] {
  1971. if v.Code == "winnerorder" {
  1972. if maxlen < i {
  1973. continue
  1974. }
  1975. j.Winnerorder[i]["price"] = v.Value
  1976. i++
  1977. } else {
  1978. bidamounts = append(bidamounts, v)
  1979. }
  1980. }
  1981. j.Result["bidamount"] = bidamounts
  1982. }