extract.go 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "sync"
  16. "time"
  17. "unicode/utf8"
  18. "github.com/PuerkitoBio/goquery"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
  31. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  32. )
  33. //启动测试抽取
  34. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  35. defer qu.Catch()
  36. ext := &ExtractTask{}
  37. ext.Id = taskId
  38. ext.IsRun = true
  39. ext.InitTestTaskInfo(resultcoll, trackcoll)
  40. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  41. ext.InitSite()
  42. ext.InitRulePres()
  43. ext.InitRuleBacks(false)
  44. ext.InitRuleCore(false)
  45. ext.InitPkgCore()
  46. ext.InitBlockRule()
  47. ext.InfoTypeList()
  48. ext.InitTag(false)
  49. ext.InitClearFn(false)
  50. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  51. //初始化城市DFA信息
  52. ext.InitCityInfo()
  53. //ext.InitCityDFA()
  54. ext.InitAreaCode()
  55. ext.InitPostCode()
  56. }
  57. //质量审核
  58. ext.InitAuditFields()
  59. ext.InitAuditRule()
  60. ext.InitAuditClass()
  61. ext.InitAuditRecogField()
  62. //品牌抽取是否开启
  63. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  64. //附件抽取是否开启
  65. ext.InitFile()
  66. return RunExtractTestTask(ext, startId, num)
  67. }
  68. func IdTrans(startId string) bson.ObjectId {
  69. defer qu.Catch()
  70. return bson.ObjectIdHex(startId)
  71. }
  72. //开始测试任务抽取
  73. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  74. n, _ := strconv.Atoi(num)
  75. id := IdTrans(startId)
  76. if id.Valid() {
  77. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  78. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  79. for _, v := range *list {
  80. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  81. continue
  82. }
  83. var j, jf *ju.Job
  84. var isSite bool
  85. if ext.IsFileField && v["projectinfo"] != nil {
  86. v["isextFile"] = true
  87. j, jf, isSite = ext.PreInfo(v)
  88. } else {
  89. j, _, isSite = ext.PreInfo(v)
  90. }
  91. go ext.ExtractProcess(j, jf, isSite)
  92. ext.TaskInfo.ProcessPool <- true
  93. }
  94. return true
  95. } else {
  96. return false
  97. }
  98. }
  99. //启动抽取
  100. func StartExtractTaskId(taskId string) bool {
  101. defer qu.Catch()
  102. isgo := false
  103. ext := TaskList[taskId]
  104. if ext == nil {
  105. ext = &ExtractTask{}
  106. ext.Id = taskId
  107. ext.InitTaskInfo()
  108. isgo = true
  109. } else {
  110. ext.Id = taskId
  111. ext.InitTaskInfo()
  112. }
  113. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  114. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  115. ext.InitSite()
  116. ext.InitRulePres()
  117. ext.InitRuleBacks(false)
  118. ext.InitRuleCore(false)
  119. ext.InitPkgCore()
  120. ext.InitBlockRule()
  121. ext.InfoTypeList()
  122. ext.InitTag(false)
  123. ext.InitClearFn(false)
  124. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  125. //初始化城市DFA信息
  126. //ext.InitCityDFA()
  127. ext.InitCityInfo()
  128. ext.InitAreaCode()
  129. ext.InitPostCode()
  130. }
  131. //质量审核
  132. ext.InitAuditFields()
  133. ext.InitAuditRule()
  134. ext.InitAuditClass()
  135. ext.InitAuditRecogField()
  136. //品牌抽取是否开启
  137. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  138. //附件抽取是否开启
  139. ext.InitFile()
  140. ext.IsRun = true
  141. go ext.ResultSave(true)
  142. go ext.BidSave(true)
  143. if isgo {
  144. go RunExtractTask(taskId)
  145. }
  146. TaskList[taskId] = ext
  147. return true
  148. }
  149. //停止抽取
  150. func StopExtractTaskId(taskId string) bool {
  151. defer qu.Catch()
  152. ext := TaskList[taskId]
  153. if ext != nil {
  154. ext.IsRun = false
  155. TaskList[taskId] = ext
  156. }
  157. //更新task.s_extlastid
  158. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  159. return true
  160. }
  161. //开始抽取
  162. func RunExtractTask(taskId string) {
  163. defer qu.Catch()
  164. ext := TaskList[taskId]
  165. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  166. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  167. pageNum := (count + PageSize - 1) / PageSize
  168. limit := PageSize
  169. if count < PageSize {
  170. limit = count
  171. }
  172. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  173. for i := 0; i < pageNum; i++ {
  174. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  175. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  176. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  177. for _, v := range *list {
  178. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  179. continue
  180. }
  181. //根据标题判断是否抽取
  182. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  183. if !b {
  184. continue
  185. }
  186. _id := qu.BsonIdToSId(v["_id"])
  187. //log.Debug(_id)
  188. if !ext.IsRun {
  189. break
  190. }
  191. var j, jf *ju.Job
  192. var isSite bool
  193. if ext.IsFileField && v["projectinfo"] != nil {
  194. v["isextFile"] = true
  195. j, jf, isSite = ext.PreInfo(v)
  196. } else {
  197. j, _, isSite = ext.PreInfo(v)
  198. }
  199. go ext.ExtractProcess(j, jf, isSite)
  200. ext.TaskInfo.LastExtId = _id
  201. ext.TaskInfo.ProcessPool <- true
  202. }
  203. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  204. if !ext.IsRun {
  205. break
  206. }
  207. }
  208. //更新task.s_extlastid
  209. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  210. }
  211. //信息预处理-不和版本关联,取最新版本的配置项
  212. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  213. return (&ExtractTask{}).PreInfo(doc)
  214. }
  215. //信息预处理-和版本关联
  216. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  217. defer qu.Catch()
  218. //判断是否有附件这个字段
  219. var isextFile bool
  220. if doc["isextFile"] != nil {
  221. isextFile = doc["isextFile"].(bool)
  222. }
  223. detail := ""
  224. d1, _ := doc["detail"].(string)
  225. d2, _ := doc["contenthtml"].(string)
  226. if len(d1) >= len(d2) || d2 == "" {
  227. detail = d1
  228. } else {
  229. detail = d2
  230. }
  231. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  232. d3, _ := doc["summary"].(string)
  233. //全文的需要修复表格
  234. detail = pretreated.RepairCon(detail)
  235. detail = ju.CutLableStr(d3 + "\n" + detail)
  236. detail = cut.ClearHtml(d3 + "\n" + detail)
  237. doc["detail"] = detail
  238. if isextFile {
  239. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  240. }
  241. //正文小于50个字,有附件把附件内容加到正文
  242. tmpDeatil := detail
  243. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  244. if err == nil {
  245. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  246. if conlen < 50 {
  247. if isextFile {
  248. detail += qu.ObjToString(doc["detailfile"])
  249. doc["detail"] = detail
  250. }
  251. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  252. //防止文本过长,造成抽取阻塞
  253. log.Debug("文本太长", doc["_id"], conlen)
  254. doc["detail"] = d3
  255. }
  256. }
  257. toptype := qu.ObjToString(doc["toptype"])
  258. subtype := qu.ObjToString(doc["subtype"])
  259. if qu.ObjToString(doc["type"]) == "bid" {
  260. toptype = "结果"
  261. }
  262. if toptype == "" {
  263. toptype = "all"
  264. }
  265. if subtype == "" {
  266. subtype = "all"
  267. }
  268. toMap := qu.ObjToMap(doc["jsondata"])
  269. if (*toMap) != nil {
  270. if (*toMap)["extweight"] == nil {
  271. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  272. }
  273. }
  274. j = &ju.Job{
  275. SourceMid: qu.BsonIdToSId(doc["_id"]),
  276. Category: toptype,
  277. CategorySecond: subtype,
  278. Content: qu.ObjToString(doc["detail"]),
  279. SpiderCode: qu.ObjToString(doc["spidercode"]),
  280. //Domain: qu.ObjToString(doc["domain"]),
  281. //Href: qu.ObjToString(doc["href"]),
  282. Title: qu.ObjToString(doc["title"]),
  283. Data: &doc,
  284. City: qu.ObjToString(doc["city"]),
  285. Province: qu.ObjToString(doc["area"]),
  286. Jsondata: toMap,
  287. Result: map[string][]*ju.ExtField{},
  288. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  289. RuleBlock: e.RuleBlock,
  290. }
  291. if isextFile {
  292. jf = &ju.Job{
  293. SourceMid: qu.BsonIdToSId(doc["_id"]),
  294. Category: toptype,
  295. Content: qu.ObjToString(doc["detailfile"]),
  296. SpiderCode: qu.ObjToString(doc["spidercode"]),
  297. Title: qu.ObjToString(doc["title"]),
  298. Data: &doc,
  299. City: qu.ObjToString(doc["city"]),
  300. Province: qu.ObjToString(doc["area"]),
  301. Jsondata: toMap,
  302. Result: map[string][]*ju.ExtField{},
  303. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  304. RuleBlock: e.RuleBlock,
  305. IsFile: isextFile,
  306. }
  307. }
  308. //是否配置站点
  309. codeSite := j.SpiderCode
  310. exp, isSite := e.Luacodes.Load(codeSite)
  311. if isSite {
  312. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  313. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  314. }
  315. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  316. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  317. }
  318. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  319. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  320. }
  321. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  322. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  323. }
  324. }
  325. qu.Try(func() {
  326. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  327. if isextFile {
  328. pretreated.AnalyStart(jf, isSite, codeSite)
  329. }
  330. }, func(err interface{}) {
  331. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  332. })
  333. return j, jf, isSite
  334. }
  335. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  336. func file2text(doc *map[string]interface{}) {
  337. var strfileinfo bytes.Buffer
  338. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  339. if va, ok := v["attachments"].(map[string]interface{}); ok {
  340. for _, vaatt := range va {
  341. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  342. if qu.ObjToString(fileinfo["content"]) != "" {
  343. switch fileinfo["content"].(type) {
  344. case string:
  345. lock.Lock()
  346. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  347. lock.Unlock()
  348. case []map[string]interface{}:
  349. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  350. if fv["context"] != nil {
  351. lock.Lock()
  352. strfileinfo.WriteString(fv["context"].(string) + " \n")
  353. lock.Unlock()
  354. }
  355. }
  356. }
  357. }
  358. }
  359. }
  360. }
  361. }
  362. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  363. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  364. }
  365. }
  366. //抽取
  367. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  368. e.ExtractDetail(j, isSite, j.SpiderCode)
  369. if jf != nil && jf.IsFile {
  370. e.ExtractFile(jf, isSite, j.SpiderCode)
  371. }
  372. if isSite {
  373. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  374. if ok && ismerge.(bool) {
  375. tmpj := &ju.Job{
  376. SourceMid: j.SourceMid,
  377. Category: j.Category,
  378. CategorySecond: j.CategorySecond,
  379. Content: j.Content,
  380. SpiderCode: j.SpiderCode,
  381. //Domain: qu.ObjToString(doc["domain"]),
  382. //Href: qu.ObjToString(doc["href"]),
  383. Title: j.Title,
  384. Data: j.Data,
  385. City: j.City,
  386. Province: j.Province,
  387. Jsondata: j.Jsondata,
  388. Result: map[string][]*ju.ExtField{},
  389. BuyerAddr: j.BuyerAddr,
  390. RuleBlock: e.RuleBlock,
  391. }
  392. qu.Try(func() {
  393. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  394. }, func(err interface{}) {
  395. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  396. })
  397. e.ExtractDetail(tmpj, false, "")
  398. //if jf != nil && jf.IsFile {
  399. // e.ExtractFile(jf, false, "")
  400. //}
  401. //合并数据
  402. j.Block = append(j.Block, tmpj.Block...)
  403. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  404. for tmpk, _ := range j.Result {
  405. if len(tmpj.Result[tmpk]) > 0 {
  406. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  407. }
  408. }
  409. for tmpk, _ := range tmpj.Result {
  410. if len(j.Result[tmpk]) == 0 {
  411. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  412. }
  413. }
  414. }
  415. }
  416. //分析抽取结果并保存
  417. AnalysisSaveResult(j, jf, e)
  418. <-e.TaskInfo.ProcessPool
  419. }
  420. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  421. qu.Try(func() {
  422. doc := *j.Data
  423. //全局前置规则,结果覆盖doc属性
  424. //for _, v := range e.RulePres {
  425. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  426. //}
  427. tmprules := map[string][]*RuleCore{}
  428. lockrule.Lock()
  429. if j.Category == "all" || j.CategorySecond == "all" {
  430. if isSite {
  431. for k, vc1 := range e.SiteRuleCores["all_all"] {
  432. tmprules[k] = vc1
  433. }
  434. } else {
  435. for k, vc1 := range e.RuleCores["all_all"] {
  436. tmprules[k] = vc1
  437. }
  438. }
  439. } else {
  440. if isSite {
  441. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  442. tmprules[k] = vc1
  443. }
  444. } else {
  445. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  446. tmprules[k] = vc1
  447. }
  448. }
  449. }
  450. if len(tmprules) < 1 { //分类未覆盖部分
  451. if isSite {
  452. for k, vc1 := range e.RuleCores["all_all"] {
  453. tmprules[k] = vc1
  454. }
  455. } else {
  456. for k, vc1 := range e.SiteRuleCores["all_all"] {
  457. tmprules[k] = vc1
  458. }
  459. }
  460. }
  461. lockrule.Unlock()
  462. //抽取规则
  463. for _, vc1 := range tmprules {
  464. for _, vc := range vc1 {
  465. tmp := ju.DeepCopy(doc).(map[string]interface{})
  466. //是否进入逻辑
  467. if !ju.Logic(vc.LuaLogic, tmp) {
  468. continue
  469. }
  470. ////抽取-前置规则
  471. //for _, v := range vc.RulePres {
  472. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  473. //}
  474. // log.Debug("抽取-前置规则", tmp)
  475. //抽取-规则
  476. ExtRuleCore(tmp, e, vc, j, isSite)
  477. // log.Debug("抽取-规则", tmp)
  478. //抽取-后置规则
  479. for _, v := range vc.RuleBacks {
  480. ExtRegBack(j, v, e.TaskInfo)
  481. }
  482. //kv规则
  483. for _, v := range vc.KVRuleCores {
  484. ExtRuleKV(j, v, e.TaskInfo)
  485. }
  486. // log.Debug("抽取-后置规则", tmp)
  487. //项目名称未能抽取到,标题来凑
  488. if vc.Field == "projectname" {
  489. if vc.ExtFrom == "title" {
  490. isextitle := true
  491. for _, v := range j.Result[vc.Field] {
  492. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  493. isextitle = false
  494. break
  495. }
  496. }
  497. if isextitle { //标题加入选举
  498. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  499. if isSite {
  500. field.Score = 1
  501. }
  502. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  503. }
  504. }
  505. for i := 0; i < 3; i++ {
  506. for _, v := range vc.RuleBacks {
  507. ExtRegBack(j, v, e.TaskInfo)
  508. }
  509. }
  510. }
  511. }
  512. }
  513. //全局后置规则
  514. if isSite {
  515. for _, v := range e.SiteRuleBacks {
  516. ExtRegBack(j, v, e.TaskInfo)
  517. }
  518. } else {
  519. for _, v := range e.RuleBacks {
  520. ExtRegBack(j, v, e.TaskInfo)
  521. }
  522. }
  523. //函数清理
  524. for key, val := range j.Result {
  525. for _, v := range val {
  526. lockclear.Lock()
  527. var cfn = []string{}
  528. if isSite {
  529. cfn = e.SiteClearFn[key]
  530. } else {
  531. cfn = e.ClearFn[key]
  532. }
  533. lockclear.Unlock()
  534. if len(cfn) == 0 {
  535. continue
  536. }
  537. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  538. before, _ := v.Value.(string)
  539. v.Value = data[0]
  540. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  541. //添加行数清理的日志
  542. //清理特殊符号
  543. lockclear.Lock()
  544. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  545. text := qu.ObjToString(v.Value)
  546. before = text
  547. v.Value = clear.OtherClean(key, text)
  548. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  549. }
  550. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  551. lockclear.Unlock()
  552. }
  553. }
  554. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  555. // bs, _ := json.Marshal(j.Result)
  556. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  557. }, func(err interface{}) {
  558. log.Debug("ExtractProcess err", err)
  559. })
  560. }
  561. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  562. qu.Try(func() {
  563. doc := *j.Data
  564. //全局前置规则,结果覆盖doc属性
  565. // for _, v := range e.RulePres {
  566. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  567. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  568. // }
  569. // }
  570. //抽取规则
  571. tmprules := map[string][]*RuleCore{}
  572. lockrule.Lock()
  573. if j.Category == "all" || j.CategorySecond == "all" {
  574. for k, vc1 := range e.RuleCores["all_all"] {
  575. tmprules[k] = vc1
  576. }
  577. } else {
  578. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  579. tmprules[k] = vc1
  580. }
  581. }
  582. lockrule.Unlock()
  583. for _, vc1 := range tmprules {
  584. for _, vc := range vc1 {
  585. tmp := ju.DeepCopy(doc).(map[string]interface{})
  586. //是否进入逻辑
  587. if !ju.Logic(vc.LuaLogic, tmp) {
  588. continue
  589. }
  590. //抽取-前置规则
  591. // for _, v := range vc.RulePres {
  592. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  593. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  594. // }
  595. // }
  596. // log.Debug("抽取-前置规则", tmp)
  597. //抽取-规则
  598. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  599. ExtRuleCore(tmp, e, vc, j, isSite)
  600. }
  601. // log.Debug("抽取-规则", tmp)
  602. //抽取-后置规则
  603. for _, v := range vc.RuleBacks {
  604. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  605. ExtRegBack(j, v, e.TaskInfo)
  606. }
  607. }
  608. // log.Debug("抽取-后置规则", tmp)
  609. }
  610. }
  611. //全局后置规则
  612. for _, v := range e.RuleBacks {
  613. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  614. ExtRegBack(j, v, e.TaskInfo)
  615. }
  616. }
  617. //函数清理
  618. for key, val := range j.Result {
  619. for _, v := range val {
  620. lockclear.Lock()
  621. cfn := e.ClearFn[key]
  622. lockclear.Unlock()
  623. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  624. v.Value = data[0]
  625. //清理特殊符号
  626. lockclear.Lock()
  627. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  628. clear.MesField[key] != nil {
  629. text := qu.ObjToString(v.Value)
  630. text = clear.OtherClean(key, text)
  631. v.Value = text
  632. }
  633. lockclear.Unlock()
  634. }
  635. }
  636. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  637. // bs, _ := json.Marshal(j.Result)
  638. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  639. }, func(err interface{}) {
  640. log.Debug("ExtractProcess err", err)
  641. })
  642. }
  643. //前置过滤
  644. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  645. defer qu.Catch()
  646. before := ju.DeepCopy(doc).(map[string]interface{})
  647. extinfo := map[string]interface{}{}
  648. if in.IsLua {
  649. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  650. if j != nil {
  651. lua.Block = j.Block
  652. }
  653. extinfo = lua.RunScript("pre")
  654. for k, v := range extinfo { //结果覆盖原doc
  655. doc[k] = v
  656. }
  657. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  658. } else {
  659. var key string
  660. if !j.IsFile {
  661. key = qu.If(in.Field == "", "detail", in.Field).(string)
  662. } else {
  663. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  664. }
  665. text := qu.ObjToString(doc[key])
  666. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  667. doc[key] = extinfo[key] //结果覆盖原doc
  668. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  669. }
  670. return doc
  671. }
  672. //抽取-规则
  673. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  674. //候选人加入
  675. var kvMap map[string][]map[string]interface{}
  676. extByReg := true
  677. if vc.ExtFrom != "title" {
  678. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  679. }
  680. for _, v := range vc.RuleCores {
  681. if v.IsLua {
  682. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  683. } else if extByReg {
  684. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  685. }
  686. }
  687. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  688. if vc.Field == "budget" && len(kvMap) == 0 {
  689. if len(j.BlockPackage) == 1 {
  690. for _, bp := range j.BlockPackage {
  691. for fieldname, field := range vc.LFields {
  692. if field != vc.Field {
  693. continue
  694. }
  695. tp := ""
  696. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  697. if k == 0 {
  698. tp = "colon"
  699. } else if k == 1 {
  700. tp = "space"
  701. } else if k == 2 {
  702. tp = "table"
  703. }
  704. if v == nil || v.KvTags == nil {
  705. continue
  706. }
  707. for _, vv := range v.KvTags[fieldname] {
  708. text := ju.TrimLRSpace(vv.Value, "")
  709. if text != "" {
  710. tmp := &ju.ExtField{
  711. Field: vc.Field,
  712. Code: "CL_分包",
  713. Type: tp,
  714. MatchType: "package",
  715. RuleText: bp.Text,
  716. SourceValue: vv.Key,
  717. Value: text,
  718. }
  719. if isSite {
  720. tmp.Score = 1
  721. }
  722. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  723. }
  724. }
  725. }
  726. }
  727. break
  728. }
  729. }
  730. } else {
  731. for k, v := range kvMap {
  732. if j.Result[k] == nil {
  733. j.Result[k] = [](*ju.ExtField){}
  734. }
  735. for _, tmp := range v {
  736. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
  737. if isSite {
  738. field.Score = 1
  739. }
  740. if tmp["blocktag"] != nil {
  741. btag := make(map[string]string)
  742. for k := range tmp["blocktag"].(map[string]bool) {
  743. blocktag.Lock()
  744. if TagConfigDesc[k] != "" {
  745. btag[k] = TagConfigDesc[k]
  746. }
  747. blocktag.Unlock()
  748. }
  749. field.BlockTag = btag
  750. }
  751. j.Result[k] = append(j.Result[k], field)
  752. }
  753. }
  754. }
  755. }
  756. //抽取-规则-kv
  757. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  758. defer qu.Catch()
  759. if extfrom == "title" || !in.IsLua {
  760. return
  761. }
  762. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  763. lua.KvMap = *kvMap
  764. lua.Block = j.Block
  765. extinfo := lua.RunScript("core")
  766. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  767. for _, v := range tmps {
  768. v["core"] = in.Code
  769. }
  770. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  771. }
  772. if len(extinfo) > 0 {
  773. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  774. }
  775. }
  776. //抽取-规则-正则
  777. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  778. defer qu.Catch()
  779. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  780. b := IsExtract(in.Field, j.Title, j.Content)
  781. if !b {
  782. return
  783. }
  784. //全文正则
  785. //text := qu.ObjToString(doc[extfrom])
  786. //if in.Field != "" {
  787. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  788. // if len(extinfo) > 0 {
  789. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  790. // }
  791. //}
  792. //块抽取
  793. if in.Field != "" {
  794. if extfrom == "title" {
  795. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  796. if len(extinfo) > 0 {
  797. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  798. }
  799. } else {
  800. for _, v := range j.Block {
  801. btag := make(map[string]string)
  802. for k := range v.Classify {
  803. blocktag.Lock()
  804. btag[k] = TagConfigDesc[k]
  805. blocktag.Unlock()
  806. }
  807. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  808. if len(extinfo) > 0 {
  809. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  810. }
  811. }
  812. }
  813. }
  814. }
  815. //pkg抽取-规则-正则
  816. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  817. defer qu.Catch()
  818. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  819. b := IsExtract(in.Field, j.Title, j.Content)
  820. if !b {
  821. return
  822. }
  823. //块抽取
  824. if in.Field != "" {
  825. for k, vbpkg := range j.BlockPackage {
  826. rep := map[string]string{}
  827. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  828. if !((in.Field == "budget" && vbpkg.Budget > 0) || (in.Field == "bidamount" && vbpkg.Bidamount > 0) ||
  829. (in.Field == "winner" && vbpkg.Winner == "") || (in.Field == "bidstatus" && vbpkg.BidStatus == "") ||
  830. (in.Field == "projectname" && vbpkg.Name == "")) {
  831. continue
  832. }
  833. //处理正负数修正
  834. ptmp := strings.Split(in.RuleText, "#")
  835. sign := 0
  836. if len(ptmp) == 2 {
  837. if ptmp[1] == "正" {
  838. sign = 1
  839. } else if ptmp[1] == "负" {
  840. sign = -1
  841. }
  842. }
  843. tmp := strings.Split(ptmp[0], "__")
  844. if len(tmp) == 2 {
  845. epos := strings.Split(tmp[1], ",")
  846. posm := map[string]int{}
  847. for _, v := range epos {
  848. ks := strings.Split(v, ":")
  849. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  850. posm[ks[1]] = qu.IntAll(ks[0])
  851. } else {
  852. posm[in.Field] = qu.IntAll(ks[0])
  853. }
  854. }
  855. var pattern string
  856. if strings.Contains(tmp[0], "\\u") {
  857. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  858. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  859. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  860. } else {
  861. pattern = tmp[0]
  862. }
  863. //log.Debug("pattern", pattern)
  864. //fmt.Println(text)
  865. reg := regexp.MustCompile(pattern)
  866. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  867. for i, _ := range apos {
  868. pos := apos[i]
  869. for k, p := range posm {
  870. if len(pos) > p {
  871. if pos[p] == -1 || pos[p+1] == -1 {
  872. continue
  873. }
  874. val := vbpkg.Text[pos[p]:pos[p+1]]
  875. if string(val) == "" {
  876. continue
  877. }
  878. if sign == -1 {
  879. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  880. } else {
  881. rep[k+"_"+fmt.Sprint(i)] = val
  882. }
  883. }
  884. }
  885. }
  886. //fmt.Println(text)
  887. for i := 0; i < len(apos); i++ {
  888. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  889. if in.Field == "budget" && vbpkg.Budget <= 0 {
  890. lock.Lock()
  891. cfn := e.ClearFn[in.Field]
  892. lock.Unlock()
  893. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  894. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  895. break
  896. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  897. lock.Lock()
  898. cfn := e.ClearFn[in.Field]
  899. lock.Unlock()
  900. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  901. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  902. break
  903. } else if in.Field == "winner" {
  904. if j.BlockPackage[k].Winner == "" {
  905. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  906. break
  907. }
  908. } else if in.Field == "bidstatus" {
  909. if j.BlockPackage[k].BidStatus == "" {
  910. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  911. break
  912. }
  913. } else if in.Field == "projectname" {
  914. if j.BlockPackage[k].Name == "" {
  915. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  916. break
  917. }
  918. }
  919. }
  920. }
  921. }
  922. } else {
  923. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  924. val := ""
  925. if len(pos) == 2 {
  926. //"text" = "text"[pos[1]:]
  927. val = "text"[pos[1]:]
  928. rs := regexp.MustCompile("[^\r\n\t]+")
  929. tmp := rs.FindAllString("text", -1)
  930. if len(tmp) > 0 {
  931. val = tmp[0]
  932. }
  933. }
  934. if val != "" {
  935. if in.Field == "budget" && vbpkg.Budget <= 0 {
  936. lock.Lock()
  937. cfn := e.ClearFn[in.Field]
  938. lock.Unlock()
  939. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  940. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  941. break
  942. }
  943. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  944. lock.Lock()
  945. cfn := e.ClearFn[in.Field]
  946. lock.Unlock()
  947. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  948. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  949. break
  950. } else if in.Field == "bidstatus" {
  951. if j.BlockPackage[k].BidStatus == "" {
  952. j.BlockPackage[k].BidStatus = val
  953. break
  954. }
  955. } else if in.Field == "projectname" {
  956. if j.BlockPackage[k].Name == "" {
  957. j.BlockPackage[k].Name = val
  958. break
  959. }
  960. }
  961. }
  962. }
  963. }
  964. }
  965. }
  966. //lua脚本根据属性设置提取kv值
  967. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  968. kvmap := map[string][]map[string]interface{}{}
  969. if len(j.Winnerorder) > 1 {
  970. if vc.Field == "bidamount" {
  971. for _, v := range j.Winnerorder {
  972. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  973. "code": "winnerorder",
  974. "field": vc.Field,
  975. "ruletext": "中标候选人",
  976. "extfrom": vc.ExtFrom,
  977. "sourcevalue": "中标候选人",
  978. "value": v["price"],
  979. "type": "winnerorder",
  980. "matchtype": "winnerorder",
  981. })
  982. }
  983. //候选人中标金额
  984. if price := j.Winnerorder[0]["price"]; price != nil {
  985. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  986. "code": "CL_中标候选人",
  987. "field": vc.Field,
  988. "ruletext": "中标候选人",
  989. "extfrom": vc.ExtFrom,
  990. "sourcevalue": "中标候选人",
  991. "value": price,
  992. "type": "winnerorder",
  993. "matchtype": "winnerorder",
  994. })
  995. return kvmap, false
  996. }
  997. }
  998. //else if vc.Field == "winner" {
  999. // for _, v := range j.Winnerorder {
  1000. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1001. // "code": "winnerorder",
  1002. // "field": vc.Field,
  1003. // "ruletext": "中标候选人",
  1004. // "extfrom": vc.ExtFrom,
  1005. // "sourcevalue": "中标候选人",
  1006. // "value": v["entname"],
  1007. // "type": "winnerorder",
  1008. // "matchtype": "winnerorder",
  1009. // })
  1010. // }
  1011. // //候选人中标单位
  1012. // if entname := j.Winnerorder[0]["entname"]; entname != nil {
  1013. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1014. // "code": "CL_中标候选人",
  1015. // "field": vc.Field,
  1016. // "ruletext": "中标候选人",
  1017. // "extfrom": vc.ExtFrom,
  1018. // "sourcevalue": "中标候选人",
  1019. // "value": entname,
  1020. // "type": "winnerorder",
  1021. // "matchtype": "winnerorder",
  1022. // })
  1023. // return kvmap, false
  1024. // }
  1025. //}
  1026. }
  1027. for fieldname, field := range vc.LFields {
  1028. if field != vc.Field {
  1029. continue
  1030. }
  1031. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1032. }
  1033. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1034. return kvmap, true
  1035. }
  1036. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1037. for _, bl := range blocks {
  1038. tp := ""
  1039. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1040. if k == 0 {
  1041. tp = "colon"
  1042. } else if k == 1 {
  1043. tp = "space"
  1044. } else if k == 2 {
  1045. tp = "table"
  1046. }
  1047. if v == nil || v.KvTags == nil {
  1048. continue
  1049. }
  1050. for _, vv := range v.KvTags[fieldname] {
  1051. text := ju.TrimLRSpace(vv.Value, "")
  1052. if text != "" {
  1053. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1054. "code": "CL_" + vv.Key,
  1055. "field": field,
  1056. "ruletext": vv.Key,
  1057. "extfrom": vc.ExtFrom,
  1058. "sourcevalue": text,
  1059. "value": text,
  1060. "type": tp,
  1061. "matchtype": "tag_string",
  1062. "blocktag": bl.Classify,
  1063. "weight": vv.Weight,
  1064. })
  1065. break //暂定取第一个
  1066. }
  1067. }
  1068. }
  1069. if len(kvmap[field]) == 0 {
  1070. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1071. }
  1072. }
  1073. }
  1074. //正则提取结果
  1075. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1076. defer qu.Catch()
  1077. var score int
  1078. if isSite {
  1079. score = 1
  1080. }
  1081. extinfo := map[string][]map[string]interface{}{}
  1082. rep := map[string]string{}
  1083. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1084. //处理正负数修正
  1085. ptmp := strings.Split(vre.RuleText, "#")
  1086. sign := 0
  1087. if len(ptmp) == 2 {
  1088. if ptmp[1] == "正" {
  1089. sign = 1
  1090. } else if ptmp[1] == "负" {
  1091. sign = -1
  1092. }
  1093. }
  1094. tmp := strings.Split(ptmp[0], "__")
  1095. if len(tmp) == 2 {
  1096. epos := strings.Split(tmp[1], ",")
  1097. posm := map[string]int{}
  1098. for _, v := range epos {
  1099. ks := strings.Split(v, ":")
  1100. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1101. posm[ks[1]] = qu.IntAll(ks[0])
  1102. } else {
  1103. posm[vre.Field] = qu.IntAll(ks[0])
  1104. }
  1105. }
  1106. var pattern string
  1107. if strings.Contains(tmp[0], "\\u") {
  1108. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1109. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1110. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1111. } else {
  1112. pattern = tmp[0]
  1113. }
  1114. //log.Debug("pattern", pattern)
  1115. //fmt.Println(text)
  1116. reg := regexp.MustCompile(pattern)
  1117. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1118. for i, _ := range apos {
  1119. pos := apos[i]
  1120. for k, p := range posm {
  1121. if len(pos) > p {
  1122. if pos[p] == -1 || pos[p+1] == -1 {
  1123. continue
  1124. }
  1125. val := text[pos[p]:pos[p+1]]
  1126. if string(val) == "" {
  1127. continue
  1128. }
  1129. if sign == -1 {
  1130. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1131. } else {
  1132. rep[k+"_"+fmt.Sprint(i)] = val
  1133. }
  1134. }
  1135. }
  1136. }
  1137. //fmt.Println(text)
  1138. tmps := []map[string]interface{}{}
  1139. for i := 0; i < len(apos); i++ {
  1140. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1141. tmp := map[string]interface{}{
  1142. "field": vre.Field,
  1143. "code": vre.Code,
  1144. "ruletext": vre.RuleText,
  1145. "extfrom": text,
  1146. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1147. "type": "regexp",
  1148. "matchtype": "regcontent",
  1149. "blocktag": *tag,
  1150. "score": score,
  1151. }
  1152. tmps = append(tmps, tmp)
  1153. exfield := ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)], Value: rep[vre.Field+"_"+fmt.Sprint(i)]}
  1154. if tmp["blocktag"] != nil {
  1155. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1156. }
  1157. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1158. }
  1159. }
  1160. if len(tmps) > 0 {
  1161. extinfo[vre.Field] = tmps
  1162. }
  1163. }
  1164. } else {
  1165. pos := vre.RegCore.Reg.FindStringIndex(text)
  1166. val := ""
  1167. if len(pos) == 2 {
  1168. text = text[pos[1]:]
  1169. rs := regexp.MustCompile("[^\r\n\t]+")
  1170. tmp := rs.FindAllString(text, -1)
  1171. if len(tmp) > 0 {
  1172. val = tmp[0]
  1173. }
  1174. }
  1175. if val != "" {
  1176. tmps := []map[string]interface{}{}
  1177. tmp := map[string]interface{}{
  1178. "field": vre.Field,
  1179. "code": vre.Code,
  1180. "ruletext": vre.RuleText,
  1181. "extfrom": text,
  1182. "value": val,
  1183. "type": "regexp",
  1184. "matchtype": "regcontent",
  1185. "blocktag": *tag,
  1186. "score": score,
  1187. }
  1188. tmps = append(tmps, tmp)
  1189. extinfo[vre.Field] = tmps
  1190. if j.Result[vre.Field] == nil {
  1191. j.Result[vre.Field] = [](*ju.ExtField){}
  1192. }
  1193. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
  1194. if tmp["blocktag"] != nil {
  1195. field.BlockTag = tmp["blocktag"].(map[string]string)
  1196. }
  1197. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1198. }
  1199. }
  1200. return extinfo
  1201. }
  1202. //后置过滤
  1203. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1204. defer qu.Catch()
  1205. if in.IsLua {
  1206. result := GetResultMapForLua(j)
  1207. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1208. if j != nil {
  1209. lua.Block = j.Block
  1210. }
  1211. extinfo := lua.RunScript("back")
  1212. for k, v := range extinfo {
  1213. if tmps, ok := v.([]map[string]interface{}); ok {
  1214. j.Result[k] = [](*ju.ExtField){}
  1215. for _, tmp := range tmps {
  1216. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"]}
  1217. if tmp["blocktag"] != nil {
  1218. field.BlockTag = tmp["blocktag"].(map[string]string)
  1219. }
  1220. j.Result[k] = append(j.Result[k], field)
  1221. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1222. }
  1223. }
  1224. }
  1225. if len(extinfo) > 0 {
  1226. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1227. }
  1228. } else {
  1229. extinfo := map[string]interface{}{}
  1230. if in.Field != "" {
  1231. if j.Result[in.Field] != nil {
  1232. tmp := j.Result[in.Field]
  1233. exts := []interface{}{}
  1234. for k, v := range tmp {
  1235. //table抽取到的数据不清理
  1236. // if v.Type == "table" && v.Field != "projectname" {
  1237. // continue
  1238. // }
  1239. text := qu.ObjToString(v.Value)
  1240. if text != "" {
  1241. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1242. }
  1243. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1244. continue
  1245. }
  1246. j.Result[in.Field][k].Value = text
  1247. exts = append(exts, map[string]interface{}{
  1248. "field": v.Field,
  1249. "code": v.Code,
  1250. "ruletext": v.RuleText,
  1251. "type": v.Type,
  1252. "matchtype": v.MatchType,
  1253. "extfrom": v.ExtFrom,
  1254. "value": text,
  1255. })
  1256. }
  1257. if len(exts) > 0 {
  1258. extinfo[in.Field] = exts
  1259. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1260. }
  1261. }
  1262. } else {
  1263. for key, tmp := range j.Result {
  1264. exts := []interface{}{}
  1265. for k, v := range tmp {
  1266. if v.Type == "table" { //table抽取到的数据不清理
  1267. continue
  1268. }
  1269. text := qu.ObjToString(v.Value)
  1270. if text != "" {
  1271. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1272. }
  1273. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1274. continue
  1275. }
  1276. j.Result[key][k].Value = text
  1277. exts = append(exts, map[string]interface{}{
  1278. "field": v.Field,
  1279. "code": v.Code,
  1280. "ruletext": v.RuleText,
  1281. "type": v.Type,
  1282. "matchtype": v.MatchType,
  1283. "extfrom": v.ExtFrom,
  1284. "value": text,
  1285. })
  1286. }
  1287. if len(exts) > 0 {
  1288. extinfo[key] = exts
  1289. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1290. }
  1291. }
  1292. }
  1293. }
  1294. }
  1295. //后置过滤
  1296. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1297. defer qu.Catch()
  1298. for k, v := range j.BlockPackage {
  1299. if in.Field == "winner" {
  1300. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1301. } else if in.Field == "bidstatus" {
  1302. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1303. } else if in.Field == "" {
  1304. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1305. } else if in.Field == "projectname" {
  1306. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1307. }
  1308. }
  1309. }
  1310. //KV过滤
  1311. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1312. defer qu.Catch()
  1313. extinfo := map[string]interface{}{}
  1314. if in.Field != "" {
  1315. if j.Result[in.Field] != nil {
  1316. tmp := j.Result[in.Field]
  1317. exts := []interface{}{}
  1318. for k, v := range tmp {
  1319. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1320. continue
  1321. }
  1322. text := qu.ObjToString(v.Value)
  1323. if text != "" {
  1324. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1325. }
  1326. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1327. continue
  1328. }
  1329. j.Result[in.Field][k].Value = text
  1330. exts = append(exts, map[string]interface{}{
  1331. "field": v.Field,
  1332. "code": v.Code,
  1333. "ruletext": v.RuleText,
  1334. "type": v.Type,
  1335. "matchtype": v.MatchType,
  1336. "extfrom": v.ExtFrom,
  1337. "value": text,
  1338. })
  1339. }
  1340. if len(exts) > 0 {
  1341. extinfo[in.Field] = exts
  1342. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1343. }
  1344. }
  1345. }
  1346. }
  1347. //获取抽取结果map[string][]interface{},lua脚本使用
  1348. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1349. defer qu.Catch()
  1350. result := map[string][]map[string]interface{}{}
  1351. for key, val := range j.Result {
  1352. if result[key] == nil {
  1353. result[key] = []map[string]interface{}{}
  1354. }
  1355. for _, v := range val {
  1356. tmp := map[string]interface{}{
  1357. "field": v.Field,
  1358. "code": v.Code,
  1359. "ruletext": v.RuleText,
  1360. "value": v.Value,
  1361. "type": v.Type,
  1362. "matchtype": v.MatchType,
  1363. "extfrom": v.ExtFrom,
  1364. }
  1365. result[key] = append(result[key], tmp)
  1366. }
  1367. }
  1368. return result
  1369. }
  1370. //抽取日志
  1371. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1372. defer qu.Catch()
  1373. if !t.IsEtxLog {
  1374. return
  1375. }
  1376. logdata := map[string]interface{}{
  1377. "code": qu.If(v.Code == "", "kv", v.Code),
  1378. "name": v.Name,
  1379. "type": ftype,
  1380. "ruletext": v.RuleText,
  1381. "islua": v.IsLua,
  1382. "field": v.Field,
  1383. "version": t.Version,
  1384. "taskname": t.Name,
  1385. "before": before,
  1386. "extinfo": extinfo,
  1387. "sid": sid,
  1388. "comeintime": time.Now().Unix(),
  1389. }
  1390. lock.Lock()
  1391. ExtLogs[t] = append(ExtLogs[t], logdata)
  1392. lock.Unlock()
  1393. }
  1394. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1395. exts := []map[string]interface{}{}
  1396. exts = append(exts, map[string]interface{}{
  1397. "field": ext.Field,
  1398. "code": ext.Code,
  1399. "type": ftype,
  1400. "matchtype": matchtype,
  1401. "extfrom": ext.ExtFrom,
  1402. "value": ext.Value,
  1403. })
  1404. extinfo := map[string]interface{}{
  1405. ext.Field: exts,
  1406. }
  1407. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1408. }
  1409. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1410. defer qu.Catch()
  1411. if !t.IsEtxLog {
  1412. return
  1413. }
  1414. logdata := map[string]interface{}{
  1415. "code": code,
  1416. "name": name,
  1417. "type": ftype,
  1418. "ruletext": "",
  1419. "islua": false,
  1420. "field": field,
  1421. "version": t.Version,
  1422. "taskname": t.Name,
  1423. "before": before,
  1424. "extinfo": extinfo,
  1425. "sid": sid,
  1426. "comeintime": time.Now().Unix(),
  1427. }
  1428. lock.Lock()
  1429. ExtLogs[t] = append(ExtLogs[t], logdata)
  1430. lock.Unlock()
  1431. }
  1432. //保存抽取日志
  1433. func SaveExtLog() {
  1434. defer qu.Catch()
  1435. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1436. lock.Lock()
  1437. tmpLogs = ExtLogs
  1438. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1439. lock.Unlock()
  1440. for k, v := range tmpLogs {
  1441. if len(v) < saveLimit {
  1442. db.Mgo.SaveBulk(k.TrackColl, v...)
  1443. } else {
  1444. for {
  1445. if len(v) > saveLimit {
  1446. tmp := v[:saveLimit]
  1447. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1448. v = v[saveLimit:]
  1449. } else {
  1450. db.Mgo.SaveBulk(k.TrackColl, v...)
  1451. break
  1452. }
  1453. }
  1454. }
  1455. }
  1456. time.AfterFunc(10*time.Second, SaveExtLog)
  1457. }
  1458. type FieldValue struct {
  1459. Value interface{}
  1460. Count int
  1461. }
  1462. //分析抽取结果并保存
  1463. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1464. qu.Try(func() {
  1465. //重新取出清理过后的中标候选人
  1466. resetWinnerorder(j)
  1467. doc, result, _id := funcAnalysis(j, e)
  1468. if isSaveTag, _ := ju.Config["isSaveTag"].(bool); isSaveTag {
  1469. go otherNeedSave(j, result, e)
  1470. }
  1471. auxinfo := auxInfo(j)
  1472. //从排序结果中取值
  1473. tmp := map[string]interface{}{} //抽取值
  1474. tmp["jsondata"] = j.Jsondata
  1475. tmp["fieldall"] = auxinfo
  1476. for _, val := range result {
  1477. for _, v := range val { //取第一个非负数,项目名称除外
  1478. if v.Score > -1 {
  1479. tmp[v.Field] = v.Value
  1480. break
  1481. } else if v.Field == "projectname" {
  1482. tmp[v.Field] = v.Value
  1483. break
  1484. }
  1485. }
  1486. }
  1487. if len(j.PackageInfo) > 0 { //分包信息
  1488. tmp["package"] = j.PackageInfo
  1489. }
  1490. if len(j.Winnerorder) > 0 { //候选人信息
  1491. tmp["winnerorder"] = j.Winnerorder
  1492. }
  1493. //处理附件
  1494. var resultf map[string][]*ju.ExtField
  1495. if jf != nil {
  1496. _, resultf, _ = funcAnalysis(jf, e)
  1497. auxinfof := auxInfo(jf)
  1498. tmp["fieldallf"] = auxinfof
  1499. ffield := map[string]interface{}{}
  1500. for _, val := range resultf {
  1501. for _, v := range val { //取第一个非负数
  1502. if v.Score > -1 {
  1503. ffield[v.Field] = v.Value
  1504. break
  1505. }
  1506. }
  1507. }
  1508. if len(jf.PackageInfo) > 0 { //分包信息
  1509. ffield["package"] = jf.PackageInfo
  1510. }
  1511. if len(jf.Winnerorder) > 0 { //候选人信息
  1512. ffield["winnerorder"] = jf.Winnerorder
  1513. }
  1514. tmp["ffield"] = ffield
  1515. }
  1516. for k, v := range *doc {
  1517. //去重冗余字段
  1518. if delFiled(k) {
  1519. continue
  1520. }
  1521. if tmp[k] == nil {
  1522. tmp[k] = v
  1523. }
  1524. }
  1525. //质量审核
  1526. if ok, _ := ju.Config["qualityaudit"].(bool); ok {
  1527. e.QualityAudit(tmp)
  1528. }
  1529. if e.IsExtractCity { //城市抽取
  1530. //e.ExtractCity(j, tmp, _id)
  1531. e.NewExtractCity(j, tmp, _id)
  1532. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1533. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1534. // tmp["district"] = d
  1535. // if b {
  1536. // tmp["city"] = c
  1537. // tmp["area"] = p
  1538. // }
  1539. }
  1540. //品牌抽取
  1541. if ju.IsBrandGoods {
  1542. tmp["checkhas"] = map[string]int{
  1543. "hastable": j.HasTable,
  1544. "hasgoods": j.HasGoods,
  1545. "hasbrand": j.HasBrand,
  1546. "haskey": j.HasKey,
  1547. }
  1548. if len(j.BrandData) > 0 {
  1549. tmp["tablebrand"] = j.BrandData
  1550. }
  1551. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1552. }
  1553. //所有kv组成的字符串
  1554. var kvtext bytes.Buffer
  1555. blocks := make([]ju.BlockAndTag, 0)
  1556. for _, v := range j.Block {
  1557. //分包和标签
  1558. if ju.Config["saveblock"].(bool) {
  1559. xx, _ := json.Marshal(v)
  1560. tmpblock := new(ju.TmpBlock)
  1561. err := json.Unmarshal(xx, &tmpblock)
  1562. if err != nil {
  1563. if v.BPackage != nil {
  1564. bpb, _ := json.Marshal(v.BPackage)
  1565. tmpblock.BPackage = string(bpb)
  1566. }
  1567. tmpblock = rangeBlockToJson(v, *tmpblock)
  1568. }
  1569. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1570. }
  1571. //把所有kv组装成一个字符串,存库
  1572. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1573. if jv == nil {
  1574. continue
  1575. }
  1576. for jv_k, jv_v := range jv.KvTags {
  1577. for _, jv_vv := range jv_v {
  1578. kvtext.WriteString(jv_k)
  1579. kvtext.WriteString(":")
  1580. kvtext.WriteString(jv_vv.Value)
  1581. kvtext.WriteString(" ")
  1582. }
  1583. }
  1584. }
  1585. }
  1586. if kvtext.Len() > 0 {
  1587. tmp["kvtext"] = kvtext.String()
  1588. }
  1589. if len(blocks) > 0 {
  1590. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1591. if utf8.RuneCount(blocksBytes) < 100000 {
  1592. tmp["blocks"] = string(blocksBytes)
  1593. }
  1594. }
  1595. }
  1596. //tmp["extract_content"] = j.Content
  1597. if e.TaskInfo.TestColl == "" {
  1598. if len(tmp) > 0 { //保存抽取结果
  1599. for field, _ := range e.Fields {
  1600. if tmp[field] == nil {
  1601. tmp[field] = "" //覆盖之前版本数据
  1602. }
  1603. }
  1604. tmp["repeat"] = 0
  1605. tmparr := []map[string]interface{}{
  1606. map[string]interface{}{
  1607. "_id": qu.StringTOBsonId(_id),
  1608. },
  1609. map[string]interface{}{"$set": tmp},
  1610. }
  1611. e.RWMutex.Lock()
  1612. e.BidArr = append(e.BidArr, tmparr)
  1613. e.BidTotal++
  1614. e.RWMutex.Unlock()
  1615. }
  1616. if b, ok := ju.Config["saveresult"].(bool); ok && b {
  1617. id := tmp["_id"]
  1618. tmp["result"] = result
  1619. tmp["resultf"] = resultf
  1620. delete(tmp, "_id")
  1621. tmparr := []map[string]interface{}{
  1622. map[string]interface{}{
  1623. "_id": id,
  1624. },
  1625. map[string]interface{}{"$set": tmp},
  1626. }
  1627. e.RWMutex.Lock()
  1628. e.ResultArr = append(e.ResultArr, tmparr)
  1629. e.RWMutex.Unlock()
  1630. }
  1631. } else { //测试结果
  1632. delete(tmp, "_id")
  1633. if len(j.BlockPackage) > 0 { //分包详情
  1634. bs, _ := json.Marshal(j.BlockPackage)
  1635. tmp["epackage"] = string(bs)
  1636. }
  1637. tmp["result"] = result
  1638. tmp["resultf"] = resultf
  1639. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1640. if !b {
  1641. log.Debug(e.TaskInfo.TestColl, _id)
  1642. }
  1643. }
  1644. }, func(err interface{}) {
  1645. log.Debug("AnalysisSaveResult err", err)
  1646. })
  1647. }
  1648. //保存其他
  1649. //kv、表格、块上的标签凡是新的标签都入库
  1650. //val type times firstid createtime 判定field
  1651. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1652. now := time.Now().Unix()
  1653. coll := e.TaskInfo.TestColl
  1654. if coll == "" {
  1655. coll = "extract_tag_result"
  1656. } else {
  1657. coll += "_tag"
  1658. }
  1659. datas := []map[string]interface{}{}
  1660. kv := map[string]int{}
  1661. for _, v := range j.Block {
  1662. //
  1663. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1664. if vv == nil || vv.KvTags == nil {
  1665. continue
  1666. }
  1667. for kkk, vvv := range vv.KvTags {
  1668. for _, vvvv := range vvv {
  1669. if vvvv.IsInvalid {
  1670. kv[kkk] = kv[kkk] + 1
  1671. break
  1672. }
  1673. }
  1674. }
  1675. }
  1676. for _, vv := range v.NotClassifyTitles {
  1677. datas = append(datas, map[string]interface{}{
  1678. "val": vv,
  1679. "times": 0,
  1680. "type": "block",
  1681. "firstid": j.SourceMid,
  1682. "createtime": now,
  1683. })
  1684. if len(datas) == saveLimit {
  1685. db.Mgo.SaveBulk(coll, datas...)
  1686. datas = []map[string]interface{}{}
  1687. }
  1688. }
  1689. }
  1690. for k, v := range kv {
  1691. datas = append(datas, map[string]interface{}{
  1692. "val": k,
  1693. "times": v,
  1694. "type": "kv",
  1695. "firstid": j.SourceMid,
  1696. "createtime": now,
  1697. })
  1698. if len(datas) == saveLimit {
  1699. db.Mgo.SaveBulk(coll, datas...)
  1700. datas = []map[string]interface{}{}
  1701. }
  1702. }
  1703. if len(datas) > 0 {
  1704. db.Mgo.SaveBulk(coll, datas...)
  1705. }
  1706. }
  1707. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1708. if j == nil {
  1709. return nil
  1710. }
  1711. if len(j.Block) > 0 {
  1712. for i, v := range j.Block {
  1713. rangetmp := new(ju.TmpBlock)
  1714. vb, _ := json.Marshal(v)
  1715. json.Unmarshal(vb, &rangetmp)
  1716. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  1717. }
  1718. }
  1719. if j.ColonKV != nil {
  1720. cb, _ := json.Marshal(j.ColonKV)
  1721. tmpblock.ColonKV = string(cb)
  1722. }
  1723. if j.SpaceKV != nil {
  1724. sb, _ := json.Marshal(j.SpaceKV)
  1725. tmpblock.SpaceKV = string(sb)
  1726. }
  1727. if j.TableKV != nil {
  1728. tb, _ := json.Marshal(j.TableKV)
  1729. tmpblock.TableKV = string(tb)
  1730. }
  1731. return &tmpblock
  1732. }
  1733. //去重冗余字段
  1734. func delFiled(k string) bool {
  1735. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  1736. }
  1737. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  1738. defer qu.Catch()
  1739. doc := j.Data
  1740. result := j.Result
  1741. _id := qu.BsonIdToSId((*doc)["_id"])
  1742. result = ScoreFields(j, e.Tag)
  1743. //结果排序
  1744. for _, val := range result {
  1745. ju.Sort(val)
  1746. }
  1747. j.Result = JsonDataMergeProcessing(j, e)
  1748. return doc, result, _id
  1749. }
  1750. //辅助信息,如果没有排序先排序
  1751. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  1752. fieldalls := map[string][]map[string]interface{}{}
  1753. for field, val := range j.Result {
  1754. //ju.Sort(val)
  1755. sfields := []map[string]interface{}{}
  1756. for _, v := range val {
  1757. standardized := false
  1758. if field == "buyer" || field == "winner" || field == "agency" {
  1759. i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
  1760. if i > 0 {
  1761. standardized = true
  1762. }
  1763. }
  1764. sfield := map[string]interface{}{
  1765. "val": v.Value,
  1766. "type": v.Type,
  1767. "score": v.Score,
  1768. "blocktag": v.BlockTag,
  1769. "sourceval": v.SourceValue,
  1770. "standardized": standardized,
  1771. }
  1772. sfields = append(sfields, sfield)
  1773. }
  1774. fieldalls[field] = sfields
  1775. }
  1776. return fieldalls
  1777. }
  1778. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  1779. defer qu.Catch()
  1780. //获取审核字段
  1781. for _, field := range e.AuditFields {
  1782. //1.分包
  1783. if resulttmp["package"] != nil {
  1784. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  1785. for _, val := range packagedata {
  1786. if val[field] != nil {
  1787. fv := qu.ObjToString(val[field])
  1788. if fv != "" {
  1789. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1790. e.RedisMatch(field, fv, val) //redis匹配
  1791. } else { //除了buyer和winner,其他字段走规则匹配
  1792. e.RuleMatch(field, fv, val)
  1793. }
  1794. }
  1795. }
  1796. }
  1797. }
  1798. //2.外围
  1799. if resulttmp[field] != nil {
  1800. fv := qu.ObjToString(resulttmp[field])
  1801. if fv != "" {
  1802. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  1803. e.RedisMatch(field, fv, resulttmp) //redis匹配
  1804. } else { //除了buyer和winner,其他字段走规则匹配
  1805. e.RuleMatch(field, fv, resulttmp)
  1806. }
  1807. }
  1808. }
  1809. }
  1810. }
  1811. //Redis匹配
  1812. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  1813. defer qu.Catch()
  1814. i := redis.GetInt(field, field+"_"+fv) //查找redis
  1815. if i == 0 { //reids未找到,执行规则匹配
  1816. val[field+"_isredis"] = false
  1817. e.RuleMatch(field, fv, val) //规则匹配
  1818. } else { //redis找到,打标识存库
  1819. val[field+"_isredis"] = true
  1820. }
  1821. }
  1822. //规则匹配
  1823. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  1824. defer qu.Catch()
  1825. if fieldval != "" {
  1826. SMap := e.StartMatch(field, fieldval)
  1827. //SMap.AddKey(field+"_isaudit", false)
  1828. for _, k := range SMap.Keys {
  1829. tmpMap[k] = SMap.Map[k]
  1830. }
  1831. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  1832. }
  1833. }
  1834. //开始规则匹配
  1835. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  1836. defer qu.Catch()
  1837. SMap := pretreated.NewSortMap()
  1838. lock.Lock()
  1839. f := e.RecogFieldMap[field]
  1840. lock.Unlock()
  1841. if len(f) > 0 {
  1842. fid := qu.BsonIdToSId(f["_id"])
  1843. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  1844. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  1845. if textAfterRecogFieldPrerule != "" {
  1846. lock.Lock()
  1847. classMap := e.FidClassMap[fid]
  1848. lock.Unlock()
  1849. L:
  1850. for _, c := range classMap { //class
  1851. classid := qu.BsonIdToSId(c["_id"])
  1852. classPrerule := qu.ObjToString(c["s_class_prerule"])
  1853. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  1854. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  1855. if textAfterClassPrerule != "" {
  1856. lock.Lock()
  1857. ruleMap := e.CidRuleMap[classid]
  1858. lock.Unlock()
  1859. for _, r := range ruleMap { //rule
  1860. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  1861. s_name := qu.ObjToString(r["s_name"])
  1862. rule := r["rule"].([]interface{})
  1863. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  1864. if textAfterRulePrerule != "" {
  1865. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  1866. if b { //匹配到一个分类下某个规则时,不再继续匹配
  1867. if savefield != "" { //保存字段不为空,存储代码信息
  1868. SMap.AddKey(field+"_"+savefield, s_name)
  1869. }
  1870. break L
  1871. }
  1872. }
  1873. }
  1874. }
  1875. }
  1876. }
  1877. }
  1878. return SMap
  1879. }
  1880. //中标候选人经过清理之后,重新取出赋值
  1881. func resetWinnerorder(j *ju.Job) {
  1882. if len(j.Winnerorder) == 0 {
  1883. return
  1884. }
  1885. maxlen := len(j.Winnerorder) - 1
  1886. //中标单位
  1887. i := 0
  1888. winners := []*ju.ExtField{}
  1889. for _, v := range j.Result["winner"] {
  1890. if v.Code == "winnerorder" {
  1891. if maxlen < i {
  1892. continue
  1893. }
  1894. j.Winnerorder[i]["entname"] = v.Value
  1895. i++
  1896. } else {
  1897. winners = append(winners, v)
  1898. }
  1899. }
  1900. j.Result["winner"] = winners
  1901. //中标金额
  1902. i = 0
  1903. bidamounts := []*ju.ExtField{}
  1904. for _, v := range j.Result["bidamount"] {
  1905. if v.Code == "winnerorder" {
  1906. if maxlen < i {
  1907. continue
  1908. }
  1909. j.Winnerorder[i]["price"] = v.Value
  1910. i++
  1911. } else {
  1912. bidamounts = append(bidamounts, v)
  1913. }
  1914. }
  1915. j.Result["bidamount"] = bidamounts
  1916. }