extract.go 72 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  31. //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. )
  34. //启动测试抽取
  35. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  36. defer qu.Catch()
  37. ext := TaskList[taskId]
  38. if ext == nil {
  39. ext = &ExtractTask{}
  40. ext.Id = taskId
  41. ext.InitTestTaskInfo(resultcoll, trackcoll)
  42. ext.IsRun = true
  43. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  44. }
  45. ext.InitSite()
  46. ext.InitRulePres()
  47. ext.InitRuleBacks(false)
  48. ext.InitRuleBacks(true)
  49. ext.InitRuleCore(false)
  50. ext.InitRuleCore(true)
  51. ext.InitPkgCore()
  52. ext.InitBlockRule()
  53. ext.InfoTypeList()
  54. ext.InitTag(false)
  55. ext.InitTag(true)
  56. ext.InitClearFn(false)
  57. ext.InitClearFn(true)
  58. ext.Lock()
  59. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  60. //初始化城市DFA信息
  61. ext.InitCityInfo()
  62. //ext.InitCityDFA()
  63. ext.InitAreaCode()
  64. ext.InitPostCode()
  65. }
  66. ext.Unlock()
  67. //质量审核
  68. ext.InitAuditFields()
  69. ext.InitAuditRule()
  70. ext.InitAuditClass()
  71. ext.InitAuditRecogField()
  72. //品牌抽取是否开启
  73. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  74. //价格个数抽取是否开启
  75. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  76. //附件抽取是否开启
  77. ext.InitFile()
  78. ext.TaskInfo.TestColl = resultcoll
  79. TaskList[taskId] = ext
  80. return RunExtractTestTask(ext, startId, num)
  81. }
  82. func IdTrans(startId string) bson.ObjectId {
  83. defer qu.Catch()
  84. return bson.ObjectIdHex(startId)
  85. }
  86. //开始测试任务抽取
  87. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  88. n, _ := strconv.Atoi(num)
  89. id := IdTrans(startId)
  90. if id.Valid() {
  91. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  92. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  93. for _, v := range *list {
  94. //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
  95. // continue
  96. //}
  97. if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时
  98. continue
  99. }
  100. var j, jf *ju.Job
  101. var isSite bool
  102. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  103. v["isextFile"] = true
  104. j, jf, isSite = ext.PreInfo(v)
  105. } else {
  106. j, _, isSite = ext.PreInfo(v)
  107. }
  108. go ext.ExtractProcess(j, jf, isSite)
  109. ext.TaskInfo.ProcessPool <- true
  110. }
  111. return true
  112. } else {
  113. return false
  114. }
  115. }
  116. //启动抽取
  117. func StartExtractTaskId(taskId string) bool {
  118. defer qu.Catch()
  119. isgo := false
  120. ext := TaskList[taskId]
  121. if ext == nil {
  122. ext = &ExtractTask{}
  123. ext.Id = taskId
  124. ext.InitTaskInfo()
  125. isgo = true
  126. } else {
  127. ext.Id = taskId
  128. ext.InitTaskInfo()
  129. }
  130. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  131. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  132. ext.InitSite()
  133. ext.InitRulePres()
  134. ext.InitRuleBacks(false)
  135. ext.InitRuleBacks(true)
  136. ext.InitRuleCore(false)
  137. ext.InitRuleCore(true)
  138. ext.InitPkgCore()
  139. ext.InitBlockRule()
  140. ext.InfoTypeList()
  141. ext.InitTag(false)
  142. ext.InitTag(true)
  143. ext.InitClearFn(false)
  144. ext.InitClearFn(true)
  145. ext.Lock()
  146. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  147. //初始化城市DFA信息
  148. //ext.InitCityDFA()
  149. ext.InitCityInfo()
  150. ext.InitAreaCode()
  151. ext.InitPostCode()
  152. }
  153. ext.Unlock()
  154. //质量审核
  155. ext.InitAuditFields()
  156. ext.InitAuditRule()
  157. ext.InitAuditClass()
  158. ext.InitAuditRecogField()
  159. //品牌抽取是否开启
  160. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  161. //价格个数抽取是否开启
  162. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  163. //附件抽取是否开启
  164. ext.InitFile()
  165. ext.IsRun = true
  166. go ext.ResultSave(true)
  167. go ext.BidSave(true)
  168. if isgo {
  169. go RunExtractTask(taskId)
  170. }
  171. TaskList[taskId] = ext
  172. return true
  173. }
  174. //停止抽取
  175. func StopExtractTaskId(taskId string) bool {
  176. defer qu.Catch()
  177. ext := TaskList[taskId]
  178. if ext != nil {
  179. ext.IsRun = false
  180. TaskList[taskId] = ext
  181. }
  182. //更新task.s_extlastid
  183. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  184. return true
  185. }
  186. //开始抽取
  187. func RunExtractTask(taskId string) {
  188. defer qu.Catch()
  189. ext := TaskList[taskId]
  190. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  191. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  192. pageNum := (count + PageSize - 1) / PageSize
  193. limit := PageSize
  194. if count < PageSize {
  195. limit = count
  196. }
  197. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  198. for i := 0; i < pageNum; i++ {
  199. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  200. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  201. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  202. for _, v := range *list {
  203. //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  204. // continue
  205. //}
  206. //根据标题判断是否抽取
  207. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  208. if !b {
  209. continue
  210. }
  211. _id := qu.BsonIdToSId(v["_id"])
  212. //log.Debug(_id)
  213. if !ext.IsRun {
  214. break
  215. }
  216. var j, jf *ju.Job
  217. var isSite bool
  218. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  219. v["isextFile"] = true
  220. j, jf, isSite = ext.PreInfo(v)
  221. } else {
  222. j, _, isSite = ext.PreInfo(v)
  223. }
  224. go ext.ExtractProcess(j, jf, isSite)
  225. ext.TaskInfo.LastExtId = _id
  226. ext.TaskInfo.ProcessPool <- true
  227. }
  228. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  229. if !ext.IsRun {
  230. break
  231. }
  232. }
  233. //更新task.s_extlastid
  234. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  235. }
  236. //信息预处理-不和版本关联,取最新版本的配置项
  237. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  238. return (&ExtractTask{}).PreInfo(doc)
  239. }
  240. var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  241. //信息预处理-和版本关联
  242. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  243. defer qu.Catch()
  244. //判断是否有附件这个字段
  245. var isextFile bool
  246. if doc["isextFile"] != nil {
  247. isextFile = doc["isextFile"].(bool)
  248. }
  249. detail := ""
  250. d1, _ := doc["detail"].(string)
  251. d2, _ := doc["contenthtml"].(string)
  252. if len(d1) >= len(d2) || d2 == "" {
  253. detail = d1
  254. } else {
  255. detail = d2
  256. }
  257. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  258. d3, _ := doc["summary"].(string)
  259. //全文的需要修复表格
  260. detail = pretreated.RepairCon(detail)
  261. detail = ju.CutLableStr(d3 + "\n" + detail)
  262. detail = cut.ClearHtml(d3 + "\n" + detail)
  263. doc["detail"] = detail
  264. isClearnMoney := !clearMoneyReg.MatchString(detail)
  265. if isClearnMoney {
  266. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  267. }
  268. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  269. if isextFile {
  270. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  271. }
  272. //正文小于200个字,有附件把附件内容加到正文
  273. //tmpDeatil := detail
  274. //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  275. //if err == nil {
  276. // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  277. // if conlen < 2000 {
  278. // if isextFile {
  279. // detail += qu.ObjToString(doc["detailfile"])
  280. // doc["detail"] = detail
  281. // }
  282. // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
  283. // //防止文本过长,造成抽取阻塞
  284. // log.Debug("文本太长", doc["_id"], conlen)
  285. // doc["detail"] = d3
  286. // }
  287. //}
  288. toptype := qu.ObjToString(doc["toptype"])
  289. subtype := qu.ObjToString(doc["subtype"])
  290. if qu.ObjToString(doc["type"]) == "bid" {
  291. toptype = "结果"
  292. }
  293. if toptype == "" {
  294. toptype = "all"
  295. }
  296. if subtype == "" {
  297. subtype = "all"
  298. }
  299. if toptype == "其它" || subtype == "其它" || subtype == "其他" || subtype == "结果变更" {
  300. toptype = "all"
  301. subtype = "all"
  302. }
  303. toMap := qu.ObjToMap(doc["jsondata"])
  304. //log.Debug("toMap", toMap)
  305. if (*toMap) != nil {
  306. if (*toMap)["extweight"] == nil {
  307. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  308. }
  309. if (*toMap)["jsoncontent"] != nil {
  310. delete(*toMap, "jsoncontent")
  311. }
  312. for k,v := range *toMap{
  313. if _,ok := v.(float64);ok{
  314. continue
  315. }else if _,ok := v.(int64);ok{
  316. continue
  317. }else if _,ok2 := v.(string);ok2{
  318. continue
  319. }else {
  320. delete(*toMap,k)
  321. }
  322. }
  323. }
  324. j = &ju.Job{
  325. SourceMid: qu.BsonIdToSId(doc["_id"]),
  326. Category: toptype,
  327. CategorySecond: subtype,
  328. Content: qu.ObjToString(doc["detail"]),
  329. SpiderCode: qu.ObjToString(doc["spidercode"]),
  330. Site: qu.ObjToString(doc["site"]),
  331. //Domain: qu.ObjToString(doc["domain"]),
  332. //Href: qu.ObjToString(doc["href"]),
  333. Title: qu.ObjToString(doc["title"]),
  334. Data: &doc,
  335. City: qu.ObjToString(doc["city"]),
  336. Province: qu.ObjToString(doc["area"]),
  337. Jsondata: toMap,
  338. Result: map[string][]*ju.ExtField{},
  339. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  340. RuleBlock: e.RuleBlock,
  341. Dataging: qu.IntAll(doc["dataging"]),
  342. IsClearnMoney: isClearnMoneystr,
  343. }
  344. if isextFile {
  345. jf = &ju.Job{
  346. SourceMid: qu.BsonIdToSId(doc["_id"]),
  347. Category: toptype,
  348. CategorySecond: subtype,
  349. Content: qu.ObjToString(doc["detailfile"]),
  350. SpiderCode: qu.ObjToString(doc["spidercode"]),
  351. Site: qu.ObjToString(doc["site"]),
  352. Title: qu.ObjToString(doc["title"]),
  353. Data: &doc,
  354. City: qu.ObjToString(doc["city"]),
  355. Province: qu.ObjToString(doc["area"]),
  356. Jsondata: toMap,
  357. Result: map[string][]*ju.ExtField{},
  358. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  359. RuleBlock: e.RuleBlock,
  360. IsFile: isextFile,
  361. Dataging: qu.IntAll(doc["dataging"]),
  362. IsClearnMoney: isClearnMoneystr,
  363. }
  364. }
  365. codeSite := j.SpiderCode
  366. //是否启用站点
  367. if value, ok := e.SiteMerge.Load(codeSite); ok {
  368. isSite = value.(bool)
  369. }
  370. if isSite {
  371. //是否配置站点
  372. exp, isSite := e.Luacodes.Load(codeSite)
  373. if isSite {
  374. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  375. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  376. }
  377. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  378. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  379. }
  380. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  381. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  382. }
  383. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  384. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  385. }
  386. }
  387. }
  388. qu.Try(func() {
  389. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  390. if isextFile && strings.TrimSpace(jf.Content) != "" {
  391. pretreated.AnalyStart(jf, isSite, codeSite)
  392. }
  393. }, func(err interface{}) {
  394. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  395. })
  396. return j, jf, isSite
  397. }
  398. var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  399. var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  400. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  401. func file2text(doc *map[string]interface{}) {
  402. mnameone := map[string]bool{}
  403. mname := map[string]bool{}
  404. murl := map[string]string{}
  405. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  406. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  407. for _, attachs := range attach_text {
  408. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  409. for _, fileinfo := range fileinfos {
  410. if ff, ok := fileinfo.(map[string]interface{}); ok {
  411. attach_url := qu.ObjToString(ff["attach_url"])
  412. ffname := qu.ObjToString(ff["file_name"])
  413. if clearStrReg.MatchString(ffname) {
  414. continue
  415. }
  416. mname[ffname] = true
  417. murl[ffname] = attach_url
  418. if sortStrReg.MatchString(ffname) {
  419. mnameone[ffname] = true
  420. }
  421. }
  422. }
  423. }
  424. }
  425. }
  426. tmpstr := ""
  427. for k := range mnameone {
  428. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  429. (*doc)["detailfile"] = tmpstr
  430. return
  431. }
  432. bs := ju.OssGetObject(murl[k])
  433. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  434. tmpstr += bs + "\n"
  435. }
  436. }
  437. for k := range mname {
  438. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  439. (*doc)["detailfile"] = tmpstr
  440. return
  441. }
  442. bs := ju.OssGetObject(murl[k])
  443. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  444. tmpstr += bs + "\n"
  445. }
  446. }
  447. (*doc)["detailfile"] = tmpstr
  448. }
  449. //抽取
  450. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  451. e.ExtractDetail(j, isSite, j.SpiderCode)
  452. if jf != nil && jf.IsFile {
  453. e.ExtractDetail(jf, isSite, j.SpiderCode)
  454. for tmpk, xs := range jf.Result {
  455. if len(j.Result[tmpk]) == 0 {
  456. if tmpk == "budget" || tmpk == "bidamount" {
  457. for _, v := range xs {
  458. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  459. j.Result[tmpk] = append(j.Result[tmpk], v)
  460. }
  461. }
  462. } else {
  463. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  464. }
  465. }
  466. }
  467. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  468. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  469. }
  470. }
  471. if isSite {
  472. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  473. if ok && ismerge.(bool) {
  474. tmpj := &ju.Job{
  475. SourceMid: j.SourceMid,
  476. Category: j.Category,
  477. CategorySecond: j.CategorySecond,
  478. Content: j.Content,
  479. SpiderCode: j.SpiderCode,
  480. //Domain: qu.ObjToString(doc["domain"]),
  481. //Href: qu.ObjToString(doc["href"]),
  482. Title: j.Title,
  483. Data: j.Data,
  484. City: j.City,
  485. Province: j.Province,
  486. Jsondata: j.Jsondata,
  487. Result: map[string][]*ju.ExtField{},
  488. BuyerAddr: j.BuyerAddr,
  489. RuleBlock: e.RuleBlock,
  490. }
  491. qu.Try(func() {
  492. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  493. }, func(err interface{}) {
  494. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  495. })
  496. e.ExtractDetail(tmpj, false, "")
  497. //if jf != nil && jf.IsFile {
  498. // e.ExtractFile(jf, false, "")
  499. //}
  500. //合并数据
  501. j.Block = append(j.Block, tmpj.Block...)
  502. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  503. for tmpk, _ := range j.Result {
  504. if len(tmpj.Result[tmpk]) > 0 {
  505. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  506. }
  507. }
  508. for tmpk, _ := range tmpj.Result {
  509. if len(j.Result[tmpk]) == 0 {
  510. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  511. }
  512. }
  513. }
  514. }
  515. //分析抽取结果并保存
  516. AnalysisSaveResult(j, jf, e)
  517. <-e.TaskInfo.ProcessPool
  518. }
  519. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  520. qu.Try(func() {
  521. doc := *j.Data
  522. //全局前置规则,结果覆盖doc属性
  523. //for _, v := range e.RulePres {
  524. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  525. //}
  526. tmprules := map[string][]*RuleCore{}
  527. lockrule.Lock()
  528. if j.Category == "all" || j.CategorySecond == "all" {
  529. if isSite {
  530. for k, vc1 := range e.SiteRuleCores["all_all"] {
  531. tmprules[k] = vc1
  532. }
  533. } else {
  534. for k, vc1 := range e.RuleCores["all_all"] {
  535. tmprules[k] = vc1
  536. }
  537. }
  538. } else {
  539. if isSite {
  540. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  541. tmprules[k] = vc1
  542. }
  543. } else {
  544. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  545. tmprules[k] = vc1
  546. }
  547. }
  548. }
  549. if len(tmprules) < 1 { //分类未覆盖部分
  550. if isSite {
  551. for k, vc1 := range e.RuleCores["all_all"] {
  552. tmprules[k] = vc1
  553. }
  554. } else {
  555. for k, vc1 := range e.SiteRuleCores["all_all"] {
  556. tmprules[k] = vc1
  557. }
  558. }
  559. }
  560. lockrule.Unlock()
  561. //抽取规则
  562. for _, vc1 := range tmprules {
  563. for _, vc := range vc1 {
  564. tmp := ju.DeepCopy(doc).(map[string]interface{})
  565. //是否进入逻辑
  566. if !ju.Logic(vc.LuaLogic, tmp) {
  567. continue
  568. }
  569. ////抽取-前置规则
  570. //for _, v := range vc.RulePres {
  571. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  572. //}
  573. // log.Debug("抽取-前置规则", tmp)
  574. //抽取-规则
  575. ExtRuleCore(tmp, e, vc, j, isSite)
  576. // log.Debug("抽取-规则", tmp)
  577. //抽取-后置规则
  578. for _, v := range vc.RuleBacks {
  579. ExtRegBack(j, v, e.TaskInfo, vc)
  580. }
  581. //kv规则
  582. for _, v := range vc.KVRuleCores {
  583. ExtRuleKV(j, v, e.TaskInfo)
  584. }
  585. // log.Debug("抽取-后置规则", tmp)
  586. //项目名称未能抽取到,标题来凑
  587. if vc.Field == "projectname" {
  588. if vc.ExtFrom == "title" {
  589. isextitle := true
  590. for _, v := range j.Result[vc.Field] {
  591. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  592. isextitle = false
  593. break
  594. }
  595. }
  596. if isextitle { //标题加入选举
  597. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  598. if isSite {
  599. field.Score = 1
  600. }
  601. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  602. }
  603. }
  604. for i := 0; i < 3; i++ {
  605. for _, v := range vc.RuleBacks {
  606. ExtRegBack(j, v, e.TaskInfo, vc)
  607. }
  608. }
  609. }
  610. }
  611. }
  612. //全局后置规则
  613. if isSite {
  614. for _, v := range e.SiteRuleBacks {
  615. ExtRegBack(j, v, e.TaskInfo, nil)
  616. }
  617. } else {
  618. for _, v := range e.RuleBacks {
  619. ExtRegBack(j, v, e.TaskInfo, nil)
  620. }
  621. }
  622. //函数清理
  623. for key, val := range j.Result {
  624. for i, v := range val {
  625. if v.Field == "projectname" && v.Type == "table" {
  626. break
  627. }
  628. lockclear.Lock()
  629. var cfn = []string{}
  630. if isSite {
  631. cfn = e.SiteClearFn[key]
  632. } else {
  633. cfn = e.ClearFn[key]
  634. }
  635. lockclear.Unlock()
  636. if len(cfn) == 0 {
  637. continue
  638. }
  639. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  640. if key == "budget" || key == "bidamount" {
  641. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  642. j.Result[key][i].IsTrue = true
  643. } else {
  644. j.Result[key][i].Value = data[0]
  645. continue
  646. }
  647. }
  648. before, _ := v.Value.(string)
  649. v.Value = data[0]
  650. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  651. //添加行数清理的日志
  652. //清理特殊符号
  653. lockclear.Lock()
  654. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  655. text := qu.ObjToString(v.Value)
  656. before = text
  657. v.Value = clear.OtherClean(key, text)
  658. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  659. }
  660. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  661. lockclear.Unlock()
  662. }
  663. }
  664. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  665. // bs, _ := json.Marshal(j.Result)
  666. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  667. }, func(err interface{}) {
  668. log.Debug("ExtractProcess err", err)
  669. })
  670. }
  671. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  672. qu.Try(func() {
  673. doc := *j.Data
  674. //全局前置规则,结果覆盖doc属性
  675. // for _, v := range e.RulePres {
  676. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  677. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  678. // }
  679. // }
  680. //抽取规则
  681. tmprules := map[string][]*RuleCore{}
  682. lockrule.Lock()
  683. if j.Category == "all" || j.CategorySecond == "all" {
  684. for k, vc1 := range e.RuleCores["all_all"] {
  685. tmprules[k] = vc1
  686. }
  687. } else {
  688. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  689. tmprules[k] = vc1
  690. }
  691. }
  692. lockrule.Unlock()
  693. for _, vc1 := range tmprules {
  694. for _, vc := range vc1 {
  695. tmp := ju.DeepCopy(doc).(map[string]interface{})
  696. //是否进入逻辑
  697. if !ju.Logic(vc.LuaLogic, tmp) {
  698. continue
  699. }
  700. //抽取-前置规则
  701. // for _, v := range vc.RulePres {
  702. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  703. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  704. // }
  705. // }
  706. // log.Debug("抽取-前置规则", tmp)
  707. //抽取-规则
  708. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  709. ExtRuleCore(tmp, e, vc, j, isSite)
  710. }
  711. // log.Debug("抽取-规则", tmp)
  712. //抽取-后置规则
  713. for _, v := range vc.RuleBacks {
  714. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  715. ExtRegBack(j, v, e.TaskInfo, vc)
  716. }
  717. }
  718. // log.Debug("抽取-后置规则", tmp)
  719. }
  720. }
  721. //全局后置规则
  722. for _, v := range e.RuleBacks {
  723. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  724. ExtRegBack(j, v, e.TaskInfo, nil)
  725. }
  726. }
  727. //函数清理
  728. for key, val := range j.Result {
  729. for _, v := range val {
  730. lockclear.Lock()
  731. cfn := e.ClearFn[key]
  732. lockclear.Unlock()
  733. if len(cfn) == 0 {
  734. continue
  735. }
  736. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  737. v.Value = data[0]
  738. //清理特殊符号
  739. lockclear.Lock()
  740. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  741. clear.MesField[key] != nil {
  742. text := qu.ObjToString(v.Value)
  743. text = clear.OtherClean(key, text)
  744. v.Value = text
  745. }
  746. lockclear.Unlock()
  747. }
  748. }
  749. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  750. // bs, _ := json.Marshal(j.Result)
  751. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  752. }, func(err interface{}) {
  753. log.Debug("ExtractProcess err", err)
  754. })
  755. }
  756. //前置过滤
  757. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  758. defer qu.Catch()
  759. before := ju.DeepCopy(doc).(map[string]interface{})
  760. extinfo := map[string]interface{}{}
  761. if in.IsLua {
  762. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  763. if j != nil {
  764. lua.Block = j.Block
  765. }
  766. extinfo = lua.RunScript("pre")
  767. for k, v := range extinfo { //结果覆盖原doc
  768. doc[k] = v
  769. }
  770. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  771. } else {
  772. var key string
  773. if !j.IsFile {
  774. key = qu.If(in.Field == "", "detail", in.Field).(string)
  775. } else {
  776. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  777. }
  778. text := qu.ObjToString(doc[key])
  779. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  780. doc[key] = extinfo[key] //结果覆盖原doc
  781. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  782. }
  783. return doc
  784. }
  785. //抽取-规则
  786. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  787. //候选人加入
  788. var kvMap map[string][]map[string]interface{}
  789. extByReg := true
  790. if vc.ExtFrom != "title" {
  791. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  792. }
  793. for _, v := range vc.RuleCores {
  794. if v.IsLua {
  795. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  796. } else if extByReg {
  797. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  798. }
  799. }
  800. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  801. if vc.Field == "budget" && len(kvMap) == 0 {
  802. if len(j.BlockPackage) == 1 {
  803. for _, bp := range j.BlockPackage {
  804. for fieldname, field := range vc.LFields {
  805. if field != vc.Field {
  806. continue
  807. }
  808. tp := ""
  809. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  810. if k == 0 {
  811. tp = "colon"
  812. } else if k == 1 {
  813. tp = "space"
  814. } else if k == 2 {
  815. tp = "table"
  816. }
  817. if v == nil || v.KvTags == nil {
  818. continue
  819. }
  820. for _, vv := range v.KvTags[fieldname] {
  821. text := ju.TrimLRSpace(vv.Value, "")
  822. if text != "" {
  823. tmp := &ju.ExtField{
  824. ExtFrom: "package",
  825. Field: vc.Field,
  826. Code: "CL_分包",
  827. Type: tp,
  828. MatchType: "package",
  829. RuleText: bp.Text,
  830. SourceValue: vv.Key,
  831. Value: text,
  832. }
  833. if isSite {
  834. tmp.Score = 1
  835. }
  836. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  837. }
  838. }
  839. }
  840. }
  841. break
  842. }
  843. }
  844. } else {
  845. for k, v := range kvMap {
  846. if j.Result[k] == nil {
  847. j.Result[k] = [](*ju.ExtField){}
  848. }
  849. for _, tmp := range v {
  850. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  851. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  852. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  853. MatchType: qu.ObjToString(tmp["matchtype"]),
  854. RuleText: qu.ObjToString(tmp["ruletext"]),
  855. SourceValue: tmp["sourcevalue"],
  856. Value: tmp["value"]}
  857. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  858. field.Score = 1
  859. }
  860. if isSite {
  861. field.Score = 1
  862. }
  863. if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
  864. moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
  865. if len(moneys) > 0 {
  866. if vf, ok := moneys[0].(float64); ok {
  867. field.Value = vf
  868. field.IsTrue = moneys[len(moneys)-1].(bool)
  869. } else if vi, ok := moneys[0].(int); ok {
  870. field.Value = float64(vi)
  871. field.IsTrue = moneys[len(moneys)-1].(bool)
  872. }
  873. }
  874. }
  875. if tmp["blocktag"] != nil {
  876. btag := make(map[string]string)
  877. for k := range tmp["blocktag"].(map[string]bool) {
  878. blocktag.Lock()
  879. if TagConfigDesc[k] != "" {
  880. btag[k] = TagConfigDesc[k]
  881. }
  882. blocktag.Unlock()
  883. }
  884. field.BlockTag = btag
  885. }
  886. j.Result[k] = append(j.Result[k], field)
  887. }
  888. }
  889. }
  890. }
  891. //抽取-规则-kv
  892. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  893. defer qu.Catch()
  894. if extfrom == "title" || !in.IsLua {
  895. return
  896. }
  897. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  898. lua.KvMap = *kvMap
  899. lua.Block = j.Block
  900. extinfo := lua.RunScript("core")
  901. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  902. for _, v := range tmps {
  903. v["core"] = in.Code
  904. }
  905. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  906. }
  907. if len(extinfo) > 0 {
  908. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  909. }
  910. }
  911. //抽取-规则-正则
  912. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  913. defer qu.Catch()
  914. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  915. b := IsExtract(in.Field, j.Title, j.Content)
  916. if !b {
  917. return
  918. }
  919. //全文正则
  920. //text := qu.ObjToString(doc[extfrom])
  921. //if in.Field != "" {
  922. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  923. // if len(extinfo) > 0 {
  924. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  925. // }
  926. //}
  927. //块抽取
  928. if in.Field != "" {
  929. if extfrom == "title" {
  930. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  931. if len(extinfo) > 0 {
  932. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  933. }
  934. } else {
  935. for _, v := range j.Block {
  936. btag := make(map[string]string)
  937. for k := range v.Classify {
  938. blocktag.Lock()
  939. btag[k] = TagConfigDesc[k]
  940. blocktag.Unlock()
  941. }
  942. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  943. if len(extinfo) > 0 {
  944. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  945. }
  946. }
  947. }
  948. }
  949. }
  950. //pkg抽取-规则-正则
  951. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  952. defer qu.Catch()
  953. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  954. b := IsExtract(in.Field, j.Title, j.Content)
  955. if !b {
  956. return
  957. }
  958. //块抽取
  959. if in.Field != "" {
  960. for k, vbpkg := range j.BlockPackage {
  961. rep := map[string]string{}
  962. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  963. if in.Field == "budget" && vbpkg.Budget > 0 {
  964. continue
  965. }
  966. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  967. continue
  968. }
  969. if in.Field == "winner" && vbpkg.Winner != "" {
  970. continue
  971. }
  972. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  973. continue
  974. }
  975. if in.Field == "projectname" && vbpkg.Name != "" {
  976. continue
  977. }
  978. if in.Field == "winner" && vbpkg.Winner != "" {
  979. continue
  980. }
  981. if in.Field == "winnerperson" {
  982. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  983. continue
  984. }
  985. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  986. continue
  987. }
  988. }
  989. if in.Field == "winnertel" {
  990. if vbpkg.WinnerPerson == "" {
  991. continue
  992. }
  993. }
  994. //处理正负数修正
  995. ptmp := strings.Split(in.RuleText, "#")
  996. sign := 0
  997. if len(ptmp) == 2 {
  998. if ptmp[1] == "正" {
  999. sign = 1
  1000. } else if ptmp[1] == "负" {
  1001. sign = -1
  1002. }
  1003. }
  1004. tmp := strings.Split(ptmp[0], "__")
  1005. if len(tmp) == 2 {
  1006. epos := strings.Split(tmp[1], ",")
  1007. posm := map[string]int{}
  1008. for _, v := range epos {
  1009. ks := strings.Split(v, ":")
  1010. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1011. posm[ks[1]] = qu.IntAll(ks[0])
  1012. } else {
  1013. posm[in.Field] = qu.IntAll(ks[0])
  1014. }
  1015. }
  1016. var pattern string
  1017. if strings.Contains(tmp[0], "\\u") {
  1018. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1019. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1020. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1021. } else {
  1022. pattern = tmp[0]
  1023. }
  1024. //log.Debug("pattern", pattern)
  1025. //fmt.Println(text)
  1026. reg := regexp.MustCompile(pattern)
  1027. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  1028. for i, _ := range apos {
  1029. pos := apos[i]
  1030. for k, p := range posm {
  1031. if len(pos) > p {
  1032. if pos[p] == -1 || pos[p+1] == -1 {
  1033. continue
  1034. }
  1035. val := vbpkg.Text[pos[p]:pos[p+1]]
  1036. if string(val) == "" {
  1037. continue
  1038. }
  1039. if sign == -1 {
  1040. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1041. } else {
  1042. rep[k+"_"+fmt.Sprint(i)] = val
  1043. }
  1044. }
  1045. }
  1046. }
  1047. //fmt.Println(text)
  1048. for i := 0; i < len(apos); i++ {
  1049. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  1050. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1051. lock.Lock()
  1052. cfn := e.ClearFn[in.Field]
  1053. lock.Unlock()
  1054. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1055. if data[len(data)-1].(bool) {
  1056. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1057. j.BlockPackage[k].IsTrueBudget = true
  1058. }
  1059. break
  1060. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1061. lock.Lock()
  1062. cfn := e.ClearFn[in.Field]
  1063. lock.Unlock()
  1064. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1065. if data[len(data)-1].(bool) {
  1066. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1067. j.BlockPackage[k].IsTrueBidamount = true
  1068. }
  1069. break
  1070. } else if in.Field == "winner" {
  1071. if j.BlockPackage[k].Winner == "" {
  1072. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  1073. break
  1074. }
  1075. } else if in.Field == "winnertel" {
  1076. if j.BlockPackage[k].WinnerTel == "" {
  1077. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1078. break
  1079. }
  1080. } else if in.Field == "winnerperson" {
  1081. if j.BlockPackage[k].WinnerPerson == "" {
  1082. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1083. break
  1084. }
  1085. } else if in.Field == "bidstatus" {
  1086. if j.BlockPackage[k].BidStatus == "" {
  1087. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1088. break
  1089. }
  1090. } else if in.Field == "projectname" {
  1091. if j.BlockPackage[k].Name == "" {
  1092. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1093. break
  1094. }
  1095. } else if in.Field == "winnerperson" {
  1096. if j.BlockPackage[k].WinnerPerson == "" {
  1097. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1098. break
  1099. }
  1100. } else if in.Field == "winnertel" {
  1101. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1102. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1103. break
  1104. }
  1105. }
  1106. }
  1107. }
  1108. }
  1109. } else {
  1110. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1111. val := ""
  1112. if len(pos) == 2 {
  1113. //"text" = "text"[pos[1]:]
  1114. val = "text"[pos[1]:]
  1115. rs := regexp.MustCompile("[^\r\n\t]+")
  1116. tmp := rs.FindAllString("text", -1)
  1117. if len(tmp) > 0 {
  1118. val = tmp[0]
  1119. }
  1120. }
  1121. if val != "" {
  1122. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1123. lock.Lock()
  1124. cfn := e.ClearFn[in.Field]
  1125. lock.Unlock()
  1126. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1127. if data[len(data)-1].(bool) {
  1128. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1129. j.BlockPackage[k].IsTrueBudget = true
  1130. }
  1131. break
  1132. }
  1133. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1134. lock.Lock()
  1135. cfn := e.ClearFn[in.Field]
  1136. lock.Unlock()
  1137. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1138. if data[len(data)-1].(bool) {
  1139. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1140. j.BlockPackage[k].IsTrueBidamount = true
  1141. }
  1142. break
  1143. } else if in.Field == "bidstatus" {
  1144. if j.BlockPackage[k].BidStatus == "" {
  1145. j.BlockPackage[k].BidStatus = val
  1146. break
  1147. }
  1148. } else if in.Field == "projectname" {
  1149. if j.BlockPackage[k].Name == "" {
  1150. j.BlockPackage[k].Name = val
  1151. break
  1152. }
  1153. }
  1154. }
  1155. }
  1156. }
  1157. }
  1158. }
  1159. //lua脚本根据属性设置提取kv值
  1160. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1161. kvmap := map[string][]map[string]interface{}{}
  1162. if len(j.Winnerorder) > 1 {
  1163. if vc.Field == "bidamount" {
  1164. for _, v := range j.Winnerorder {
  1165. if v["price"] == nil {
  1166. continue
  1167. }
  1168. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1169. "code": "winnerorder",
  1170. "field": vc.Field,
  1171. "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
  1172. "extfrom": v["sortstr"],
  1173. "sourcevalue": v["price"],
  1174. "value": v["price"],
  1175. "type": "winnerorder",
  1176. "matchtype": "winnerorder",
  1177. })
  1178. return kvmap, false
  1179. }
  1180. //候选人中标金额
  1181. if price := j.Winnerorder[0]["price"]; price != nil {
  1182. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1183. "code": "CL_中标候选人",
  1184. "field": vc.Field,
  1185. "ruletext": "中标候选人",
  1186. "extfrom": j.Winnerorder[0]["sortstr"],
  1187. "sourcevalue": price,
  1188. "value": price,
  1189. "type": "winnerorder",
  1190. "matchtype": "winnerorder",
  1191. })
  1192. return kvmap, false
  1193. }
  1194. }
  1195. }
  1196. for fieldname, field := range vc.LFields {
  1197. if field != vc.Field {
  1198. continue
  1199. }
  1200. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1201. }
  1202. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1203. return kvmap, true
  1204. }
  1205. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1206. //qu.Debug("fieldname+++", fieldname)
  1207. for _, bl := range blocks {
  1208. tp := ""
  1209. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1210. if k == 0 {
  1211. tp = "colon"
  1212. } else if k == 1 {
  1213. tp = "space"
  1214. } else if k == 2 {
  1215. tp = "table"
  1216. }
  1217. if v == nil || v.KvTags == nil {
  1218. continue
  1219. }
  1220. for _, vv := range v.KvTags[fieldname] {
  1221. text := ju.TrimLRSpace(vv.Value, "")
  1222. if text != "" {
  1223. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1224. "code": "CL_" + vv.Key,
  1225. "field": field,
  1226. "ruletext": vv.Key,
  1227. "extfrom": vc.ExtFrom,
  1228. "sourcevalue": text,
  1229. "value": text,
  1230. "type": tp,
  1231. "matchtype": "tag_string",
  1232. "blocktag": bl.Classify,
  1233. "weight": vv.Weight,
  1234. })
  1235. //if field != "winnertel" && field != "winnerperson" {
  1236. // //break //暂定取第一个
  1237. //}
  1238. }
  1239. }
  1240. }
  1241. if len(kvmap[field]) == 0 {
  1242. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1243. }
  1244. }
  1245. }
  1246. //正则提取结果
  1247. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1248. defer qu.Catch()
  1249. var score float64
  1250. score = vre.Score
  1251. if isSite {
  1252. score = score + 1.0
  1253. }
  1254. extinfo := map[string][]map[string]interface{}{}
  1255. rep := map[string]string{}
  1256. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1257. //处理正负数修正
  1258. ptmp := strings.Split(vre.RuleText, "#")
  1259. sign := 0
  1260. if len(ptmp) == 2 {
  1261. if ptmp[1] == "正" {
  1262. sign = 1
  1263. } else if ptmp[1] == "负" {
  1264. sign = -1
  1265. }
  1266. }
  1267. tmp := strings.Split(ptmp[0], "__")
  1268. if len(tmp) == 2 {
  1269. epos := strings.Split(tmp[1], ",")
  1270. posm := map[string]int{}
  1271. for _, v := range epos {
  1272. ks := strings.Split(v, ":")
  1273. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1274. posm[ks[1]] = qu.IntAll(ks[0])
  1275. } else {
  1276. posm[vre.Field] = qu.IntAll(ks[0])
  1277. }
  1278. }
  1279. var pattern string
  1280. if strings.Contains(tmp[0], "\\u") {
  1281. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1282. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1283. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1284. } else {
  1285. pattern = tmp[0]
  1286. }
  1287. //log.Debug("pattern", pattern)
  1288. //fmt.Println(text)
  1289. reg := regexp.MustCompile(pattern)
  1290. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1291. for i, _ := range apos {
  1292. pos := apos[i]
  1293. for k, p := range posm {
  1294. if len(pos) > p {
  1295. if pos[p] == -1 || pos[p+1] == -1 {
  1296. continue
  1297. }
  1298. val := text[pos[p]:pos[p+1]]
  1299. if string(val) == "" {
  1300. continue
  1301. }
  1302. if sign == -1 {
  1303. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1304. } else {
  1305. rep[k+"_"+fmt.Sprint(i)] = val
  1306. }
  1307. }
  1308. }
  1309. }
  1310. //fmt.Println(text)
  1311. tmps := []map[string]interface{}{}
  1312. for i := 0; i < len(apos); i++ {
  1313. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1314. tmp := map[string]interface{}{
  1315. "field": vre.Field,
  1316. "code": vre.Code,
  1317. "ruletext": vre.RuleText,
  1318. "extfrom": text,
  1319. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1320. "type": "regexp",
  1321. "matchtype": "regcontent",
  1322. "blocktag": *tag,
  1323. "score": score,
  1324. }
  1325. tmps = append(tmps, tmp)
  1326. exfield := ju.ExtField{
  1327. BlockTag: *tag,
  1328. Field: vre.Field,
  1329. Code: vre.Code,
  1330. RuleText: vre.RuleText,
  1331. Type: "regexp",
  1332. MatchType: "regcontent",
  1333. ExtFrom: extfrom,
  1334. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1335. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1336. Score: score}
  1337. if tmp["blocktag"] != nil {
  1338. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1339. }
  1340. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1341. }
  1342. }
  1343. if len(tmps) > 0 {
  1344. //fmt.Println(tmps)
  1345. extinfo[vre.Field] = tmps
  1346. }
  1347. }
  1348. } else {
  1349. pos := vre.RegCore.Reg.FindStringIndex(text)
  1350. val := ""
  1351. if len(pos) == 2 {
  1352. text = text[pos[1]:]
  1353. rs := regexp.MustCompile("[^\r\n\t]+")
  1354. tmp := rs.FindAllString(text, -1)
  1355. if len(tmp) > 0 {
  1356. val = tmp[0]
  1357. }
  1358. }
  1359. if val != "" {
  1360. tmps := []map[string]interface{}{}
  1361. tmp := map[string]interface{}{
  1362. "field": vre.Field,
  1363. "code": vre.Code,
  1364. "ruletext": vre.RuleText,
  1365. "extfrom": text,
  1366. "value": val,
  1367. "type": "regexp",
  1368. "matchtype": "regcontent",
  1369. "blocktag": *tag,
  1370. "score": score,
  1371. }
  1372. tmps = append(tmps, tmp)
  1373. extinfo[vre.Field] = tmps
  1374. if j.Result[vre.Field] == nil {
  1375. j.Result[vre.Field] = [](*ju.ExtField){}
  1376. }
  1377. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1378. Value: val,
  1379. Score: score}
  1380. if tmp["blocktag"] != nil {
  1381. field.BlockTag = tmp["blocktag"].(map[string]string)
  1382. }
  1383. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1384. }
  1385. }
  1386. return extinfo
  1387. }
  1388. //后置过滤
  1389. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1390. defer qu.Catch()
  1391. if in.IsLua {
  1392. result := GetResultMapForLua(j)
  1393. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1394. if j != nil {
  1395. lua.Block = j.Block
  1396. }
  1397. extinfo := lua.RunScript("back")
  1398. for k, v := range extinfo {
  1399. if tmps, ok := v.([]map[string]interface{}); ok {
  1400. j.Result[k] = [](*ju.ExtField){}
  1401. for _, tmp := range tmps {
  1402. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1403. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1404. Value: tmp["value"]}
  1405. if tmp["blocktag"] != nil {
  1406. field.BlockTag = tmp["blocktag"].(map[string]string)
  1407. }
  1408. j.Result[k] = append(j.Result[k], field)
  1409. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1410. }
  1411. }
  1412. }
  1413. if len(extinfo) > 0 {
  1414. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1415. }
  1416. } else {
  1417. extinfo := map[string]interface{}{}
  1418. if in.Field != "" {
  1419. clearByTitle := false
  1420. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1421. clearByTitle = true
  1422. }
  1423. if j.Result[in.Field] != nil {
  1424. tmp := j.Result[in.Field]
  1425. exts := []interface{}{}
  1426. for k, v := range tmp {
  1427. if clearByTitle && v.ExtFrom != "title" {
  1428. continue
  1429. }
  1430. //table抽取到的数据不清理
  1431. if v.Type == "table" && v.Field == "projectname" {
  1432. return
  1433. }
  1434. text := qu.ObjToString(v.Value)
  1435. if v.Field == "bidamount" || v.Field == "budget" {
  1436. if strings.Contains(qu.ObjToString(v.SourceValue), "费率") {
  1437. j.Result[in.Field][k].IsTrue = false
  1438. continue
  1439. }
  1440. }
  1441. if text != "" {
  1442. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1443. }
  1444. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1445. continue
  1446. }
  1447. j.Result[in.Field][k].Value = text
  1448. exts = append(exts, map[string]interface{}{
  1449. "field": v.Field,
  1450. "code": v.Code,
  1451. "ruletext": v.RuleText,
  1452. "type": v.Type,
  1453. "matchtype": v.MatchType,
  1454. "extfrom": v.ExtFrom,
  1455. "value": text,
  1456. })
  1457. }
  1458. if len(exts) > 0 {
  1459. extinfo[in.Field] = exts
  1460. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1461. }
  1462. }
  1463. } else {
  1464. for key, tmp := range j.Result {
  1465. exts := []interface{}{}
  1466. for k, v := range tmp {
  1467. //table抽取到的数据不清理
  1468. if v.Type == "table" && v.Field == "projectname" {
  1469. return
  1470. }
  1471. text := qu.ObjToString(v.Value)
  1472. if text != "" {
  1473. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1474. }
  1475. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1476. continue
  1477. }
  1478. j.Result[key][k].Value = text
  1479. exts = append(exts, map[string]interface{}{
  1480. "field": v.Field,
  1481. "code": v.Code,
  1482. "ruletext": v.RuleText,
  1483. "type": v.Type,
  1484. "matchtype": v.MatchType,
  1485. "extfrom": v.ExtFrom,
  1486. "value": text,
  1487. })
  1488. }
  1489. if len(exts) > 0 {
  1490. extinfo[key] = exts
  1491. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1492. }
  1493. }
  1494. }
  1495. }
  1496. }
  1497. //后置过滤
  1498. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1499. defer qu.Catch()
  1500. for k, v := range j.BlockPackage {
  1501. if in.Field == "winner" {
  1502. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1503. } else if in.Field == "bidstatus" {
  1504. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1505. } else if in.Field == "" {
  1506. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1507. } else if in.Field == "projectname" {
  1508. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1509. } else if in.Field == "winnerperson" {
  1510. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1511. } else if in.Field == "winnertel" {
  1512. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1513. }
  1514. }
  1515. }
  1516. //KV过滤
  1517. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1518. defer qu.Catch()
  1519. extinfo := map[string]interface{}{}
  1520. if in.Field != "" {
  1521. if j.Result[in.Field] != nil {
  1522. tmp := j.Result[in.Field]
  1523. exts := []interface{}{}
  1524. for k, v := range tmp {
  1525. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1526. continue
  1527. }
  1528. text := qu.ObjToString(v.Value)
  1529. if text != "" {
  1530. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1531. }
  1532. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1533. continue
  1534. }
  1535. j.Result[in.Field][k].Value = text
  1536. exts = append(exts, map[string]interface{}{
  1537. "field": v.Field,
  1538. "code": v.Code,
  1539. "ruletext": v.RuleText,
  1540. "type": v.Type,
  1541. "matchtype": v.MatchType,
  1542. "extfrom": v.ExtFrom,
  1543. "value": text,
  1544. })
  1545. }
  1546. if len(exts) > 0 {
  1547. extinfo[in.Field] = exts
  1548. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1549. }
  1550. }
  1551. }
  1552. }
  1553. //获取抽取结果map[string][]interface{},lua脚本使用
  1554. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1555. defer qu.Catch()
  1556. result := map[string][]map[string]interface{}{}
  1557. for key, val := range j.Result {
  1558. if result[key] == nil {
  1559. result[key] = []map[string]interface{}{}
  1560. }
  1561. for _, v := range val {
  1562. tmp := map[string]interface{}{
  1563. "field": v.Field,
  1564. "code": v.Code,
  1565. "ruletext": v.RuleText,
  1566. "value": v.Value,
  1567. "type": v.Type,
  1568. "matchtype": v.MatchType,
  1569. "extfrom": v.ExtFrom,
  1570. }
  1571. result[key] = append(result[key], tmp)
  1572. }
  1573. }
  1574. return result
  1575. }
  1576. //抽取日志
  1577. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1578. defer qu.Catch()
  1579. if !t.IsEtxLog {
  1580. return
  1581. }
  1582. logdata := map[string]interface{}{
  1583. "code": qu.If(v.Code == "", "kv", v.Code),
  1584. "name": v.Name,
  1585. "type": ftype,
  1586. "ruletext": v.RuleText,
  1587. "islua": v.IsLua,
  1588. "field": v.Field,
  1589. "version": t.Version,
  1590. "taskname": t.Name,
  1591. "before": before,
  1592. "extinfo": extinfo,
  1593. "sid": sid,
  1594. "comeintime": time.Now().Unix(),
  1595. }
  1596. lock.Lock()
  1597. ExtLogs[t] = append(ExtLogs[t], logdata)
  1598. lock.Unlock()
  1599. }
  1600. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1601. exts := []map[string]interface{}{}
  1602. exts = append(exts, map[string]interface{}{
  1603. "field": ext.Field,
  1604. "code": ext.Code,
  1605. "type": ftype,
  1606. "matchtype": matchtype,
  1607. "extfrom": ext.ExtFrom,
  1608. "value": ext.Value,
  1609. })
  1610. extinfo := map[string]interface{}{
  1611. ext.Field: exts,
  1612. }
  1613. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1614. }
  1615. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1616. defer qu.Catch()
  1617. if !t.IsEtxLog {
  1618. return
  1619. }
  1620. logdata := map[string]interface{}{
  1621. "code": code,
  1622. "name": name,
  1623. "type": ftype,
  1624. "ruletext": "",
  1625. "islua": false,
  1626. "field": field,
  1627. "version": t.Version,
  1628. "taskname": t.Name,
  1629. "before": before,
  1630. "extinfo": extinfo,
  1631. "sid": sid,
  1632. "comeintime": time.Now().Unix(),
  1633. }
  1634. lock.Lock()
  1635. ExtLogs[t] = append(ExtLogs[t], logdata)
  1636. lock.Unlock()
  1637. }
  1638. //保存抽取日志
  1639. func SaveExtLog() {
  1640. defer qu.Catch()
  1641. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1642. lock.Lock()
  1643. tmpLogs = ExtLogs
  1644. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1645. lock.Unlock()
  1646. for k, v := range tmpLogs {
  1647. if len(v) < saveLimit {
  1648. db.Mgo.SaveBulk(k.TrackColl, v...)
  1649. } else {
  1650. for {
  1651. if len(v) > saveLimit {
  1652. tmp := v[:saveLimit]
  1653. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1654. v = v[saveLimit:]
  1655. } else {
  1656. db.Mgo.SaveBulk(k.TrackColl, v...)
  1657. break
  1658. }
  1659. }
  1660. }
  1661. }
  1662. time.AfterFunc(10*time.Second, SaveExtLog)
  1663. }
  1664. type FieldValue struct {
  1665. Value interface{}
  1666. Count int
  1667. }
  1668. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1669. //分析抽取结果并保存
  1670. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1671. qu.Try(func() {
  1672. if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
  1673. if j.CategorySecond != "单一" {
  1674. delete(j.Result, "winner")
  1675. delete(j.Result, "bidamount")
  1676. for _, v := range j.BlockPackage {
  1677. v.Bidamount = 0
  1678. v.IsTrueBidamount = false
  1679. if v.Winner != "" {
  1680. v.Winner = ""
  1681. if v.SpaceKV != nil {
  1682. delete(v.SpaceKV.KvTags, "中标单位")
  1683. }
  1684. if v.TableKV != nil {
  1685. delete(v.TableKV.KvTags, "中标单位")
  1686. }
  1687. if v.ColonKV != nil {
  1688. delete(v.ColonKV.KvTags, "中标单位")
  1689. }
  1690. }
  1691. }
  1692. for _, v := range j.PackageInfo {
  1693. delete(v, "winner")
  1694. delete(v, "bidamount")
  1695. }
  1696. }
  1697. }
  1698. //重新取出清理过后的中标候选人
  1699. resetWinnerorder(j)
  1700. doc, result, _id := funcAnalysis(j, e)
  1701. if ju.IsSaveTag {
  1702. go otherNeedSave(j, result, e)
  1703. }
  1704. //从排序结果中取值
  1705. tmp := map[string]interface{}{} //抽取值
  1706. tmp["spidercode"] = j.SpiderCode
  1707. tmp["site"] = j.Site
  1708. if len(*j.Jsondata) > 0 {
  1709. tmp["jsondata"] = j.Jsondata
  1710. }
  1711. for _, val := range result {
  1712. for _, v := range val { //取第一个非负数,项目名称除外
  1713. //存0是否有效
  1714. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue{
  1715. tmp[v.Field] = v.Value
  1716. break
  1717. }
  1718. if v.Score > -1 {
  1719. tmp[v.Field] = v.Value
  1720. break
  1721. }
  1722. }
  1723. }
  1724. if len(j.PackageInfo) > 15 {
  1725. for k, v := range j.PackageInfo {
  1726. j.PackageInfo = map[string]map[string]interface{}{}
  1727. j.PackageInfo[k] = v
  1728. break
  1729. }
  1730. }
  1731. if len(j.PackageInfo) > 0 { //分包信息
  1732. tmp["package"] = j.PackageInfo
  1733. //包预算,中标金额合并大于抽取就覆盖
  1734. var tmpBidamount, tmpBudget float64
  1735. //s_winner逗号分隔拼接,分包中标人
  1736. var tmpstr, savewinner []string
  1737. //按包排序
  1738. for b, v := range j.PackageInfo {
  1739. if v["winner"] != nil && v["winner"] != "" {
  1740. tmpstr = append(tmpstr, b)
  1741. }
  1742. }
  1743. //包预算,中标金额合并大于抽取就覆盖
  1744. if len(j.PackageInfo) >= 1 {
  1745. //包数大于1累加
  1746. for _, v := range j.PackageInfo {
  1747. if v["budget"] != nil {
  1748. tmpBudget += qu.Float64All(v["budget"])
  1749. }
  1750. if v["bidamount"] != nil {
  1751. tmpBidamount += qu.Float64All(v["bidamount"])
  1752. }
  1753. }
  1754. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1755. tmp["budget"] = tmpBudget
  1756. }
  1757. if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
  1758. tmp["bidamount"] = tmpBidamount
  1759. } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1760. tmp["bidamount"] = tmpBidamount
  1761. }
  1762. } else {
  1763. //包数等于1,tmp没有值取包里的值
  1764. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1765. for _, v := range j.PackageInfo {
  1766. if v["budget"] != nil {
  1767. tmp["budget"] = v["budget"]
  1768. }
  1769. }
  1770. }
  1771. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1772. for _, v := range j.PackageInfo {
  1773. if v["bidamount"] != nil {
  1774. tmp["bidamount"] = v["bidamount"]
  1775. }
  1776. }
  1777. }
  1778. }
  1779. //s_winner逗号分隔拼接,分包中标人
  1780. sort.Strings(tmpstr)
  1781. for _, v := range tmpstr {
  1782. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1783. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1784. if savevvv == "" {
  1785. continue
  1786. }
  1787. savewinner = append(savewinner, savevvv)
  1788. }
  1789. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1790. tmp["s_winner"] = tmp["winner"]
  1791. } else if savewinner != nil {
  1792. savewinner = RemoveReplicaSliceString(savewinner)
  1793. tmp["s_winner"] = strings.Join(savewinner, ",")
  1794. }
  1795. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1796. //没有分包取winner
  1797. tmp["s_winner"] = tmp["winner"]
  1798. }
  1799. if len(j.Winnerorder) > 0 { //候选人信息
  1800. for i, v := range j.Winnerorder {
  1801. if v["price"] != nil {
  1802. tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  1803. if tmpPrice[len(tmpPrice)-1].(bool) {
  1804. j.Winnerorder[i]["price"] = tmpPrice[0]
  1805. } else {
  1806. delete(j.Winnerorder[i], "price")
  1807. }
  1808. }
  1809. }
  1810. tmp["winnerorder"] = j.Winnerorder
  1811. }
  1812. //处理附件
  1813. var resultf map[string][]*ju.ExtField
  1814. ffield := map[string]interface{}{}
  1815. if jf != nil {
  1816. _, resultf, _ = funcAnalysis(jf, e)
  1817. for _, val := range resultf {
  1818. for _, v := range val { //取第一个非负数
  1819. if v.Score > -1 {
  1820. ffield[v.Field] = v.Value
  1821. if tmp[v.Field] == nil {
  1822. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
  1823. tmp[v.Field] = v.Value
  1824. break
  1825. }
  1826. }
  1827. break
  1828. }
  1829. }
  1830. }
  1831. if len(jf.PackageInfo) > 0 { //分包信息
  1832. ffield["package"] = jf.PackageInfo
  1833. }
  1834. if len(jf.Winnerorder) > 0 { //候选人信息
  1835. ffield["winnerorder"] = jf.Winnerorder
  1836. }
  1837. }
  1838. for k, v := range *doc {
  1839. if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
  1840. (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
  1841. }
  1842. //去重冗余字段
  1843. if delFiled(k) {
  1844. continue
  1845. }
  1846. if tmp[k] == nil {
  1847. tmp[k] = v
  1848. }
  1849. }
  1850. //质量审核
  1851. if ju.QualityAudit {
  1852. e.QualityAudit(tmp)
  1853. }
  1854. //城市抽取
  1855. if e.IsExtractCity {
  1856. //e.ExtractCity(j, tmp, _id)
  1857. e.NewExtractCity(j, &tmp, _id)
  1858. }
  1859. //品牌抽取
  1860. if ju.IsBrandGoods {
  1861. tmp["checkhas"] = map[string]int{
  1862. "hastable": j.HasTable,
  1863. "hasgoods": j.HasGoods,
  1864. "hasbrand": j.HasBrand,
  1865. "haskey": j.HasKey,
  1866. }
  1867. if len(j.BrandData) > 0 {
  1868. tmp["tablebrand"] = j.BrandData
  1869. }
  1870. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1871. }
  1872. //prince和number抽取
  1873. if ju.IsPriceNumber {
  1874. priceNumberLen := len(j.PriceNumberData)
  1875. if priceNumberLen > 1 { //table数据去重
  1876. tmpPriceNumberData := []map[string]interface{}{}
  1877. tableStrs := map[string]bool{}
  1878. for _, tb := range j.PriceNumberData {
  1879. has := false
  1880. bytes, _ := json.Marshal(tb)
  1881. str := string(bytes)
  1882. if len(tableStrs) > 0 && tableStrs[str] {
  1883. has = true
  1884. } else {
  1885. tableStrs[str] = true
  1886. }
  1887. if !has {
  1888. for _, data := range tb {
  1889. tmpPriceNumberData = append(tmpPriceNumberData, data)
  1890. }
  1891. }
  1892. }
  1893. tmp["pricenumber"] = tmpPriceNumberData
  1894. } else if priceNumberLen == 1 {
  1895. tmp["pricenumber"] = j.PriceNumberData[0]
  1896. }
  1897. }
  1898. //所有kv组成的字符串
  1899. var kvtext bytes.Buffer
  1900. blocks := make([]ju.BlockAndTag, 0)
  1901. for _, v := range j.Block {
  1902. //分包和标签
  1903. if ju.SaveBlock {
  1904. xx, _ := json.Marshal(v)
  1905. tmpblock := new(ju.TmpBlock)
  1906. err := json.Unmarshal(xx, &tmpblock)
  1907. if err != nil {
  1908. if v.BPackage != nil {
  1909. bpb, _ := json.Marshal(v.BPackage)
  1910. tmpblock.BPackage = string(bpb)
  1911. }
  1912. tmpblock = rangeBlockToJson(v, *tmpblock)
  1913. }
  1914. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1915. }
  1916. //把所有kv组装成一个字符串,存库
  1917. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1918. if jv == nil {
  1919. continue
  1920. }
  1921. for jv_k, jv_v := range jv.KvTags {
  1922. for _, jv_vv := range jv_v {
  1923. kvtext.WriteString(jv_k)
  1924. kvtext.WriteString(":")
  1925. kvtext.WriteString(jv_vv.Value)
  1926. kvtext.WriteString("\n")
  1927. }
  1928. }
  1929. }
  1930. }
  1931. if kvtext.Len() > 0 {
  1932. tmp["kvtext"] = kvtext.String()
  1933. }
  1934. if len(blocks) > 0 {
  1935. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1936. if utf8.RuneCount(blocksBytes) < 100000 {
  1937. tmp["blocks"] = string(blocksBytes)
  1938. }
  1939. }
  1940. }
  1941. tmp["dataging"] = j.Dataging
  1942. //budget bidamount
  1943. if bg, ok := tmp["budget"].(float64); ok && bg >= 500000000000 {
  1944. delete(tmp, "budget")
  1945. }
  1946. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 500000000000 {
  1947. delete(tmp, "bidamount")
  1948. }
  1949. //检查字段
  1950. tmp = checkFields(tmp)
  1951. if tmp["projectname"] == nil || tmp["projectname"] == "" {
  1952. tmp["projectname"] = j.Title
  1953. }
  1954. tmp["repeat"] = 0
  1955. if ju.Ffield {
  1956. if len(ffield) > 0 {
  1957. tmp["ffield"] = ffield
  1958. }
  1959. }
  1960. if e.TaskInfo.TestColl == "" {
  1961. if len(tmp) > 0 { //保存抽取结果
  1962. tmparr := []map[string]interface{}{
  1963. map[string]interface{}{
  1964. "_id": qu.StringTOBsonId(_id),
  1965. },
  1966. map[string]interface{}{"$set": tmp},
  1967. }
  1968. e.RWMutex.Lock()
  1969. e.BidArr = append(e.BidArr, tmparr)
  1970. e.BidTotal++
  1971. e.RWMutex.Unlock()
  1972. }
  1973. if ju.SaveResult {
  1974. id := tmp["_id"]
  1975. tmp["result"] = result
  1976. tmp["resultf"] = resultf
  1977. delete(tmp, "_id")
  1978. tmparr := []map[string]interface{}{
  1979. map[string]interface{}{
  1980. "_id": id,
  1981. },
  1982. map[string]interface{}{"$set": tmp},
  1983. }
  1984. e.RWMutex.Lock()
  1985. e.ResultArr = append(e.ResultArr, tmparr)
  1986. e.RWMutex.Unlock()
  1987. }
  1988. } else { //测试结果
  1989. delete(tmp, "_id")
  1990. delete(tmp, "fieldall")
  1991. if len(j.BlockPackage) > 0 { //分包详情
  1992. if len(j.BlockPackage) > 10 {
  1993. tmp["epackage"] = "分包异常"
  1994. } else {
  1995. bs, _ := json.Marshal(j.BlockPackage)
  1996. tmp["epackage"] = string(bs)
  1997. }
  1998. }
  1999. tmp["result"] = result
  2000. //tmp["resultf"] = resultf
  2001. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  2002. if !b {
  2003. log.Debug(e.TaskInfo.TestColl, _id)
  2004. }
  2005. }
  2006. }, func(err interface{}) {
  2007. log.Debug("AnalysisSaveResult err", err)
  2008. })
  2009. }
  2010. func checkFields(tmp map[string]interface{}) map[string]interface{} {
  2011. delete(tmp, "contenthtml")
  2012. delete(tmp, "detail")
  2013. //delete(tmp, "toptype")
  2014. //delete(tmp, "subtype")
  2015. if _, ok := tmp["bidamount"].(string); ok {
  2016. delete(tmp, "bidamount")
  2017. } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]) {
  2018. delete(tmp, "bidamount")
  2019. }
  2020. if _, ok := tmp["budget"].(string); ok {
  2021. delete(tmp, "budget")
  2022. }
  2023. if _, ok := tmp["bidopentime"].(string); ok {
  2024. delete(tmp, "bidopentime")
  2025. }
  2026. if _, ok := tmp["signaturedate"].(string); ok {
  2027. delete(tmp, "signaturedate")
  2028. }
  2029. for k, v := range tmp {
  2030. if v == "" {
  2031. delete(tmp, k)
  2032. }
  2033. }
  2034. return tmp
  2035. }
  2036. //保存其他
  2037. //kv、表格、块上的标签凡是新的标签都入库
  2038. //val type times firstid createtime 判定field
  2039. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  2040. now := time.Now().Unix()
  2041. coll := e.TaskInfo.TestColl
  2042. if coll == "" {
  2043. coll = "extract_tag_result"
  2044. } else {
  2045. coll += "_tag"
  2046. }
  2047. datas := []map[string]interface{}{}
  2048. kv := map[string]int{}
  2049. for _, v := range j.Block {
  2050. //
  2051. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  2052. if vv == nil || vv.KvTags == nil {
  2053. continue
  2054. }
  2055. for kkk, vvv := range vv.KvTags {
  2056. for _, vvvv := range vvv {
  2057. if vvvv.IsInvalid {
  2058. kv[kkk] = kv[kkk] + 1
  2059. break
  2060. }
  2061. }
  2062. }
  2063. }
  2064. for _, vv := range v.NotClassifyTitles {
  2065. datas = append(datas, map[string]interface{}{
  2066. "val": vv,
  2067. "times": 0,
  2068. "type": "block",
  2069. "firstid": j.SourceMid,
  2070. "createtime": now,
  2071. })
  2072. if len(datas) == saveLimit {
  2073. db.Mgo.SaveBulk(coll, datas...)
  2074. datas = []map[string]interface{}{}
  2075. }
  2076. }
  2077. }
  2078. for k, v := range kv {
  2079. datas = append(datas, map[string]interface{}{
  2080. "val": k,
  2081. "times": v,
  2082. "type": "kv",
  2083. "firstid": j.SourceMid,
  2084. "createtime": now,
  2085. })
  2086. if len(datas) == saveLimit {
  2087. db.Mgo.SaveBulk(coll, datas...)
  2088. datas = []map[string]interface{}{}
  2089. }
  2090. }
  2091. if len(datas) > 0 {
  2092. db.Mgo.SaveBulk(coll, datas...)
  2093. }
  2094. }
  2095. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2096. if j == nil {
  2097. return nil
  2098. }
  2099. if len(j.Block) > 0 {
  2100. for i, v := range j.Block {
  2101. rangetmp := new(ju.TmpBlock)
  2102. vb, _ := json.Marshal(v)
  2103. json.Unmarshal(vb, &rangetmp)
  2104. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2105. }
  2106. }
  2107. if j.ColonKV != nil {
  2108. cb, _ := json.Marshal(j.ColonKV)
  2109. tmpblock.ColonKV = string(cb)
  2110. }
  2111. if j.SpaceKV != nil {
  2112. sb, _ := json.Marshal(j.SpaceKV)
  2113. tmpblock.SpaceKV = string(sb)
  2114. }
  2115. if j.TableKV != nil {
  2116. tb, _ := json.Marshal(j.TableKV)
  2117. tmpblock.TableKV = string(tb)
  2118. }
  2119. return &tmpblock
  2120. }
  2121. //去重冗余字段
  2122. func delFiled(k string) bool {
  2123. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2124. }
  2125. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2126. defer qu.Catch()
  2127. doc := j.Data
  2128. result := j.Result
  2129. _id := qu.BsonIdToSId((*doc)["_id"])
  2130. result = ScoreFields(j, e.Tag) //正负面词打分
  2131. //结果排序
  2132. for _, val := range result {
  2133. ju.Sort(val)
  2134. }
  2135. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2136. //jsondata清理
  2137. clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
  2138. marshalbt, _ := json.Marshal(j.Jsondata)
  2139. tmpjddata := make(map[string]interface{})
  2140. json.Unmarshal(marshalbt, &tmpjddata)
  2141. for _, jdkey := range ju.JsonData {
  2142. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2143. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2144. if jdkey == "budget" || jdkey == "bidamount" {
  2145. lockclear.Lock()
  2146. cfn := e.ClearFn[jdkey]
  2147. lockclear.Unlock()
  2148. if len(cfn) == 0 {
  2149. continue
  2150. }
  2151. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
  2152. if tmpv.Value == newNum[0] {
  2153. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2154. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2155. ju.Sort(j.Result[jdkey])
  2156. delete((*j.Jsondata), jdkey)
  2157. break
  2158. }
  2159. } else {
  2160. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2161. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2162. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2163. ju.Sort(j.Result[jdkey])
  2164. delete((*j.Jsondata), jdkey)
  2165. break
  2166. }
  2167. }
  2168. }
  2169. }
  2170. }
  2171. if len(*j.Jsondata) > 0 {
  2172. j.Result = JsonDataMergeProcessing(j, e)
  2173. }
  2174. j.Jsondata = &tmpjddata
  2175. }
  2176. return doc, result, _id
  2177. }
  2178. //辅助信息,如果没有排序先排序
  2179. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2180. fieldalls := map[string][]map[string]interface{}{}
  2181. if j == nil {
  2182. return fieldalls
  2183. }
  2184. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2185. defer qykredis.Close()
  2186. db := 0
  2187. for field, val := range j.Result {
  2188. //ju.Sort(val)
  2189. if field == "buyer" {
  2190. db = ju.BuyerDB
  2191. } else if field == "winner" {
  2192. db = ju.WinnerDB
  2193. } else if field == "agency" {
  2194. db = ju.AgencyDB
  2195. }
  2196. sfields := []map[string]interface{}{}
  2197. for _, v := range val {
  2198. standardized := false
  2199. if _, err := qykredis.Do("SELECT", db); err != nil {
  2200. fmt.Println("redis select err", err)
  2201. } else {
  2202. rep, err := qykredis.Do("GET", v.Value)
  2203. if rep != nil && err == nil {
  2204. standardized = true
  2205. }
  2206. }
  2207. if field == "budget" || field == "bidamount" {
  2208. if !v.IsTrue {
  2209. continue
  2210. }
  2211. }
  2212. sfield := map[string]interface{}{
  2213. "val": v.Value,
  2214. "type": v.Type,
  2215. "score": v.Score,
  2216. "blocktag": v.BlockTag,
  2217. "sourceval": v.SourceValue,
  2218. "standardized": standardized,
  2219. }
  2220. sfields = append(sfields, sfield)
  2221. }
  2222. fieldalls[field] = sfields
  2223. }
  2224. return fieldalls
  2225. }
  2226. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2227. defer qu.Catch()
  2228. //获取审核字段
  2229. for _, field := range e.AuditFields {
  2230. //1.分包
  2231. if resulttmp["package"] != nil {
  2232. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2233. for _, val := range packagedata {
  2234. if val[field] != nil {
  2235. fv := qu.ObjToString(val[field])
  2236. if fv != "" {
  2237. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2238. e.RedisMatch(field, fv, val) //redis匹配
  2239. } else { //除了buyer和winner,其他字段走规则匹配
  2240. e.RuleMatch(field, fv, val)
  2241. }
  2242. }
  2243. }
  2244. }
  2245. }
  2246. //2.外围
  2247. if resulttmp[field] != nil {
  2248. fv := qu.ObjToString(resulttmp[field])
  2249. if fv != "" {
  2250. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2251. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2252. } else { //除了buyer和winner,其他字段走规则匹配
  2253. e.RuleMatch(field, fv, resulttmp)
  2254. }
  2255. }
  2256. }
  2257. }
  2258. }
  2259. //Redis匹配
  2260. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2261. defer qu.Catch()
  2262. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2263. if i == 0 { //reids未找到,执行规则匹配
  2264. val[field+"_isredis"] = false
  2265. e.RuleMatch(field, fv, val) //规则匹配
  2266. } else { //redis找到,打标识存库
  2267. val[field+"_isredis"] = true
  2268. }
  2269. }
  2270. //规则匹配
  2271. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2272. defer qu.Catch()
  2273. if fieldval != "" {
  2274. SMap := e.StartMatch(field, fieldval)
  2275. //SMap.AddKey(field+"_isaudit", false)
  2276. for _, k := range SMap.Keys {
  2277. tmpMap[k] = SMap.Map[k]
  2278. }
  2279. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2280. }
  2281. }
  2282. //开始规则匹配
  2283. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2284. defer qu.Catch()
  2285. SMap := pretreated.NewSortMap()
  2286. lock.Lock()
  2287. f := e.RecogFieldMap[field]
  2288. lock.Unlock()
  2289. if len(f) > 0 {
  2290. fid := qu.BsonIdToSId(f["_id"])
  2291. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2292. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2293. if textAfterRecogFieldPrerule != "" {
  2294. lock.Lock()
  2295. classMap := e.FidClassMap[fid]
  2296. lock.Unlock()
  2297. L:
  2298. for _, c := range classMap { //class
  2299. classid := qu.BsonIdToSId(c["_id"])
  2300. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2301. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2302. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2303. if textAfterClassPrerule != "" {
  2304. lock.Lock()
  2305. ruleMap := e.CidRuleMap[classid]
  2306. lock.Unlock()
  2307. for _, r := range ruleMap { //rule
  2308. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2309. s_name := qu.ObjToString(r["s_name"])
  2310. rule := r["rule"].([]interface{})
  2311. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2312. if textAfterRulePrerule != "" {
  2313. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2314. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2315. if savefield != "" { //保存字段不为空,存储代码信息
  2316. SMap.AddKey(field+"_"+savefield, s_name)
  2317. }
  2318. break L
  2319. }
  2320. }
  2321. }
  2322. }
  2323. }
  2324. }
  2325. }
  2326. return SMap
  2327. }
  2328. //中标候选人经过清理之后,重新取出赋值
  2329. func resetWinnerorder(j *ju.Job) {
  2330. if len(j.Winnerorder) == 0 {
  2331. return
  2332. }
  2333. maxlen := len(j.Winnerorder) - 1
  2334. //中标单位
  2335. //i := 0
  2336. winners := []*ju.ExtField{}
  2337. bidamounts := []*ju.ExtField{}
  2338. if maxlen > 0 {
  2339. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2340. if j.Winnerorder[0]["price"] != nil {
  2341. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2342. if tmpPrice[len(tmpPrice)-1].(bool) {
  2343. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5})
  2344. }
  2345. }
  2346. }
  2347. if j.Result["winner"] == nil && len(winners) > 0 {
  2348. j.Result["winner"] = winners
  2349. } else if len(winners) > 0 {
  2350. j.Result["winner"] = append(j.Result["winner"], winners...)
  2351. }
  2352. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2353. j.Result["bidamount"] = bidamounts
  2354. } else if len(bidamounts) > 0 {
  2355. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2356. }
  2357. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  2358. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2359. j.Result["winner"] = winners
  2360. if j.Winnerorder[0]["price"] != nil {
  2361. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2362. if tmpPrice[len(tmpPrice)-1].(bool) {
  2363. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2364. }
  2365. j.Result["bidamount"] = bidamounts
  2366. }
  2367. }
  2368. }
  2369. func RemoveReplicaSliceString(slc []string) []string {
  2370. result := make([]string, 0)
  2371. tempMap := make(map[string]bool, len(slc))
  2372. for _, e := range slc {
  2373. if tempMap[e] == false {
  2374. tempMap[e] = true
  2375. result = append(result, e)
  2376. }
  2377. }
  2378. return result
  2379. }