extract.go 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. "github.com/PuerkitoBio/goquery"
  20. log "github.com/donnie4w/go-logger/logger"
  21. "gopkg.in/mgo.v2/bson"
  22. )
  23. var (
  24. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  25. cut = ju.NewCut() //获取正文并清理
  26. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  27. TaskList map[string]*ExtractTask //任务列表
  28. ClearTaskList map[string]*ClearTask //清理任务列表
  29. saveLimit = 100 //抽取日志批量保存
  30. PageSize = 5000 //查询分页
  31. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. )
  34. //启动测试抽取
  35. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  36. defer qu.Catch()
  37. ext := &ExtractTask{}
  38. ext.Id = taskId
  39. ext.IsRun = true
  40. ext.InitTestTaskInfo(resultcoll, trackcoll)
  41. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  42. ext.InitSite()
  43. ext.InitRulePres()
  44. ext.InitRuleBacks(false)
  45. ext.InitRuleBacks(true)
  46. ext.InitRuleCore(false)
  47. ext.InitRuleCore(true)
  48. ext.InitPkgCore()
  49. ext.InitBlockRule()
  50. ext.InfoTypeList()
  51. ext.InitTag(false)
  52. ext.InitTag(true)
  53. ext.InitClearFn(false)
  54. ext.InitClearFn(true)
  55. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  56. //初始化城市DFA信息
  57. ext.InitCityInfo()
  58. //ext.InitCityDFA()
  59. ext.InitAreaCode()
  60. ext.InitPostCode()
  61. }
  62. //质量审核
  63. ext.InitAuditFields()
  64. ext.InitAuditRule()
  65. ext.InitAuditClass()
  66. ext.InitAuditRecogField()
  67. //品牌抽取是否开启
  68. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  69. //价格个数抽取是否开启
  70. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  71. //附件抽取是否开启
  72. ext.InitFile()
  73. return RunExtractTestTask(ext, startId, num)
  74. }
  75. func IdTrans(startId string) bson.ObjectId {
  76. defer qu.Catch()
  77. return bson.ObjectIdHex(startId)
  78. }
  79. //开始测试任务抽取
  80. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  81. n, _ := strconv.Atoi(num)
  82. id := IdTrans(startId)
  83. if id.Valid() {
  84. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  85. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  86. for _, v := range *list {
  87. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  88. continue
  89. }
  90. var j, jf *ju.Job
  91. var isSite bool
  92. if ext.IsFileField && v["projectinfo"] != nil {
  93. v["isextFile"] = true
  94. j, jf, isSite = ext.PreInfo(v)
  95. } else {
  96. j, _, isSite = ext.PreInfo(v)
  97. }
  98. go ext.ExtractProcess(j, jf, isSite)
  99. ext.TaskInfo.ProcessPool <- true
  100. }
  101. return true
  102. } else {
  103. return false
  104. }
  105. }
  106. //启动抽取
  107. func StartExtractTaskId(taskId string) bool {
  108. defer qu.Catch()
  109. isgo := false
  110. ext := TaskList[taskId]
  111. if ext == nil {
  112. ext = &ExtractTask{}
  113. ext.Id = taskId
  114. ext.InitTaskInfo()
  115. isgo = true
  116. } else {
  117. ext.Id = taskId
  118. ext.InitTaskInfo()
  119. }
  120. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  121. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  122. ext.InitSite()
  123. ext.InitRulePres()
  124. ext.InitRuleBacks(false)
  125. ext.InitRuleBacks(true)
  126. ext.InitRuleCore(false)
  127. ext.InitRuleCore(true)
  128. ext.InitPkgCore()
  129. ext.InitBlockRule()
  130. ext.InfoTypeList()
  131. ext.InitTag(false)
  132. ext.InitTag(true)
  133. ext.InitClearFn(false)
  134. ext.InitClearFn(true)
  135. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  136. //初始化城市DFA信息
  137. //ext.InitCityDFA()
  138. ext.InitCityInfo()
  139. ext.InitAreaCode()
  140. ext.InitPostCode()
  141. }
  142. //质量审核
  143. ext.InitAuditFields()
  144. ext.InitAuditRule()
  145. ext.InitAuditClass()
  146. ext.InitAuditRecogField()
  147. //品牌抽取是否开启
  148. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  149. //价格个数抽取是否开启
  150. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  151. //附件抽取是否开启
  152. ext.InitFile()
  153. ext.IsRun = true
  154. go ext.ResultSave(true)
  155. go ext.BidSave(true)
  156. if isgo {
  157. go RunExtractTask(taskId)
  158. }
  159. TaskList[taskId] = ext
  160. return true
  161. }
  162. //停止抽取
  163. func StopExtractTaskId(taskId string) bool {
  164. defer qu.Catch()
  165. ext := TaskList[taskId]
  166. if ext != nil {
  167. ext.IsRun = false
  168. TaskList[taskId] = ext
  169. }
  170. //更新task.s_extlastid
  171. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  172. return true
  173. }
  174. //开始抽取
  175. func RunExtractTask(taskId string) {
  176. defer qu.Catch()
  177. ext := TaskList[taskId]
  178. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  179. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  180. pageNum := (count + PageSize - 1) / PageSize
  181. limit := PageSize
  182. if count < PageSize {
  183. limit = count
  184. }
  185. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  186. for i := 0; i < pageNum; i++ {
  187. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  188. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  189. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  190. for _, v := range *list {
  191. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  192. continue
  193. }
  194. //根据标题判断是否抽取
  195. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  196. if !b {
  197. continue
  198. }
  199. _id := qu.BsonIdToSId(v["_id"])
  200. //log.Debug(_id)
  201. if !ext.IsRun {
  202. break
  203. }
  204. var j, jf *ju.Job
  205. var isSite bool
  206. if ext.IsFileField && v["projectinfo"] != nil {
  207. v["isextFile"] = true
  208. j, jf, isSite = ext.PreInfo(v)
  209. } else {
  210. j, _, isSite = ext.PreInfo(v)
  211. }
  212. go ext.ExtractProcess(j, jf, isSite)
  213. ext.TaskInfo.LastExtId = _id
  214. ext.TaskInfo.ProcessPool <- true
  215. }
  216. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  217. if !ext.IsRun {
  218. break
  219. }
  220. }
  221. //更新task.s_extlastid
  222. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  223. }
  224. //信息预处理-不和版本关联,取最新版本的配置项
  225. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  226. return (&ExtractTask{}).PreInfo(doc)
  227. }
  228. //信息预处理-和版本关联
  229. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  230. defer qu.Catch()
  231. //判断是否有附件这个字段
  232. var isextFile bool
  233. if doc["isextFile"] != nil {
  234. isextFile = doc["isextFile"].(bool)
  235. }
  236. detail := ""
  237. d1, _ := doc["detail"].(string)
  238. d2, _ := doc["contenthtml"].(string)
  239. if len(d1) >= len(d2) || d2 == "" {
  240. detail = d1
  241. } else {
  242. detail = d2
  243. }
  244. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  245. d3, _ := doc["summary"].(string)
  246. //全文的需要修复表格
  247. detail = pretreated.RepairCon(detail)
  248. detail = ju.CutLableStr(d3 + "\n" + detail)
  249. detail = cut.ClearHtml(d3 + "\n" + detail)
  250. doc["detail"] = detail
  251. if isextFile {
  252. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  253. }
  254. //正文小于200个字,有附件把附件内容加到正文
  255. tmpDeatil := detail
  256. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  257. if err == nil {
  258. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  259. if conlen < 200 {
  260. if isextFile {
  261. detail += qu.ObjToString(doc["detailfile"])
  262. doc["detail"] = detail
  263. }
  264. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  265. //防止文本过长,造成抽取阻塞
  266. log.Debug("文本太长", doc["_id"], conlen)
  267. doc["detail"] = d3
  268. }
  269. }
  270. toptype := qu.ObjToString(doc["toptype"])
  271. subtype := qu.ObjToString(doc["subtype"])
  272. if qu.ObjToString(doc["type"]) == "bid" {
  273. toptype = "结果"
  274. }
  275. if toptype == "" {
  276. toptype = "all"
  277. }
  278. if subtype == "" {
  279. subtype = "all"
  280. }
  281. if toptype == "其它" || subtype == "其它" || subtype == "其他" || subtype == "结果变更" {
  282. toptype = "all"
  283. subtype = "all"
  284. }
  285. toMap := qu.ObjToMap(doc["jsondata"])
  286. //log.Debug("toMap", toMap)
  287. if (*toMap) != nil {
  288. if (*toMap)["extweight"] == nil {
  289. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  290. }
  291. }
  292. j = &ju.Job{
  293. SourceMid: qu.BsonIdToSId(doc["_id"]),
  294. Category: toptype,
  295. CategorySecond: subtype,
  296. Content: qu.ObjToString(doc["detail"]),
  297. SpiderCode: qu.ObjToString(doc["spidercode"]),
  298. Site: qu.ObjToString(doc["site"]),
  299. //Domain: qu.ObjToString(doc["domain"]),
  300. //Href: qu.ObjToString(doc["href"]),
  301. Title: qu.ObjToString(doc["title"]),
  302. Data: &doc,
  303. City: qu.ObjToString(doc["city"]),
  304. Province: qu.ObjToString(doc["area"]),
  305. Jsondata: toMap,
  306. Result: map[string][]*ju.ExtField{},
  307. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  308. RuleBlock: e.RuleBlock,
  309. }
  310. if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
  311. delete((*j.Jsondata), "jsoncontent")
  312. }
  313. if isextFile {
  314. jf = &ju.Job{
  315. SourceMid: qu.BsonIdToSId(doc["_id"]),
  316. Category: toptype,
  317. Content: qu.ObjToString(doc["detailfile"]),
  318. SpiderCode: qu.ObjToString(doc["spidercode"]),
  319. Site: qu.ObjToString(doc["site"]),
  320. Title: qu.ObjToString(doc["title"]),
  321. Data: &doc,
  322. City: qu.ObjToString(doc["city"]),
  323. Province: qu.ObjToString(doc["area"]),
  324. Jsondata: toMap,
  325. Result: map[string][]*ju.ExtField{},
  326. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  327. RuleBlock: e.RuleBlock,
  328. IsFile: isextFile,
  329. }
  330. if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
  331. delete((*jf.Jsondata), "jsoncontent")
  332. }
  333. }
  334. codeSite := j.SpiderCode
  335. //是否启用站点
  336. if value, ok := e.SiteMerge.Load(codeSite); ok {
  337. isSite = value.(bool)
  338. }
  339. if isSite {
  340. //是否配置站点
  341. exp, isSite := e.Luacodes.Load(codeSite)
  342. if isSite {
  343. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  344. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  345. }
  346. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  347. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  348. }
  349. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  350. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  351. }
  352. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  353. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  354. }
  355. }
  356. }
  357. qu.Try(func() {
  358. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  359. if isextFile {
  360. pretreated.AnalyStart(jf, isSite, codeSite)
  361. }
  362. }, func(err interface{}) {
  363. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  364. })
  365. return j, jf, isSite
  366. }
  367. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  368. func file2text(doc *map[string]interface{}) {
  369. var strfileinfo bytes.Buffer
  370. if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
  371. if va, ok := v["attachments"].(map[string]interface{}); ok {
  372. for _, vaatt := range va {
  373. if fileinfo, ok := vaatt.(map[string]interface{}); ok {
  374. if qu.ObjToString(fileinfo["content"]) != "" {
  375. switch fileinfo["content"].(type) {
  376. case string:
  377. lock.Lock()
  378. strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
  379. lock.Unlock()
  380. case []map[string]interface{}:
  381. for _, fv := range fileinfo["content"].([]map[string]interface{}) {
  382. if fv["context"] != nil {
  383. lock.Lock()
  384. strfileinfo.WriteString(fv["context"].(string) + " \n")
  385. lock.Unlock()
  386. }
  387. }
  388. }
  389. }
  390. }
  391. }
  392. }
  393. }
  394. if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  395. (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
  396. }
  397. }
  398. //抽取
  399. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  400. e.ExtractDetail(j, isSite, j.SpiderCode)
  401. if jf != nil && jf.IsFile {
  402. e.ExtractFile(jf, isSite, j.SpiderCode)
  403. }
  404. if isSite {
  405. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  406. if ok && ismerge.(bool) {
  407. tmpj := &ju.Job{
  408. SourceMid: j.SourceMid,
  409. Category: j.Category,
  410. CategorySecond: j.CategorySecond,
  411. Content: j.Content,
  412. SpiderCode: j.SpiderCode,
  413. //Domain: qu.ObjToString(doc["domain"]),
  414. //Href: qu.ObjToString(doc["href"]),
  415. Title: j.Title,
  416. Data: j.Data,
  417. City: j.City,
  418. Province: j.Province,
  419. Jsondata: j.Jsondata,
  420. Result: map[string][]*ju.ExtField{},
  421. BuyerAddr: j.BuyerAddr,
  422. RuleBlock: e.RuleBlock,
  423. }
  424. qu.Try(func() {
  425. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  426. }, func(err interface{}) {
  427. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  428. })
  429. e.ExtractDetail(tmpj, false, "")
  430. //if jf != nil && jf.IsFile {
  431. // e.ExtractFile(jf, false, "")
  432. //}
  433. //合并数据
  434. j.Block = append(j.Block, tmpj.Block...)
  435. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  436. for tmpk, _ := range j.Result {
  437. if len(tmpj.Result[tmpk]) > 0 {
  438. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  439. }
  440. }
  441. for tmpk, _ := range tmpj.Result {
  442. if len(j.Result[tmpk]) == 0 {
  443. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  444. }
  445. }
  446. }
  447. }
  448. //分析抽取结果并保存
  449. AnalysisSaveResult(j, jf, e)
  450. <-e.TaskInfo.ProcessPool
  451. }
  452. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  453. qu.Try(func() {
  454. doc := *j.Data
  455. //全局前置规则,结果覆盖doc属性
  456. //for _, v := range e.RulePres {
  457. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  458. //}
  459. tmprules := map[string][]*RuleCore{}
  460. lockrule.Lock()
  461. if j.Category == "all" || j.CategorySecond == "all" {
  462. if isSite {
  463. for k, vc1 := range e.SiteRuleCores["all_all"] {
  464. tmprules[k] = vc1
  465. }
  466. } else {
  467. for k, vc1 := range e.RuleCores["all_all"] {
  468. tmprules[k] = vc1
  469. }
  470. }
  471. } else {
  472. if isSite {
  473. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  474. tmprules[k] = vc1
  475. }
  476. } else {
  477. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  478. tmprules[k] = vc1
  479. }
  480. }
  481. }
  482. if len(tmprules) < 1 { //分类未覆盖部分
  483. if isSite {
  484. for k, vc1 := range e.RuleCores["all_all"] {
  485. tmprules[k] = vc1
  486. }
  487. } else {
  488. for k, vc1 := range e.SiteRuleCores["all_all"] {
  489. tmprules[k] = vc1
  490. }
  491. }
  492. }
  493. lockrule.Unlock()
  494. //抽取规则
  495. for _, vc1 := range tmprules {
  496. for _, vc := range vc1 {
  497. tmp := ju.DeepCopy(doc).(map[string]interface{})
  498. //是否进入逻辑
  499. if !ju.Logic(vc.LuaLogic, tmp) {
  500. continue
  501. }
  502. ////抽取-前置规则
  503. //for _, v := range vc.RulePres {
  504. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  505. //}
  506. // log.Debug("抽取-前置规则", tmp)
  507. //抽取-规则
  508. ExtRuleCore(tmp, e, vc, j, isSite)
  509. // log.Debug("抽取-规则", tmp)
  510. //抽取-后置规则
  511. for _, v := range vc.RuleBacks {
  512. ExtRegBack(j, v, e.TaskInfo)
  513. }
  514. //kv规则
  515. for _, v := range vc.KVRuleCores {
  516. ExtRuleKV(j, v, e.TaskInfo)
  517. }
  518. // log.Debug("抽取-后置规则", tmp)
  519. //项目名称未能抽取到,标题来凑
  520. if vc.Field == "projectname" {
  521. if vc.ExtFrom == "title" {
  522. isextitle := true
  523. for _, v := range j.Result[vc.Field] {
  524. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  525. isextitle = false
  526. break
  527. }
  528. }
  529. if isextitle { //标题加入选举
  530. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  531. if isSite {
  532. field.Score = 1
  533. }
  534. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  535. }
  536. }
  537. for i := 0; i < 3; i++ {
  538. for _, v := range vc.RuleBacks {
  539. ExtRegBack(j, v, e.TaskInfo)
  540. }
  541. }
  542. }
  543. }
  544. }
  545. //全局后置规则
  546. if isSite {
  547. for _, v := range e.SiteRuleBacks {
  548. ExtRegBack(j, v, e.TaskInfo)
  549. }
  550. } else {
  551. for _, v := range e.RuleBacks {
  552. ExtRegBack(j, v, e.TaskInfo)
  553. }
  554. }
  555. //函数清理
  556. for key, val := range j.Result {
  557. for i, v := range val {
  558. // if v.ExtFrom == "title"&& v.Field == "buyer"{
  559. // qu.Debug("title---",v.Value)
  560. // }else if v.Field == "buyer"{
  561. // qu.Debug("text---",v.Value)
  562. // }
  563. lockclear.Lock()
  564. var cfn = []string{}
  565. if isSite {
  566. cfn = e.SiteClearFn[key]
  567. } else {
  568. cfn = e.ClearFn[key]
  569. }
  570. lockclear.Unlock()
  571. if len(cfn) == 0 {
  572. continue
  573. }
  574. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  575. if key == "budget" || key == "bidamount" {
  576. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  577. j.Result[key][i].IsTrue = true
  578. } else {
  579. j.Result[key][i].Value = data[0]
  580. continue
  581. }
  582. }
  583. before, _ := v.Value.(string)
  584. v.Value = data[0]
  585. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  586. //添加行数清理的日志
  587. //清理特殊符号
  588. lockclear.Lock()
  589. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  590. text := qu.ObjToString(v.Value)
  591. before = text
  592. v.Value = clear.OtherClean(key, text)
  593. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  594. }
  595. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  596. lockclear.Unlock()
  597. }
  598. }
  599. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  600. // bs, _ := json.Marshal(j.Result)
  601. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  602. }, func(err interface{}) {
  603. log.Debug("ExtractProcess err", err)
  604. })
  605. }
  606. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  607. qu.Try(func() {
  608. doc := *j.Data
  609. //全局前置规则,结果覆盖doc属性
  610. // for _, v := range e.RulePres {
  611. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  612. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  613. // }
  614. // }
  615. //抽取规则
  616. tmprules := map[string][]*RuleCore{}
  617. lockrule.Lock()
  618. if j.Category == "all" || j.CategorySecond == "all" {
  619. for k, vc1 := range e.RuleCores["all_all"] {
  620. tmprules[k] = vc1
  621. }
  622. } else {
  623. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  624. tmprules[k] = vc1
  625. }
  626. }
  627. lockrule.Unlock()
  628. for _, vc1 := range tmprules {
  629. for _, vc := range vc1 {
  630. tmp := ju.DeepCopy(doc).(map[string]interface{})
  631. //是否进入逻辑
  632. if !ju.Logic(vc.LuaLogic, tmp) {
  633. continue
  634. }
  635. //抽取-前置规则
  636. // for _, v := range vc.RulePres {
  637. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  638. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  639. // }
  640. // }
  641. // log.Debug("抽取-前置规则", tmp)
  642. //抽取-规则
  643. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  644. ExtRuleCore(tmp, e, vc, j, isSite)
  645. }
  646. // log.Debug("抽取-规则", tmp)
  647. //抽取-后置规则
  648. for _, v := range vc.RuleBacks {
  649. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  650. ExtRegBack(j, v, e.TaskInfo)
  651. }
  652. }
  653. // log.Debug("抽取-后置规则", tmp)
  654. }
  655. }
  656. //全局后置规则
  657. for _, v := range e.RuleBacks {
  658. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  659. ExtRegBack(j, v, e.TaskInfo)
  660. }
  661. }
  662. //函数清理
  663. for key, val := range j.Result {
  664. for _, v := range val {
  665. lockclear.Lock()
  666. cfn := e.ClearFn[key]
  667. lockclear.Unlock()
  668. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  669. v.Value = data[0]
  670. //清理特殊符号
  671. lockclear.Lock()
  672. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  673. clear.MesField[key] != nil {
  674. text := qu.ObjToString(v.Value)
  675. text = clear.OtherClean(key, text)
  676. v.Value = text
  677. }
  678. lockclear.Unlock()
  679. }
  680. }
  681. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  682. // bs, _ := json.Marshal(j.Result)
  683. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  684. }, func(err interface{}) {
  685. log.Debug("ExtractProcess err", err)
  686. })
  687. }
  688. //前置过滤
  689. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  690. defer qu.Catch()
  691. before := ju.DeepCopy(doc).(map[string]interface{})
  692. extinfo := map[string]interface{}{}
  693. if in.IsLua {
  694. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  695. if j != nil {
  696. lua.Block = j.Block
  697. }
  698. extinfo = lua.RunScript("pre")
  699. for k, v := range extinfo { //结果覆盖原doc
  700. doc[k] = v
  701. }
  702. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  703. } else {
  704. var key string
  705. if !j.IsFile {
  706. key = qu.If(in.Field == "", "detail", in.Field).(string)
  707. } else {
  708. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  709. }
  710. text := qu.ObjToString(doc[key])
  711. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  712. doc[key] = extinfo[key] //结果覆盖原doc
  713. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  714. }
  715. return doc
  716. }
  717. //抽取-规则
  718. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  719. //候选人加入
  720. var kvMap map[string][]map[string]interface{}
  721. extByReg := true
  722. if vc.ExtFrom != "title" {
  723. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  724. }
  725. for _, v := range vc.RuleCores {
  726. if v.IsLua {
  727. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  728. } else if extByReg {
  729. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  730. }
  731. }
  732. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  733. if vc.Field == "budget" && len(kvMap) == 0 {
  734. if len(j.BlockPackage) == 1 {
  735. for _, bp := range j.BlockPackage {
  736. for fieldname, field := range vc.LFields {
  737. if field != vc.Field {
  738. continue
  739. }
  740. tp := ""
  741. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  742. if k == 0 {
  743. tp = "colon"
  744. } else if k == 1 {
  745. tp = "space"
  746. } else if k == 2 {
  747. tp = "table"
  748. }
  749. if v == nil || v.KvTags == nil {
  750. continue
  751. }
  752. for _, vv := range v.KvTags[fieldname] {
  753. text := ju.TrimLRSpace(vv.Value, "")
  754. if text != "" {
  755. tmp := &ju.ExtField{
  756. ExtFrom: "package",
  757. Field: vc.Field,
  758. Code: "CL_分包",
  759. Type: tp,
  760. MatchType: "package",
  761. RuleText: bp.Text,
  762. SourceValue: vv.Key,
  763. Value: text,
  764. }
  765. if isSite {
  766. tmp.Score = 1
  767. }
  768. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  769. }
  770. }
  771. }
  772. }
  773. break
  774. }
  775. }
  776. } else {
  777. for k, v := range kvMap {
  778. if j.Result[k] == nil {
  779. j.Result[k] = [](*ju.ExtField){}
  780. }
  781. for _, tmp := range v {
  782. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  783. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  784. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  785. MatchType: qu.ObjToString(tmp["matchtype"]),
  786. RuleText: qu.ObjToString(tmp["ruletext"]),
  787. SourceValue: tmp["sourcevalue"],
  788. Value: tmp["value"]}
  789. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  790. field.Score = 1
  791. }
  792. if isSite {
  793. field.Score = 1
  794. }
  795. if tmp["blocktag"] != nil {
  796. btag := make(map[string]string)
  797. for k := range tmp["blocktag"].(map[string]bool) {
  798. blocktag.Lock()
  799. if TagConfigDesc[k] != "" {
  800. btag[k] = TagConfigDesc[k]
  801. }
  802. blocktag.Unlock()
  803. }
  804. field.BlockTag = btag
  805. }
  806. j.Result[k] = append(j.Result[k], field)
  807. }
  808. }
  809. }
  810. }
  811. //抽取-规则-kv
  812. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  813. defer qu.Catch()
  814. if extfrom == "title" || !in.IsLua {
  815. return
  816. }
  817. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  818. lua.KvMap = *kvMap
  819. lua.Block = j.Block
  820. extinfo := lua.RunScript("core")
  821. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  822. for _, v := range tmps {
  823. v["core"] = in.Code
  824. }
  825. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  826. }
  827. if len(extinfo) > 0 {
  828. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  829. }
  830. }
  831. //抽取-规则-正则
  832. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  833. defer qu.Catch()
  834. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  835. b := IsExtract(in.Field, j.Title, j.Content)
  836. if !b {
  837. return
  838. }
  839. //全文正则
  840. //text := qu.ObjToString(doc[extfrom])
  841. //if in.Field != "" {
  842. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  843. // if len(extinfo) > 0 {
  844. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  845. // }
  846. //}
  847. //块抽取
  848. if in.Field != "" {
  849. if extfrom == "title" {
  850. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  851. if len(extinfo) > 0 {
  852. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  853. }
  854. } else {
  855. for _, v := range j.Block {
  856. btag := make(map[string]string)
  857. for k := range v.Classify {
  858. blocktag.Lock()
  859. btag[k] = TagConfigDesc[k]
  860. blocktag.Unlock()
  861. }
  862. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  863. if len(extinfo) > 0 {
  864. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  865. }
  866. }
  867. }
  868. }
  869. }
  870. //pkg抽取-规则-正则
  871. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  872. defer qu.Catch()
  873. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  874. b := IsExtract(in.Field, j.Title, j.Content)
  875. if !b {
  876. return
  877. }
  878. //块抽取
  879. if in.Field != "" {
  880. for k, vbpkg := range j.BlockPackage {
  881. rep := map[string]string{}
  882. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  883. if in.Field == "budget" && vbpkg.Budget > 0 {
  884. continue
  885. }
  886. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  887. continue
  888. }
  889. if in.Field == "winner" && vbpkg.Winner != "" {
  890. continue
  891. }
  892. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  893. continue
  894. }
  895. if in.Field == "projectname" && vbpkg.Name != "" {
  896. continue
  897. }
  898. if in.Field == "winner" && vbpkg.Winner != "" {
  899. continue
  900. }
  901. if in.Field == "winnerperson" {
  902. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  903. continue
  904. }
  905. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  906. continue
  907. }
  908. }
  909. if in.Field == "winnertel" {
  910. if vbpkg.WinnerPerson == "" {
  911. continue
  912. }
  913. }
  914. //处理正负数修正
  915. ptmp := strings.Split(in.RuleText, "#")
  916. sign := 0
  917. if len(ptmp) == 2 {
  918. if ptmp[1] == "正" {
  919. sign = 1
  920. } else if ptmp[1] == "负" {
  921. sign = -1
  922. }
  923. }
  924. tmp := strings.Split(ptmp[0], "__")
  925. if len(tmp) == 2 {
  926. epos := strings.Split(tmp[1], ",")
  927. posm := map[string]int{}
  928. for _, v := range epos {
  929. ks := strings.Split(v, ":")
  930. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  931. posm[ks[1]] = qu.IntAll(ks[0])
  932. } else {
  933. posm[in.Field] = qu.IntAll(ks[0])
  934. }
  935. }
  936. var pattern string
  937. if strings.Contains(tmp[0], "\\u") {
  938. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  939. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  940. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  941. } else {
  942. pattern = tmp[0]
  943. }
  944. //log.Debug("pattern", pattern)
  945. //fmt.Println(text)
  946. reg := regexp.MustCompile(pattern)
  947. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  948. for i, _ := range apos {
  949. pos := apos[i]
  950. for k, p := range posm {
  951. if len(pos) > p {
  952. if pos[p] == -1 || pos[p+1] == -1 {
  953. continue
  954. }
  955. val := vbpkg.Text[pos[p]:pos[p+1]]
  956. if string(val) == "" {
  957. continue
  958. }
  959. if sign == -1 {
  960. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  961. } else {
  962. rep[k+"_"+fmt.Sprint(i)] = val
  963. }
  964. }
  965. }
  966. }
  967. //fmt.Println(text)
  968. for i := 0; i < len(apos); i++ {
  969. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  970. if in.Field == "budget" && vbpkg.Budget <= 0 {
  971. lock.Lock()
  972. cfn := e.ClearFn[in.Field]
  973. lock.Unlock()
  974. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  975. if data[len(data)-1].(bool) {
  976. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  977. j.BlockPackage[k].IsTrueBudget = true
  978. }
  979. break
  980. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  981. lock.Lock()
  982. cfn := e.ClearFn[in.Field]
  983. lock.Unlock()
  984. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  985. if data[len(data)-1].(bool) {
  986. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  987. j.BlockPackage[k].IsTrueBidamount = true
  988. }
  989. break
  990. } else if in.Field == "winner" {
  991. if j.BlockPackage[k].Winner == "" {
  992. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  993. break
  994. }
  995. } else if in.Field == "winnertel" {
  996. if j.BlockPackage[k].WinnerTel == "" {
  997. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  998. break
  999. }
  1000. } else if in.Field == "winnerperson" {
  1001. if j.BlockPackage[k].WinnerPerson == "" {
  1002. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1003. break
  1004. }
  1005. } else if in.Field == "bidstatus" {
  1006. if j.BlockPackage[k].BidStatus == "" {
  1007. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1008. break
  1009. }
  1010. } else if in.Field == "projectname" {
  1011. if j.BlockPackage[k].Name == "" {
  1012. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1013. break
  1014. }
  1015. } else if in.Field == "winnerperson" {
  1016. if j.BlockPackage[k].WinnerPerson == "" {
  1017. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1018. break
  1019. }
  1020. } else if in.Field == "winnertel" {
  1021. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1022. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1023. break
  1024. }
  1025. }
  1026. }
  1027. }
  1028. }
  1029. } else {
  1030. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1031. val := ""
  1032. if len(pos) == 2 {
  1033. //"text" = "text"[pos[1]:]
  1034. val = "text"[pos[1]:]
  1035. rs := regexp.MustCompile("[^\r\n\t]+")
  1036. tmp := rs.FindAllString("text", -1)
  1037. if len(tmp) > 0 {
  1038. val = tmp[0]
  1039. }
  1040. }
  1041. if val != "" {
  1042. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1043. lock.Lock()
  1044. cfn := e.ClearFn[in.Field]
  1045. lock.Unlock()
  1046. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1047. if data[len(data)-1].(bool) {
  1048. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1049. j.BlockPackage[k].IsTrueBudget = true
  1050. }
  1051. break
  1052. }
  1053. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1054. lock.Lock()
  1055. cfn := e.ClearFn[in.Field]
  1056. lock.Unlock()
  1057. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1058. if data[len(data)-1].(bool) {
  1059. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1060. j.BlockPackage[k].IsTrueBidamount = true
  1061. }
  1062. break
  1063. } else if in.Field == "bidstatus" {
  1064. if j.BlockPackage[k].BidStatus == "" {
  1065. j.BlockPackage[k].BidStatus = val
  1066. break
  1067. }
  1068. } else if in.Field == "projectname" {
  1069. if j.BlockPackage[k].Name == "" {
  1070. j.BlockPackage[k].Name = val
  1071. break
  1072. }
  1073. }
  1074. }
  1075. }
  1076. }
  1077. }
  1078. }
  1079. //lua脚本根据属性设置提取kv值
  1080. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1081. kvmap := map[string][]map[string]interface{}{}
  1082. if len(j.Winnerorder) > 1 {
  1083. if vc.Field == "bidamount" {
  1084. for _, v := range j.Winnerorder {
  1085. if v["price"] == nil {
  1086. continue
  1087. }
  1088. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1089. "code": "winnerorder",
  1090. "field": vc.Field,
  1091. "ruletext": "中标候选人_" + v["sortstr"].(string),
  1092. "extfrom": v["sortstr"],
  1093. "sourcevalue": v["price"],
  1094. "value": v["price"],
  1095. "type": "winnerorder",
  1096. "matchtype": "winnerorder",
  1097. })
  1098. return kvmap, false
  1099. }
  1100. //候选人中标金额
  1101. if price := j.Winnerorder[0]["price"]; price != nil {
  1102. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1103. "code": "CL_中标候选人",
  1104. "field": vc.Field,
  1105. "ruletext": "中标候选人",
  1106. "extfrom": j.Winnerorder[0]["sortstr"],
  1107. "sourcevalue": price,
  1108. "value": price,
  1109. "type": "winnerorder",
  1110. "matchtype": "winnerorder",
  1111. })
  1112. return kvmap, false
  1113. }
  1114. }
  1115. //else if vc.Field == "winner" {
  1116. // for _, v := range j.Winnerorder {
  1117. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1118. // "code": "winnerorder",
  1119. // "field": vc.Field,
  1120. // "ruletext": "中标候选人",
  1121. // "extfrom": vc.ExtFrom,
  1122. // "sourcevalue": "中标候选人",
  1123. // "value": v["entname"],
  1124. // "type": "winnerorder",
  1125. // "matchtype": "winnerorder",
  1126. // })
  1127. // }
  1128. // //候选人中标单位
  1129. // if entname := j.Winnerorder[0]["entname"]; entname != nil {
  1130. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1131. // "code": "CL_中标候选人",
  1132. // "field": vc.Field,
  1133. // "ruletext": "中标候选人",
  1134. // "extfrom": vc.ExtFrom,
  1135. // "sourcevalue": "中标候选人",
  1136. // "value": entname,
  1137. // "type": "winnerorder",
  1138. // "matchtype": "winnerorder",
  1139. // })
  1140. // return kvmap, false
  1141. // }
  1142. //}
  1143. }
  1144. for fieldname, field := range vc.LFields {
  1145. if field != vc.Field {
  1146. continue
  1147. }
  1148. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1149. }
  1150. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1151. return kvmap, true
  1152. }
  1153. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1154. //qu.Debug("fieldname+++", fieldname)
  1155. for _, bl := range blocks {
  1156. tp := ""
  1157. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1158. if k == 0 {
  1159. tp = "colon"
  1160. // for _, vv := range v.Kvs {
  1161. // qu.Debug("colon-kvs:", vv.Key, vv.Value)
  1162. // }
  1163. // for kkk, vv := range v.KvTags {
  1164. // for _, vvv := range vv {
  1165. // qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
  1166. // }
  1167. // }
  1168. } else if k == 1 {
  1169. tp = "space"
  1170. // for _, vv := range v.Kvs {
  1171. // qu.Debug("space-kvs:", vv.Key, vv.Value)
  1172. // }
  1173. // for kkk, vv := range v.KvTags {
  1174. // for _, vvv := range vv {
  1175. // qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
  1176. // }
  1177. // }
  1178. } else if k == 2 {
  1179. tp = "table"
  1180. // for _, vv := range v.Kvs {
  1181. // qu.Debug("table-kvs:", vv.Key, vv.Value)
  1182. // }
  1183. // for kkk, vv := range v.KvTags {
  1184. // for _, vvv := range vv {
  1185. // qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
  1186. // }
  1187. // }
  1188. }
  1189. if v == nil || v.KvTags == nil {
  1190. continue
  1191. }
  1192. for _, vv := range v.KvTags[fieldname] {
  1193. text := ju.TrimLRSpace(vv.Value, "")
  1194. if text != "" {
  1195. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1196. "code": "CL_" + vv.Key,
  1197. "field": field,
  1198. "ruletext": vv.Key,
  1199. "extfrom": vc.ExtFrom,
  1200. "sourcevalue": text,
  1201. "value": text,
  1202. "type": tp,
  1203. "matchtype": "tag_string",
  1204. "blocktag": bl.Classify,
  1205. "weight": vv.Weight,
  1206. })
  1207. //if field != "winnertel" && field != "winnerperson" {
  1208. // //break //暂定取第一个
  1209. //}
  1210. }
  1211. }
  1212. }
  1213. if len(kvmap[field]) == 0 {
  1214. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1215. }
  1216. }
  1217. }
  1218. //正则提取结果
  1219. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1220. defer qu.Catch()
  1221. var score float64
  1222. score = vre.Score
  1223. if isSite {
  1224. score = score + 1.0
  1225. }
  1226. extinfo := map[string][]map[string]interface{}{}
  1227. rep := map[string]string{}
  1228. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1229. //处理正负数修正
  1230. ptmp := strings.Split(vre.RuleText, "#")
  1231. sign := 0
  1232. if len(ptmp) == 2 {
  1233. if ptmp[1] == "正" {
  1234. sign = 1
  1235. } else if ptmp[1] == "负" {
  1236. sign = -1
  1237. }
  1238. }
  1239. tmp := strings.Split(ptmp[0], "__")
  1240. if len(tmp) == 2 {
  1241. epos := strings.Split(tmp[1], ",")
  1242. posm := map[string]int{}
  1243. for _, v := range epos {
  1244. ks := strings.Split(v, ":")
  1245. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1246. posm[ks[1]] = qu.IntAll(ks[0])
  1247. } else {
  1248. posm[vre.Field] = qu.IntAll(ks[0])
  1249. }
  1250. }
  1251. var pattern string
  1252. if strings.Contains(tmp[0], "\\u") {
  1253. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1254. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1255. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1256. } else {
  1257. pattern = tmp[0]
  1258. }
  1259. //log.Debug("pattern", pattern)
  1260. //fmt.Println(text)
  1261. reg := regexp.MustCompile(pattern)
  1262. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1263. for i, _ := range apos {
  1264. pos := apos[i]
  1265. for k, p := range posm {
  1266. if len(pos) > p {
  1267. if pos[p] == -1 || pos[p+1] == -1 {
  1268. continue
  1269. }
  1270. val := text[pos[p]:pos[p+1]]
  1271. if string(val) == "" {
  1272. continue
  1273. }
  1274. if sign == -1 {
  1275. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1276. } else {
  1277. rep[k+"_"+fmt.Sprint(i)] = val
  1278. }
  1279. }
  1280. }
  1281. }
  1282. //fmt.Println(text)
  1283. tmps := []map[string]interface{}{}
  1284. for i := 0; i < len(apos); i++ {
  1285. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1286. tmp := map[string]interface{}{
  1287. "field": vre.Field,
  1288. "code": vre.Code,
  1289. "ruletext": vre.RuleText,
  1290. "extfrom": text,
  1291. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1292. "type": "regexp",
  1293. "matchtype": "regcontent",
  1294. "blocktag": *tag,
  1295. "score": score,
  1296. }
  1297. tmps = append(tmps, tmp)
  1298. exfield := ju.ExtField{
  1299. BlockTag: *tag,
  1300. Field: vre.Field,
  1301. Code: vre.Code,
  1302. RuleText: vre.RuleText,
  1303. Type: "regexp",
  1304. MatchType: "regcontent",
  1305. ExtFrom: extfrom,
  1306. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1307. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1308. Score: score}
  1309. if tmp["blocktag"] != nil {
  1310. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1311. }
  1312. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1313. }
  1314. }
  1315. if len(tmps) > 0 {
  1316. //fmt.Println(tmps)
  1317. extinfo[vre.Field] = tmps
  1318. }
  1319. }
  1320. } else {
  1321. pos := vre.RegCore.Reg.FindStringIndex(text)
  1322. val := ""
  1323. if len(pos) == 2 {
  1324. text = text[pos[1]:]
  1325. rs := regexp.MustCompile("[^\r\n\t]+")
  1326. tmp := rs.FindAllString(text, -1)
  1327. if len(tmp) > 0 {
  1328. val = tmp[0]
  1329. }
  1330. }
  1331. if val != "" {
  1332. tmps := []map[string]interface{}{}
  1333. tmp := map[string]interface{}{
  1334. "field": vre.Field,
  1335. "code": vre.Code,
  1336. "ruletext": vre.RuleText,
  1337. "extfrom": text,
  1338. "value": val,
  1339. "type": "regexp",
  1340. "matchtype": "regcontent",
  1341. "blocktag": *tag,
  1342. "score": score,
  1343. }
  1344. tmps = append(tmps, tmp)
  1345. extinfo[vre.Field] = tmps
  1346. if j.Result[vre.Field] == nil {
  1347. j.Result[vre.Field] = [](*ju.ExtField){}
  1348. }
  1349. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1350. Value: val,
  1351. Score: score}
  1352. if tmp["blocktag"] != nil {
  1353. field.BlockTag = tmp["blocktag"].(map[string]string)
  1354. }
  1355. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1356. }
  1357. }
  1358. return extinfo
  1359. }
  1360. //后置过滤
  1361. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1362. defer qu.Catch()
  1363. if in.IsLua {
  1364. result := GetResultMapForLua(j)
  1365. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1366. if j != nil {
  1367. lua.Block = j.Block
  1368. }
  1369. extinfo := lua.RunScript("back")
  1370. for k, v := range extinfo {
  1371. if tmps, ok := v.([]map[string]interface{}); ok {
  1372. j.Result[k] = [](*ju.ExtField){}
  1373. for _, tmp := range tmps {
  1374. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1375. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1376. Value: tmp["value"]}
  1377. if tmp["blocktag"] != nil {
  1378. field.BlockTag = tmp["blocktag"].(map[string]string)
  1379. }
  1380. j.Result[k] = append(j.Result[k], field)
  1381. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1382. }
  1383. }
  1384. }
  1385. if len(extinfo) > 0 {
  1386. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1387. }
  1388. } else {
  1389. extinfo := map[string]interface{}{}
  1390. if in.Field != "" {
  1391. if j.Result[in.Field] != nil {
  1392. tmp := j.Result[in.Field]
  1393. exts := []interface{}{}
  1394. for k, v := range tmp {
  1395. //table抽取到的数据不清理
  1396. // if v.Type == "table" && v.Field != "projectname" {
  1397. // continue
  1398. // }
  1399. text := qu.ObjToString(v.Value)
  1400. if text != "" {
  1401. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1402. }
  1403. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1404. continue
  1405. }
  1406. j.Result[in.Field][k].Value = text
  1407. exts = append(exts, map[string]interface{}{
  1408. "field": v.Field,
  1409. "code": v.Code,
  1410. "ruletext": v.RuleText,
  1411. "type": v.Type,
  1412. "matchtype": v.MatchType,
  1413. "extfrom": v.ExtFrom,
  1414. "value": text,
  1415. })
  1416. }
  1417. if len(exts) > 0 {
  1418. extinfo[in.Field] = exts
  1419. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1420. }
  1421. }
  1422. } else {
  1423. for key, tmp := range j.Result {
  1424. exts := []interface{}{}
  1425. for k, v := range tmp {
  1426. if v.Type == "table" { //table抽取到的数据不清理
  1427. continue
  1428. }
  1429. text := qu.ObjToString(v.Value)
  1430. if text != "" {
  1431. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1432. }
  1433. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1434. continue
  1435. }
  1436. j.Result[key][k].Value = text
  1437. exts = append(exts, map[string]interface{}{
  1438. "field": v.Field,
  1439. "code": v.Code,
  1440. "ruletext": v.RuleText,
  1441. "type": v.Type,
  1442. "matchtype": v.MatchType,
  1443. "extfrom": v.ExtFrom,
  1444. "value": text,
  1445. })
  1446. }
  1447. if len(exts) > 0 {
  1448. extinfo[key] = exts
  1449. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1450. }
  1451. }
  1452. }
  1453. }
  1454. }
  1455. //后置过滤
  1456. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1457. defer qu.Catch()
  1458. for k, v := range j.BlockPackage {
  1459. if in.Field == "winner" {
  1460. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1461. } else if in.Field == "bidstatus" {
  1462. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1463. } else if in.Field == "" {
  1464. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1465. } else if in.Field == "projectname" {
  1466. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1467. } else if in.Field == "winnerperson" {
  1468. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1469. } else if in.Field == "winnertel" {
  1470. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1471. }
  1472. }
  1473. }
  1474. //KV过滤
  1475. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1476. defer qu.Catch()
  1477. extinfo := map[string]interface{}{}
  1478. if in.Field != "" {
  1479. if j.Result[in.Field] != nil {
  1480. tmp := j.Result[in.Field]
  1481. exts := []interface{}{}
  1482. for k, v := range tmp {
  1483. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1484. continue
  1485. }
  1486. text := qu.ObjToString(v.Value)
  1487. if text != "" {
  1488. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1489. }
  1490. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1491. continue
  1492. }
  1493. j.Result[in.Field][k].Value = text
  1494. exts = append(exts, map[string]interface{}{
  1495. "field": v.Field,
  1496. "code": v.Code,
  1497. "ruletext": v.RuleText,
  1498. "type": v.Type,
  1499. "matchtype": v.MatchType,
  1500. "extfrom": v.ExtFrom,
  1501. "value": text,
  1502. })
  1503. }
  1504. if len(exts) > 0 {
  1505. extinfo[in.Field] = exts
  1506. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1507. }
  1508. }
  1509. }
  1510. }
  1511. //获取抽取结果map[string][]interface{},lua脚本使用
  1512. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1513. defer qu.Catch()
  1514. result := map[string][]map[string]interface{}{}
  1515. for key, val := range j.Result {
  1516. if result[key] == nil {
  1517. result[key] = []map[string]interface{}{}
  1518. }
  1519. for _, v := range val {
  1520. tmp := map[string]interface{}{
  1521. "field": v.Field,
  1522. "code": v.Code,
  1523. "ruletext": v.RuleText,
  1524. "value": v.Value,
  1525. "type": v.Type,
  1526. "matchtype": v.MatchType,
  1527. "extfrom": v.ExtFrom,
  1528. }
  1529. result[key] = append(result[key], tmp)
  1530. }
  1531. }
  1532. return result
  1533. }
  1534. //抽取日志
  1535. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1536. defer qu.Catch()
  1537. if !t.IsEtxLog {
  1538. return
  1539. }
  1540. logdata := map[string]interface{}{
  1541. "code": qu.If(v.Code == "", "kv", v.Code),
  1542. "name": v.Name,
  1543. "type": ftype,
  1544. "ruletext": v.RuleText,
  1545. "islua": v.IsLua,
  1546. "field": v.Field,
  1547. "version": t.Version,
  1548. "taskname": t.Name,
  1549. "before": before,
  1550. "extinfo": extinfo,
  1551. "sid": sid,
  1552. "comeintime": time.Now().Unix(),
  1553. }
  1554. lock.Lock()
  1555. ExtLogs[t] = append(ExtLogs[t], logdata)
  1556. lock.Unlock()
  1557. }
  1558. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1559. exts := []map[string]interface{}{}
  1560. exts = append(exts, map[string]interface{}{
  1561. "field": ext.Field,
  1562. "code": ext.Code,
  1563. "type": ftype,
  1564. "matchtype": matchtype,
  1565. "extfrom": ext.ExtFrom,
  1566. "value": ext.Value,
  1567. })
  1568. extinfo := map[string]interface{}{
  1569. ext.Field: exts,
  1570. }
  1571. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1572. }
  1573. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1574. defer qu.Catch()
  1575. if !t.IsEtxLog {
  1576. return
  1577. }
  1578. logdata := map[string]interface{}{
  1579. "code": code,
  1580. "name": name,
  1581. "type": ftype,
  1582. "ruletext": "",
  1583. "islua": false,
  1584. "field": field,
  1585. "version": t.Version,
  1586. "taskname": t.Name,
  1587. "before": before,
  1588. "extinfo": extinfo,
  1589. "sid": sid,
  1590. "comeintime": time.Now().Unix(),
  1591. }
  1592. lock.Lock()
  1593. ExtLogs[t] = append(ExtLogs[t], logdata)
  1594. lock.Unlock()
  1595. }
  1596. //保存抽取日志
  1597. func SaveExtLog() {
  1598. defer qu.Catch()
  1599. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1600. lock.Lock()
  1601. tmpLogs = ExtLogs
  1602. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1603. lock.Unlock()
  1604. for k, v := range tmpLogs {
  1605. if len(v) < saveLimit {
  1606. db.Mgo.SaveBulk(k.TrackColl, v...)
  1607. } else {
  1608. for {
  1609. if len(v) > saveLimit {
  1610. tmp := v[:saveLimit]
  1611. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1612. v = v[saveLimit:]
  1613. } else {
  1614. db.Mgo.SaveBulk(k.TrackColl, v...)
  1615. break
  1616. }
  1617. }
  1618. }
  1619. }
  1620. time.AfterFunc(10*time.Second, SaveExtLog)
  1621. }
  1622. type FieldValue struct {
  1623. Value interface{}
  1624. Count int
  1625. }
  1626. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1627. //分析抽取结果并保存
  1628. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1629. qu.Try(func() {
  1630. //重新取出清理过后的中标候选人
  1631. resetWinnerorder(j)
  1632. doc, result, _id := funcAnalysis(j, e)
  1633. if ju.IsSaveTag {
  1634. go otherNeedSave(j, result, e)
  1635. }
  1636. auxinfo := auxInfo(j)
  1637. //从排序结果中取值
  1638. tmp := map[string]interface{}{} //抽取值
  1639. tmp["spidercode"] = j.SpiderCode
  1640. tmp["site"] = j.Site
  1641. tmp["jsondata"] = j.Jsondata
  1642. tmp["fieldall"] = auxinfo
  1643. for _, val := range result {
  1644. for _, v := range val { //取第一个非负数,项目名称除外
  1645. //存0是否有效
  1646. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
  1647. tmp[v.Field] = v.Value
  1648. break
  1649. }
  1650. if v.Score > -1 {
  1651. tmp[v.Field] = v.Value
  1652. break
  1653. } else if v.Field == "projectname" {
  1654. tmp[v.Field] = v.Value
  1655. break
  1656. }
  1657. }
  1658. }
  1659. if len(j.PackageInfo) > 15 {
  1660. for k, v := range j.PackageInfo {
  1661. j.PackageInfo = map[string]map[string]interface{}{}
  1662. j.PackageInfo[k] = v
  1663. break
  1664. }
  1665. }
  1666. if len(j.PackageInfo) > 0 { //分包信息
  1667. tmp["package"] = j.PackageInfo
  1668. //包预算,中标金额合并大于抽取就覆盖
  1669. var tmpBidamount, tmpBudget float64
  1670. //s_winner逗号分隔拼接,分包中标人
  1671. var tmpstr, savewinner []string
  1672. //按包排序
  1673. for b, v := range j.PackageInfo {
  1674. if v["winner"] != nil && v["winner"] != "" {
  1675. tmpstr = append(tmpstr, b)
  1676. }
  1677. }
  1678. //包预算,中标金额合并大于抽取就覆盖
  1679. if len(j.PackageInfo) >= 1 {
  1680. //包数大于1累加
  1681. for _, v := range j.PackageInfo {
  1682. if v["budget"] != nil {
  1683. tmpBudget += qu.Float64All(v["budget"])
  1684. }
  1685. if v["bidamount"] != nil {
  1686. tmpBidamount += qu.Float64All(v["bidamount"])
  1687. }
  1688. }
  1689. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1690. tmp["budget"] = tmpBudget
  1691. }
  1692. if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1693. tmp["bidamount"] = tmpBidamount
  1694. }
  1695. } else {
  1696. //包数等于1,tmp没有值取包里的值
  1697. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1698. for _, v := range j.PackageInfo {
  1699. if v["budget"] != nil {
  1700. tmp["budget"] = v["budget"]
  1701. }
  1702. }
  1703. }
  1704. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1705. for _, v := range j.PackageInfo {
  1706. if v["bidamount"] != nil {
  1707. tmp["bidamount"] = v["bidamount"]
  1708. }
  1709. }
  1710. }
  1711. }
  1712. //s_winner逗号分隔拼接,分包中标人
  1713. sort.Strings(tmpstr)
  1714. for _, v := range tmpstr {
  1715. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1716. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1717. if savevvv == "" {
  1718. continue
  1719. }
  1720. savewinner = append(savewinner, savevvv)
  1721. }
  1722. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1723. tmp["s_winner"] = tmp["winner"]
  1724. } else if savewinner != nil {
  1725. savewinner = RemoveReplicaSliceString(savewinner)
  1726. tmp["s_winner"] = strings.Join(savewinner, ",")
  1727. }
  1728. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1729. //没有分包取winner
  1730. tmp["s_winner"] = tmp["winner"]
  1731. }
  1732. if len(j.Winnerorder) > 0 { //候选人信息
  1733. for i, v := range j.Winnerorder {
  1734. if v["price"] != nil {
  1735. j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
  1736. }
  1737. }
  1738. tmp["winnerorder"] = j.Winnerorder
  1739. }
  1740. //处理附件
  1741. var resultf map[string][]*ju.ExtField
  1742. if jf != nil {
  1743. _, resultf, _ = funcAnalysis(jf, e)
  1744. auxinfof := auxInfo(jf)
  1745. tmp["fieldallf"] = auxinfof
  1746. ffield := map[string]interface{}{}
  1747. for _, val := range resultf {
  1748. for _, v := range val { //取第一个非负数
  1749. if v.Score > -1 {
  1750. ffield[v.Field] = v.Value
  1751. break
  1752. }
  1753. }
  1754. }
  1755. if len(jf.PackageInfo) > 0 { //分包信息
  1756. ffield["package"] = jf.PackageInfo
  1757. }
  1758. if len(jf.Winnerorder) > 0 { //候选人信息
  1759. ffield["winnerorder"] = jf.Winnerorder
  1760. }
  1761. tmp["ffield"] = ffield
  1762. }
  1763. for k, v := range *doc {
  1764. //去重冗余字段
  1765. if delFiled(k) {
  1766. continue
  1767. }
  1768. if tmp[k] == nil {
  1769. tmp[k] = v
  1770. }
  1771. }
  1772. //质量审核
  1773. if ju.QualityAudit {
  1774. e.QualityAudit(tmp)
  1775. }
  1776. if e.IsExtractCity { //城市抽取
  1777. //e.ExtractCity(j, tmp, _id)
  1778. e.NewExtractCity(j, tmp, _id)
  1779. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1780. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1781. // tmp["district"] = d
  1782. // if b {
  1783. // tmp["city"] = c
  1784. // tmp["area"] = p
  1785. // }
  1786. }
  1787. //品牌抽取
  1788. if ju.IsBrandGoods {
  1789. tmp["checkhas"] = map[string]int{
  1790. "hastable": j.HasTable,
  1791. "hasgoods": j.HasGoods,
  1792. "hasbrand": j.HasBrand,
  1793. "haskey": j.HasKey,
  1794. }
  1795. if len(j.BrandData) > 0 {
  1796. tmp["tablebrand"] = j.BrandData
  1797. }
  1798. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1799. }
  1800. //prince和number抽取
  1801. if ju.IsPriceNumber {
  1802. priceNumberLen := len(j.PriceNumberData)
  1803. if priceNumberLen > 1 { //table数据去重
  1804. tmpPriceNumberData := []map[string]interface{}{}
  1805. tableStrs := map[string]bool{}
  1806. for _, tb := range j.PriceNumberData {
  1807. has := false
  1808. bytes, _ := json.Marshal(tb)
  1809. str := string(bytes)
  1810. if len(tableStrs) > 0 && tableStrs[str] {
  1811. has = true
  1812. } else {
  1813. tableStrs[str] = true
  1814. }
  1815. if !has {
  1816. for _, data := range tb {
  1817. tmpPriceNumberData = append(tmpPriceNumberData, data)
  1818. }
  1819. }
  1820. }
  1821. tmp["pricenumber"] = tmpPriceNumberData
  1822. } else if priceNumberLen == 1 {
  1823. tmp["pricenumber"] = j.PriceNumberData[0]
  1824. }
  1825. }
  1826. //所有kv组成的字符串
  1827. var kvtext bytes.Buffer
  1828. blocks := make([]ju.BlockAndTag, 0)
  1829. for _, v := range j.Block {
  1830. //分包和标签
  1831. if ju.SaveBlock {
  1832. xx, _ := json.Marshal(v)
  1833. tmpblock := new(ju.TmpBlock)
  1834. err := json.Unmarshal(xx, &tmpblock)
  1835. if err != nil {
  1836. if v.BPackage != nil {
  1837. bpb, _ := json.Marshal(v.BPackage)
  1838. tmpblock.BPackage = string(bpb)
  1839. }
  1840. tmpblock = rangeBlockToJson(v, *tmpblock)
  1841. }
  1842. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1843. }
  1844. //把所有kv组装成一个字符串,存库
  1845. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1846. if jv == nil {
  1847. continue
  1848. }
  1849. for jv_k, jv_v := range jv.KvTags {
  1850. for _, jv_vv := range jv_v {
  1851. kvtext.WriteString(jv_k)
  1852. kvtext.WriteString(":")
  1853. kvtext.WriteString(jv_vv.Value)
  1854. kvtext.WriteString(" ")
  1855. }
  1856. }
  1857. }
  1858. }
  1859. if kvtext.Len() > 0 {
  1860. tmp["kvtext"] = kvtext.String()
  1861. }
  1862. if len(blocks) > 0 {
  1863. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1864. if utf8.RuneCount(blocksBytes) < 100000 {
  1865. tmp["blocks"] = string(blocksBytes)
  1866. }
  1867. }
  1868. }
  1869. // fmt.Println("=============抽取结果================")
  1870. // for k, v := range tmp {
  1871. // qu.Debug(k, "---", v)
  1872. // }
  1873. //tmp["extract_content"] = j.Content
  1874. if e.TaskInfo.TestColl == "" {
  1875. if len(tmp) > 0 { //保存抽取结果
  1876. /* if len(e.SiteFields) <= 0 {
  1877. //for field, _ := range e.Fields {
  1878. // if tmp[field] == nil && {
  1879. // tmp[field] = "" //覆盖之前版本数据
  1880. // }
  1881. //}
  1882. } else {
  1883. //for field, _ := range e.SiteFields {
  1884. // if tmp[field] == nil &&{
  1885. // tmp[field] = "" //覆盖之前版本数据
  1886. // }
  1887. //}
  1888. }*/
  1889. tmp["repeat"] = 0
  1890. tmparr := []map[string]interface{}{
  1891. map[string]interface{}{
  1892. "_id": qu.StringTOBsonId(_id),
  1893. },
  1894. map[string]interface{}{"$set": tmp},
  1895. }
  1896. e.RWMutex.Lock()
  1897. e.BidArr = append(e.BidArr, tmparr)
  1898. e.BidTotal++
  1899. e.RWMutex.Unlock()
  1900. }
  1901. if ju.SaveResult {
  1902. id := tmp["_id"]
  1903. tmp["result"] = result
  1904. tmp["resultf"] = resultf
  1905. delete(tmp, "_id")
  1906. tmparr := []map[string]interface{}{
  1907. map[string]interface{}{
  1908. "_id": id,
  1909. },
  1910. map[string]interface{}{"$set": tmp},
  1911. }
  1912. e.RWMutex.Lock()
  1913. e.ResultArr = append(e.ResultArr, tmparr)
  1914. e.RWMutex.Unlock()
  1915. }
  1916. } else { //测试结果
  1917. delete(tmp, "_id")
  1918. delete(tmp, "fieldall")
  1919. if len(j.BlockPackage) > 0 { //分包详情
  1920. if len(j.BlockPackage) > 10 {
  1921. tmp["epackage"] = "分包异常"
  1922. } else {
  1923. bs, _ := json.Marshal(j.BlockPackage)
  1924. tmp["epackage"] = string(bs)
  1925. }
  1926. }
  1927. tmp["result"] = result
  1928. tmp["resultf"] = resultf
  1929. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1930. if !b {
  1931. log.Debug(e.TaskInfo.TestColl, _id)
  1932. }
  1933. }
  1934. }, func(err interface{}) {
  1935. log.Debug("AnalysisSaveResult err", err)
  1936. })
  1937. }
  1938. //保存其他
  1939. //kv、表格、块上的标签凡是新的标签都入库
  1940. //val type times firstid createtime 判定field
  1941. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1942. now := time.Now().Unix()
  1943. coll := e.TaskInfo.TestColl
  1944. if coll == "" {
  1945. coll = "extract_tag_result"
  1946. } else {
  1947. coll += "_tag"
  1948. }
  1949. datas := []map[string]interface{}{}
  1950. kv := map[string]int{}
  1951. for _, v := range j.Block {
  1952. //
  1953. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1954. if vv == nil || vv.KvTags == nil {
  1955. continue
  1956. }
  1957. for kkk, vvv := range vv.KvTags {
  1958. for _, vvvv := range vvv {
  1959. if vvvv.IsInvalid {
  1960. kv[kkk] = kv[kkk] + 1
  1961. break
  1962. }
  1963. }
  1964. }
  1965. }
  1966. for _, vv := range v.NotClassifyTitles {
  1967. datas = append(datas, map[string]interface{}{
  1968. "val": vv,
  1969. "times": 0,
  1970. "type": "block",
  1971. "firstid": j.SourceMid,
  1972. "createtime": now,
  1973. })
  1974. if len(datas) == saveLimit {
  1975. db.Mgo.SaveBulk(coll, datas...)
  1976. datas = []map[string]interface{}{}
  1977. }
  1978. }
  1979. }
  1980. for k, v := range kv {
  1981. datas = append(datas, map[string]interface{}{
  1982. "val": k,
  1983. "times": v,
  1984. "type": "kv",
  1985. "firstid": j.SourceMid,
  1986. "createtime": now,
  1987. })
  1988. if len(datas) == saveLimit {
  1989. db.Mgo.SaveBulk(coll, datas...)
  1990. datas = []map[string]interface{}{}
  1991. }
  1992. }
  1993. if len(datas) > 0 {
  1994. db.Mgo.SaveBulk(coll, datas...)
  1995. }
  1996. }
  1997. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  1998. if j == nil {
  1999. return nil
  2000. }
  2001. if len(j.Block) > 0 {
  2002. for i, v := range j.Block {
  2003. rangetmp := new(ju.TmpBlock)
  2004. vb, _ := json.Marshal(v)
  2005. json.Unmarshal(vb, &rangetmp)
  2006. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2007. }
  2008. }
  2009. if j.ColonKV != nil {
  2010. cb, _ := json.Marshal(j.ColonKV)
  2011. tmpblock.ColonKV = string(cb)
  2012. }
  2013. if j.SpaceKV != nil {
  2014. sb, _ := json.Marshal(j.SpaceKV)
  2015. tmpblock.SpaceKV = string(sb)
  2016. }
  2017. if j.TableKV != nil {
  2018. tb, _ := json.Marshal(j.TableKV)
  2019. tmpblock.TableKV = string(tb)
  2020. }
  2021. return &tmpblock
  2022. }
  2023. //去重冗余字段
  2024. func delFiled(k string) bool {
  2025. return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2026. }
  2027. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2028. defer qu.Catch()
  2029. doc := j.Data
  2030. result := j.Result
  2031. _id := qu.BsonIdToSId((*doc)["_id"])
  2032. result = ScoreFields(j, e.Tag) //正负面词打分
  2033. //结果排序
  2034. for _, val := range result {
  2035. ju.Sort(val)
  2036. }
  2037. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2038. marshalbt, _ := json.Marshal(j.Jsondata)
  2039. tmpjddata := make(map[string]interface{})
  2040. json.Unmarshal(marshalbt,&tmpjddata)
  2041. for _, jdkey := range ju.JsonData {
  2042. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2043. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2044. if jdkey == "budget" || jdkey == "bidamount" {
  2045. lockclear.Lock()
  2046. cfn := e.ClearFn[jdkey]
  2047. lockclear.Unlock()
  2048. if len(cfn) == 0 {
  2049. continue
  2050. }
  2051. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
  2052. if tmpv.Value == newNum[0] {
  2053. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2054. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2055. ju.Sort(j.Result[jdkey])
  2056. delete((*j.Jsondata), jdkey)
  2057. break
  2058. }
  2059. }else {
  2060. if (*j.Jsondata)[jdkey] == tmpv.Value{
  2061. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2062. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2063. ju.Sort(j.Result[jdkey])
  2064. delete((*j.Jsondata), jdkey)
  2065. break
  2066. }
  2067. }
  2068. }
  2069. }
  2070. }
  2071. if len(*j.Jsondata)>0{
  2072. j.Result = JsonDataMergeProcessing(j, e)
  2073. }
  2074. j.Jsondata = &tmpjddata
  2075. }
  2076. return doc, result, _id
  2077. }
  2078. //辅助信息,如果没有排序先排序
  2079. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2080. fieldalls := map[string][]map[string]interface{}{}
  2081. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2082. defer qykredis.Close()
  2083. db := 0
  2084. for field, val := range j.Result {
  2085. //ju.Sort(val)
  2086. if field == "buyer" {
  2087. db = ju.BuyerDB
  2088. } else if field == "winner" {
  2089. db = ju.WinnerDB
  2090. } else if field == "agency" {
  2091. db = ju.AgencyDB
  2092. }
  2093. sfields := []map[string]interface{}{}
  2094. for _, v := range val {
  2095. standardized := false
  2096. if _, err := qykredis.Do("SELECT", db); err != nil {
  2097. fmt.Println("redis select err", err)
  2098. } else {
  2099. rep, err := qykredis.Do("GET", v.Value)
  2100. if rep != nil && err == nil {
  2101. standardized = true
  2102. }
  2103. }
  2104. if field == "budget" || field == "bidamount" {
  2105. if !v.IsTrue {
  2106. continue
  2107. }
  2108. }
  2109. sfield := map[string]interface{}{
  2110. "val": v.Value,
  2111. "type": v.Type,
  2112. "score": v.Score,
  2113. "blocktag": v.BlockTag,
  2114. "sourceval": v.SourceValue,
  2115. "standardized": standardized,
  2116. }
  2117. sfields = append(sfields, sfield)
  2118. }
  2119. fieldalls[field] = sfields
  2120. }
  2121. return fieldalls
  2122. }
  2123. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2124. defer qu.Catch()
  2125. //获取审核字段
  2126. for _, field := range e.AuditFields {
  2127. //1.分包
  2128. if resulttmp["package"] != nil {
  2129. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2130. for _, val := range packagedata {
  2131. if val[field] != nil {
  2132. fv := qu.ObjToString(val[field])
  2133. if fv != "" {
  2134. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2135. e.RedisMatch(field, fv, val) //redis匹配
  2136. } else { //除了buyer和winner,其他字段走规则匹配
  2137. e.RuleMatch(field, fv, val)
  2138. }
  2139. }
  2140. }
  2141. }
  2142. }
  2143. //2.外围
  2144. if resulttmp[field] != nil {
  2145. fv := qu.ObjToString(resulttmp[field])
  2146. if fv != "" {
  2147. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2148. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2149. } else { //除了buyer和winner,其他字段走规则匹配
  2150. e.RuleMatch(field, fv, resulttmp)
  2151. }
  2152. }
  2153. }
  2154. }
  2155. }
  2156. //Redis匹配
  2157. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2158. defer qu.Catch()
  2159. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2160. if i == 0 { //reids未找到,执行规则匹配
  2161. val[field+"_isredis"] = false
  2162. e.RuleMatch(field, fv, val) //规则匹配
  2163. } else { //redis找到,打标识存库
  2164. val[field+"_isredis"] = true
  2165. }
  2166. }
  2167. //规则匹配
  2168. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2169. defer qu.Catch()
  2170. if fieldval != "" {
  2171. SMap := e.StartMatch(field, fieldval)
  2172. //SMap.AddKey(field+"_isaudit", false)
  2173. for _, k := range SMap.Keys {
  2174. tmpMap[k] = SMap.Map[k]
  2175. }
  2176. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2177. }
  2178. }
  2179. //开始规则匹配
  2180. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2181. defer qu.Catch()
  2182. SMap := pretreated.NewSortMap()
  2183. lock.Lock()
  2184. f := e.RecogFieldMap[field]
  2185. lock.Unlock()
  2186. if len(f) > 0 {
  2187. fid := qu.BsonIdToSId(f["_id"])
  2188. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2189. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2190. if textAfterRecogFieldPrerule != "" {
  2191. lock.Lock()
  2192. classMap := e.FidClassMap[fid]
  2193. lock.Unlock()
  2194. L:
  2195. for _, c := range classMap { //class
  2196. classid := qu.BsonIdToSId(c["_id"])
  2197. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2198. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2199. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2200. if textAfterClassPrerule != "" {
  2201. lock.Lock()
  2202. ruleMap := e.CidRuleMap[classid]
  2203. lock.Unlock()
  2204. for _, r := range ruleMap { //rule
  2205. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2206. s_name := qu.ObjToString(r["s_name"])
  2207. rule := r["rule"].([]interface{})
  2208. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2209. if textAfterRulePrerule != "" {
  2210. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2211. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2212. if savefield != "" { //保存字段不为空,存储代码信息
  2213. SMap.AddKey(field+"_"+savefield, s_name)
  2214. }
  2215. break L
  2216. }
  2217. }
  2218. }
  2219. }
  2220. }
  2221. }
  2222. }
  2223. return SMap
  2224. }
  2225. //中标候选人经过清理之后,重新取出赋值
  2226. func resetWinnerorder(j *ju.Job) {
  2227. if len(j.Winnerorder) == 0 {
  2228. return
  2229. }
  2230. maxlen := len(j.Winnerorder) - 1
  2231. //中标单位
  2232. //i := 0
  2233. winners := []*ju.ExtField{}
  2234. bidamounts := []*ju.ExtField{}
  2235. //for _, v := range j.Result["winner"] {
  2236. // if v.Code == "winnerorder" {
  2237. // if maxlen < i {
  2238. // continue
  2239. // }
  2240. // j.Winnerorder[i]["entname"] = v.Value
  2241. // i++
  2242. // } else {
  2243. // winners = append(winners, v)
  2244. // }
  2245. //}
  2246. if maxlen > 0 {
  2247. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2248. if j.Winnerorder[0]["price"] != nil {
  2249. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
  2250. if tmpPrice[len(tmpPrice)-1].(bool) {
  2251. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
  2252. }
  2253. }
  2254. }
  2255. if j.Result["winner"] == nil && len(winners) > 0 {
  2256. j.Result["winner"] = winners
  2257. } else if len(winners) > 0 {
  2258. j.Result["winner"] = append(j.Result["winner"], winners...)
  2259. }
  2260. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2261. j.Result["bidamount"] = bidamounts
  2262. } else if len(bidamounts) > 0 {
  2263. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2264. }
  2265. //j.Result["winner"] = winners
  2266. //中标金额
  2267. //i = 0
  2268. //bidamounts := []*ju.ExtField{}
  2269. //for _, v := range j.Result["bidamount"] {
  2270. // if v.Code == "winnerorder" {
  2271. // if maxlen < i {
  2272. // continue
  2273. // }
  2274. // j.Winnerorder[i]["price"] = v.Value
  2275. // i++
  2276. // } else {
  2277. // bidamounts = append(bidamounts, v)
  2278. // }
  2279. //}
  2280. //j.Result["bidamount"] = bidamounts
  2281. }
  2282. func RemoveReplicaSliceString(slc []string) []string {
  2283. result := make([]string, 0)
  2284. tempMap := make(map[string]bool, len(slc))
  2285. for _, e := range slc {
  2286. if tempMap[e] == false {
  2287. tempMap[e] = true
  2288. result = append(result, e)
  2289. }
  2290. }
  2291. return result
  2292. }