extract.go 87 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/shopspring/decimal"
  7. "go.mongodb.org/mongo-driver/bson/primitive"
  8. "jy/clear"
  9. db "jy/mongodbutil"
  10. "jy/pretreated"
  11. ju "jy/util"
  12. qu "qfw/util"
  13. "qfw/util/redis"
  14. "regexp"
  15. "sort"
  16. "strconv"
  17. "strings"
  18. "sync"
  19. "time"
  20. "unicode/utf8"
  21. log "github.com/donnie4w/go-logger/logger"
  22. "gopkg.in/mgo.v2/bson"
  23. )
  24. var (
  25. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  26. JYUrl = "https://www.jianyu360.com/article/content/%s.html"
  27. cut = ju.NewCut() //获取正文并清理
  28. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  29. TaskList map[string]*ExtractTask //任务列表
  30. ClearTaskList map[string]*ClearTask //清理任务列表
  31. saveLimit = 100 //抽取日志批量保存
  32. PageSize = 5000 //查询分页
  33. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  34. //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
  35. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  36. /*f = map[string]bool{
  37. "T": true,
  38. "_d": true,
  39. "area": true,
  40. "channel": true,
  41. "comeintime": true,
  42. "competehref": true,
  43. "href": true,
  44. "l_np_publishtime": true,
  45. "publishtime": true,
  46. "sendflag": true,
  47. "site": true,
  48. "spidercode": true,
  49. "title": true,
  50. "projectname": true,
  51. }*/
  52. /*f = map[string]bool{
  53. "contentid": true,
  54. "progName": true,
  55. "updateTime": true,
  56. "url": true,
  57. "areaId": true,
  58. "areaName": true,
  59. "popTitle": true,
  60. "showTitle": true,
  61. "progId": true,
  62. "catid": true,
  63. "isConcern": true,
  64. "followCount": true,
  65. "followSuggestion": true,
  66. "isBoutique": true,
  67. "canTj": true,
  68. "tenderAmountNumber": true,
  69. "tenderAmountUnit": true,
  70. "bidderAmountNumber": true,
  71. "bidderAmountUnit": true,
  72. "registrationBeginTime": true,
  73. "registrationEndTime": true,
  74. "starNum": true,
  75. "title": true,
  76. "proInvested": true,
  77. "projectname": true,
  78. }*/
  79. spidercode = map[string]bool{
  80. "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
  81. "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
  82. "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
  83. "hb_tmsggzyjyxxw_jsgc_kbqk": true,
  84. "zj_nbsyyggzyjyw_jsgc_kbqk": true,
  85. "zj_zjsggzyjyzx_jyxx_kbjg": true,
  86. "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
  87. "zj_lssggzyjyw_jsgc_kbsk": true,
  88. "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
  89. "sc_mssggzydzjypt_jsgc_kbjl": true,
  90. "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
  91. "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
  92. "a_hbszbtbggfwpt_kbjl": true,
  93. "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
  94. "a_szldzbyxgs_kbxx": true,
  95. "zj_zssssxggzyjyw_gcjs_kbjggs": true,
  96. "gd_szszfhjsj_kbqkgs": true,
  97. "a_gjggzyjypt_gcjs_kbjl": true,
  98. "a_gjggzyjypt_gcjs_kbjl_new": true,
  99. "zj_tzsyhggzyjyzx_kbjggg": true,
  100. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
  101. "ah_czsggzyjyw_jsgc_kbjl": true,
  102. "ah_czsggzyjyw_zfcg_kbxx": true,
  103. "ah_whsggzyjyfww_kbxx_cgxm": true,
  104. "ah_whsggzyjyfww_kbxx_gcxm": true,
  105. }
  106. )
  107. //启动测试抽取-、、、、结果追踪
  108. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  109. defer qu.Catch()
  110. ext := TaskList[taskId]
  111. if ext == nil {
  112. ext = &ExtractTask{}
  113. ext.Id = taskId
  114. ext.InitTestTaskInfo(resultcoll, trackcoll)
  115. ext.IsRun = true
  116. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  117. }
  118. ext.InitSite()
  119. ext.InitRulePres()
  120. ext.InitRuleBacks(false)
  121. ext.InitRuleBacks(true)
  122. ext.InitRuleCore(false)
  123. ext.InitRuleCore(true)
  124. ext.InitPkgCore()
  125. ext.InitBlockRule()
  126. ext.InfoTypeList()
  127. ext.InitTag(false)
  128. ext.InitTag(true)
  129. ext.InitClearFn(false)
  130. ext.InitClearFn(true)
  131. ext.Lock()
  132. //ext.IsExtractCity = false
  133. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  134. //初始化城市DFA信息
  135. ext.InitCityInfo()
  136. //ext.InitCityDFA()
  137. ext.InitAreaCode()
  138. ext.InitPostCode()
  139. }
  140. ext.Unlock()
  141. //质量审核
  142. ext.InitAuditFields()
  143. ext.InitAuditRule()
  144. ext.InitAuditClass()
  145. ext.InitAuditRecogField()
  146. //品牌抽取是否开启
  147. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  148. //价格个数抽取是否开启
  149. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  150. //附件抽取是否开启
  151. ext.InitFile()
  152. ext.TaskInfo.TestColl = resultcoll
  153. TaskList[taskId] = ext
  154. return RunExtractTestTask(ext, startId, num)
  155. }
  156. func IdTrans(startId string) bson.ObjectId {
  157. defer qu.Catch()
  158. return bson.ObjectIdHex(startId)
  159. }
  160. func StringTOBsonId(id string) primitive.ObjectID {
  161. objectId, _ := primitive.ObjectIDFromHex(id)
  162. return objectId
  163. }
  164. func BsonTOStringId(id interface{}) string {
  165. return id.(primitive.ObjectID).Hex()
  166. }
  167. //开始测试任务抽取
  168. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  169. n, _ := strconv.Atoi(num)
  170. id := IdTrans(startId)
  171. if id.Valid() {
  172. //query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  173. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  174. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  175. for _, v := range *list {
  176. //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
  177. // continue
  178. //}
  179. if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
  180. continue
  181. }
  182. var j, jf *ju.Job
  183. var isSite bool
  184. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  185. v["isextFile"] = true
  186. j, jf, isSite = ext.PreInfo(v)
  187. } else {//无附件
  188. j, _, isSite = ext.PreInfo(v)
  189. }
  190. go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
  191. ext.TaskInfo.ProcessPool <- true
  192. }
  193. return true
  194. } else {
  195. return false
  196. }
  197. }
  198. //启动抽取
  199. func StartExtractTaskId(taskId string) bool {
  200. defer qu.Catch()
  201. isgo := false
  202. ext := TaskList[taskId]
  203. if ext == nil {
  204. ext = &ExtractTask{}
  205. ext.Id = taskId
  206. ext.InitTaskInfo()
  207. isgo = true
  208. } else {
  209. ext.Id = taskId
  210. ext.InitTaskInfo()
  211. }
  212. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  213. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  214. ext.InitSite()
  215. ext.InitRulePres()
  216. ext.InitRuleBacks(false)
  217. ext.InitRuleBacks(true)
  218. ext.InitRuleCore(false)
  219. ext.InitRuleCore(true)
  220. ext.InitPkgCore()
  221. ext.InitBlockRule()
  222. ext.InfoTypeList()
  223. ext.InitTag(false)
  224. ext.InitTag(true)
  225. ext.InitClearFn(false)
  226. ext.InitClearFn(true)
  227. ext.Lock()
  228. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  229. //初始化城市DFA信息
  230. //ext.InitCityDFA()
  231. ext.InitCityInfo()
  232. ext.InitAreaCode()
  233. ext.InitPostCode()
  234. }
  235. ext.Unlock()
  236. //质量审核
  237. ext.InitAuditFields()
  238. ext.InitAuditRule()
  239. ext.InitAuditClass()
  240. ext.InitAuditRecogField()
  241. //品牌抽取是否开启
  242. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  243. //价格个数抽取是否开启
  244. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  245. //附件抽取是否开启
  246. ext.InitFile()
  247. ext.IsRun = true
  248. go ext.ResultSave(true)
  249. go ext.BidSave(true)
  250. if isgo {
  251. go RunExtractTask(taskId)
  252. }
  253. TaskList[taskId] = ext
  254. return true
  255. }
  256. //停止抽取
  257. func StopExtractTaskId(taskId string) bool {
  258. defer qu.Catch()
  259. ext := TaskList[taskId]
  260. if ext != nil {
  261. ext.IsRun = false
  262. TaskList[taskId] = ext
  263. }
  264. //更新task.s_extlastid
  265. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  266. return true
  267. }
  268. //开始抽取
  269. func RunExtractTask(taskId string) {
  270. defer qu.Catch()
  271. ext := TaskList[taskId]
  272. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  273. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  274. pageNum := (count + PageSize - 1) / PageSize
  275. limit := PageSize
  276. if count < PageSize {
  277. limit = count
  278. }
  279. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  280. for i := 0; i < pageNum; i++ {
  281. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  282. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  283. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  284. for _, v := range *list {
  285. //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  286. // continue
  287. //}
  288. //根据标题判断是否抽取
  289. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  290. if !b {
  291. continue
  292. }
  293. _id := qu.BsonIdToSId(v["_id"])
  294. //log.Debug(_id)
  295. if !ext.IsRun {
  296. break
  297. }
  298. var j, jf *ju.Job
  299. var isSite bool
  300. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  301. v["isextFile"] = true
  302. j, jf, isSite = ext.PreInfo(v)
  303. } else {
  304. j, _, isSite = ext.PreInfo(v)
  305. }
  306. go ext.ExtractProcess(j, jf, isSite)
  307. ext.TaskInfo.LastExtId = _id
  308. ext.TaskInfo.ProcessPool <- true
  309. }
  310. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  311. if !ext.IsRun {
  312. break
  313. }
  314. }
  315. //更新task.s_extlastid
  316. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  317. }
  318. //信息预处理-不和版本关联,取最新版本的配置项
  319. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  320. return (&ExtractTask{}).PreInfo(doc)
  321. }
  322. var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  323. //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
  324. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  325. defer qu.Catch()
  326. //判断是否有附件这个字段
  327. var isextFile bool
  328. if doc["isextFile"] != nil {
  329. isextFile = doc["isextFile"].(bool)
  330. }
  331. detail := ""
  332. d1, _ := doc["detail"].(string)
  333. d2, _ := doc["contenthtml"].(string)
  334. if len(d1) >= len(d2) || d2 == "" {
  335. detail = d1
  336. } else {
  337. detail = d2
  338. }
  339. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  340. d3, _ := doc["summary"].(string)
  341. //全文的需要修复表格
  342. detail = pretreated.RepairCon(detail)
  343. detail = ju.CutLableStr(d3 + "\n" + detail)
  344. detail = cut.ClearHtml(d3 + "\n" + detail)
  345. doc["detail"] = detail
  346. isClearnMoney := !clearMoneyReg.MatchString(detail)
  347. if isClearnMoney {
  348. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  349. }
  350. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  351. if isextFile {
  352. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  353. }
  354. //正文小于200个字,有附件把附件内容加到正文
  355. //tmpDeatil := detail
  356. //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  357. //if err == nil {
  358. // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  359. // if conlen < 2000 {
  360. // if isextFile {
  361. // detail += qu.ObjToString(doc["detailfile"])
  362. // doc["detail"] = detail
  363. // }
  364. // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
  365. // //防止文本过长,造成抽取阻塞
  366. // log.Debug("文本太长", doc["_id"], conlen)
  367. // doc["detail"] = d3
  368. // }
  369. //}
  370. toptype := qu.ObjToString(doc["toptype"])
  371. subtype := qu.ObjToString(doc["subtype"])
  372. if qu.ObjToString(doc["type"]) == "bid" {
  373. toptype = "结果"
  374. }
  375. if toptype == "" {
  376. toptype = "all"
  377. }
  378. if subtype == "" {
  379. subtype = "all"
  380. }
  381. if subtype == "其他" {
  382. subtype = "其它"
  383. }
  384. toMap := qu.ObjToMap(doc["jsondata"])
  385. //log.Debug("toMap", toMap)
  386. if (*toMap) != nil {
  387. if (*toMap)["extweight"] == nil {
  388. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  389. }
  390. if (*toMap)["jsoncontent"] != nil {
  391. delete(*toMap, "jsoncontent")
  392. }
  393. for k, v := range *toMap {
  394. if _, ok := v.(float64); ok {
  395. continue
  396. } else if _, ok := v.(int64); ok {
  397. continue
  398. } else if _, ok2 := v.(string); ok2 {
  399. continue
  400. } else {
  401. delete(*toMap, k)
  402. }
  403. }
  404. }
  405. j = &ju.Job{
  406. SourceMid: qu.BsonIdToSId(doc["_id"]),
  407. Category: toptype,
  408. CategorySecond: subtype,
  409. Content: qu.ObjToString(doc["detail"]),
  410. SpiderCode: qu.ObjToString(doc["spidercode"]),
  411. Site: qu.ObjToString(doc["site"]),
  412. //Domain: qu.ObjToString(doc["domain"]),
  413. //Href: qu.ObjToString(doc["href"]),
  414. Title: qu.ObjToString(doc["title"]),
  415. Data: &doc,
  416. City: qu.ObjToString(doc["city"]),
  417. Province: qu.ObjToString(doc["area"]),
  418. Jsondata: toMap,
  419. Result: map[string][]*ju.ExtField{},
  420. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  421. RuleBlock: e.RuleBlock,
  422. Dataging: qu.IntAll(doc["dataging"]),
  423. IsClearnMoney: isClearnMoneystr,
  424. }
  425. if isextFile {
  426. jf = &ju.Job{
  427. SourceMid: qu.BsonIdToSId(doc["_id"]),
  428. Category: toptype,
  429. CategorySecond: subtype,
  430. Content: qu.ObjToString(doc["detailfile"]),
  431. SpiderCode: qu.ObjToString(doc["spidercode"]),
  432. Site: qu.ObjToString(doc["site"]),
  433. Title: qu.ObjToString(doc["title"]),
  434. Data: &doc,
  435. City: qu.ObjToString(doc["city"]),
  436. Province: qu.ObjToString(doc["area"]),
  437. Jsondata: toMap,
  438. Result: map[string][]*ju.ExtField{},
  439. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  440. RuleBlock: e.RuleBlock,
  441. IsFile: isextFile,
  442. Dataging: qu.IntAll(doc["dataging"]),
  443. IsClearnMoney: isClearnMoneystr,
  444. }
  445. }
  446. codeSite := j.SpiderCode
  447. //是否启用站点
  448. if value, ok := e.SiteMerge.Load(codeSite); ok {
  449. isSite = value.(bool)
  450. }
  451. if isSite {
  452. //是否配置站点
  453. exp, isSite := e.Luacodes.Load(codeSite)
  454. if isSite {
  455. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  456. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  457. }
  458. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  459. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  460. }
  461. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  462. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  463. }
  464. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  465. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  466. }
  467. }
  468. }
  469. qu.Try(func() {
  470. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  471. if isextFile && strings.TrimSpace(jf.Content) != "" {
  472. pretreated.AnalyStart(jf, isSite, codeSite)
  473. }
  474. }, func(err interface{}) {
  475. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  476. })
  477. return j, jf, isSite
  478. }
  479. var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  480. var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  481. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  482. func file2text(doc *map[string]interface{}) {
  483. mnameone := map[string]bool{}
  484. mname := map[string]bool{}
  485. murl := map[string]string{}
  486. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  487. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  488. for _, attachs := range attach_text {
  489. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  490. for _, fileinfo := range fileinfos {
  491. if ff, ok := fileinfo.(map[string]interface{}); ok {
  492. attach_url := qu.ObjToString(ff["attach_url"])
  493. ffname := qu.ObjToString(ff["file_name"])
  494. if clearStrReg.MatchString(ffname) {
  495. continue
  496. }
  497. mname[ffname] = true
  498. murl[ffname] = attach_url
  499. if sortStrReg.MatchString(ffname) {
  500. mnameone[ffname] = true
  501. }
  502. }
  503. }
  504. }
  505. }
  506. }
  507. tmpstr := ""
  508. for k := range mnameone {
  509. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  510. (*doc)["detailfile"] = tmpstr
  511. return
  512. }
  513. bs := ju.OssGetObject(murl[k])
  514. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  515. tmpstr += bs + "\n"
  516. }
  517. }
  518. for k := range mname {
  519. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  520. (*doc)["detailfile"] = tmpstr
  521. return
  522. }
  523. bs := ju.OssGetObject(murl[k])
  524. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  525. tmpstr += bs + "\n"
  526. }
  527. }
  528. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  529. }
  530. //抽取-正文
  531. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  532. e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性
  533. if jf != nil && jf.IsFile { //附件jf → j 合并
  534. e.ExtractDetail(jf, isSite, j.SpiderCode)
  535. for tmpk, xs := range jf.Result {
  536. if len(j.Result[tmpk]) == 0 {
  537. if tmpk == "budget" || tmpk == "bidamount" {
  538. for _, v := range xs {
  539. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  540. j.Result[tmpk] = append(j.Result[tmpk], v)
  541. }
  542. }
  543. } else {
  544. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  545. }
  546. }
  547. }
  548. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  549. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  550. }
  551. if len(j.PackageInfo) == 0 && jf.PackageInfo != nil && len(jf.PackageInfo) > 0 {
  552. j.PackageInfo = jf.PackageInfo
  553. }
  554. }
  555. if isSite {
  556. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  557. if ok && ismerge.(bool) {
  558. tmpj := &ju.Job{
  559. SourceMid: j.SourceMid,
  560. Category: j.Category,
  561. CategorySecond: j.CategorySecond,
  562. Content: j.Content,
  563. SpiderCode: j.SpiderCode,
  564. //Domain: qu.ObjToString(doc["domain"]),
  565. //Href: qu.ObjToString(doc["href"]),
  566. Title: j.Title,
  567. Data: j.Data,
  568. City: j.City,
  569. Province: j.Province,
  570. Jsondata: j.Jsondata,
  571. Result: map[string][]*ju.ExtField{},
  572. BuyerAddr: j.BuyerAddr,
  573. RuleBlock: e.RuleBlock,
  574. }
  575. qu.Try(func() {
  576. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  577. }, func(err interface{}) {
  578. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  579. })
  580. e.ExtractDetail(tmpj, false, "")
  581. //if jf != nil && jf.IsFile {
  582. // e.ExtractFile(jf, false, "")
  583. //}
  584. //合并数据
  585. j.Block = append(j.Block, tmpj.Block...)
  586. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  587. for tmpk, _ := range j.Result {
  588. if len(tmpj.Result[tmpk]) > 0 {
  589. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  590. }
  591. }
  592. for tmpk, _ := range tmpj.Result {
  593. if len(j.Result[tmpk]) == 0 {
  594. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  595. }
  596. }
  597. }
  598. }
  599. //分析抽取结果并保存
  600. AnalysisSaveResult(j, jf, e)
  601. <-e.TaskInfo.ProcessPool
  602. }
  603. //抽取-正文-规则等 detail
  604. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  605. qu.Try(func() {
  606. doc := *j.Data
  607. //全局前置规则,结果覆盖doc属性
  608. //for _, v := range e.RulePres {
  609. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  610. //}
  611. tmprules := map[string][]*RuleCore{}
  612. lockrule.Lock()
  613. //加载分类抽取配置
  614. if j.Category == "all" || j.CategorySecond == "all" {
  615. if isSite {
  616. for k, vc1 := range e.SiteRuleCores["all_all"] {
  617. tmprules[k] = vc1
  618. }
  619. } else {
  620. for k, vc1 := range e.RuleCores["all_all"] {
  621. tmprules[k] = vc1
  622. }
  623. }
  624. } else {
  625. if isSite {
  626. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  627. tmprules[k] = vc1
  628. }
  629. } else {
  630. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  631. tmprules[k] = vc1
  632. }
  633. }
  634. }
  635. if len(tmprules) < 1 { //分类未覆盖部分
  636. if isSite {
  637. for k, vc1 := range e.RuleCores["all_all"] {
  638. tmprules[k] = vc1
  639. }
  640. } else {
  641. for k, vc1 := range e.SiteRuleCores["all_all"] {
  642. tmprules[k] = vc1
  643. }
  644. }
  645. }
  646. lockrule.Unlock()
  647. //抽取规则
  648. for _, vc1 := range tmprules {
  649. for _, vc := range vc1 {
  650. tmp := ju.DeepCopy(doc).(map[string]interface{})
  651. //是否进入逻辑
  652. if !ju.Logic(vc.LuaLogic, tmp) {
  653. continue
  654. }
  655. if vc.Field =="buyer" {
  656. //log.Debug("调试抽取字段")
  657. }
  658. ////抽取-前置规则
  659. //for _, v := range vc.RulePres {
  660. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  661. //}
  662. // log.Debug("抽取-前置规则", tmp)
  663. //抽取-规则
  664. ExtRuleCore(tmp, e, vc, j, isSite)
  665. // log.Debug("抽取-规则", tmp)
  666. //抽取-后置规则
  667. for _, v := range vc.RuleBacks {
  668. ExtRegBack(j, v, e.TaskInfo, vc)
  669. }
  670. //kv规则
  671. for _, v := range vc.KVRuleCores {
  672. ExtRuleKV(j, v, e.TaskInfo)
  673. }
  674. // log.Debug("抽取-后置规则", tmp)
  675. //项目名称未能抽取到,标题来凑
  676. if vc.Field == "projectname" {
  677. if vc.ExtFrom == "title" {
  678. isextitle := true
  679. for _, v := range j.Result[vc.Field] {
  680. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  681. isextitle = false
  682. break
  683. }
  684. }
  685. if isextitle { //标题加入选举
  686. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  687. if isSite {
  688. field.Score = 1
  689. }
  690. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  691. }
  692. }
  693. for i := 0; i < 3; i++ {
  694. for _, v := range vc.RuleBacks {
  695. ExtRegBack(j, v, e.TaskInfo, vc)
  696. }
  697. }
  698. }
  699. }
  700. }
  701. //全局后置规则
  702. if isSite {
  703. for _, v := range e.SiteRuleBacks {
  704. ExtRegBack(j, v, e.TaskInfo, nil)
  705. }
  706. } else {
  707. for _, v := range e.RuleBacks {
  708. ExtRegBack(j, v, e.TaskInfo, nil)
  709. }
  710. }
  711. //函数清理
  712. for key, val := range j.Result {
  713. for i, v := range val {
  714. if v.Field == "projectname" && v.Type == "table" {
  715. break
  716. }
  717. if key == "budget" || key == "bidamount" {
  718. if _, ok := v.Value.(float64); ok && !v.IsTrue {
  719. continue
  720. }
  721. }
  722. lockclear.Lock()
  723. var cfn = []string{}
  724. if isSite {
  725. cfn = e.SiteClearFn[key]
  726. if len(cfn) == 0 {
  727. cfn = e.ClearFn[key]
  728. }
  729. } else {
  730. cfn = e.ClearFn[key]
  731. }
  732. lockclear.Unlock()
  733. if len(cfn) == 0 {
  734. continue
  735. }
  736. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  737. if key == "budget" || key == "bidamount" {
  738. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  739. j.Result[key][i].IsTrue = true
  740. } else {
  741. j.Result[key][i].Value = data[0]
  742. continue
  743. }
  744. }
  745. before, _ := v.Value.(string)
  746. v.Value = data[0]
  747. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  748. //添加行数清理的日志
  749. //清理特殊符号
  750. lockclear.Lock()
  751. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  752. text := qu.ObjToString(v.Value)
  753. before = text
  754. //指定清理--新增-函数清理-其他清理
  755. if key=="winner"||key=="agency"||key=="buyer" {
  756. text = strings.ReplaceAll(text,"【","")
  757. text = strings.ReplaceAll(text,"】","")
  758. }
  759. v.Value = clear.OtherClean(key, text)
  760. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  761. }
  762. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  763. lockclear.Unlock()
  764. }
  765. }
  766. PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
  767. // bs, _ := json.Marshal(j.Result)
  768. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  769. }, func(err interface{}) {
  770. log.Debug("ExtractProcess err", err, j.SourceMid)
  771. })
  772. }
  773. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  774. qu.Try(func() {
  775. doc := *j.Data
  776. //全局前置规则,结果覆盖doc属性
  777. // for _, v := range e.RulePres {
  778. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  779. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  780. // }
  781. // }
  782. //抽取规则
  783. tmprules := map[string][]*RuleCore{}
  784. lockrule.Lock()
  785. if j.Category == "all" || j.CategorySecond == "all" {
  786. for k, vc1 := range e.RuleCores["all_all"] {
  787. tmprules[k] = vc1
  788. }
  789. } else {
  790. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  791. tmprules[k] = vc1
  792. }
  793. }
  794. lockrule.Unlock()
  795. for _, vc1 := range tmprules {
  796. for _, vc := range vc1 {
  797. tmp := ju.DeepCopy(doc).(map[string]interface{})
  798. //是否进入逻辑
  799. if !ju.Logic(vc.LuaLogic, tmp) {
  800. continue
  801. }
  802. //抽取-前置规则
  803. // for _, v := range vc.RulePres {
  804. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  805. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  806. // }
  807. // }
  808. // log.Debug("抽取-前置规则", tmp)
  809. //抽取-规则
  810. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  811. ExtRuleCore(tmp, e, vc, j, isSite)
  812. }
  813. // log.Debug("抽取-规则", tmp)
  814. //抽取-后置规则
  815. for _, v := range vc.RuleBacks {
  816. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  817. ExtRegBack(j, v, e.TaskInfo, vc)
  818. }
  819. }
  820. // log.Debug("抽取-后置规则", tmp)
  821. }
  822. }
  823. //全局后置规则
  824. for _, v := range e.RuleBacks {
  825. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  826. ExtRegBack(j, v, e.TaskInfo, nil)
  827. }
  828. }
  829. //函数清理
  830. for key, val := range j.Result {
  831. for _, v := range val {
  832. lockclear.Lock()
  833. var cfn = []string{}
  834. if isSite {
  835. cfn = e.SiteClearFn[key]
  836. if len(cfn) == 0 {
  837. cfn = e.ClearFn[key]
  838. }
  839. } else {
  840. cfn = e.ClearFn[key]
  841. }
  842. lockclear.Unlock()
  843. if len(cfn) == 0 {
  844. continue
  845. }
  846. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  847. v.Value = data[0]
  848. //清理特殊符号
  849. lockclear.Lock()
  850. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  851. clear.MesField[key] != nil {
  852. text := qu.ObjToString(v.Value)
  853. text = clear.OtherClean(key, text)
  854. v.Value = text
  855. }
  856. lockclear.Unlock()
  857. }
  858. }
  859. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  860. // bs, _ := json.Marshal(j.Result)
  861. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  862. }, func(err interface{}) {
  863. log.Debug("ExtractProcess err", err)
  864. })
  865. }
  866. //前置过滤
  867. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  868. defer qu.Catch()
  869. before := ju.DeepCopy(doc).(map[string]interface{})
  870. extinfo := map[string]interface{}{}
  871. if in.IsLua {
  872. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  873. if j != nil {
  874. lua.Block = j.Block
  875. }
  876. extinfo = lua.RunScript("pre")
  877. for k, v := range extinfo { //结果覆盖原doc
  878. doc[k] = v
  879. }
  880. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  881. } else {
  882. var key string
  883. if !j.IsFile {
  884. key = qu.If(in.Field == "", "detail", in.Field).(string)
  885. } else {
  886. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  887. }
  888. text := qu.ObjToString(doc[key])
  889. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  890. doc[key] = extinfo[key] //结果覆盖原doc
  891. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  892. }
  893. return doc
  894. }
  895. //抽取-规则
  896. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  897. //候选人加入
  898. var kvMap map[string][]map[string]interface{}
  899. extByReg := true
  900. if vc.ExtFrom != "title" {
  901. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  902. }
  903. for _, v := range vc.RuleCores {
  904. if v.IsLua {
  905. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  906. } else if extByReg {
  907. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  908. }
  909. }
  910. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  911. if vc.Field == "budget" && len(kvMap) == 0 {
  912. if len(j.BlockPackage) == 1 {
  913. for _, bp := range j.BlockPackage {
  914. for fieldname, field := range vc.LFields {
  915. if field != vc.Field {
  916. continue
  917. }
  918. tp := ""
  919. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  920. if k == 0 {
  921. tp = "colon"
  922. } else if k == 1 {
  923. tp = "space"
  924. } else if k == 2 {
  925. tp = "table"
  926. }
  927. if v == nil || v.KvTags == nil {
  928. continue
  929. }
  930. for _, vv := range v.KvTags[fieldname] {
  931. text := ju.TrimLRSpace(vv.Value, "")
  932. if text != "" {
  933. tmp := &ju.ExtField{
  934. ExtFrom: "package",
  935. Field: vc.Field,
  936. Code: "CL_分包",
  937. Type: tp,
  938. MatchType: "package",
  939. RuleText: bp.Text,
  940. SourceValue: vv.Key,
  941. Value: text,
  942. }
  943. if isSite {
  944. tmp.Score = 1
  945. }
  946. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  947. }
  948. }
  949. }
  950. }
  951. break
  952. }
  953. }
  954. } else {
  955. for k, v := range kvMap {
  956. if j.Result[k] == nil {
  957. j.Result[k] = [](*ju.ExtField){}
  958. }
  959. for _, tmp := range v {
  960. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  961. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  962. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  963. MatchType: qu.ObjToString(tmp["matchtype"]),
  964. RuleText: qu.ObjToString(tmp["ruletext"]),
  965. SourceValue: tmp["sourcevalue"],
  966. Value: tmp["value"]}
  967. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  968. field.Score = 1
  969. }
  970. if isSite {
  971. field.Score = 1
  972. }
  973. if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
  974. moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
  975. if len(moneys) > 0 {
  976. if vf, ok := moneys[0].(float64); ok {
  977. field.Value = vf
  978. field.IsTrue = moneys[len(moneys)-1].(bool)
  979. } else if vi, ok := moneys[0].(int); ok {
  980. field.Value = float64(vi)
  981. field.IsTrue = moneys[len(moneys)-1].(bool)
  982. }
  983. }
  984. }
  985. if tmp["blocktag"] != nil {
  986. btag := make(map[string]string)
  987. for k := range tmp["blocktag"].(map[string]bool) {
  988. blocktag.Lock()
  989. if TagConfigDesc[k] != "" {
  990. btag[k] = TagConfigDesc[k]
  991. }
  992. blocktag.Unlock()
  993. }
  994. field.BlockTag = btag
  995. }
  996. j.Result[k] = append(j.Result[k], field)
  997. }
  998. }
  999. }
  1000. }
  1001. //抽取-规则-kv
  1002. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  1003. defer qu.Catch()
  1004. if extfrom == "title" || !in.IsLua {
  1005. return
  1006. }
  1007. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  1008. lua.KvMap = *kvMap
  1009. lua.Block = j.Block
  1010. extinfo := lua.RunScript("core")
  1011. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  1012. for _, v := range tmps {
  1013. v["core"] = in.Code
  1014. }
  1015. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  1016. }
  1017. if len(extinfo) > 0 {
  1018. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1019. }
  1020. }
  1021. //抽取-规则-正则
  1022. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  1023. defer qu.Catch()
  1024. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1025. b := IsExtract(in.Field, j.Title, j.Content)
  1026. if !b {
  1027. return
  1028. }
  1029. //全文正则
  1030. //text := qu.ObjToString(doc[extfrom])
  1031. //if in.Field != "" {
  1032. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  1033. // if len(extinfo) > 0 {
  1034. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1035. // }
  1036. //}
  1037. //块抽取
  1038. if in.Field != "" {
  1039. if extfrom == "title" {
  1040. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  1041. if len(extinfo) > 0 {
  1042. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1043. }
  1044. } else if in.Field == "qualifies" {
  1045. extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite)
  1046. if len(extinfo) > 0 {
  1047. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1048. }
  1049. } else {
  1050. for _, v := range j.Block {
  1051. btag := make(map[string]string)
  1052. for k := range v.Classify {
  1053. blocktag.Lock()
  1054. btag[k] = TagConfigDesc[k]
  1055. blocktag.Unlock()
  1056. }
  1057. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  1058. if len(extinfo) > 0 {
  1059. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1060. }
  1061. }
  1062. }
  1063. }
  1064. }
  1065. //pkg抽取-规则-正则
  1066. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  1067. defer qu.Catch()
  1068. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1069. b := IsExtract(in.Field, j.Title, j.Content)
  1070. if !b {
  1071. return
  1072. }
  1073. //块抽取
  1074. if in.Field != "" {
  1075. for k, vbpkg := range j.BlockPackage {
  1076. rep := map[string]string{}
  1077. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1078. if in.Field == "budget" && vbpkg.Budget > 0 {
  1079. continue
  1080. }
  1081. if in.Field == "agencyfee" && vbpkg.Agencyfee > 0 {
  1082. continue
  1083. }
  1084. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  1085. continue
  1086. }
  1087. if in.Field == "winner" && vbpkg.Winner != "" {
  1088. continue
  1089. }
  1090. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  1091. continue
  1092. }
  1093. if in.Field == "projectname" && vbpkg.Name != "" {
  1094. continue
  1095. }
  1096. if in.Field == "winner" && vbpkg.Winner != "" {
  1097. continue
  1098. }
  1099. if in.Field == "winnerperson" {
  1100. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  1101. continue
  1102. }
  1103. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  1104. continue
  1105. }
  1106. }
  1107. if in.Field == "winnertel" {
  1108. if vbpkg.WinnerPerson == "" {
  1109. continue
  1110. }
  1111. }
  1112. //处理正负数修正
  1113. ptmp := strings.Split(in.RuleText, "#")
  1114. sign := 0
  1115. if len(ptmp) == 2 {
  1116. if ptmp[1] == "正" {
  1117. sign = 1
  1118. } else if ptmp[1] == "负" {
  1119. sign = -1
  1120. }
  1121. }
  1122. tmp := strings.Split(ptmp[0], "__")
  1123. if len(tmp) == 2 {
  1124. epos := strings.Split(tmp[1], ",")
  1125. posm := map[string]int{}
  1126. for _, v := range epos {
  1127. ks := strings.Split(v, ":")
  1128. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1129. posm[ks[1]] = qu.IntAll(ks[0])
  1130. } else {
  1131. posm[in.Field] = qu.IntAll(ks[0])
  1132. }
  1133. }
  1134. var pattern string
  1135. if strings.Contains(tmp[0], "\\u") {
  1136. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1137. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1138. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1139. } else {
  1140. pattern = tmp[0]
  1141. }
  1142. //log.Debug("pattern", pattern)
  1143. //fmt.Println(text)
  1144. reg := regexp.MustCompile(pattern)
  1145. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  1146. for i, _ := range apos {
  1147. pos := apos[i]
  1148. for k, p := range posm {
  1149. if len(pos) > p {
  1150. if pos[p] == -1 || pos[p+1] == -1 {
  1151. continue
  1152. }
  1153. val := vbpkg.Text[pos[p]:pos[p+1]]
  1154. if string(val) == "" {
  1155. continue
  1156. }
  1157. if sign == -1 {
  1158. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1159. } else {
  1160. rep[k+"_"+fmt.Sprint(i)] = val
  1161. }
  1162. }
  1163. }
  1164. }
  1165. //fmt.Println(text)
  1166. for i := 0; i < len(apos); i++ {
  1167. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  1168. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1169. lock.Lock()
  1170. cfn := e.ClearFn[in.Field]
  1171. lock.Unlock()
  1172. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1173. if data[len(data)-1].(bool) {
  1174. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1175. j.BlockPackage[k].IsTrueBudget = true
  1176. }
  1177. break
  1178. } else if in.Field == "agencyfee" && vbpkg.Agencyfee <= 0 {
  1179. lock.Lock()
  1180. cfn := e.ClearFn[in.Field]
  1181. lock.Unlock()
  1182. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1183. if data[len(data)-1].(bool) {
  1184. j.BlockPackage[k].Agencyfee = qu.Float64All(data[0])
  1185. j.BlockPackage[k].IsTrueAgencyfee = true
  1186. }
  1187. break
  1188. }else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1189. lock.Lock()
  1190. cfn := e.ClearFn[in.Field]
  1191. lock.Unlock()
  1192. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1193. if data[len(data)-1].(bool) {
  1194. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1195. j.BlockPackage[k].IsTrueBidamount = true
  1196. }
  1197. break
  1198. } else if in.Field == "winner" {
  1199. if j.BlockPackage[k].Winner == "" {
  1200. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  1201. break
  1202. }
  1203. } else if in.Field == "winnertel" {
  1204. if j.BlockPackage[k].WinnerTel == "" {
  1205. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1206. break
  1207. }
  1208. } else if in.Field == "winnerperson" {
  1209. if j.BlockPackage[k].WinnerPerson == "" {
  1210. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1211. break
  1212. }
  1213. } else if in.Field == "bidstatus" {
  1214. if j.BlockPackage[k].BidStatus == "" {
  1215. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1216. break
  1217. }
  1218. } else if in.Field == "projectname" {
  1219. if j.BlockPackage[k].Name == "" {
  1220. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1221. break
  1222. }
  1223. } else if in.Field == "winnerperson" {
  1224. if j.BlockPackage[k].WinnerPerson == "" {
  1225. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1226. break
  1227. }
  1228. } else if in.Field == "winnertel" {
  1229. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1230. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1231. break
  1232. }
  1233. }
  1234. }
  1235. }
  1236. }
  1237. } else {
  1238. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1239. val := ""
  1240. if len(pos) == 2 {
  1241. //"text" = "text"[pos[1]:]
  1242. val = "text"[pos[1]:]
  1243. rs := regexp.MustCompile("[^\r\n\t]+")
  1244. tmp := rs.FindAllString("text", -1)
  1245. if len(tmp) > 0 {
  1246. val = tmp[0]
  1247. }
  1248. }
  1249. if val != "" {
  1250. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1251. lock.Lock()
  1252. cfn := e.ClearFn[in.Field]
  1253. lock.Unlock()
  1254. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1255. if data[len(data)-1].(bool) {
  1256. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1257. j.BlockPackage[k].IsTrueBudget = true
  1258. }
  1259. break
  1260. }
  1261. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1262. lock.Lock()
  1263. cfn := e.ClearFn[in.Field]
  1264. lock.Unlock()
  1265. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1266. if data[len(data)-1].(bool) {
  1267. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1268. j.BlockPackage[k].IsTrueBidamount = true
  1269. }
  1270. break
  1271. } else if in.Field == "bidstatus" {
  1272. if j.BlockPackage[k].BidStatus == "" {
  1273. j.BlockPackage[k].BidStatus = val
  1274. break
  1275. }
  1276. } else if in.Field == "projectname" {
  1277. if j.BlockPackage[k].Name == "" {
  1278. j.BlockPackage[k].Name = val
  1279. break
  1280. }
  1281. }
  1282. }
  1283. }
  1284. }
  1285. }
  1286. }
  1287. //lua脚本根据属性设置提取kv值
  1288. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1289. kvmap := map[string][]map[string]interface{}{}
  1290. if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  1291. if vc.Field == "bidamount" {
  1292. for _, v := range j.Winnerorder {
  1293. if v["price"] == nil {
  1294. continue
  1295. }
  1296. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1297. "code": "winnerorder",
  1298. "field": vc.Field,
  1299. "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
  1300. "extfrom": v["sortstr"],
  1301. "sourcevalue": v["price"],
  1302. "value": v["price"],
  1303. "type": "winnerorder",
  1304. "matchtype": "winnerorder",
  1305. })
  1306. return kvmap, false
  1307. }
  1308. //候选人中标金额
  1309. if price := j.Winnerorder[0]["price"]; price != nil {
  1310. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1311. "code": "CL_中标候选人",
  1312. "field": vc.Field,
  1313. "ruletext": "中标候选人",
  1314. "extfrom": j.Winnerorder[0]["sortstr"],
  1315. "sourcevalue": price,
  1316. "value": price,
  1317. "type": "winnerorder",
  1318. "matchtype": "winnerorder",
  1319. })
  1320. return kvmap, false
  1321. }
  1322. }
  1323. }
  1324. for fieldname, field := range vc.LFields {
  1325. if field != vc.Field {
  1326. continue
  1327. }
  1328. extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category)
  1329. }
  1330. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1331. return kvmap, true
  1332. }
  1333. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) {
  1334. //qu.Debug("fieldname+++", fieldname)
  1335. for _, bl := range blocks {
  1336. tp := ""
  1337. if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") {
  1338. if text := ju.TrimLRSpace(bl.Text, ""); text != "" {
  1339. if Category == "招标" || Category == "拟建" || Category == "预告" {
  1340. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1341. "code": "CL_块内容",
  1342. "field": field,
  1343. "ruletext": "投标保证金",
  1344. "extfrom": "投标保证金_块内容",
  1345. "sourcevalue": bl.Text,
  1346. "value": text,
  1347. "type": "投标保证金_块内容",
  1348. "matchtype": "tag_string",
  1349. "blocktag": bl.Classify,
  1350. "weight": 0,
  1351. })
  1352. } else if Category == "结果" {
  1353. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1354. "code": "CL_",
  1355. "field": field,
  1356. "ruletext": "履约保证金",
  1357. "extfrom": "履约保证金_块内容",
  1358. "sourcevalue": bl.Text,
  1359. "value": text,
  1360. "type": "履约保证金_块内容",
  1361. "matchtype": "tag_string",
  1362. "blocktag": bl.Classify,
  1363. "weight": 0,
  1364. })
  1365. }
  1366. }
  1367. return
  1368. }
  1369. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1370. if k == 0 {
  1371. tp = "colon"
  1372. } else if k == 1 {
  1373. tp = "space"
  1374. } else if k == 2 {
  1375. tp = "table"
  1376. }
  1377. if v == nil || v.KvTags == nil {
  1378. continue
  1379. }
  1380. for _, vv := range v.KvTags[fieldname] {
  1381. text := ju.TrimLRSpace(vv.Value, "")
  1382. if text != "" {
  1383. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1384. "code": "CL_" + vv.Key,
  1385. "field": field,
  1386. "ruletext": vv.Key,
  1387. "extfrom": vc.ExtFrom,
  1388. "sourcevalue": text,
  1389. "value": text,
  1390. "type": tp,
  1391. "matchtype": "tag_string",
  1392. "blocktag": bl.Classify,
  1393. "weight": vv.Weight,
  1394. })
  1395. //if field != "winnertel" && field != "winnerperson" {
  1396. // //break //暂定取第一个
  1397. //}
  1398. }
  1399. }
  1400. }
  1401. if len(kvmap[field]) == 0 {
  1402. extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category)
  1403. }
  1404. }
  1405. }
  1406. //正则提取结果
  1407. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1408. defer qu.Catch()
  1409. var score float64
  1410. score = vre.Score
  1411. if isSite {
  1412. score = score + 1.0
  1413. }
  1414. extinfo := map[string][]map[string]interface{}{}
  1415. rep := map[string]string{}
  1416. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1417. //处理正负数修正
  1418. ptmp := strings.Split(vre.RuleText, "#")
  1419. sign := 0
  1420. if len(ptmp) == 2 {
  1421. if ptmp[1] == "正" {
  1422. sign = 1
  1423. } else if ptmp[1] == "负" {
  1424. sign = -1
  1425. }
  1426. }
  1427. tmp := strings.Split(ptmp[0], "__")
  1428. if len(tmp) == 2 {
  1429. epos := strings.Split(tmp[1], ",")
  1430. posm := map[string]int{}
  1431. for _, v := range epos {
  1432. ks := strings.Split(v, ":")
  1433. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1434. posm[ks[1]] = qu.IntAll(ks[0])
  1435. } else {
  1436. posm[vre.Field] = qu.IntAll(ks[0])
  1437. }
  1438. }
  1439. var pattern string
  1440. if strings.Contains(tmp[0], "\\u") {
  1441. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1442. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1443. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1444. } else {
  1445. pattern = tmp[0]
  1446. }
  1447. //log.Debug("pattern", pattern)
  1448. //fmt.Println(text)
  1449. reg := regexp.MustCompile(pattern)
  1450. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1451. for i, _ := range apos {
  1452. pos := apos[i]
  1453. for k, p := range posm {
  1454. if len(pos) > p {
  1455. if pos[p] == -1 || pos[p+1] == -1 {
  1456. continue
  1457. }
  1458. val := text[pos[p]:pos[p+1]]
  1459. if string(val) == "" {
  1460. continue
  1461. }
  1462. if sign == -1 {
  1463. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1464. } else {
  1465. rep[k+"_"+fmt.Sprint(i)] = val
  1466. }
  1467. }
  1468. }
  1469. }
  1470. tmps := []map[string]interface{}{}
  1471. for i := 0; i < len(apos); i++ {
  1472. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1473. tmp := map[string]interface{}{
  1474. "field": vre.Field,
  1475. "code": vre.Code,
  1476. "ruletext": vre.RuleText,
  1477. "extfrom": text,
  1478. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1479. "type": "regexp",
  1480. "matchtype": "regcontent",
  1481. "blocktag": *tag,
  1482. "score": score,
  1483. }
  1484. exfield := ju.ExtField{
  1485. BlockTag: *tag,
  1486. Field: vre.Field,
  1487. Code: vre.Code,
  1488. RuleText: vre.RuleText,
  1489. Type: "regexp",
  1490. MatchType: "regcontent",
  1491. ExtFrom: extfrom,
  1492. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1493. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1494. Score: score,
  1495. }
  1496. if vre.Field == "qualifies" {
  1497. if len(rep) >= 2 {
  1498. tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1499. exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1500. }
  1501. }
  1502. tmps = append(tmps, tmp)
  1503. if tmp["blocktag"] != nil {
  1504. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1505. }
  1506. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1507. }
  1508. }
  1509. if len(tmps) > 0 {
  1510. //fmt.Println(tmps)
  1511. extinfo[vre.Field] = tmps
  1512. }
  1513. }
  1514. } else {
  1515. pos := vre.RegCore.Reg.FindStringIndex(text)
  1516. val := ""
  1517. if len(pos) == 2 {
  1518. text = text[pos[1]:]
  1519. rs := regexp.MustCompile("[^\r\n\t]+")
  1520. tmp := rs.FindAllString(text, -1)
  1521. if len(tmp) > 0 {
  1522. val = tmp[0]
  1523. }
  1524. }
  1525. if val != "" {
  1526. tmps := []map[string]interface{}{}
  1527. tmp := map[string]interface{}{
  1528. "field": vre.Field,
  1529. "code": vre.Code,
  1530. "ruletext": vre.RuleText,
  1531. "extfrom": text,
  1532. "value": val,
  1533. "type": "regexp",
  1534. "matchtype": "regcontent",
  1535. "blocktag": *tag,
  1536. "score": score,
  1537. }
  1538. tmps = append(tmps, tmp)
  1539. extinfo[vre.Field] = tmps
  1540. if j.Result[vre.Field] == nil {
  1541. j.Result[vre.Field] = [](*ju.ExtField){}
  1542. }
  1543. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1544. Value: val,
  1545. Score: score}
  1546. if tmp["blocktag"] != nil {
  1547. field.BlockTag = tmp["blocktag"].(map[string]string)
  1548. }
  1549. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1550. }
  1551. }
  1552. return extinfo
  1553. }
  1554. //后置过滤
  1555. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1556. defer qu.Catch()
  1557. if in.IsLua {
  1558. result := GetResultMapForLua(j)
  1559. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1560. if j != nil {
  1561. lua.Block = j.Block
  1562. }
  1563. extinfo := lua.RunScript("back")
  1564. for k, v := range extinfo {
  1565. if tmps, ok := v.([]map[string]interface{}); ok {
  1566. j.Result[k] = [](*ju.ExtField){}
  1567. for _, tmp := range tmps {
  1568. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1569. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1570. Value: tmp["value"]}
  1571. if tmp["blocktag"] != nil {
  1572. field.BlockTag = tmp["blocktag"].(map[string]string)
  1573. }
  1574. j.Result[k] = append(j.Result[k], field)
  1575. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1576. }
  1577. }
  1578. }
  1579. if len(extinfo) > 0 {
  1580. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1581. }
  1582. } else {
  1583. extinfo := map[string]interface{}{}
  1584. if in.Field != "" {
  1585. clearByTitle := false
  1586. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1587. clearByTitle = true
  1588. }
  1589. if j.Result[in.Field] != nil {
  1590. tmp := j.Result[in.Field]
  1591. exts := []interface{}{}
  1592. for k, v := range tmp {
  1593. if clearByTitle && v.ExtFrom != "title" {
  1594. continue
  1595. }
  1596. //table抽取到的数据不清理
  1597. if v.Type == "table" && v.Field == "projectname" {
  1598. return
  1599. }
  1600. text := qu.ObjToString(v.Value)
  1601. if v.Field == "bidamount" || v.Field == "budget" {
  1602. if (strings.Contains(qu.ObjToString(v.SourceValue), "费率")||
  1603. strings.Contains(qu.ObjToString(v.SourceValue), "税率") ||
  1604. strings.Contains(qu.ObjToString(v.SourceValue), "(%)") ) &&
  1605. !strings.Contains(qu.ObjToString(v.SourceValue), "工程设计费"){
  1606. j.Result[in.Field][k].IsTrue = false
  1607. continue
  1608. }
  1609. }
  1610. if text != "" {
  1611. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1612. }
  1613. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1614. continue
  1615. }
  1616. j.Result[in.Field][k].Value = text
  1617. exts = append(exts, map[string]interface{}{
  1618. "field": v.Field,
  1619. "code": v.Code,
  1620. "ruletext": v.RuleText,
  1621. "type": v.Type,
  1622. "matchtype": v.MatchType,
  1623. "extfrom": v.ExtFrom,
  1624. "value": text,
  1625. })
  1626. }
  1627. if len(exts) > 0 {
  1628. extinfo[in.Field] = exts
  1629. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1630. }
  1631. }
  1632. } else {
  1633. for key, tmp := range j.Result {
  1634. exts := []interface{}{}
  1635. for k, v := range tmp {
  1636. //table抽取到的数据不清理
  1637. if v.Type == "table" && v.Field == "projectname" {
  1638. return
  1639. }
  1640. text := qu.ObjToString(v.Value)
  1641. if text != "" {
  1642. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1643. }
  1644. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1645. continue
  1646. }
  1647. j.Result[key][k].Value = text
  1648. exts = append(exts, map[string]interface{}{
  1649. "field": v.Field,
  1650. "code": v.Code,
  1651. "ruletext": v.RuleText,
  1652. "type": v.Type,
  1653. "matchtype": v.MatchType,
  1654. "extfrom": v.ExtFrom,
  1655. "value": text,
  1656. })
  1657. }
  1658. if len(exts) > 0 {
  1659. extinfo[key] = exts
  1660. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1661. }
  1662. }
  1663. }
  1664. }
  1665. }
  1666. //后置过滤
  1667. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1668. defer qu.Catch()
  1669. for k, v := range j.BlockPackage {
  1670. if in.Field == "winner" {
  1671. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1672. } else if in.Field == "bidstatus" {
  1673. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1674. } else if in.Field == "" {
  1675. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1676. } else if in.Field == "projectname" {
  1677. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1678. } else if in.Field == "winnerperson" {
  1679. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1680. } else if in.Field == "winnertel" {
  1681. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1682. }
  1683. }
  1684. }
  1685. //KV过滤
  1686. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1687. defer qu.Catch()
  1688. extinfo := map[string]interface{}{}
  1689. if in.Field != "" {
  1690. if j.Result[in.Field] != nil {
  1691. tmp := j.Result[in.Field]
  1692. exts := []interface{}{}
  1693. for k, v := range tmp {
  1694. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1695. continue
  1696. }
  1697. text := qu.ObjToString(v.Value)
  1698. if text != "" {
  1699. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1700. }
  1701. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1702. continue
  1703. }
  1704. j.Result[in.Field][k].Value = text
  1705. exts = append(exts, map[string]interface{}{
  1706. "field": v.Field,
  1707. "code": v.Code,
  1708. "ruletext": v.RuleText,
  1709. "type": v.Type,
  1710. "matchtype": v.MatchType,
  1711. "extfrom": v.ExtFrom,
  1712. "value": text,
  1713. })
  1714. }
  1715. if len(exts) > 0 {
  1716. extinfo[in.Field] = exts
  1717. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1718. }
  1719. }
  1720. }
  1721. }
  1722. //获取抽取结果map[string][]interface{},lua脚本使用
  1723. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1724. defer qu.Catch()
  1725. result := map[string][]map[string]interface{}{}
  1726. for key, val := range j.Result {
  1727. if result[key] == nil {
  1728. result[key] = []map[string]interface{}{}
  1729. }
  1730. for _, v := range val {
  1731. tmp := map[string]interface{}{
  1732. "field": v.Field,
  1733. "code": v.Code,
  1734. "ruletext": v.RuleText,
  1735. "value": v.Value,
  1736. "type": v.Type,
  1737. "matchtype": v.MatchType,
  1738. "extfrom": v.ExtFrom,
  1739. }
  1740. result[key] = append(result[key], tmp)
  1741. }
  1742. }
  1743. return result
  1744. }
  1745. //抽取日志
  1746. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1747. defer qu.Catch()
  1748. if !t.IsEtxLog {
  1749. return
  1750. }
  1751. logdata := map[string]interface{}{
  1752. "code": qu.If(v.Code == "", "kv", v.Code),
  1753. "name": v.Name,
  1754. "type": ftype,
  1755. "ruletext": v.RuleText,
  1756. "islua": v.IsLua,
  1757. "field": v.Field,
  1758. "version": t.Version,
  1759. "taskname": t.Name,
  1760. "before": before,
  1761. "extinfo": extinfo,
  1762. "sid": sid,
  1763. "comeintime": time.Now().Unix(),
  1764. }
  1765. lock.Lock()
  1766. ExtLogs[t] = append(ExtLogs[t], logdata)
  1767. lock.Unlock()
  1768. }
  1769. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1770. exts := []map[string]interface{}{}
  1771. exts = append(exts, map[string]interface{}{
  1772. "field": ext.Field,
  1773. "code": ext.Code,
  1774. "type": ftype,
  1775. "matchtype": matchtype,
  1776. "extfrom": ext.ExtFrom,
  1777. "value": ext.Value,
  1778. })
  1779. extinfo := map[string]interface{}{
  1780. ext.Field: exts,
  1781. }
  1782. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1783. }
  1784. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1785. defer qu.Catch()
  1786. if !t.IsEtxLog {
  1787. return
  1788. }
  1789. logdata := map[string]interface{}{
  1790. "code": code,
  1791. "name": name,
  1792. "type": ftype,
  1793. "ruletext": "",
  1794. "islua": false,
  1795. "field": field,
  1796. "version": t.Version,
  1797. "taskname": t.Name,
  1798. "before": before,
  1799. "extinfo": extinfo,
  1800. "sid": sid,
  1801. "comeintime": time.Now().Unix(),
  1802. }
  1803. lock.Lock()
  1804. ExtLogs[t] = append(ExtLogs[t], logdata)
  1805. lock.Unlock()
  1806. }
  1807. //保存抽取日志
  1808. func SaveExtLog() {
  1809. defer qu.Catch()
  1810. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1811. lock.Lock()
  1812. tmpLogs = ExtLogs
  1813. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1814. lock.Unlock()
  1815. for k, v := range tmpLogs {
  1816. if len(v) < saveLimit {
  1817. db.Mgo.SaveBulk(k.TrackColl, v...)
  1818. } else {
  1819. for {
  1820. if len(v) > saveLimit {
  1821. tmp := v[:saveLimit]
  1822. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1823. v = v[saveLimit:]
  1824. } else {
  1825. db.Mgo.SaveBulk(k.TrackColl, v...)
  1826. break
  1827. }
  1828. }
  1829. }
  1830. }
  1831. time.AfterFunc(10*time.Second, SaveExtLog)
  1832. }
  1833. type FieldValue struct {
  1834. Value interface{}
  1835. Count int
  1836. }
  1837. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1838. var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
  1839. //分析抽取结果并保存
  1840. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1841. qu.Try(func() {
  1842. if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
  1843. if j.CategorySecond != "单一" {
  1844. delete(j.Result, "winner")
  1845. delete(j.Result, "bidamount")
  1846. for _, v := range j.BlockPackage {
  1847. v.Bidamount = 0
  1848. v.IsTrueBidamount = false
  1849. if v.Winner != "" {
  1850. v.Winner = ""
  1851. if v.SpaceKV != nil {
  1852. delete(v.SpaceKV.KvTags, "中标单位")
  1853. }
  1854. if v.TableKV != nil {
  1855. delete(v.TableKV.KvTags, "中标单位")
  1856. }
  1857. if v.ColonKV != nil {
  1858. delete(v.ColonKV.KvTags, "中标单位")
  1859. }
  1860. }
  1861. }
  1862. for _, v := range j.PackageInfo {
  1863. delete(v, "winner")
  1864. delete(v, "bidamount")
  1865. }
  1866. j.Winnerorder = nil
  1867. if jf != nil && jf.Winnerorder != nil {
  1868. jf.Winnerorder = nil
  1869. }
  1870. }
  1871. }
  1872. //重新取出清理过后的中标候选人
  1873. resetWinnerorder(j)
  1874. //打分
  1875. doc, result, _id := funcAnalysis(j, e)
  1876. //_, result, _id := funcAnalysis(j, e)
  1877. if ju.IsSaveTag {
  1878. go otherNeedSave(j, result, e)
  1879. }
  1880. //从排序结果中取值
  1881. tmp := map[string]interface{}{} //抽取值
  1882. tmp["spidercode"] = j.SpiderCode
  1883. tmp["site"] = j.Site
  1884. if len(*j.Jsondata) > 0 {
  1885. tmp["jsondata"] = j.Jsondata
  1886. }
  1887. //字段-抽取来源
  1888. fieldSource := make(map[string]interface{},0)
  1889. //字段-抽取来源
  1890. for k, val := range result {
  1891. if k == "qualifies" {
  1892. squalifies := make([]interface{}, 0)
  1893. squalifiesMap := make(map[string]*scoreIndex, 0)
  1894. for _, kv := range val {
  1895. skey := kv.RuleText
  1896. if kv.Score > 0 {
  1897. if squalifiesMap[skey] == nil {
  1898. squalifiesMap = map[string]*scoreIndex{
  1899. skey: &scoreIndex{
  1900. Score: kv.Score,
  1901. Index: len(squalifies),
  1902. },
  1903. }
  1904. squalifies = append(squalifies, map[string]interface{}{
  1905. "key": skey,
  1906. "value": kv.Value,
  1907. })
  1908. } else {
  1909. if squalifiesMap[skey].Score < kv.Score {
  1910. squalifies[squalifiesMap[skey].Index] = map[string]interface{}{
  1911. "key": skey,
  1912. "value": kv.Value,
  1913. }
  1914. }
  1915. }
  1916. }
  1917. }
  1918. tmp[k] = squalifies
  1919. continue
  1920. }
  1921. for _, v := range val { //取第一个非负数,项目名称除外
  1922. //存0是否有效
  1923. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 {
  1924. tmp[v.Field] = v.Value
  1925. fieldSource[v.Field] = map[string]interface{}{
  1926. "ext_type":v.Type,
  1927. "ext_from":v.ExtFrom,
  1928. }
  1929. break
  1930. }
  1931. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  1932. tmp[v.Field] = v.Value
  1933. fieldSource[v.Field] = map[string]interface{}{
  1934. "ext_type":v.Type,
  1935. "ext_from":v.ExtFrom,
  1936. }
  1937. break
  1938. }
  1939. }
  1940. }
  1941. tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",")
  1942. if len(j.PackageInfo) > 15 {
  1943. for k, v := range j.PackageInfo {
  1944. j.PackageInfo = map[string]map[string]interface{}{}
  1945. j.PackageInfo[k] = v
  1946. break
  1947. }
  1948. }
  1949. if len(j.PackageInfo) > 0 { //分包信息
  1950. tmp["package"] = j.PackageInfo
  1951. //包预算,中标金额合并大于抽取就覆盖
  1952. var tmpBidamount, tmpBudget,tmpAgencyfee float64
  1953. //s_winner逗号分隔拼接,分包中标人
  1954. var tmpstr, savewinner []string
  1955. //按包排序
  1956. for b, v := range j.PackageInfo {
  1957. if v["winner"] != nil && v["winner"] != "" {
  1958. tmpstr = append(tmpstr, b)
  1959. }
  1960. }
  1961. //包预算,中标金额合并大于抽取就覆盖
  1962. if len(j.PackageInfo) >= 1 {
  1963. //包数大于1累加
  1964. for _, v := range j.PackageInfo {
  1965. if v["budget"] != nil {
  1966. tmpBudget += qu.Float64All(v["budget"])
  1967. }
  1968. if v["bidamount"] != nil {
  1969. tmpBidamount += qu.Float64All(v["bidamount"])
  1970. }
  1971. if v["agencyfee"] != nil {
  1972. tmpAgencyfee += qu.Float64All(v["agencyfee"])
  1973. }
  1974. }
  1975. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1976. fieldSource["budget"] = map[string]interface{}{
  1977. "ext_type":"",
  1978. "ext_from":"package",
  1979. }
  1980. tmp["budget"] = tmpBudget
  1981. }
  1982. if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee {
  1983. fieldSource["agencyfee"] = map[string]interface{}{
  1984. "ext_type":"",
  1985. "ext_from":"package",
  1986. }
  1987. tmp["agencyfee"] = tmpAgencyfee
  1988. }
  1989. if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
  1990. fieldSource["bidamount"] = map[string]interface{}{
  1991. "ext_type":"",
  1992. "ext_from":"package",
  1993. }
  1994. tmp["bidamount"] = tmpBidamount
  1995. } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1996. fieldSource["bidamount"] = map[string]interface{}{
  1997. "ext_type":"",
  1998. "ext_from":"package",
  1999. }
  2000. tmp["bidamount"] = tmpBidamount
  2001. }
  2002. } else {
  2003. //包数等于1,tmp没有值取包里的值
  2004. if tmp["budget"] == nil || tmp["budget"] == 0 {
  2005. for _, v := range j.PackageInfo {
  2006. if v["budget"] != nil {
  2007. fieldSource["budget"] = map[string]interface{}{
  2008. "ext_type":"",
  2009. "ext_from":"package",
  2010. }
  2011. tmp["budget"] = v["budget"]
  2012. }
  2013. }
  2014. }
  2015. if tmp["agencyfee"] == nil || tmp["agencyfee"] == 0 {
  2016. for _, v := range j.PackageInfo {
  2017. if v["agencyfee"] != nil {
  2018. fieldSource["agencyfee"] = map[string]interface{}{
  2019. "ext_type":"",
  2020. "ext_from":"package",
  2021. }
  2022. tmp["agencyfee"] = v["agencyfee"]
  2023. }
  2024. }
  2025. }
  2026. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  2027. for _, v := range j.PackageInfo {
  2028. if v["bidamount"] != nil {
  2029. fieldSource["bidamount"] = map[string]interface{}{
  2030. "ext_type":"",
  2031. "ext_from":"package",
  2032. }
  2033. tmp["bidamount"] = v["bidamount"]
  2034. }
  2035. }
  2036. }
  2037. }
  2038. //s_winner逗号分隔拼接,分包中标人
  2039. sort.Strings(tmpstr)
  2040. for _, v := range tmpstr {
  2041. winner := qu.ObjToString(j.PackageInfo[v]["winner"])
  2042. new_winner := clearWinnerReg.ReplaceAllString(winner, "")
  2043. if new_winner == "" {
  2044. continue
  2045. }
  2046. //名称黑名单
  2047. if unPackageWinnerReg.MatchString(new_winner) {
  2048. continue
  2049. }
  2050. savewinner = append(savewinner, new_winner)
  2051. }
  2052. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  2053. tmp["s_winner"] = tmp["winner"]
  2054. fieldSource["s_winner"] = fieldSource["winner"]
  2055. } else if savewinner != nil {
  2056. savewinner = RemoveReplicaSliceString(savewinner)
  2057. tmp["s_winner"] = strings.Join(savewinner, ",")
  2058. if len(savewinner)==1 {
  2059. fieldSource["s_winner"] = fieldSource["winner"]
  2060. }else if len(savewinner)>1{
  2061. fieldSource["s_winner"] = map[string]interface{}{
  2062. "ext_type":"",
  2063. "ext_from":"package",
  2064. }
  2065. }
  2066. }
  2067. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  2068. //没有分包取winner
  2069. tmp["s_winner"] = tmp["winner"]
  2070. fieldSource["s_winner"] = fieldSource["winner"]
  2071. }
  2072. if len(j.Winnerorder) > 0 { //候选人信息
  2073. for i, v := range j.Winnerorder {
  2074. if v["price"] != nil {
  2075. tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2076. if tmpPrice[len(tmpPrice)-1].(bool) {
  2077. j.Winnerorder[i]["price"] = tmpPrice[0]
  2078. } else {
  2079. delete(j.Winnerorder[i], "price")
  2080. }
  2081. }
  2082. }
  2083. tmp["winnerorder"] = j.Winnerorder
  2084. }
  2085. //处理附件
  2086. var resultf map[string][]*ju.ExtField
  2087. ffield := map[string]interface{}{}
  2088. if jf != nil {
  2089. _, resultf, _ = funcAnalysis(jf, e)
  2090. for _, val := range resultf {
  2091. for _, v := range val { //取第一个非负数
  2092. if v.Score > -1 {
  2093. ffield[v.Field] = v.Value
  2094. if tmp[v.Field] == nil {
  2095. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
  2096. tmp[v.Field] = v.Value
  2097. fieldSource[v.Field] = map[string]interface{}{
  2098. "ext_type":v.Type,
  2099. "ext_from":"ff",
  2100. }
  2101. break
  2102. }
  2103. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  2104. tmp[v.Field] = v.Value
  2105. fieldSource[v.Field] = map[string]interface{}{
  2106. "ext_type":v.Type,
  2107. "ext_from":"ff",
  2108. }
  2109. break
  2110. }
  2111. }
  2112. break
  2113. }
  2114. }
  2115. }
  2116. if len(jf.PackageInfo) > 0 { //分包信息
  2117. ffield["package"] = jf.PackageInfo
  2118. }
  2119. if len(jf.Winnerorder) > 0 { //候选人信息
  2120. ffield["winnerorder"] = jf.Winnerorder
  2121. }
  2122. }
  2123. //添加字段来源
  2124. tmp["field_source"] = fieldSource
  2125. //添加字段来源
  2126. for k, v := range *doc {
  2127. if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
  2128. (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
  2129. }
  2130. //去重冗余字段
  2131. if delFiled(k) {
  2132. continue
  2133. }
  2134. if tmp[k] == nil {
  2135. tmp[k] = v
  2136. }
  2137. }
  2138. //质量审核
  2139. if ju.QualityAudit {
  2140. e.QualityAudit(tmp)
  2141. }
  2142. //城市抽取
  2143. if e.IsExtractCity {
  2144. e.NewExtractCity(j, &tmp, _id)
  2145. }
  2146. //品牌抽取
  2147. if ju.IsBrandGoods {
  2148. tmp["checkhas"] = map[string]int{
  2149. "hastable": j.HasTable,
  2150. "hasgoods": j.HasGoods,
  2151. "hasbrand": j.HasBrand,
  2152. "haskey": j.HasKey,
  2153. }
  2154. if len(j.BrandData) > 0 {
  2155. tmp["tablebrand"] = j.BrandData
  2156. }
  2157. }
  2158. //prince和number抽取
  2159. if ju.IsPriceNumber {
  2160. priceNumberLen := len(j.PriceNumberData)
  2161. if priceNumberLen > 1 { //table数据去重
  2162. tmpPriceNumberData := []map[string]interface{}{}
  2163. tableStrs := map[string]bool{}
  2164. for _, tb := range j.PriceNumberData {
  2165. has := false
  2166. bytes, _ := json.Marshal(tb)
  2167. str := string(bytes)
  2168. if len(tableStrs) > 0 && tableStrs[str] {
  2169. has = true
  2170. } else {
  2171. tableStrs[str] = true
  2172. }
  2173. if !has {
  2174. for _, data := range tb {
  2175. tmpPriceNumberData = append(tmpPriceNumberData, data)
  2176. }
  2177. }
  2178. }
  2179. tmp["pricenumber"] = tmpPriceNumberData
  2180. } else if priceNumberLen == 1 {
  2181. tmp["pricenumber"] = j.PriceNumberData[0]
  2182. }
  2183. }
  2184. //所有kv组成的字符串
  2185. var kvtext bytes.Buffer
  2186. blocks := make([]ju.BlockAndTag, 0)
  2187. for _, v := range j.Block {
  2188. //分包和标签
  2189. if ju.SaveBlock {
  2190. xx, _ := json.Marshal(v)
  2191. tmpblock := new(ju.TmpBlock)
  2192. err := json.Unmarshal(xx, &tmpblock)
  2193. if err != nil {
  2194. if v.BPackage != nil {
  2195. bpb, _ := json.Marshal(v.BPackage)
  2196. tmpblock.BPackage = string(bpb)
  2197. }
  2198. tmpblock = rangeBlockToJson(v, *tmpblock)
  2199. }
  2200. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  2201. }
  2202. //把所有kv组装成一个字符串,存库
  2203. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  2204. if jv == nil {
  2205. continue
  2206. }
  2207. for jv_k, jv_v := range jv.KvTags {
  2208. for _, jv_vv := range jv_v {
  2209. kvtext.WriteString(jv_k)
  2210. kvtext.WriteString(":")
  2211. kvtext.WriteString(jv_vv.Value)
  2212. kvtext.WriteString("\n")
  2213. }
  2214. }
  2215. }
  2216. }
  2217. if kvtext.Len() > 0 {
  2218. tmp["kvtext"] = kvtext.String()
  2219. }
  2220. if len(blocks) > 0 {
  2221. if blocksBytes, err := json.Marshal(blocks); err == nil {
  2222. if utf8.RuneCount(blocksBytes) < 100000 {
  2223. tmp["blocks"] = string(blocksBytes)
  2224. }
  2225. }
  2226. }
  2227. tmp["dataging"] = j.Dataging
  2228. /*for k, v := range *j.Data {
  2229. if f[k] {
  2230. tmp[k] = v
  2231. }
  2232. }
  2233. for k := range tmp {
  2234. if !f[k]{
  2235. delete(tmp,k)
  2236. }
  2237. }*/
  2238. //检查字段
  2239. tmp = checkFields(tmp)
  2240. if tmp["projectname"] == nil || tmp["projectname"] == "" {
  2241. tmp["projectname"] = j.Title
  2242. }
  2243. tmp["repeat"] = 0
  2244. if ju.Ffield {
  2245. if len(ffield) > 0 {
  2246. tmp["ffield"] = ffield
  2247. }
  2248. }
  2249. if e.TaskInfo.TestColl == "" {
  2250. if len(tmp) > 0 { //保存抽取结果
  2251. delete(tmp, "_id")
  2252. tmparr := []map[string]interface{}{
  2253. map[string]interface{}{
  2254. "_id": qu.StringTOBsonId(_id),
  2255. },
  2256. map[string]interface{}{"$set": tmp},
  2257. }
  2258. e.RWMutex.Lock()
  2259. e.BidArr = append(e.BidArr, tmparr)
  2260. e.BidTotal++
  2261. e.RWMutex.Unlock()
  2262. }
  2263. if ju.SaveResult {
  2264. id := tmp["_id"]
  2265. tmp["result"] = result
  2266. tmp["resultf"] = resultf
  2267. delete(tmp, "_id")
  2268. tmparr := []map[string]interface{}{
  2269. map[string]interface{}{
  2270. "_id": id,
  2271. },
  2272. map[string]interface{}{"$set": tmp},
  2273. }
  2274. e.RWMutex.Lock()
  2275. e.ResultArr = append(e.ResultArr, tmparr)
  2276. e.RWMutex.Unlock()
  2277. }
  2278. } else { //测试结果
  2279. delete(tmp, "_id")
  2280. delete(tmp, "fieldall")
  2281. if len(j.BlockPackage) > 0 { //分包详情
  2282. if len(j.BlockPackage) > 10 {
  2283. tmp["epackage"] = "分包异常"
  2284. } else {
  2285. bs, _ := json.Marshal(j.BlockPackage)
  2286. tmp["epackage"] = string(bs)
  2287. }
  2288. }
  2289. tmp["result"] = result
  2290. //tmp["resultf"] = resultf
  2291. //_,err :=db.Mgo.Get().DB("zhengkun").C("result_data").Upsert(`{"_id":"`+_id+`"}`,map[string]interface{}{"$set": tmp})
  2292. //log.Debug("save:",err)
  2293. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  2294. if !b {
  2295. log.Debug(e.TaskInfo.TestColl, _id)
  2296. }
  2297. }
  2298. }, func(err interface{}) {
  2299. log.Debug("AnalysisSaveResult err", err)
  2300. })
  2301. }
  2302. //检查字段-
  2303. func checkFields(tmp map[string]interface{}) map[string]interface{} {
  2304. delete(tmp, "contenthtml")
  2305. delete(tmp, "detail")
  2306. tmp["repeat"] = 0
  2307. //指定爬虫-金额处理-预算-中标金额异常
  2308. if qu.ObjToString(tmp["spidercode"])=="xz_xzzzqjzscjgycxxxpt_zbtzs" {
  2309. if budget, ok := tmp["budget"].(float64); ok && budget>0 && budget < 1000000{
  2310. tmp["budget"] = budget*10000.0
  2311. }
  2312. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{
  2313. tmp["bidamount"] = bidamount/10000.0
  2314. }
  2315. }
  2316. if qu.ObjToString(tmp["spidercode"])=="js_jsszbtbw_zbhxrgs" {
  2317. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{
  2318. tmp["bidamount"] = bidamount/10000.0
  2319. }
  2320. }
  2321. if _, ok := tmp["bidamount"].(string); ok {
  2322. delete(tmp, "bidamount")
  2323. } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/5 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
  2324. delete(tmp, "bidamount")
  2325. }
  2326. if _, ok := tmp["budget"].(string); ok {
  2327. delete(tmp, "budget")
  2328. }
  2329. if _, ok := tmp["unitprice"].(string); ok {
  2330. delete(tmp, "unitprice")
  2331. }
  2332. if _, ok := tmp["bidopentime"].(string); ok {
  2333. delete(tmp, "bidopentime")
  2334. }
  2335. if _, ok := tmp["signaturedate"].(string); ok {
  2336. delete(tmp, "signaturedate")
  2337. }
  2338. if _, ok := tmp["supervisorrate"].(string); ok {
  2339. delete(tmp, "supervisorrate")
  2340. }
  2341. for k, v := range tmp {
  2342. if k == "qualifies" {
  2343. continue
  2344. }
  2345. if k == "contract_guarantee" || k == "bid_guarantee" {
  2346. if len(fmt.Sprint(v)) > 0 {
  2347. tmp[k] = true
  2348. } else {
  2349. delete(tmp, k)
  2350. }
  2351. }
  2352. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") {
  2353. delete(tmp, k)
  2354. }
  2355. }
  2356. //项目周期-有效值
  2357. projectperiod := qu.ObjToString(tmp["projectperiod"])
  2358. if projectperiod !="" {
  2359. //项目周期包含日期,数字及日期单位可保留,其余可清洗
  2360. isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
  2361. if !isNeedValueReg.MatchString(projectperiod) {
  2362. delete(tmp, "projectperiod")
  2363. }
  2364. }
  2365. //工期单位是否有效-清理
  2366. if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
  2367. dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
  2368. if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit)>4 {
  2369. delete(tmp, "project_timeunit")
  2370. }
  2371. //年-0 >5 删除
  2372. if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"])==0 || qu.Int64All(tmp["project_duration"])>5 ){
  2373. delete(tmp, "project_timeunit")
  2374. }
  2375. }
  2376. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  2377. strwin := qu.ObjToString(tmp["winner"])
  2378. strwin_s := qu.ObjToString(tmp["s_winner"])
  2379. if !strings.Contains(strwin_s, strwin) {
  2380. tmp["s_winner"] = strwin
  2381. }
  2382. }
  2383. //budget bidamount
  2384. if bg, ok := tmp["budget"].(float64); ok {
  2385. if bg >= 50000000000 {
  2386. tmp["budget_max_err"] = bg
  2387. delete(tmp, "budget")
  2388. }
  2389. }
  2390. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
  2391. code := qu.ObjToString(tmp["spidercode"])
  2392. if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" &&
  2393. code != "js_jsszbtbw_zbhxrgs"{
  2394. tmp["bidamount_max_err"] = bg
  2395. delete(tmp, "bidamount")
  2396. }
  2397. }
  2398. //投标方式-
  2399. bidway := qu.IntAll(tmp["bidway"])
  2400. if bidway == 1 {
  2401. tmp["bidway"] = "纸质投标"
  2402. }else if bidway == 2 {
  2403. tmp["bidway"] = "电子投标"
  2404. }else {
  2405. delete(tmp, "bidway")
  2406. }
  2407. //折扣系数
  2408. discount := dealWithDiscountBid(tmp)
  2409. if discount >0.0 {
  2410. tmp["biddiscount"] = discount
  2411. }else {
  2412. delete(tmp, "biddiscount")
  2413. }
  2414. delete(tmp, "biddiscount_up")
  2415. delete(tmp, "biddiscount_down")
  2416. //临时
  2417. //bidopentime := qu.Int64All(tmp["bidopentime"])
  2418. //bidendtime := qu.Int64All(tmp["bidendtime"])
  2419. //timeLayout := "2006-01-02 15:04:05"
  2420. //
  2421. //if bidopentime>0 {
  2422. // bidopentime_str := time.Unix(bidopentime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
  2423. // tmp["bidopentime"] = bidopentime_str
  2424. //}
  2425. //if bidendtime>0 {
  2426. // bidendtime_str := time.Unix(bidendtime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
  2427. // tmp["bidendtime"] = bidendtime_str
  2428. //}
  2429. jyhref:= fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
  2430. tmp["jytest_href"] = jyhref
  2431. return tmp
  2432. }
  2433. //处理折扣系数
  2434. func dealWithDiscountBid(tmp map[string]interface{}) float64 {
  2435. biddiscount := qu.Float64All(tmp["biddiscount"])
  2436. biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
  2437. biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
  2438. baseCount := float64(1)
  2439. if biddiscount_down >0.0 {
  2440. num1:=decimal.NewFromFloat(baseCount)
  2441. num2:=decimal.NewFromFloat(biddiscount_down)
  2442. decimalValue := num1.Sub(num2)
  2443. res,_ := decimalValue.Float64()
  2444. //log.Debug("下浮后折扣系数:",res)
  2445. return res
  2446. }
  2447. if biddiscount_up >0.0 {
  2448. num1:=decimal.NewFromFloat(baseCount)
  2449. num2:=decimal.NewFromFloat(biddiscount_up)
  2450. decimalValue := num1.Add(num2)
  2451. res,_ := decimalValue.Float64()
  2452. //log.Debug("上浮后折扣系数:",res)
  2453. return res
  2454. }
  2455. if biddiscount>0.0 {
  2456. if biddiscount > 1.0 && biddiscount<=10.0 {
  2457. num1:=decimal.NewFromFloat(10.0)
  2458. num2:=decimal.NewFromFloat(biddiscount_up)
  2459. decimalValue := num2.Div(num1)
  2460. res,_ := decimalValue.Float64()
  2461. //log.Debug("标准-①折扣系数:",res)
  2462. return res
  2463. }else if biddiscount>10.0 {
  2464. num1:=decimal.NewFromFloat(100.0)
  2465. num2:=decimal.NewFromFloat(biddiscount_up)
  2466. decimalValue := num2.Div(num1)
  2467. res,_ := decimalValue.Float64()
  2468. //log.Debug("标准-⑩折扣系数:",res)
  2469. return res
  2470. }else {
  2471. //log.Debug("标准折扣系数:",biddiscount)
  2472. return biddiscount
  2473. }
  2474. }
  2475. return 0.0
  2476. }
  2477. //保存其他
  2478. //kv、表格、块上的标签凡是新的标签都入库
  2479. //val type times firstid createtime 判定field
  2480. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  2481. now := time.Now().Unix()
  2482. coll := e.TaskInfo.TestColl
  2483. if coll == "" {
  2484. coll = "extract_tag_result"
  2485. } else {
  2486. coll += "_tag"
  2487. }
  2488. datas := []map[string]interface{}{}
  2489. kv := map[string]int{}
  2490. for _, v := range j.Block {
  2491. //
  2492. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  2493. if vv == nil || vv.KvTags == nil {
  2494. continue
  2495. }
  2496. for kkk, vvv := range vv.KvTags {
  2497. for _, vvvv := range vvv {
  2498. if vvvv.IsInvalid {
  2499. kv[kkk] = kv[kkk] + 1
  2500. break
  2501. }
  2502. }
  2503. }
  2504. }
  2505. for _, vv := range v.NotClassifyTitles {
  2506. datas = append(datas, map[string]interface{}{
  2507. "val": vv,
  2508. "times": 0,
  2509. "type": "block",
  2510. "firstid": j.SourceMid,
  2511. "createtime": now,
  2512. })
  2513. if len(datas) == saveLimit {
  2514. db.Mgo.SaveBulk(coll, datas...)
  2515. datas = []map[string]interface{}{}
  2516. }
  2517. }
  2518. }
  2519. for k, v := range kv {
  2520. datas = append(datas, map[string]interface{}{
  2521. "val": k,
  2522. "times": v,
  2523. "type": "kv",
  2524. "firstid": j.SourceMid,
  2525. "createtime": now,
  2526. })
  2527. if len(datas) == saveLimit {
  2528. db.Mgo.SaveBulk(coll, datas...)
  2529. datas = []map[string]interface{}{}
  2530. }
  2531. }
  2532. if len(datas) > 0 {
  2533. db.Mgo.SaveBulk(coll, datas...)
  2534. }
  2535. }
  2536. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2537. if j == nil {
  2538. return nil
  2539. }
  2540. if len(j.Block) > 0 {
  2541. for i, v := range j.Block {
  2542. rangetmp := new(ju.TmpBlock)
  2543. vb, _ := json.Marshal(v)
  2544. json.Unmarshal(vb, &rangetmp)
  2545. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2546. }
  2547. }
  2548. if j.ColonKV != nil {
  2549. cb, _ := json.Marshal(j.ColonKV)
  2550. tmpblock.ColonKV = string(cb)
  2551. }
  2552. if j.SpaceKV != nil {
  2553. sb, _ := json.Marshal(j.SpaceKV)
  2554. tmpblock.SpaceKV = string(sb)
  2555. }
  2556. if j.TableKV != nil {
  2557. tb, _ := json.Marshal(j.TableKV)
  2558. tmpblock.TableKV = string(tb)
  2559. }
  2560. return &tmpblock
  2561. }
  2562. //去重冗余字段
  2563. func delFiled(k string) bool {
  2564. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2565. }
  2566. //分析-打分排序
  2567. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2568. defer qu.Catch()
  2569. doc := j.Data
  2570. result := j.Result
  2571. _id := qu.BsonIdToSId((*doc)["_id"])
  2572. result = ScoreFields(j, e.Tag) //正负面词打分
  2573. //结果排序
  2574. for _, val := range result {
  2575. ju.Sort(val)
  2576. }
  2577. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2578. clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
  2579. marshalbt, _ := json.Marshal(j.Jsondata)
  2580. tmpjddata := make(map[string]interface{})
  2581. json.Unmarshal(marshalbt, &tmpjddata)
  2582. for _, jdkey := range ju.JsonData {
  2583. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2584. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2585. if jdkey == "budget" || jdkey == "bidamount" {
  2586. lockclear.Lock()
  2587. cfn := e.ClearFn[jdkey]
  2588. lockclear.Unlock()
  2589. if len(cfn) == 0 {
  2590. continue
  2591. }
  2592. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
  2593. if tmpv.Value == newNum[0] {
  2594. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2595. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2596. ju.Sort(j.Result[jdkey])
  2597. delete((*j.Jsondata), jdkey)
  2598. break
  2599. }
  2600. } else {
  2601. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2602. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2603. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2604. ju.Sort(j.Result[jdkey])
  2605. delete((*j.Jsondata), jdkey)
  2606. break
  2607. }
  2608. }
  2609. }
  2610. }
  2611. }
  2612. if len(*j.Jsondata) > 0 {
  2613. j.Result = JsonDataMergeProcessing(j, e)
  2614. }
  2615. j.Jsondata = &tmpjddata
  2616. }
  2617. return doc, result, _id
  2618. }
  2619. //辅助信息,如果没有排序先排序
  2620. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2621. fieldalls := map[string][]map[string]interface{}{}
  2622. if j == nil {
  2623. return fieldalls
  2624. }
  2625. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2626. defer qykredis.Close()
  2627. db := 0
  2628. for field, val := range j.Result {
  2629. //ju.Sort(val)
  2630. if field == "buyer" {
  2631. db = ju.BuyerDB
  2632. } else if field == "winner" {
  2633. db = ju.WinnerDB
  2634. } else if field == "agency" {
  2635. db = ju.AgencyDB
  2636. }
  2637. sfields := []map[string]interface{}{}
  2638. for _, v := range val {
  2639. standardized := false
  2640. if _, err := qykredis.Do("SELECT", db); err != nil {
  2641. fmt.Println("redis select err", err)
  2642. } else {
  2643. rep, err := qykredis.Do("GET", v.Value)
  2644. if rep != nil && err == nil {
  2645. standardized = true
  2646. }
  2647. }
  2648. if field == "budget" || field == "bidamount" {
  2649. if !v.IsTrue {
  2650. continue
  2651. }
  2652. }
  2653. sfield := map[string]interface{}{
  2654. "val": v.Value,
  2655. "type": v.Type,
  2656. "score": v.Score,
  2657. "blocktag": v.BlockTag,
  2658. "sourceval": v.SourceValue,
  2659. "standardized": standardized,
  2660. }
  2661. sfields = append(sfields, sfield)
  2662. }
  2663. fieldalls[field] = sfields
  2664. }
  2665. return fieldalls
  2666. }
  2667. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2668. defer qu.Catch()
  2669. //获取审核字段
  2670. for _, field := range e.AuditFields {
  2671. //1.分包
  2672. if resulttmp["package"] != nil {
  2673. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2674. for _, val := range packagedata {
  2675. if val[field] != nil {
  2676. fv := qu.ObjToString(val[field])
  2677. if fv != "" {
  2678. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2679. e.RedisMatch(field, fv, val) //redis匹配
  2680. } else { //除了buyer和winner,其他字段走规则匹配
  2681. e.RuleMatch(field, fv, val)
  2682. }
  2683. }
  2684. }
  2685. }
  2686. }
  2687. //2.外围
  2688. if resulttmp[field] != nil {
  2689. fv := qu.ObjToString(resulttmp[field])
  2690. if fv != "" {
  2691. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2692. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2693. } else { //除了buyer和winner,其他字段走规则匹配
  2694. e.RuleMatch(field, fv, resulttmp)
  2695. }
  2696. }
  2697. }
  2698. }
  2699. }
  2700. //Redis匹配
  2701. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2702. defer qu.Catch()
  2703. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2704. if i == 0 { //reids未找到,执行规则匹配
  2705. val[field+"_isredis"] = false
  2706. e.RuleMatch(field, fv, val) //规则匹配
  2707. } else { //redis找到,打标识存库
  2708. val[field+"_isredis"] = true
  2709. }
  2710. }
  2711. //规则匹配
  2712. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2713. defer qu.Catch()
  2714. if fieldval != "" {
  2715. SMap := e.StartMatch(field, fieldval)
  2716. //SMap.AddKey(field+"_isaudit", false)
  2717. for _, k := range SMap.Keys {
  2718. tmpMap[k] = SMap.Map[k]
  2719. }
  2720. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2721. }
  2722. }
  2723. //开始规则匹配
  2724. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2725. defer qu.Catch()
  2726. SMap := pretreated.NewSortMap()
  2727. lock.Lock()
  2728. f := e.RecogFieldMap[field]
  2729. lock.Unlock()
  2730. if len(f) > 0 {
  2731. fid := qu.BsonIdToSId(f["_id"])
  2732. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2733. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2734. if textAfterRecogFieldPrerule != "" {
  2735. lock.Lock()
  2736. classMap := e.FidClassMap[fid]
  2737. lock.Unlock()
  2738. L:
  2739. for _, c := range classMap { //class
  2740. classid := qu.BsonIdToSId(c["_id"])
  2741. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2742. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2743. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2744. if textAfterClassPrerule != "" {
  2745. lock.Lock()
  2746. ruleMap := e.CidRuleMap[classid]
  2747. lock.Unlock()
  2748. for _, r := range ruleMap { //rule
  2749. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2750. s_name := qu.ObjToString(r["s_name"])
  2751. rule := r["rule"].([]interface{})
  2752. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2753. if textAfterRulePrerule != "" {
  2754. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2755. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2756. if savefield != "" { //保存字段不为空,存储代码信息
  2757. SMap.AddKey(field+"_"+savefield, s_name)
  2758. }
  2759. break L
  2760. }
  2761. }
  2762. }
  2763. }
  2764. }
  2765. }
  2766. }
  2767. return SMap
  2768. }
  2769. //筛选重复候选人-相关
  2770. func filterRepeatWinArr(j *ju.Job) {
  2771. if j.SpiderCode=="sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
  2772. sort_WinOrder_Arr := make([][]map[string]interface{},0)
  2773. sort_arr := make([]map[string]interface{},0)
  2774. for _,v := range j.Winnerorder{
  2775. sort := qu.IntAll(v["sort"])
  2776. if sort==1 { //为一组
  2777. if len(sort_arr)>0 {
  2778. sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr)
  2779. }
  2780. sort_arr = make([]map[string]interface{},0)
  2781. }
  2782. sort_arr = append(sort_arr,v)
  2783. }
  2784. if len(sort_arr)>0 {
  2785. sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr)
  2786. }
  2787. if len(sort_WinOrder_Arr)>0 { //有重复排序组-开始筛选清理
  2788. isIndex :=0
  2789. for index,winArr := range sort_WinOrder_Arr {
  2790. if len(winArr)>0 {
  2791. if qu.ObjToString(winArr[0]["price"])!=""&&
  2792. qu.ObjToString(winArr[0]["entname"])!="" {
  2793. isIndex = index
  2794. break
  2795. }
  2796. }
  2797. }
  2798. j.Winnerorder = sort_WinOrder_Arr[isIndex]
  2799. }
  2800. }
  2801. }
  2802. //中标候选人经过清理之后,重新取出赋值
  2803. func resetWinnerorder(j *ju.Job) {
  2804. if len(j.Winnerorder) == 0 {
  2805. return
  2806. }
  2807. maxlen := len(j.Winnerorder) - 1
  2808. //中标单位
  2809. //i := 0
  2810. winners := []*ju.ExtField{}
  2811. bidamounts := []*ju.ExtField{}
  2812. if maxlen > 0 {
  2813. //新增-指定爬虫中标候选人过滤
  2814. filterRepeatWinArr(j)
  2815. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  2816. return
  2817. }
  2818. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2819. if j.Winnerorder[0]["price"] != nil {
  2820. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2821. if tmpPrice[len(tmpPrice)-1].(bool) {
  2822. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2823. }
  2824. }
  2825. }
  2826. if j.Result["winner"] == nil && len(winners) > 0 {
  2827. j.Result["winner"] = winners
  2828. } else if len(winners) > 0 {
  2829. j.Result["winner"] = append(j.Result["winner"], winners...)
  2830. }
  2831. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2832. j.Result["bidamount"] = bidamounts
  2833. } else if len(bidamounts) > 0 {
  2834. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2835. }
  2836. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  2837. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2838. j.Result["winner"] = winners
  2839. if j.Winnerorder[0]["price"] != nil {
  2840. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2841. if tmpPrice[len(tmpPrice)-1].(bool) {
  2842. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2843. }
  2844. j.Result["bidamount"] = bidamounts
  2845. }
  2846. }
  2847. }
  2848. func RemoveReplicaSliceString(slc []string) []string {
  2849. result := make([]string, 0)
  2850. tempMap := make(map[string]bool, len(slc))
  2851. for _, e := range slc {
  2852. if tempMap[e] == false {
  2853. tempMap[e] = true
  2854. result = append(result, e)
  2855. }
  2856. }
  2857. return result
  2858. }
  2859. type scoreIndex struct {
  2860. Score float64
  2861. Index int
  2862. }