extract.go 98 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/shopspring/decimal"
  7. "go.mongodb.org/mongo-driver/bson/primitive"
  8. "jy/clear"
  9. db "jy/mongodbutil"
  10. "jy/pretreated"
  11. ju "jy/util"
  12. qu "qfw/util"
  13. "qfw/util/redis"
  14. "regexp"
  15. "sort"
  16. "strconv"
  17. "strings"
  18. "sync"
  19. "time"
  20. "unicode/utf8"
  21. log "github.com/donnie4w/go-logger/logger"
  22. "gopkg.in/mgo.v2/bson"
  23. )
  24. var (
  25. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  26. JYUrl = "https://www.jianyu360.com/article/content/%s.html"
  27. cut = ju.NewCut() //获取正文并清理
  28. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  29. TaskList map[string]*ExtractTask //任务列表
  30. ClearTaskList map[string]*ClearTask //清理任务列表
  31. saveLimit = 100 //抽取日志批量保存
  32. PageSize = 5000 //查询分页
  33. Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  34. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  35. NiJianField = []string{
  36. "string#approvecode",
  37. "string#total_investment",
  38. "string#funds",
  39. "string#owner",
  40. "string#projectaddr",
  41. "string#projectperiod",
  42. "string#project_scale",
  43. "string#project_person",
  44. "string#project_phone",
  45. "string#approvenumber",
  46. "string#projecttype",
  47. "string#approvestatus",
  48. "time#project_startdate",
  49. "time#project_completedate",
  50. "map#construction_area",
  51. "map#floor_area",
  52. }
  53. spidercode = map[string]bool{
  54. "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
  55. "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
  56. "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
  57. "hb_tmsggzyjyxxw_jsgc_kbqk": true,
  58. "zj_nbsyyggzyjyw_jsgc_kbqk": true,
  59. "zj_zjsggzyjyzx_jyxx_kbjg": true,
  60. "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
  61. "zj_lssggzyjyw_jsgc_kbsk": true,
  62. "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
  63. "sc_mssggzydzjypt_jsgc_kbjl": true,
  64. "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
  65. "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
  66. "a_hbszbtbggfwpt_kbjl": true,
  67. "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
  68. "a_szldzbyxgs_kbxx": true,
  69. "zj_zssssxggzyjyw_gcjs_kbjggs": true,
  70. "gd_szszfhjsj_kbqkgs": true,
  71. "a_gjggzyjypt_gcjs_kbjl": true,
  72. "a_gjggzyjypt_gcjs_kbjl_new": true,
  73. "zj_tzsyhggzyjyzx_kbjggg": true,
  74. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
  75. "ah_czsggzyjyw_jsgc_kbjl": true,
  76. "ah_czsggzyjyw_zfcg_kbxx": true,
  77. "ah_whsggzyjyfww_kbxx_cgxm": true,
  78. "ah_whsggzyjyfww_kbxx_gcxm": true,
  79. }
  80. clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  81. sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  82. clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  83. clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否)") //保证金
  84. )
  85. //启动测试抽取-、、、、结果追踪
  86. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  87. defer qu.Catch()
  88. ext := TaskList[taskId]
  89. if ext == nil {
  90. ext = &ExtractTask{}
  91. ext.Id = taskId
  92. ext.InitTestTaskInfo(resultcoll, trackcoll)
  93. ext.IsRun = true
  94. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  95. }
  96. ext.InitSite()
  97. ext.InitRulePres()
  98. ext.InitRuleBacks(false)
  99. ext.InitRuleBacks(true)
  100. ext.InitRuleCore(false)
  101. ext.InitRuleCore(true)
  102. ext.InitPkgCore()
  103. ext.InitBlockRule()
  104. ext.InfoTypeList()
  105. ext.InitTag(false)
  106. ext.InitTag(true)
  107. ext.InitClearFn(false)
  108. ext.InitClearFn(true)
  109. ext.Lock()
  110. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  111. ext.InitCityInfo()
  112. ext.InitAreaCode()
  113. ext.InitPostCode()
  114. }
  115. ext.Unlock()
  116. //质量审核
  117. ext.InitAuditFields()
  118. ext.InitAuditRule()
  119. ext.InitAuditClass()
  120. ext.InitAuditRecogField()
  121. //品牌抽取是否开启
  122. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  123. //价格个数抽取是否开启
  124. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  125. //附件抽取是否开启
  126. ext.InitFile()
  127. ext.TaskInfo.TestColl = resultcoll
  128. TaskList[taskId] = ext
  129. return RunExtractTestTask(ext, startId, num)
  130. }
  131. func IdTrans(startId string) bson.ObjectId {
  132. defer qu.Catch()
  133. return bson.ObjectIdHex(startId)
  134. }
  135. func StringTOBsonId(id string) primitive.ObjectID {
  136. objectId, _ := primitive.ObjectIDFromHex(id)
  137. return objectId
  138. }
  139. func BsonTOStringId(id interface{}) string {
  140. return id.(primitive.ObjectID).Hex()
  141. }
  142. //开始测试任务抽取~结果追踪
  143. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  144. n, _ := strconv.Atoi(num)
  145. id := IdTrans(startId)
  146. if id.Valid() {
  147. //query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  148. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  149. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  150. for _, v := range *list {
  151. //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
  152. // continue
  153. //}
  154. if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
  155. continue
  156. }
  157. var j, jf *ju.Job
  158. var isSite bool
  159. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  160. v["isextFile"] = true
  161. j, jf, isSite = ext.PreInfo(v)
  162. } else { //无附件
  163. j, _, isSite = ext.PreInfo(v)
  164. }
  165. go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
  166. ext.TaskInfo.ProcessPool <- true
  167. }
  168. return true
  169. } else {
  170. return false
  171. }
  172. }
  173. //启动抽取
  174. func StartExtractTaskId(taskId string) bool {
  175. defer qu.Catch()
  176. isgo := false
  177. ext := TaskList[taskId]
  178. if ext == nil {
  179. ext = &ExtractTask{}
  180. ext.Id = taskId
  181. ext.InitTaskInfo()
  182. isgo = true
  183. } else {
  184. ext.Id = taskId
  185. ext.InitTaskInfo()
  186. }
  187. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  188. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  189. ext.InitSite()
  190. ext.InitRulePres()
  191. ext.InitRuleBacks(false)
  192. ext.InitRuleBacks(true)
  193. ext.InitRuleCore(false)
  194. ext.InitRuleCore(true)
  195. ext.InitPkgCore()
  196. ext.InitBlockRule()
  197. ext.InfoTypeList()
  198. ext.InitTag(false)
  199. ext.InitTag(true)
  200. ext.InitClearFn(false)
  201. ext.InitClearFn(true)
  202. ext.Lock()
  203. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  204. ext.InitCityInfo()
  205. ext.InitAreaCode()
  206. ext.InitPostCode()
  207. }
  208. ext.Unlock()
  209. //质量审核
  210. ext.InitAuditFields()
  211. ext.InitAuditRule()
  212. ext.InitAuditClass()
  213. ext.InitAuditRecogField()
  214. //品牌抽取是否开启
  215. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  216. //价格个数抽取是否开启
  217. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  218. //附件抽取是否开启
  219. ext.InitFile()
  220. ext.IsRun = true
  221. go ext.ResultSave(true)
  222. go ext.BidSave(true)
  223. if isgo {
  224. go RunExtractTask(taskId)
  225. }
  226. TaskList[taskId] = ext
  227. return true
  228. }
  229. //停止抽取
  230. func StopExtractTaskId(taskId string) bool {
  231. defer qu.Catch()
  232. ext := TaskList[taskId]
  233. if ext != nil {
  234. ext.IsRun = false
  235. TaskList[taskId] = ext
  236. }
  237. //更新task.s_extlastid
  238. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  239. return true
  240. }
  241. //开始抽取
  242. func RunExtractTask(taskId string) {
  243. defer qu.Catch()
  244. ext := TaskList[taskId]
  245. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  246. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  247. pageNum := (count + PageSize - 1) / PageSize
  248. limit := PageSize
  249. if count < PageSize {
  250. limit = count
  251. }
  252. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  253. for i := 0; i < pageNum; i++ {
  254. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  255. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  256. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  257. for _, v := range *list {
  258. //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  259. // continue
  260. //}
  261. //根据标题判断是否抽取
  262. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  263. if !b {
  264. continue
  265. }
  266. _id := qu.BsonIdToSId(v["_id"])
  267. //log.Debug(_id)
  268. if !ext.IsRun {
  269. break
  270. }
  271. var j, jf *ju.Job
  272. var isSite bool
  273. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  274. v["isextFile"] = true
  275. j, jf, isSite = ext.PreInfo(v)
  276. } else {
  277. j, _, isSite = ext.PreInfo(v)
  278. }
  279. go ext.ExtractProcess(j, jf, isSite)
  280. ext.TaskInfo.LastExtId = _id
  281. ext.TaskInfo.ProcessPool <- true
  282. }
  283. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  284. if !ext.IsRun {
  285. break
  286. }
  287. }
  288. //更新task.s_extlastid
  289. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  290. }
  291. //信息预处理-不和版本关联,取最新版本的配置项
  292. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  293. return (&ExtractTask{}).PreInfo(doc)
  294. }
  295. func CleanDetailText(detail string, summary string) string {
  296. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  297. detail = pretreated.RepairCon(detail)
  298. detail = ju.CutLableStr(summary + "\n" + detail)
  299. detail = cut.ClearHtml(summary + "\n" + detail)
  300. return detail
  301. }
  302. //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
  303. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  304. defer qu.Catch()
  305. //判断是否有附件这个字段
  306. var isextFile bool
  307. if doc["isextFile"] != nil {
  308. isextFile = doc["isextFile"].(bool)
  309. }
  310. detail := ""
  311. summary := qu.ObjToString(doc["summary"])
  312. d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
  313. d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
  314. if len(d1) >= len(d2) || d2 == "" {
  315. detail = d1
  316. } else {
  317. detail = d2
  318. }
  319. doc["detail"] = detail
  320. isClearnMoney := !clearMoneyReg.MatchString(detail)
  321. if isClearnMoney {
  322. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  323. }
  324. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  325. if isextFile {
  326. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  327. }
  328. //正文小于200个字,有附件把附件内容加到正文
  329. //tmpDeatil := detail
  330. //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  331. //if err == nil {
  332. // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  333. // if conlen < 2000 {
  334. // if isextFile {
  335. // detail += qu.ObjToString(doc["detailfile"])
  336. // doc["detail"] = detail
  337. // }
  338. // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
  339. // //防止文本过长,造成抽取阻塞
  340. // log.Debug("文本太长", doc["_id"], conlen)
  341. // doc["detail"] = d3
  342. // }
  343. //}
  344. toptype := qu.ObjToString(doc["toptype"])
  345. subtype := qu.ObjToString(doc["subtype"])
  346. if qu.ObjToString(doc["type"]) == "bid" {
  347. toptype = "结果"
  348. }
  349. if toptype == "" || toptype == "采购意向" {
  350. toptype = "all"
  351. }
  352. if subtype == "" || subtype == "采购意向" {
  353. subtype = "all"
  354. }
  355. if subtype == "其他" {
  356. subtype = "其它"
  357. }
  358. toMap := qu.ObjToMap(doc["jsondata"])
  359. //log.Debug("toMap", toMap)
  360. if (*toMap) != nil {
  361. if (*toMap)["extweight"] == nil {
  362. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  363. }
  364. if (*toMap)["jsoncontent"] != nil {
  365. delete(*toMap, "jsoncontent")
  366. }
  367. for k, v := range *toMap {
  368. if _, ok := v.(float64); ok {
  369. continue
  370. } else if _, ok := v.(int64); ok {
  371. continue
  372. } else if _, ok2 := v.(string); ok2 {
  373. continue
  374. } else {
  375. delete(*toMap, k)
  376. }
  377. }
  378. }
  379. j = &ju.Job{
  380. SourceMid: qu.BsonIdToSId(doc["_id"]),
  381. Category: toptype,
  382. CategorySecond: subtype,
  383. Content: qu.ObjToString(doc["detail"]),
  384. SpiderCode: qu.ObjToString(doc["spidercode"]),
  385. Site: qu.ObjToString(doc["site"]),
  386. //Domain: qu.ObjToString(doc["domain"]),
  387. //Href: qu.ObjToString(doc["href"]),
  388. Title: qu.ObjToString(doc["title"]),
  389. Data: &doc,
  390. City: qu.ObjToString(doc["city"]),
  391. Province: qu.ObjToString(doc["area"]),
  392. Jsondata: toMap,
  393. Result: map[string][]*ju.ExtField{},
  394. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  395. RuleBlock: e.RuleBlock,
  396. Dataging: qu.IntAll(doc["dataging"]),
  397. IsClearnMoney: isClearnMoneystr,
  398. IsUnRulesTab: false,
  399. }
  400. if isextFile {
  401. jf = &ju.Job{
  402. SourceMid: qu.BsonIdToSId(doc["_id"]),
  403. Category: toptype,
  404. CategorySecond: subtype,
  405. Content: qu.ObjToString(doc["detailfile"]),
  406. SpiderCode: qu.ObjToString(doc["spidercode"]),
  407. Site: qu.ObjToString(doc["site"]),
  408. Title: qu.ObjToString(doc["title"]),
  409. Data: &doc,
  410. City: qu.ObjToString(doc["city"]),
  411. Province: qu.ObjToString(doc["area"]),
  412. Jsondata: toMap,
  413. Result: map[string][]*ju.ExtField{},
  414. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  415. RuleBlock: e.RuleBlock,
  416. IsFile: isextFile,
  417. Dataging: qu.IntAll(doc["dataging"]),
  418. IsClearnMoney: isClearnMoneystr,
  419. IsUnRulesTab: false,
  420. }
  421. }
  422. codeSite := j.SpiderCode
  423. //是否启用站点
  424. if value, ok := e.SiteMerge.Load(codeSite); ok {
  425. isSite = value.(bool)
  426. }
  427. if isSite {
  428. //是否配置站点
  429. exp, isSite := e.Luacodes.Load(codeSite)
  430. if isSite {
  431. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  432. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  433. }
  434. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  435. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  436. }
  437. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  438. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  439. }
  440. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  441. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  442. }
  443. }
  444. }
  445. qu.Try(func() {
  446. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  447. if isextFile && strings.TrimSpace(jf.Content) != "" {
  448. pretreated.AnalyStart(jf, isSite, codeSite)
  449. }
  450. }, func(err interface{}) {
  451. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  452. })
  453. return j, jf, isSite
  454. }
  455. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  456. func file2text(doc *map[string]interface{}) {
  457. mnameone := map[string]bool{}
  458. mname := map[string]bool{}
  459. murl := map[string]string{}
  460. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  461. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  462. for _, attachs := range attach_text {
  463. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  464. for _, fileinfo := range fileinfos {
  465. if ff, ok := fileinfo.(map[string]interface{}); ok {
  466. attach_url := qu.ObjToString(ff["attach_url"])
  467. ffname := qu.ObjToString(ff["file_name"])
  468. if clearStrReg.MatchString(ffname) {
  469. continue
  470. }
  471. mname[ffname] = true
  472. murl[ffname] = attach_url
  473. if sortStrReg.MatchString(ffname) {
  474. mnameone[ffname] = true
  475. }
  476. }
  477. }
  478. }
  479. }
  480. }
  481. tmpstr := ""
  482. for k := range mnameone {
  483. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  484. (*doc)["detailfile"] = tmpstr
  485. return
  486. }
  487. bs := ju.OssGetObject(murl[k])
  488. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  489. tmpstr += bs + "\n"
  490. }
  491. }
  492. for k := range mname {
  493. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  494. (*doc)["detailfile"] = tmpstr
  495. return
  496. }
  497. bs := ju.OssGetObject(murl[k])
  498. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  499. tmpstr += bs + "\n"
  500. }
  501. }
  502. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  503. }
  504. //判断-附件分包是否有效
  505. func isUsedPackageJF(jf_package map[string]map[string]interface{}) bool {
  506. if jf_package == nil || len(jf_package) == 0 {
  507. return false
  508. }
  509. for _, pack := range jf_package {
  510. budget := qu.Float64All(pack["budget"])
  511. bidamount := qu.Float64All(pack["bidamount"])
  512. //winner := qu.Float64All(pack["winner"])
  513. //text := qu.ObjToString(pack["text"])
  514. //13.投标报价\n13.1本次报价
  515. //14.投标报价\n14.1投标报价
  516. if budget > 0.0 && budget <= 1.0 {
  517. return false
  518. }
  519. if bidamount > 0.0 && bidamount <= 1.0 {
  520. return false
  521. }
  522. }
  523. return true
  524. }
  525. //抽取-正文
  526. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  527. e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性
  528. if jf != nil && jf.IsFile { //附件jf → j 合并
  529. e.ExtractDetail(jf, isSite, j.SpiderCode)
  530. for tmpk, xs := range jf.Result {
  531. if len(j.Result[tmpk]) == 0 {
  532. if tmpk == "budget" || tmpk == "bidamount" {
  533. for _, v := range xs {
  534. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  535. j.Result[tmpk] = append(j.Result[tmpk], v)
  536. }
  537. }
  538. } else {
  539. if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
  540. //log.Debug("不采用~招标类附件中标信息")
  541. continue
  542. }
  543. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  544. }
  545. } else {
  546. if tmpk == "winner" { //均没有有效值~采用附件的
  547. isUsed := false
  548. for _, v := range j.Result[tmpk] {
  549. if v.Value != "" {
  550. isUsed = true
  551. break
  552. }
  553. }
  554. if !isUsed {
  555. if j.Category == "招标" && j.CategorySecond != "单一" {
  556. //log.Debug("不采用~招标类附件中标信息~")
  557. continue
  558. }
  559. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  560. }
  561. }
  562. }
  563. }
  564. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  565. if j.Category == "招标" && j.CategorySecond != "单一" {
  566. //log.Debug("不采用~招标类附件中标信息~~")
  567. } else {
  568. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  569. }
  570. }
  571. if len(j.PackageInfo) == 0 && isUsedPackageJF(jf.PackageInfo) {
  572. j.PackageInfo = jf.PackageInfo
  573. }
  574. }
  575. if isSite {
  576. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  577. if ok && ismerge.(bool) {
  578. tmpj := &ju.Job{
  579. SourceMid: j.SourceMid,
  580. Category: j.Category,
  581. CategorySecond: j.CategorySecond,
  582. Content: j.Content,
  583. SpiderCode: j.SpiderCode,
  584. //Domain: qu.ObjToString(doc["domain"]),
  585. //Href: qu.ObjToString(doc["href"]),
  586. Title: j.Title,
  587. Data: j.Data,
  588. City: j.City,
  589. Province: j.Province,
  590. Jsondata: j.Jsondata,
  591. Result: map[string][]*ju.ExtField{},
  592. BuyerAddr: j.BuyerAddr,
  593. RuleBlock: e.RuleBlock,
  594. }
  595. qu.Try(func() {
  596. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  597. }, func(err interface{}) {
  598. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  599. })
  600. e.ExtractDetail(tmpj, false, "")
  601. //if jf != nil && jf.IsFile {
  602. // e.ExtractFile(jf, false, "")
  603. //}
  604. //合并数据
  605. j.Block = append(j.Block, tmpj.Block...)
  606. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  607. for tmpk, _ := range j.Result {
  608. if len(tmpj.Result[tmpk]) > 0 {
  609. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  610. }
  611. }
  612. for tmpk, _ := range tmpj.Result {
  613. if len(j.Result[tmpk]) == 0 {
  614. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  615. }
  616. }
  617. }
  618. }
  619. //分析抽取结果并保存
  620. AnalysisSaveResult(j, jf, e)
  621. <-e.TaskInfo.ProcessPool
  622. }
  623. //抽取-正文-规则等 detail
  624. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  625. qu.Try(func() {
  626. doc := *j.Data
  627. //全局前置规则,结果覆盖doc属性
  628. //for _, v := range e.RulePres {
  629. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  630. //}
  631. tmprules := map[string][]*RuleCore{}
  632. lockrule.Lock()
  633. //加载分类抽取配置
  634. if j.Category == "all" || j.CategorySecond == "all" {
  635. if isSite {
  636. for k, vc1 := range e.SiteRuleCores["all_all"] {
  637. tmprules[k] = vc1
  638. }
  639. } else {
  640. for k, vc1 := range e.RuleCores["all_all"] {
  641. tmprules[k] = vc1
  642. }
  643. }
  644. } else {
  645. if isSite {
  646. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  647. tmprules[k] = vc1
  648. }
  649. } else {
  650. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  651. tmprules[k] = vc1
  652. }
  653. }
  654. }
  655. if len(tmprules) < 1 { //分类未覆盖部分
  656. if isSite {
  657. for k, vc1 := range e.RuleCores["all_all"] {
  658. tmprules[k] = vc1
  659. }
  660. } else {
  661. for k, vc1 := range e.SiteRuleCores["all_all"] {
  662. tmprules[k] = vc1
  663. }
  664. }
  665. }
  666. lockrule.Unlock()
  667. //抽取规则
  668. for _, vc1 := range tmprules {
  669. for _, vc := range vc1 {
  670. tmp := ju.DeepCopy(doc).(map[string]interface{})
  671. //是否进入逻辑
  672. if !ju.Logic(vc.LuaLogic, tmp) {
  673. continue
  674. }
  675. if vc.Field == "winner" {
  676. //log.Debug("调试抽取字段")
  677. }
  678. ////抽取-前置规则
  679. //for _, v := range vc.RulePres {
  680. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  681. //}
  682. // log.Debug("抽取-前置规则", tmp)
  683. //抽取-规则
  684. ExtRuleCore(tmp, e, vc, j, isSite)
  685. // log.Debug("抽取-规则", tmp)
  686. //抽取-后置规则
  687. for _, v := range vc.RuleBacks {
  688. ExtRegBack(j, v, e.TaskInfo, vc)
  689. }
  690. //kv规则
  691. for _, v := range vc.KVRuleCores {
  692. ExtRuleKV(j, v, e.TaskInfo)
  693. }
  694. // log.Debug("抽取-后置规则", tmp)
  695. //项目名称未能抽取到,标题来凑
  696. if vc.Field == "projectname" {
  697. if vc.ExtFrom == "title" {
  698. isextitle := true
  699. for _, v := range j.Result[vc.Field] {
  700. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  701. isextitle = false
  702. break
  703. }
  704. }
  705. if isextitle { //标题加入选举
  706. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  707. if isSite {
  708. field.Score = 1
  709. }
  710. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  711. }
  712. }
  713. for i := 0; i < 3; i++ {
  714. for _, v := range vc.RuleBacks {
  715. ExtRegBack(j, v, e.TaskInfo, vc)
  716. }
  717. }
  718. }
  719. }
  720. }
  721. //全局后置规则
  722. if isSite {
  723. for _, v := range e.SiteRuleBacks {
  724. ExtRegBack(j, v, e.TaskInfo, nil)
  725. }
  726. } else {
  727. for _, v := range e.RuleBacks {
  728. ExtRegBack(j, v, e.TaskInfo, nil)
  729. }
  730. }
  731. //函数清理
  732. for key, val := range j.Result {
  733. for i, v := range val {
  734. if v.Field == "projectname" && v.Type == "table" {
  735. break
  736. }
  737. if key == "budget" || key == "bidamount" {
  738. if _, ok := v.Value.(float64); ok && !v.IsTrue {
  739. continue
  740. }
  741. }
  742. lockclear.Lock()
  743. var cfn = []string{}
  744. if isSite {
  745. cfn = e.SiteClearFn[key]
  746. if len(cfn) == 0 {
  747. cfn = e.ClearFn[key]
  748. }
  749. } else {
  750. cfn = e.ClearFn[key]
  751. }
  752. lockclear.Unlock()
  753. if len(cfn) == 0 {
  754. continue
  755. }
  756. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  757. if key == "budget" || key == "bidamount" {
  758. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  759. j.Result[key][i].IsTrue = true
  760. } else {
  761. j.Result[key][i].Value = data[0]
  762. continue
  763. }
  764. }
  765. before, _ := v.Value.(string)
  766. v.Value = data[0]
  767. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  768. //添加行数清理的日志
  769. //清理特殊符号
  770. lockclear.Lock()
  771. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  772. text := qu.ObjToString(v.Value)
  773. before = text
  774. //指定清理--新增-函数清理-其他清理
  775. if key == "winner" || key == "agency" || key == "buyer" {
  776. text = strings.ReplaceAll(text, "【", "")
  777. text = strings.ReplaceAll(text, "】", "")
  778. }
  779. v.Value = clear.OtherClean(key, text)
  780. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  781. }
  782. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  783. lockclear.Unlock()
  784. }
  785. }
  786. PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
  787. // bs, _ := json.Marshal(j.Result)
  788. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  789. }, func(err interface{}) {
  790. log.Debug("ExtractProcess err", err, j.SourceMid)
  791. })
  792. }
  793. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  794. qu.Try(func() {
  795. doc := *j.Data
  796. //全局前置规则,结果覆盖doc属性
  797. // for _, v := range e.RulePres {
  798. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  799. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  800. // }
  801. // }
  802. //抽取规则
  803. tmprules := map[string][]*RuleCore{}
  804. lockrule.Lock()
  805. if j.Category == "all" || j.CategorySecond == "all" {
  806. for k, vc1 := range e.RuleCores["all_all"] {
  807. tmprules[k] = vc1
  808. }
  809. } else {
  810. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  811. tmprules[k] = vc1
  812. }
  813. }
  814. lockrule.Unlock()
  815. for _, vc1 := range tmprules {
  816. for _, vc := range vc1 {
  817. tmp := ju.DeepCopy(doc).(map[string]interface{})
  818. //是否进入逻辑
  819. if !ju.Logic(vc.LuaLogic, tmp) {
  820. continue
  821. }
  822. //抽取-前置规则
  823. // for _, v := range vc.RulePres {
  824. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  825. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  826. // }
  827. // }
  828. // log.Debug("抽取-前置规则", tmp)
  829. //抽取-规则
  830. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  831. ExtRuleCore(tmp, e, vc, j, isSite)
  832. }
  833. // log.Debug("抽取-规则", tmp)
  834. //抽取-后置规则
  835. for _, v := range vc.RuleBacks {
  836. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  837. ExtRegBack(j, v, e.TaskInfo, vc)
  838. }
  839. }
  840. // log.Debug("抽取-后置规则", tmp)
  841. }
  842. }
  843. //全局后置规则
  844. for _, v := range e.RuleBacks {
  845. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  846. ExtRegBack(j, v, e.TaskInfo, nil)
  847. }
  848. }
  849. //函数清理
  850. for key, val := range j.Result {
  851. for _, v := range val {
  852. lockclear.Lock()
  853. var cfn = []string{}
  854. if isSite {
  855. cfn = e.SiteClearFn[key]
  856. if len(cfn) == 0 {
  857. cfn = e.ClearFn[key]
  858. }
  859. } else {
  860. cfn = e.ClearFn[key]
  861. }
  862. lockclear.Unlock()
  863. if len(cfn) == 0 {
  864. continue
  865. }
  866. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  867. v.Value = data[0]
  868. //清理特殊符号
  869. lockclear.Lock()
  870. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  871. clear.MesField[key] != nil {
  872. text := qu.ObjToString(v.Value)
  873. text = clear.OtherClean(key, text)
  874. v.Value = text
  875. }
  876. lockclear.Unlock()
  877. }
  878. }
  879. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  880. // bs, _ := json.Marshal(j.Result)
  881. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  882. }, func(err interface{}) {
  883. log.Debug("ExtractProcess err", err)
  884. })
  885. }
  886. //前置过滤
  887. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  888. defer qu.Catch()
  889. before := ju.DeepCopy(doc).(map[string]interface{})
  890. extinfo := map[string]interface{}{}
  891. if in.IsLua {
  892. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  893. if j != nil {
  894. lua.Block = j.Block
  895. }
  896. extinfo = lua.RunScript("pre")
  897. for k, v := range extinfo { //结果覆盖原doc
  898. doc[k] = v
  899. }
  900. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  901. } else {
  902. var key string
  903. if !j.IsFile {
  904. key = qu.If(in.Field == "", "detail", in.Field).(string)
  905. } else {
  906. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  907. }
  908. text := qu.ObjToString(doc[key])
  909. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  910. doc[key] = extinfo[key] //结果覆盖原doc
  911. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  912. }
  913. return doc
  914. }
  915. //抽取-规则
  916. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  917. //候选人加入
  918. var kvMap map[string][]map[string]interface{}
  919. extByReg := true
  920. if vc.ExtFrom != "title" {
  921. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  922. }
  923. for _, v := range vc.RuleCores {
  924. if v.IsLua {
  925. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  926. } else if extByReg {
  927. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  928. }
  929. }
  930. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  931. if vc.Field == "budget" && len(kvMap) == 0 {
  932. if len(j.BlockPackage) == 1 {
  933. for _, bp := range j.BlockPackage {
  934. for fieldname, field := range vc.LFields {
  935. if field != vc.Field {
  936. continue
  937. }
  938. tp := ""
  939. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  940. if k == 0 {
  941. tp = "colon"
  942. } else if k == 1 {
  943. tp = "space"
  944. } else if k == 2 {
  945. tp = "table"
  946. }
  947. if v == nil || v.KvTags == nil {
  948. continue
  949. }
  950. for _, vv := range v.KvTags[fieldname] {
  951. text := ju.TrimLRSpace(vv.Value, "")
  952. if text != "" {
  953. tmp := &ju.ExtField{
  954. ExtFrom: "package",
  955. Field: vc.Field,
  956. Code: "CL_分包",
  957. Type: tp,
  958. MatchType: "package",
  959. RuleText: bp.Text,
  960. SourceValue: vv.Key,
  961. Value: text,
  962. }
  963. if isSite {
  964. tmp.Score = 1
  965. }
  966. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  967. }
  968. }
  969. }
  970. }
  971. break
  972. }
  973. }
  974. } else {
  975. for k, v := range kvMap {
  976. if j.Result[k] == nil {
  977. j.Result[k] = [](*ju.ExtField){}
  978. }
  979. for _, tmp := range v {
  980. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  981. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  982. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  983. MatchType: qu.ObjToString(tmp["matchtype"]),
  984. RuleText: qu.ObjToString(tmp["ruletext"]),
  985. SourceValue: tmp["sourcevalue"],
  986. Value: tmp["value"]}
  987. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  988. field.Score = 1
  989. }
  990. if isSite {
  991. field.Score = 1
  992. }
  993. if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
  994. moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
  995. if len(moneys) > 0 {
  996. if vf, ok := moneys[0].(float64); ok {
  997. field.Value = vf
  998. field.IsTrue = moneys[len(moneys)-1].(bool)
  999. } else if vi, ok := moneys[0].(int); ok {
  1000. field.Value = float64(vi)
  1001. field.IsTrue = moneys[len(moneys)-1].(bool)
  1002. }
  1003. }
  1004. }
  1005. if tmp["blocktag"] != nil {
  1006. btag := make(map[string]string)
  1007. for k := range tmp["blocktag"].(map[string]bool) {
  1008. blocktag.Lock()
  1009. if TagConfigDesc[k] != "" {
  1010. btag[k] = TagConfigDesc[k]
  1011. }
  1012. blocktag.Unlock()
  1013. }
  1014. field.BlockTag = btag
  1015. }
  1016. j.Result[k] = append(j.Result[k], field)
  1017. }
  1018. }
  1019. }
  1020. }
  1021. //抽取-规则-kv
  1022. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  1023. defer qu.Catch()
  1024. if extfrom == "title" || !in.IsLua {
  1025. return
  1026. }
  1027. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  1028. lua.KvMap = *kvMap
  1029. lua.Block = j.Block
  1030. extinfo := lua.RunScript("core")
  1031. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  1032. for _, v := range tmps {
  1033. v["core"] = in.Code
  1034. }
  1035. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  1036. }
  1037. if len(extinfo) > 0 {
  1038. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1039. }
  1040. }
  1041. //抽取-规则-正则
  1042. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  1043. defer qu.Catch()
  1044. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1045. b := IsExtract(in.Field, j.Title, j.Content)
  1046. if !b {
  1047. return
  1048. }
  1049. //全文正则
  1050. //text := qu.ObjToString(doc[extfrom])
  1051. //if in.Field != "" {
  1052. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  1053. // if len(extinfo) > 0 {
  1054. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1055. // }
  1056. //}
  1057. //块抽取
  1058. if in.Field != "" {
  1059. if extfrom == "title" {
  1060. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  1061. if len(extinfo) > 0 {
  1062. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1063. }
  1064. } else if in.Field == "qualifies" {
  1065. extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite)
  1066. if len(extinfo) > 0 {
  1067. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1068. }
  1069. } else {
  1070. for _, v := range j.Block {
  1071. btag := make(map[string]string)
  1072. for k := range v.Classify {
  1073. blocktag.Lock()
  1074. btag[k] = TagConfigDesc[k]
  1075. blocktag.Unlock()
  1076. }
  1077. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  1078. if len(extinfo) > 0 {
  1079. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1080. }
  1081. }
  1082. }
  1083. }
  1084. }
  1085. //pkg抽取-规则-正则
  1086. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  1087. defer qu.Catch()
  1088. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1089. b := IsExtract(in.Field, j.Title, j.Content)
  1090. if !b {
  1091. return
  1092. }
  1093. //块抽取
  1094. if in.Field != "" {
  1095. //临时调试分包抽取字段-
  1096. if in.Field == "bidamount" {
  1097. //log.Debug("分包-调试字段...")
  1098. }
  1099. for k, vbpkg := range j.BlockPackage {
  1100. rep := map[string]string{}
  1101. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1102. if in.Field == "budget" && vbpkg.Budget > 0 {
  1103. continue
  1104. }
  1105. if in.Field == "agencyfee" && vbpkg.Agencyfee > 0 {
  1106. continue
  1107. }
  1108. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  1109. continue
  1110. }
  1111. if in.Field == "winner" && vbpkg.Winner != "" {
  1112. continue
  1113. }
  1114. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  1115. continue
  1116. }
  1117. if in.Field == "projectname" && vbpkg.Name != "" {
  1118. continue
  1119. }
  1120. if in.Field == "winner" && vbpkg.Winner != "" {
  1121. continue
  1122. }
  1123. if in.Field == "winnerperson" {
  1124. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  1125. continue
  1126. }
  1127. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  1128. continue
  1129. }
  1130. }
  1131. if in.Field == "winnertel" {
  1132. if vbpkg.WinnerPerson == "" {
  1133. continue
  1134. }
  1135. }
  1136. //处理正负数修正
  1137. ptmp := strings.Split(in.RuleText, "#")
  1138. sign := 0
  1139. if len(ptmp) == 2 {
  1140. if ptmp[1] == "正" {
  1141. sign = 1
  1142. } else if ptmp[1] == "负" {
  1143. sign = -1
  1144. }
  1145. }
  1146. tmp := strings.Split(ptmp[0], "__")
  1147. if len(tmp) == 2 {
  1148. epos := strings.Split(tmp[1], ",")
  1149. posm := map[string]int{}
  1150. for _, v := range epos {
  1151. ks := strings.Split(v, ":")
  1152. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1153. posm[ks[1]] = qu.IntAll(ks[0])
  1154. } else {
  1155. posm[in.Field] = qu.IntAll(ks[0])
  1156. }
  1157. }
  1158. var pattern string
  1159. if strings.Contains(tmp[0], "\\u") {
  1160. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1161. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1162. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1163. } else {
  1164. pattern = tmp[0]
  1165. }
  1166. //log.Debug("pattern", pattern)
  1167. //fmt.Println(text)
  1168. reg := regexp.MustCompile(pattern)
  1169. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  1170. for i, _ := range apos {
  1171. pos := apos[i]
  1172. for k, p := range posm {
  1173. if len(pos) > p {
  1174. if pos[p] == -1 || pos[p+1] == -1 {
  1175. continue
  1176. }
  1177. val := vbpkg.Text[pos[p]:pos[p+1]]
  1178. if string(val) == "" {
  1179. continue
  1180. }
  1181. if sign == -1 {
  1182. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1183. } else {
  1184. rep[k+"_"+fmt.Sprint(i)] = val
  1185. }
  1186. }
  1187. }
  1188. }
  1189. //fmt.Println(text)
  1190. for i := 0; i < len(apos); i++ {
  1191. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  1192. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1193. lock.Lock()
  1194. cfn := e.ClearFn[in.Field]
  1195. lock.Unlock()
  1196. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1197. if data[len(data)-1].(bool) {
  1198. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1199. j.BlockPackage[k].IsTrueBudget = true
  1200. }
  1201. break
  1202. } else if in.Field == "agencyfee" && vbpkg.Agencyfee <= 0 {
  1203. lock.Lock()
  1204. cfn := e.ClearFn[in.Field]
  1205. lock.Unlock()
  1206. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1207. if data[len(data)-1].(bool) {
  1208. j.BlockPackage[k].Agencyfee = qu.Float64All(data[0])
  1209. j.BlockPackage[k].IsTrueAgencyfee = true
  1210. }
  1211. break
  1212. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1213. lock.Lock()
  1214. cfn := e.ClearFn[in.Field]
  1215. lock.Unlock()
  1216. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1217. if data[len(data)-1].(bool) {
  1218. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1219. j.BlockPackage[k].IsTrueBidamount = true
  1220. }
  1221. break
  1222. } else if in.Field == "winner" {
  1223. if j.BlockPackage[k].Winner == "" {
  1224. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  1225. break
  1226. }
  1227. } else if in.Field == "winnertel" {
  1228. if j.BlockPackage[k].WinnerTel == "" {
  1229. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1230. break
  1231. }
  1232. } else if in.Field == "winnerperson" {
  1233. if j.BlockPackage[k].WinnerPerson == "" {
  1234. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1235. break
  1236. }
  1237. } else if in.Field == "bidstatus" {
  1238. if j.BlockPackage[k].BidStatus == "" {
  1239. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1240. break
  1241. }
  1242. } else if in.Field == "projectname" {
  1243. if j.BlockPackage[k].Name == "" {
  1244. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1245. break
  1246. }
  1247. } else if in.Field == "winnerperson" {
  1248. if j.BlockPackage[k].WinnerPerson == "" {
  1249. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1250. break
  1251. }
  1252. } else if in.Field == "winnertel" {
  1253. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1254. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1255. break
  1256. }
  1257. }
  1258. }
  1259. }
  1260. }
  1261. } else {
  1262. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1263. val := ""
  1264. if len(pos) == 2 {
  1265. //"text" = "text"[pos[1]:]
  1266. val = "text"[pos[1]:]
  1267. rs := regexp.MustCompile("[^\r\n\t]+")
  1268. tmp := rs.FindAllString("text", -1)
  1269. if len(tmp) > 0 {
  1270. val = tmp[0]
  1271. }
  1272. }
  1273. if val != "" {
  1274. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1275. lock.Lock()
  1276. cfn := e.ClearFn[in.Field]
  1277. lock.Unlock()
  1278. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1279. if data[len(data)-1].(bool) {
  1280. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1281. j.BlockPackage[k].IsTrueBudget = true
  1282. }
  1283. break
  1284. }
  1285. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1286. lock.Lock()
  1287. cfn := e.ClearFn[in.Field]
  1288. lock.Unlock()
  1289. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1290. if data[len(data)-1].(bool) {
  1291. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1292. j.BlockPackage[k].IsTrueBidamount = true
  1293. }
  1294. break
  1295. } else if in.Field == "bidstatus" {
  1296. if j.BlockPackage[k].BidStatus == "" {
  1297. j.BlockPackage[k].BidStatus = val
  1298. break
  1299. }
  1300. } else if in.Field == "projectname" {
  1301. if j.BlockPackage[k].Name == "" {
  1302. j.BlockPackage[k].Name = val
  1303. break
  1304. }
  1305. }
  1306. }
  1307. }
  1308. }
  1309. }
  1310. }
  1311. //lua脚本根据属性设置提取kv值
  1312. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1313. kvmap := map[string][]map[string]interface{}{}
  1314. if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  1315. if vc.Field == "bidamount" {
  1316. for _, v := range j.Winnerorder {
  1317. if v["price"] == nil {
  1318. continue
  1319. }
  1320. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1321. "code": "winnerorder",
  1322. "field": vc.Field,
  1323. "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
  1324. "extfrom": v["sortstr"],
  1325. "sourcevalue": v["price"],
  1326. "value": v["price"],
  1327. "type": "winnerorder",
  1328. "matchtype": "winnerorder",
  1329. })
  1330. if len(j.Winnerorder) < 4 {
  1331. return kvmap, false
  1332. }
  1333. }
  1334. //候选人中标金额
  1335. if price := j.Winnerorder[0]["price"]; price != nil {
  1336. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1337. "code": "CL_中标候选人",
  1338. "field": vc.Field,
  1339. "ruletext": "中标候选人",
  1340. "extfrom": j.Winnerorder[0]["sortstr"],
  1341. "sourcevalue": price,
  1342. "value": price,
  1343. "type": "winnerorder",
  1344. "matchtype": "winnerorder",
  1345. })
  1346. if len(j.Winnerorder) < 4 {
  1347. return kvmap, false
  1348. }
  1349. }
  1350. }
  1351. }
  1352. for fieldname, field := range vc.LFields {
  1353. if field != vc.Field {
  1354. continue
  1355. }
  1356. extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category)
  1357. }
  1358. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1359. return kvmap, true
  1360. }
  1361. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) {
  1362. //qu.Debug("fieldname+++", fieldname)
  1363. for _, bl := range blocks {
  1364. tp := ""
  1365. if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") {
  1366. if text := ju.TrimLRSpace(bl.Text, ""); text != "" {
  1367. if Category == "招标" || Category == "拟建" || Category == "预告" {
  1368. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1369. "code": "CL_块内容",
  1370. "field": field,
  1371. "ruletext": "投标保证金",
  1372. "extfrom": "投标保证金_块内容",
  1373. "sourcevalue": bl.Text,
  1374. "value": text,
  1375. "type": "投标保证金_块内容",
  1376. "matchtype": "tag_string",
  1377. "blocktag": bl.Classify,
  1378. "weight": 0,
  1379. })
  1380. } else if Category == "结果" {
  1381. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1382. "code": "CL_",
  1383. "field": field,
  1384. "ruletext": "履约保证金",
  1385. "extfrom": "履约保证金_块内容",
  1386. "sourcevalue": bl.Text,
  1387. "value": text,
  1388. "type": "履约保证金_块内容",
  1389. "matchtype": "tag_string",
  1390. "blocktag": bl.Classify,
  1391. "weight": 0,
  1392. })
  1393. }
  1394. }
  1395. return
  1396. }
  1397. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1398. if k == 0 {
  1399. tp = "colon"
  1400. } else if k == 1 {
  1401. tp = "space"
  1402. } else if k == 2 {
  1403. tp = "table"
  1404. }
  1405. if v == nil || v.KvTags == nil {
  1406. continue
  1407. }
  1408. for _, vv := range v.KvTags[fieldname] {
  1409. text := ju.TrimLRSpace(vv.Value, "")
  1410. if text != "" {
  1411. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1412. "code": "CL_" + vv.Key,
  1413. "field": field,
  1414. "ruletext": vv.Key,
  1415. "extfrom": vc.ExtFrom,
  1416. "sourcevalue": text,
  1417. "value": text,
  1418. "type": tp,
  1419. "matchtype": "tag_string",
  1420. "blocktag": bl.Classify,
  1421. "weight": vv.Weight,
  1422. })
  1423. //if field != "winnertel" && field != "winnerperson" {
  1424. // //break //暂定取第一个
  1425. //}
  1426. }
  1427. }
  1428. }
  1429. if len(kvmap[field]) == 0 {
  1430. extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category)
  1431. }
  1432. }
  1433. }
  1434. //正则提取结果
  1435. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1436. defer qu.Catch()
  1437. var score float64
  1438. score = vre.Score
  1439. if isSite {
  1440. score = score + 1.0
  1441. }
  1442. extinfo := map[string][]map[string]interface{}{}
  1443. rep := map[string]string{}
  1444. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1445. //处理正负数修正
  1446. ptmp := strings.Split(vre.RuleText, "#")
  1447. sign := 0
  1448. if len(ptmp) == 2 {
  1449. if ptmp[1] == "正" {
  1450. sign = 1
  1451. } else if ptmp[1] == "负" {
  1452. sign = -1
  1453. }
  1454. }
  1455. tmp := strings.Split(ptmp[0], "__")
  1456. if len(tmp) == 2 {
  1457. epos := strings.Split(tmp[1], ",")
  1458. posm := map[string]int{}
  1459. for _, v := range epos {
  1460. ks := strings.Split(v, ":")
  1461. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1462. posm[ks[1]] = qu.IntAll(ks[0])
  1463. } else {
  1464. posm[vre.Field] = qu.IntAll(ks[0])
  1465. }
  1466. }
  1467. var pattern string
  1468. if strings.Contains(tmp[0], "\\u") {
  1469. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1470. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1471. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1472. } else {
  1473. pattern = tmp[0]
  1474. }
  1475. //log.Debug("pattern", pattern)
  1476. //fmt.Println(text)
  1477. reg := regexp.MustCompile(pattern)
  1478. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1479. for i, _ := range apos {
  1480. pos := apos[i]
  1481. for k, p := range posm {
  1482. if len(pos) > p {
  1483. if pos[p] == -1 || pos[p+1] == -1 {
  1484. continue
  1485. }
  1486. val := text[pos[p]:pos[p+1]]
  1487. if string(val) == "" {
  1488. continue
  1489. }
  1490. if sign == -1 {
  1491. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1492. } else {
  1493. rep[k+"_"+fmt.Sprint(i)] = val
  1494. }
  1495. }
  1496. }
  1497. }
  1498. tmps := []map[string]interface{}{}
  1499. for i := 0; i < len(apos); i++ {
  1500. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1501. tmp := map[string]interface{}{
  1502. "field": vre.Field,
  1503. "code": vre.Code,
  1504. "ruletext": vre.RuleText,
  1505. "extfrom": text,
  1506. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1507. "type": "regexp",
  1508. "matchtype": "regcontent",
  1509. "blocktag": *tag,
  1510. "score": score,
  1511. }
  1512. exfield := ju.ExtField{
  1513. BlockTag: *tag,
  1514. Field: vre.Field,
  1515. Code: vre.Code,
  1516. RuleText: vre.RuleText,
  1517. Type: "regexp",
  1518. MatchType: "regcontent",
  1519. ExtFrom: extfrom,
  1520. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1521. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1522. Score: score,
  1523. }
  1524. if vre.Field == "qualifies" {
  1525. if len(rep) >= 2 {
  1526. tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1527. exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1528. }
  1529. }
  1530. tmps = append(tmps, tmp)
  1531. if tmp["blocktag"] != nil {
  1532. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1533. }
  1534. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1535. }
  1536. }
  1537. if len(tmps) > 0 {
  1538. //fmt.Println(tmps)
  1539. extinfo[vre.Field] = tmps
  1540. }
  1541. }
  1542. } else {
  1543. pos := vre.RegCore.Reg.FindStringIndex(text)
  1544. val := ""
  1545. if len(pos) == 2 {
  1546. text = text[pos[1]:]
  1547. rs := regexp.MustCompile("[^\r\n\t]+")
  1548. tmp := rs.FindAllString(text, -1)
  1549. if len(tmp) > 0 {
  1550. val = tmp[0]
  1551. }
  1552. }
  1553. if val != "" {
  1554. tmps := []map[string]interface{}{}
  1555. tmp := map[string]interface{}{
  1556. "field": vre.Field,
  1557. "code": vre.Code,
  1558. "ruletext": vre.RuleText,
  1559. "extfrom": text,
  1560. "value": val,
  1561. "type": "regexp",
  1562. "matchtype": "regcontent",
  1563. "blocktag": *tag,
  1564. "score": score,
  1565. }
  1566. tmps = append(tmps, tmp)
  1567. extinfo[vre.Field] = tmps
  1568. if j.Result[vre.Field] == nil {
  1569. j.Result[vre.Field] = [](*ju.ExtField){}
  1570. }
  1571. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1572. Value: val,
  1573. Score: score}
  1574. if tmp["blocktag"] != nil {
  1575. field.BlockTag = tmp["blocktag"].(map[string]string)
  1576. }
  1577. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1578. }
  1579. }
  1580. return extinfo
  1581. }
  1582. //后置过滤
  1583. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1584. defer qu.Catch()
  1585. if in.IsLua {
  1586. result := GetResultMapForLua(j)
  1587. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1588. if j != nil {
  1589. lua.Block = j.Block
  1590. }
  1591. extinfo := lua.RunScript("back")
  1592. for k, v := range extinfo {
  1593. if tmps, ok := v.([]map[string]interface{}); ok {
  1594. j.Result[k] = [](*ju.ExtField){}
  1595. for _, tmp := range tmps {
  1596. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1597. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1598. Value: tmp["value"]}
  1599. if tmp["blocktag"] != nil {
  1600. field.BlockTag = tmp["blocktag"].(map[string]string)
  1601. }
  1602. j.Result[k] = append(j.Result[k], field)
  1603. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1604. }
  1605. }
  1606. }
  1607. if len(extinfo) > 0 {
  1608. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1609. }
  1610. } else {
  1611. extinfo := map[string]interface{}{}
  1612. if in.Field != "" {
  1613. clearByTitle := false
  1614. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1615. clearByTitle = true
  1616. }
  1617. if j.Result[in.Field] != nil {
  1618. tmp := j.Result[in.Field]
  1619. exts := []interface{}{}
  1620. for k, v := range tmp {
  1621. if clearByTitle && v.ExtFrom != "title" {
  1622. continue
  1623. }
  1624. //table抽取到的数据不清理
  1625. if v.Type == "table" && v.Field == "projectname" {
  1626. return
  1627. }
  1628. text := qu.ObjToString(v.Value)
  1629. if v.Field == "bidamount" || v.Field == "budget" {
  1630. if (strings.Contains(qu.ObjToString(v.SourceValue), "费率") ||
  1631. strings.Contains(qu.ObjToString(v.SourceValue), "税率") ||
  1632. strings.Contains(qu.ObjToString(v.SourceValue), "(%)")) &&
  1633. !strings.Contains(qu.ObjToString(v.SourceValue), "工程设计费") {
  1634. j.Result[in.Field][k].IsTrue = false
  1635. continue
  1636. }
  1637. }
  1638. if text != "" {
  1639. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1640. }
  1641. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1642. continue
  1643. }
  1644. j.Result[in.Field][k].Value = text
  1645. exts = append(exts, map[string]interface{}{
  1646. "field": v.Field,
  1647. "code": v.Code,
  1648. "ruletext": v.RuleText,
  1649. "type": v.Type,
  1650. "matchtype": v.MatchType,
  1651. "extfrom": v.ExtFrom,
  1652. "value": text,
  1653. })
  1654. }
  1655. if len(exts) > 0 {
  1656. extinfo[in.Field] = exts
  1657. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1658. }
  1659. }
  1660. } else {
  1661. for key, tmp := range j.Result {
  1662. exts := []interface{}{}
  1663. for k, v := range tmp {
  1664. //table抽取到的数据不清理
  1665. if v.Type == "table" && v.Field == "projectname" {
  1666. return
  1667. }
  1668. text := qu.ObjToString(v.Value)
  1669. if text != "" {
  1670. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1671. }
  1672. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1673. continue
  1674. }
  1675. j.Result[key][k].Value = text
  1676. exts = append(exts, map[string]interface{}{
  1677. "field": v.Field,
  1678. "code": v.Code,
  1679. "ruletext": v.RuleText,
  1680. "type": v.Type,
  1681. "matchtype": v.MatchType,
  1682. "extfrom": v.ExtFrom,
  1683. "value": text,
  1684. })
  1685. }
  1686. if len(exts) > 0 {
  1687. extinfo[key] = exts
  1688. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1689. }
  1690. }
  1691. }
  1692. }
  1693. }
  1694. //后置过滤
  1695. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1696. defer qu.Catch()
  1697. for k, v := range j.BlockPackage {
  1698. if in.Field == "winner" {
  1699. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1700. } else if in.Field == "bidstatus" {
  1701. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1702. } else if in.Field == "" {
  1703. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1704. } else if in.Field == "projectname" {
  1705. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1706. } else if in.Field == "winnerperson" {
  1707. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1708. } else if in.Field == "winnertel" {
  1709. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1710. }
  1711. }
  1712. }
  1713. //KV过滤
  1714. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1715. defer qu.Catch()
  1716. extinfo := map[string]interface{}{}
  1717. if in.Field != "" {
  1718. if j.Result[in.Field] != nil {
  1719. tmp := j.Result[in.Field]
  1720. exts := []interface{}{}
  1721. for k, v := range tmp {
  1722. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1723. continue
  1724. }
  1725. if v.Field == "docendtime" {
  1726. //log.Debug("调试字段...")
  1727. }
  1728. text := qu.ObjToString(v.Value)
  1729. if text != "" {
  1730. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1731. }
  1732. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1733. continue
  1734. }
  1735. j.Result[in.Field][k].Value = text
  1736. exts = append(exts, map[string]interface{}{
  1737. "field": v.Field,
  1738. "code": v.Code,
  1739. "ruletext": v.RuleText,
  1740. "type": v.Type,
  1741. "matchtype": v.MatchType,
  1742. "extfrom": v.ExtFrom,
  1743. "value": text,
  1744. })
  1745. }
  1746. if len(exts) > 0 {
  1747. extinfo[in.Field] = exts
  1748. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1749. }
  1750. }
  1751. }
  1752. }
  1753. //获取抽取结果map[string][]interface{},lua脚本使用
  1754. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1755. defer qu.Catch()
  1756. result := map[string][]map[string]interface{}{}
  1757. for key, val := range j.Result {
  1758. if result[key] == nil {
  1759. result[key] = []map[string]interface{}{}
  1760. }
  1761. for _, v := range val {
  1762. tmp := map[string]interface{}{
  1763. "field": v.Field,
  1764. "code": v.Code,
  1765. "ruletext": v.RuleText,
  1766. "value": v.Value,
  1767. "type": v.Type,
  1768. "matchtype": v.MatchType,
  1769. "extfrom": v.ExtFrom,
  1770. }
  1771. result[key] = append(result[key], tmp)
  1772. }
  1773. }
  1774. return result
  1775. }
  1776. //抽取日志
  1777. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1778. defer qu.Catch()
  1779. if !t.IsEtxLog {
  1780. return
  1781. }
  1782. logdata := map[string]interface{}{
  1783. "code": qu.If(v.Code == "", "kv", v.Code),
  1784. "name": v.Name,
  1785. "type": ftype,
  1786. "ruletext": v.RuleText,
  1787. "islua": v.IsLua,
  1788. "field": v.Field,
  1789. "version": t.Version,
  1790. "taskname": t.Name,
  1791. "before": before,
  1792. "extinfo": extinfo,
  1793. "sid": sid,
  1794. "comeintime": time.Now().Unix(),
  1795. }
  1796. lock.Lock()
  1797. ExtLogs[t] = append(ExtLogs[t], logdata)
  1798. lock.Unlock()
  1799. }
  1800. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1801. exts := []map[string]interface{}{}
  1802. exts = append(exts, map[string]interface{}{
  1803. "field": ext.Field,
  1804. "code": ext.Code,
  1805. "type": ftype,
  1806. "matchtype": matchtype,
  1807. "extfrom": ext.ExtFrom,
  1808. "value": ext.Value,
  1809. })
  1810. extinfo := map[string]interface{}{
  1811. ext.Field: exts,
  1812. }
  1813. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1814. }
  1815. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1816. defer qu.Catch()
  1817. if !t.IsEtxLog {
  1818. return
  1819. }
  1820. logdata := map[string]interface{}{
  1821. "code": code,
  1822. "name": name,
  1823. "type": ftype,
  1824. "ruletext": "",
  1825. "islua": false,
  1826. "field": field,
  1827. "version": t.Version,
  1828. "taskname": t.Name,
  1829. "before": before,
  1830. "extinfo": extinfo,
  1831. "sid": sid,
  1832. "comeintime": time.Now().Unix(),
  1833. }
  1834. lock.Lock()
  1835. ExtLogs[t] = append(ExtLogs[t], logdata)
  1836. lock.Unlock()
  1837. }
  1838. //保存抽取日志
  1839. func SaveExtLog() {
  1840. defer qu.Catch()
  1841. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1842. lock.Lock()
  1843. tmpLogs = ExtLogs
  1844. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1845. lock.Unlock()
  1846. for k, v := range tmpLogs {
  1847. if len(v) < saveLimit {
  1848. db.Mgo.SaveBulk(k.TrackColl, v...)
  1849. } else {
  1850. for {
  1851. if len(v) > saveLimit {
  1852. tmp := v[:saveLimit]
  1853. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1854. v = v[saveLimit:]
  1855. } else {
  1856. db.Mgo.SaveBulk(k.TrackColl, v...)
  1857. break
  1858. }
  1859. }
  1860. }
  1861. }
  1862. time.AfterFunc(10*time.Second, SaveExtLog)
  1863. }
  1864. type FieldValue struct {
  1865. Value interface{}
  1866. Count int
  1867. }
  1868. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1869. var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
  1870. //包含字母的实体单位
  1871. var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
  1872. //落款单位抽取
  1873. var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
  1874. var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
  1875. var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
  1876. //特殊金额-处理判断-倍率关系
  1877. func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
  1878. //金额结果只有两种 - 倍率关系10000 - 过10E
  1879. moneyIndex := []int{}
  1880. moneyArr := []float64{}
  1881. first_money := float64(0)
  1882. difValue := map[string]interface{}{}
  1883. for k, v := range val { //取第一个非负数,项目名称除外
  1884. if v.IsTrue && v.Score > -1 {
  1885. moneyArr = append(moneyArr, qu.Float64All(v.Value))
  1886. moneyIndex = append(moneyIndex, k)
  1887. key := ""
  1888. if m, ok := v.Value.(float64); ok {
  1889. key = fmt.Sprintf("%f", m)
  1890. } else {
  1891. key = qu.ObjToString(v.Value)
  1892. }
  1893. if difValue[key] == nil {
  1894. difValue[key] = 1
  1895. }
  1896. //if len(difValue) > 2 {
  1897. // return false, 0
  1898. //}
  1899. }
  1900. }
  1901. //计算金额数组
  1902. if len(difValue) == 2 {
  1903. money_1, money_2 := float64(0), float64(0)
  1904. for k, v := range moneyArr {
  1905. if k == 0 {
  1906. money_1 = v
  1907. } else {
  1908. if v != money_1 {
  1909. money_2 = v
  1910. break
  1911. }
  1912. }
  1913. }
  1914. isRatio, new_money := false, float64(0) //判断金额是否为倍率关系
  1915. if money_1 != float64(0) && money_2 != float64(0) {
  1916. if money_1 == money_2*float64(10000) && money_1 >= 100000000 {
  1917. isRatio = true
  1918. new_money = money_2
  1919. }
  1920. if money_2 == money_1*float64(10000) && money_2 >= 100000000 {
  1921. isRatio = true
  1922. new_money = money_1
  1923. }
  1924. if isRatio { //采用新值
  1925. for k, v := range moneyArr {
  1926. if v == new_money {
  1927. return true, moneyIndex[k]
  1928. }
  1929. }
  1930. }
  1931. }
  1932. } else if len(difValue) > 2 { //多组金额
  1933. is_exists := false
  1934. for _, v := range moneyArr {
  1935. if v >= 1000000000 {
  1936. is_exists = true
  1937. first_money = v
  1938. }
  1939. }
  1940. if is_exists {
  1941. for k, v := range moneyArr {
  1942. if v*10000 == first_money {
  1943. return true, moneyIndex[k]
  1944. }
  1945. }
  1946. }
  1947. } else {
  1948. }
  1949. return false, 0
  1950. }
  1951. //分析抽取结果并保存
  1952. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1953. qu.Try(func() {
  1954. //(j.Category == "招标" || j.Category == "预告")
  1955. if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
  1956. if j.CategorySecond != "单一" {
  1957. delete(j.Result, "winner")
  1958. delete(j.Result, "bidamount")
  1959. for _, v := range j.BlockPackage {
  1960. v.Bidamount = 0
  1961. v.IsTrueBidamount = false
  1962. if v.Winner != "" {
  1963. v.Winner = ""
  1964. if v.SpaceKV != nil {
  1965. delete(v.SpaceKV.KvTags, "中标单位")
  1966. }
  1967. if v.TableKV != nil {
  1968. delete(v.TableKV.KvTags, "中标单位")
  1969. }
  1970. if v.ColonKV != nil {
  1971. delete(v.ColonKV.KvTags, "中标单位")
  1972. }
  1973. }
  1974. }
  1975. for _, v := range j.PackageInfo {
  1976. delete(v, "winner")
  1977. delete(v, "bidamount")
  1978. }
  1979. j.Winnerorder = nil
  1980. if jf != nil && jf.Winnerorder != nil {
  1981. jf.Winnerorder = nil
  1982. }
  1983. }
  1984. }
  1985. //重新取出清理过后的中标候选人重置候选人
  1986. resetWinnerorder(j)
  1987. //打分
  1988. doc, result, _id := funcAnalysis(j, e)
  1989. //_, result, _id := funcAnalysis(j, e)
  1990. if ju.IsSaveTag {
  1991. go otherNeedSave(j, result, e)
  1992. }
  1993. //从排序结果中取值
  1994. tmp := map[string]interface{}{} //抽取值
  1995. tmp["spidercode"] = j.SpiderCode
  1996. tmp["site"] = j.Site
  1997. if len(*j.Jsondata) > 0 {
  1998. tmp["jsondata"] = j.Jsondata
  1999. }
  2000. //字段-抽取来源
  2001. fieldSource := make(map[string]interface{}, 0)
  2002. //字段-抽取来源
  2003. for k, val := range result {
  2004. if k == "qualifies" {
  2005. squalifies := make([]interface{}, 0)
  2006. squalifiesMap := make(map[string]*scoreIndex, 0)
  2007. for _, kv := range val {
  2008. skey := kv.RuleText
  2009. if kv.Score > 0 {
  2010. if squalifiesMap[skey] == nil {
  2011. squalifiesMap = map[string]*scoreIndex{
  2012. skey: &scoreIndex{
  2013. Score: kv.Score,
  2014. Index: len(squalifies),
  2015. },
  2016. }
  2017. squalifies = append(squalifies, map[string]interface{}{
  2018. "key": skey,
  2019. "value": kv.Value,
  2020. })
  2021. } else {
  2022. if squalifiesMap[skey].Score < kv.Score {
  2023. squalifies[squalifiesMap[skey].Index] = map[string]interface{}{
  2024. "key": skey,
  2025. "value": kv.Value,
  2026. }
  2027. }
  2028. }
  2029. }
  2030. }
  2031. tmp[k] = squalifies
  2032. continue
  2033. }
  2034. //预算-中标金额字段-特殊情况特殊处理
  2035. if k == "bidamount" || k == "budget" {
  2036. b, index := calculateAbnormalMoney(val)
  2037. if b {
  2038. new_v := val[index]
  2039. tmp[new_v.Field] = new_v.Value
  2040. fieldSource[new_v.Field] = map[string]interface{}{
  2041. "ext_type": new_v.Type,
  2042. "ext_from": new_v.ExtFrom,
  2043. }
  2044. tmp["is_dif_ratioMoney"] = true
  2045. continue
  2046. }
  2047. }
  2048. for _, v := range val { //取第一个非负数,项目名称除外//存0是否有效
  2049. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 {
  2050. tmp[v.Field] = v.Value
  2051. fieldSource[v.Field] = map[string]interface{}{
  2052. "ext_type": v.Type,
  2053. "ext_from": v.ExtFrom,
  2054. }
  2055. break
  2056. }
  2057. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  2058. tmp[v.Field] = v.Value
  2059. fieldSource[v.Field] = map[string]interface{}{
  2060. "ext_type": v.Type,
  2061. "ext_from": v.ExtFrom,
  2062. }
  2063. //中标单位~含字母判断~对比企业库
  2064. if (v.Field == "winner" || v.Field == "buyer") && letter_entity.MatchString(qu.ObjToString(v.SourceValue)) {
  2065. qyxy_data := make([]map[string]interface{}, 0)
  2066. ju.QyxySess.Find(map[string]interface{}{
  2067. "company_name": qu.ObjToString(v.SourceValue),
  2068. }).All(&qyxy_data)
  2069. if qyxy_data != nil && len(qyxy_data) > 0 {
  2070. tmp[v.Field] = v.SourceValue
  2071. }
  2072. }
  2073. break
  2074. }
  2075. }
  2076. }
  2077. tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",")
  2078. if len(j.PackageInfo) > 15 {
  2079. for k, v := range j.PackageInfo {
  2080. j.PackageInfo = map[string]map[string]interface{}{}
  2081. j.PackageInfo[k] = v
  2082. break
  2083. }
  2084. }
  2085. if len(j.PackageInfo) > 0 { //分包信息
  2086. tmp["package"] = j.PackageInfo
  2087. //包预算,中标金额合并大于抽取就覆盖
  2088. tmpBidamount, tmpBudget, tmpAgencyfee := qu.Float64All(0), qu.Float64All(0), qu.Float64All(0)
  2089. //s_winner逗号分隔拼接,分包中标人
  2090. var tmpstr, savewinner []string
  2091. //按包排序
  2092. for b, v := range j.PackageInfo {
  2093. if v["winner"] != nil && v["winner"] != "" {
  2094. tmpstr = append(tmpstr, b)
  2095. }
  2096. }
  2097. //包预算,中标金额合并大于抽取就覆盖
  2098. if len(j.PackageInfo) > 1 {
  2099. //包数大于1累加
  2100. for _, v := range j.PackageInfo {
  2101. if v["budget"] != nil {
  2102. tmpBudget = precisionAddFloat(tmpBudget, qu.Float64All(v["budget"]))
  2103. }
  2104. if v["bidamount"] != nil {
  2105. tmpBidamount = precisionAddFloat(tmpBidamount, qu.Float64All(v["bidamount"]))
  2106. }
  2107. if v["agencyfee"] != nil {
  2108. tmpAgencyfee = precisionAddFloat(tmpAgencyfee, qu.Float64All(v["agencyfee"]))
  2109. }
  2110. }
  2111. if qu.Float64All(tmp["budget"]) < tmpBudget {
  2112. fieldSource["budget"] = map[string]interface{}{
  2113. "ext_type": "",
  2114. "ext_from": "package",
  2115. }
  2116. tmp["budget"] = tmpBudget
  2117. }
  2118. if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee {
  2119. fieldSource["agencyfee"] = map[string]interface{}{
  2120. "ext_type": "",
  2121. "ext_from": "package",
  2122. }
  2123. tmp["agencyfee"] = tmpAgencyfee
  2124. }
  2125. if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
  2126. fieldSource["bidamount"] = map[string]interface{}{
  2127. "ext_type": "",
  2128. "ext_from": "package",
  2129. }
  2130. tmp["bidamount"] = tmpBidamount
  2131. } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  2132. fieldSource["bidamount"] = map[string]interface{}{
  2133. "ext_type": "",
  2134. "ext_from": "package",
  2135. }
  2136. tmp["bidamount"] = tmpBidamount
  2137. }
  2138. } else {
  2139. //包数等于1,tmp没有值取包里的值
  2140. if tmp["budget"] == nil || tmp["budget"] == 0 {
  2141. for _, v := range j.PackageInfo {
  2142. if v["budget"] != nil {
  2143. fieldSource["budget"] = map[string]interface{}{
  2144. "ext_type": "",
  2145. "ext_from": "package",
  2146. }
  2147. tmp["budget"] = v["budget"]
  2148. }
  2149. }
  2150. }
  2151. if tmp["agencyfee"] == nil || tmp["agencyfee"] == 0 {
  2152. for _, v := range j.PackageInfo {
  2153. if v["agencyfee"] != nil {
  2154. fieldSource["agencyfee"] = map[string]interface{}{
  2155. "ext_type": "",
  2156. "ext_from": "package",
  2157. }
  2158. tmp["agencyfee"] = v["agencyfee"]
  2159. }
  2160. }
  2161. }
  2162. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  2163. for _, v := range j.PackageInfo {
  2164. if v["bidamount"] != nil {
  2165. fieldSource["bidamount"] = map[string]interface{}{
  2166. "ext_type": "",
  2167. "ext_from": "package",
  2168. }
  2169. tmp["bidamount"] = v["bidamount"]
  2170. }
  2171. }
  2172. }
  2173. }
  2174. //s_winner逗号分隔拼接,分包中标人
  2175. sort.Strings(tmpstr)
  2176. for _, v := range tmpstr {
  2177. winner := qu.ObjToString(j.PackageInfo[v]["winner"])
  2178. new_winner := clearWinnerReg.ReplaceAllString(winner, "")
  2179. if new_winner == "" {
  2180. continue
  2181. }
  2182. //名称黑名单
  2183. if unPackageWinnerReg.MatchString(new_winner) {
  2184. continue
  2185. }
  2186. savewinner = append(savewinner, new_winner)
  2187. }
  2188. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  2189. tmp["s_winner"] = tmp["winner"]
  2190. fieldSource["s_winner"] = fieldSource["winner"]
  2191. } else if savewinner != nil {
  2192. if len(savewinner) == 1 && tmp["winner"] != nil {
  2193. tmp["s_winner"] = tmp["winner"]
  2194. fieldSource["s_winner"] = fieldSource["winner"]
  2195. } else {
  2196. savewinner = RemoveReplicaSliceString(savewinner)
  2197. tmp["s_winner"] = strings.Join(savewinner, ",")
  2198. fieldSource["s_winner"] = map[string]interface{}{
  2199. "ext_type": "",
  2200. "ext_from": "package",
  2201. }
  2202. }
  2203. }
  2204. } else if tmp["winner"] != nil {
  2205. //没有分包取winner
  2206. tmp["s_winner"] = tmp["winner"]
  2207. fieldSource["s_winner"] = fieldSource["winner"]
  2208. }
  2209. if len(j.Winnerorder) > 0 { //候选人信息
  2210. for i, v := range j.Winnerorder {
  2211. if v["price"] != nil {
  2212. tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2213. if tmpPrice[len(tmpPrice)-1].(bool) {
  2214. j.Winnerorder[i]["price"] = tmpPrice[0]
  2215. } else {
  2216. delete(j.Winnerorder[i], "price")
  2217. }
  2218. }
  2219. }
  2220. tmp["winnerorder"] = j.Winnerorder
  2221. }
  2222. //处理附件
  2223. var resultf map[string][]*ju.ExtField
  2224. ffield := map[string]interface{}{}
  2225. if jf != nil {
  2226. _, resultf, _ = funcAnalysis(jf, e)
  2227. for _, val := range resultf {
  2228. for _, v := range val { //取第一个非负数
  2229. if v.Score > -1 {
  2230. ffield[v.Field] = v.Value
  2231. if tmp[v.Field] == nil {
  2232. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
  2233. tmp[v.Field] = v.Value
  2234. fieldSource[v.Field] = map[string]interface{}{
  2235. "ext_type": v.Type,
  2236. "ext_from": "ff",
  2237. }
  2238. break
  2239. }
  2240. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  2241. if v.Field == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
  2242. break //此方法逻辑上已经不会达到这里 winner
  2243. }
  2244. tmp[v.Field] = v.Value
  2245. fieldSource[v.Field] = map[string]interface{}{
  2246. "ext_type": v.Type,
  2247. "ext_from": "ff",
  2248. }
  2249. //中标单位~含字母判断~对比企业库
  2250. if (v.Field == "winner" || v.Field == "buyer") && letter_entity.MatchString(qu.ObjToString(v.SourceValue)) {
  2251. qyxy_data := make([]map[string]interface{}, 0)
  2252. ju.QyxySess.Find(map[string]interface{}{
  2253. "company_name": qu.ObjToString(v.SourceValue),
  2254. }).All(&qyxy_data)
  2255. if qyxy_data != nil && len(qyxy_data) > 0 {
  2256. tmp[v.Field] = v.SourceValue
  2257. }
  2258. }
  2259. break
  2260. }
  2261. }
  2262. break
  2263. }
  2264. }
  2265. }
  2266. if len(jf.PackageInfo) > 0 { //分包信息
  2267. ffield["package"] = jf.PackageInfo
  2268. }
  2269. if len(jf.Winnerorder) > 0 { //候选人信息
  2270. ffield["winnerorder"] = jf.Winnerorder
  2271. }
  2272. }
  2273. //添加字段来源
  2274. tmp["field_source"] = fieldSource
  2275. //是否为不规则表格字段
  2276. if j.IsUnRulesTab {
  2277. tmp["is_UnRules_Tab"] = j.IsUnRulesTab
  2278. }
  2279. for k, v := range *doc {
  2280. if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
  2281. (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
  2282. }
  2283. //去重冗余字段
  2284. if delFiled(k) {
  2285. continue
  2286. }
  2287. if tmp[k] == nil {
  2288. tmp[k] = v
  2289. }
  2290. }
  2291. //质量审核
  2292. if ju.QualityAudit {
  2293. e.QualityAudit(tmp)
  2294. }
  2295. //落款识别
  2296. inscribeRecognize(&tmp, *j.Data)
  2297. //城市抽取
  2298. if e.IsExtractCity {
  2299. //e.NewExtractCity(j, &tmp)
  2300. e.ExtractRegionInfo(j, &tmp, true)
  2301. }
  2302. //品牌抽取
  2303. if ju.IsBrandGoods {
  2304. tmp["checkhas"] = map[string]int{
  2305. "hastable": j.HasTable,
  2306. "hasgoods": j.HasGoods,
  2307. "hasbrand": j.HasBrand,
  2308. "haskey": j.HasKey,
  2309. }
  2310. if len(j.BrandData) > 0 {
  2311. tmp["tablebrand"] = j.BrandData
  2312. }
  2313. }
  2314. //prince和number抽取
  2315. if ju.IsPriceNumber {
  2316. priceNumberLen := len(j.PriceNumberData)
  2317. if priceNumberLen > 1 { //table数据去重
  2318. tmpPriceNumberData := []map[string]interface{}{}
  2319. tableStrs := map[string]bool{}
  2320. for _, tb := range j.PriceNumberData {
  2321. has := false
  2322. bytes, _ := json.Marshal(tb)
  2323. str := string(bytes)
  2324. if len(tableStrs) > 0 && tableStrs[str] {
  2325. has = true
  2326. } else {
  2327. tableStrs[str] = true
  2328. }
  2329. if !has {
  2330. for _, data := range tb {
  2331. tmpPriceNumberData = append(tmpPriceNumberData, data)
  2332. }
  2333. }
  2334. }
  2335. tmp["pricenumber"] = tmpPriceNumberData
  2336. } else if priceNumberLen == 1 {
  2337. tmp["pricenumber"] = j.PriceNumberData[0]
  2338. }
  2339. }
  2340. //所有kv组成的字符串
  2341. var kvtext bytes.Buffer
  2342. blocks := make([]ju.BlockAndTag, 0)
  2343. for _, v := range j.Block {
  2344. //分包和标签
  2345. if ju.SaveBlock {
  2346. xx, _ := json.Marshal(v)
  2347. tmpblock := new(ju.TmpBlock)
  2348. err := json.Unmarshal(xx, &tmpblock)
  2349. if err != nil {
  2350. if v.BPackage != nil {
  2351. bpb, _ := json.Marshal(v.BPackage)
  2352. tmpblock.BPackage = string(bpb)
  2353. }
  2354. tmpblock = rangeBlockToJson(v, *tmpblock)
  2355. }
  2356. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  2357. }
  2358. //把所有kv组装成一个字符串,存库
  2359. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  2360. if jv == nil {
  2361. continue
  2362. }
  2363. for jv_k, jv_v := range jv.KvTags {
  2364. for _, jv_vv := range jv_v {
  2365. kvtext.WriteString(jv_k)
  2366. kvtext.WriteString(":")
  2367. kvtext.WriteString(jv_vv.Value)
  2368. kvtext.WriteString("\n")
  2369. }
  2370. }
  2371. }
  2372. }
  2373. if kvtext.Len() > 0 {
  2374. tmp["kvtext"] = kvtext.String()
  2375. }
  2376. if len(blocks) > 0 {
  2377. if blocksBytes, err := json.Marshal(blocks); err == nil {
  2378. if utf8.RuneCount(blocksBytes) < 100000 {
  2379. tmp["blocks"] = string(blocksBytes)
  2380. }
  2381. }
  2382. }
  2383. tmp["dataging"] = j.Dataging
  2384. /*for k, v := range *j.Data {
  2385. if f[k] {
  2386. tmp[k] = v
  2387. }
  2388. }
  2389. for k := range tmp {
  2390. if !f[k]{
  2391. delete(tmp,k)
  2392. }
  2393. }*/
  2394. //检查字段
  2395. tmp = checkFields(tmp, *j.Data)
  2396. if tmp["projectname"] == nil || tmp["projectname"] == "" {
  2397. tmp["projectname"] = j.Title
  2398. }
  2399. tmp["repeat"] = 0
  2400. if ju.Ffield {
  2401. if len(ffield) > 0 {
  2402. tmp["ffield"] = ffield
  2403. }
  2404. }
  2405. if e.TaskInfo.TestColl == "" {
  2406. if len(tmp) > 0 { //保存抽取结果
  2407. delete(tmp, "_id")
  2408. tmparr := []map[string]interface{}{
  2409. map[string]interface{}{
  2410. "_id": qu.StringTOBsonId(_id),
  2411. },
  2412. map[string]interface{}{"$set": tmp},
  2413. }
  2414. e.RWMutex.Lock()
  2415. e.BidArr = append(e.BidArr, tmparr)
  2416. e.BidTotal++
  2417. e.RWMutex.Unlock()
  2418. }
  2419. if ju.SaveResult {
  2420. id := tmp["_id"]
  2421. tmp["result"] = result
  2422. tmp["resultf"] = resultf
  2423. delete(tmp, "_id")
  2424. tmparr := []map[string]interface{}{
  2425. map[string]interface{}{
  2426. "_id": id,
  2427. },
  2428. map[string]interface{}{"$set": tmp},
  2429. }
  2430. e.RWMutex.Lock()
  2431. e.ResultArr = append(e.ResultArr, tmparr)
  2432. e.RWMutex.Unlock()
  2433. }
  2434. } else { //测试结果~结果追踪
  2435. delete(tmp, "_id")
  2436. delete(tmp, "fieldall")
  2437. if len(j.BlockPackage) > 0 { //分包详情
  2438. if len(j.BlockPackage) > 10 {
  2439. tmp["epackage"] = "分包异常"
  2440. } else {
  2441. bs, _ := json.Marshal(j.BlockPackage)
  2442. tmp["epackage"] = string(bs)
  2443. }
  2444. }
  2445. tmp["result"] = result
  2446. //tmp["resultf"] = resultf
  2447. //_,err :=db.Mgo.Get().DB("zhengkun").C("result_data").Upsert(`{"_id":"`+_id+`"}`,map[string]interface{}{"$set": tmp})
  2448. //log.Debug("save:",err)
  2449. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  2450. if !b {
  2451. log.Debug(e.TaskInfo.TestColl, _id)
  2452. }
  2453. }
  2454. }, func(err interface{}) {
  2455. log.Debug("AnalysisSaveResult err", err)
  2456. })
  2457. }
  2458. //检查字段-
  2459. func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
  2460. delete(tmp, "contenthtml")
  2461. delete(tmp, "detail")
  2462. //对于招标类信息~若winner没有值~过滤掉中标相关信息
  2463. if qu.ObjToString(tmp["toptype"]) == "招标" &&
  2464. qu.ObjToString(tmp["subtype"]) != "单一" {
  2465. delete(tmp, "winner")
  2466. delete(tmp, "s_winner")
  2467. delete(tmp, "bidamount")
  2468. delete(tmp, "winnerorder")
  2469. //if qu.ObjToString(tmp["winner"])=="" || qu.ObjToString(tmp["winner"])=="有限公司"{
  2470. // delete(tmp,"winner")
  2471. // delete(tmp,"s_winner")
  2472. // delete(tmp,"bidamount")
  2473. // delete(tmp,"winnerorder")
  2474. //}
  2475. }
  2476. tmp["repeat"] = 0
  2477. //指定爬虫-金额处理-预算-中标金额异常
  2478. if qu.ObjToString(tmp["spidercode"]) == "xz_xzzzqjzscjgycxxxpt_zbtzs" {
  2479. if budget, ok := tmp["budget"].(float64); ok && budget > 0 && budget < 1000000 {
  2480. tmp["budget"] = budget * 10000.0
  2481. }
  2482. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  2483. tmp["bidamount"] = bidamount / 10000.0
  2484. }
  2485. }
  2486. if qu.ObjToString(tmp["spidercode"]) == "js_jsszbtbw_zbhxrgs" {
  2487. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  2488. tmp["bidamount"] = bidamount / 10000.0
  2489. }
  2490. }
  2491. //金额比例异常-
  2492. if _, ok := tmp["bidamount"].(string); ok {
  2493. delete(tmp, "bidamount")
  2494. }
  2495. /*
  2496. else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/10 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
  2497. //比例限制打开
  2498. if fb > 1000.0 && fb < 100000000.0 {
  2499. } else {
  2500. delete(tmp, "bidamount")
  2501. }
  2502. }
  2503. */
  2504. if _, ok := tmp["budget"].(string); ok {
  2505. delete(tmp, "budget")
  2506. }
  2507. if _, ok := tmp["unitprice"].(string); ok {
  2508. delete(tmp, "unitprice")
  2509. }
  2510. if _, ok := tmp["bidopentime"].(string); ok {
  2511. delete(tmp, "bidopentime")
  2512. }
  2513. if _, ok := tmp["signaturedate"].(string); ok {
  2514. delete(tmp, "signaturedate")
  2515. }
  2516. if _, ok := tmp["supervisorrate"].(string); ok {
  2517. delete(tmp, "supervisorrate")
  2518. }
  2519. //快速过滤一遍特殊字段
  2520. for k, v := range tmp {
  2521. if k == "qualifies" {
  2522. continue
  2523. }
  2524. if k == "contract_guarantee" || k == "bid_guarantee" ||
  2525. k == "is_acquire_tender" {
  2526. if len(fmt.Sprint(v)) > 0 {
  2527. tmp[k] = true
  2528. } else {
  2529. delete(tmp, k)
  2530. }
  2531. }
  2532. if k == "is_joint_bidding" || k == "is_payment_deposit" {
  2533. if fmt.Sprint(v) == "true" {
  2534. tmp[k] = true
  2535. } else {
  2536. delete(tmp, k)
  2537. }
  2538. }
  2539. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
  2540. delete(tmp, k)
  2541. }
  2542. }
  2543. //特殊字段~根绝其他字段处理
  2544. bid_bond := qu.ObjToString(tmp["bid_bond"])
  2545. if bid_bond != "" && tmp["is_payment_deposit"] == nil {
  2546. if strings.Contains(bid_bond, "保证金") &&
  2547. !clearbondReg.MatchString(bid_bond) {
  2548. tmp["is_payment_deposit"] = true
  2549. }
  2550. }
  2551. //特殊字段~根绝其他字段处理
  2552. bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
  2553. if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
  2554. if utf8.RuneCountInString(bidopenaddress) > 5 {
  2555. tmp["bidopen_shape"] = "线下开标"
  2556. }
  2557. }
  2558. //项目周期-有效值
  2559. projectperiod := qu.ObjToString(tmp["projectperiod"])
  2560. if projectperiod != "" {
  2561. //项目周期包含日期,数字及日期单位可保留,其余可清洗
  2562. isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
  2563. if !isNeedValueReg.MatchString(projectperiod) {
  2564. delete(tmp, "projectperiod")
  2565. }
  2566. }
  2567. //工期单位是否有效-清理
  2568. if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
  2569. dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
  2570. if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit) > 4 {
  2571. delete(tmp, "project_timeunit")
  2572. }
  2573. //年-0 >5 删除
  2574. if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"]) == 0 || qu.Int64All(tmp["project_duration"]) > 5) {
  2575. delete(tmp, "project_timeunit")
  2576. }
  2577. }
  2578. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  2579. strwin := qu.ObjToString(tmp["winner"])
  2580. strwin_s := qu.ObjToString(tmp["s_winner"])
  2581. if !strings.Contains(strwin_s, strwin) {
  2582. tmp["s_winner"] = strwin
  2583. }
  2584. }
  2585. //budget bidamount
  2586. if bg, ok := tmp["budget"].(float64); ok {
  2587. if bg >= 50000000000 {
  2588. tmp["budget_max_err"] = bg
  2589. delete(tmp, "budget")
  2590. }
  2591. }
  2592. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
  2593. code := qu.ObjToString(tmp["spidercode"])
  2594. if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" &&
  2595. code != "js_jsszbtbw_zbhxrgs" {
  2596. tmp["bidamount_max_err"] = bg
  2597. delete(tmp, "bidamount")
  2598. }
  2599. }
  2600. //投标方式-
  2601. bidway := qu.IntAll(tmp["bidway"])
  2602. if bidway == 1 {
  2603. tmp["bidway"] = "纸质投标"
  2604. } else if bidway == 2 {
  2605. tmp["bidway"] = "电子投标"
  2606. } else {
  2607. delete(tmp, "bidway")
  2608. }
  2609. //折扣系数
  2610. discount := dealWithDiscountBid(tmp)
  2611. if discount > 0.0 {
  2612. tmp["biddiscount"] = discount
  2613. } else {
  2614. delete(tmp, "biddiscount")
  2615. }
  2616. delete(tmp, "biddiscount_up")
  2617. delete(tmp, "biddiscount_down")
  2618. //临时
  2619. //bidstarttime := qu.Int64All(tmp["bidstarttime"])
  2620. //docendtime := qu.Int64All(tmp["docendtime"])
  2621. //timeLayout := "2006-01-02 15:04:05"
  2622. //if bidstarttime>0 {
  2623. // time_1 := time.Unix(bidstarttime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
  2624. // tmp["bidstarttime"] = time_1
  2625. //}
  2626. //if docendtime>0 {
  2627. // time_2 := time.Unix(docendtime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
  2628. // tmp["docendtime"] = time_2
  2629. //}
  2630. jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
  2631. tmp["jytest_href"] = jyhref
  2632. //检查剑鱼发布-爬虫
  2633. jyfb_data := *qu.ObjToMap(j_data["jyfb_data"])
  2634. if jyfb_data != nil {
  2635. for k, v := range jyfb_data {
  2636. if k == "area" {
  2637. delete(tmp, "district")
  2638. }
  2639. tmp[k] = v
  2640. }
  2641. }
  2642. //return tmp
  2643. //针对拟建单位~需要验证~各种字段优先级
  2644. if qu.ObjToString(tmp["toptype"]) == "拟建" &&
  2645. qu.ObjToString(tmp["subtype"]) == "拟建" {
  2646. nj_record := map[string]interface{}{}
  2647. for _, v := range NiJianField {
  2648. arr := strings.Split(v, "#")
  2649. k_type, k_field := "", ""
  2650. if len(arr) == 2 {
  2651. k_type, k_field = arr[0], arr[1]
  2652. } else {
  2653. continue
  2654. }
  2655. tmpValue := tmp[k_field]
  2656. is_use := false
  2657. if k_type == "string" {
  2658. if qu.ObjToString(j_data[k_field]) != "" {
  2659. is_use = true
  2660. tmp[k_field] = qu.ObjToString(j_data[k_field])
  2661. }
  2662. } else if k_type == "time" {
  2663. //开竣工日期~采集为字符串
  2664. if qu.ObjToString(j_data[k_field]) != "" {
  2665. //特殊~需要转换
  2666. new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "")
  2667. if len(new_data) > 0 {
  2668. if qu.Int64All(new_data[0]) > 0 {
  2669. is_use = true
  2670. tmp[k_field] = qu.Int64All(new_data[0])
  2671. //记录历史日期值
  2672. new_k := "s_" + k_field
  2673. nj_record[new_k] = map[string]interface{}{
  2674. k_field: j_data[k_field],
  2675. }
  2676. }
  2677. }
  2678. }
  2679. } else if k_type == "map" {
  2680. p_info := *qu.ObjToMap(j_data["project_scale_info"])
  2681. if qu.ObjToString(p_info[k_field]) != "" {
  2682. is_use = true
  2683. tmp[k_field] = qu.ObjToString(p_info[k_field])
  2684. }
  2685. }
  2686. if tmpValue != nil {
  2687. nj_record[k_field] = map[string]interface{}{
  2688. k_field: tmpValue,
  2689. "is_use": is_use,
  2690. }
  2691. }
  2692. }
  2693. if len(nj_record) > 0 {
  2694. tmp["nj_record"] = nj_record
  2695. }
  2696. }
  2697. return tmp
  2698. }
  2699. //落款识别~采购单位
  2700. func inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
  2701. //落款实体
  2702. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
  2703. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  2704. new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]))
  2705. if new_buyer != "" {
  2706. (*tmp)["buyer"] = new_buyer
  2707. }
  2708. }
  2709. //拟建不能存buyer
  2710. if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
  2711. qu.ObjToString((*tmp)["subtype"]) == "拟建" {
  2712. delete((*tmp), "buyer")
  2713. }
  2714. }
  2715. func InscribeEntity(detail string) string {
  2716. //去除标签
  2717. new_str := ""
  2718. new_detail := pretreated.TextAfterRemoveTable(detail)
  2719. if len(new_detail) > 200 {
  2720. new_detail = detail[len(new_detail)-200:]
  2721. }
  2722. new_str = inscribe_entity_1.FindString(new_detail)
  2723. if new_str == "" {
  2724. new_str = inscribe_entity_2.FindString(new_detail)
  2725. if new_str != "" {
  2726. str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}")
  2727. str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}")
  2728. if str1 == str2 && str1 != "" {
  2729. new_str = str1
  2730. }
  2731. }
  2732. } else {
  2733. new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
  2734. }
  2735. if new_str != "" && exclude_entity.MatchString(new_str) {
  2736. new_str = ""
  2737. }
  2738. return new_str
  2739. }
  2740. //处理折扣系数-
  2741. func dealWithDiscountBid(tmp map[string]interface{}) float64 {
  2742. biddiscount := qu.Float64All(tmp["biddiscount"])
  2743. biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
  2744. biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
  2745. baseCount := float64(1)
  2746. if biddiscount_down > 0.0 {
  2747. num1 := decimal.NewFromFloat(baseCount)
  2748. num2 := decimal.NewFromFloat(biddiscount_down)
  2749. decimalValue := num1.Sub(num2)
  2750. res, _ := decimalValue.Float64()
  2751. return res
  2752. }
  2753. if biddiscount_up > 0.0 {
  2754. num1 := decimal.NewFromFloat(baseCount)
  2755. num2 := decimal.NewFromFloat(biddiscount_up)
  2756. decimalValue := num1.Add(num2)
  2757. res, _ := decimalValue.Float64()
  2758. //log.Debug("上浮后折扣系数:",res)
  2759. return res
  2760. }
  2761. if biddiscount > 0.0 {
  2762. if biddiscount > 1.0 && biddiscount <= 10.0 {
  2763. num1 := decimal.NewFromFloat(10.0)
  2764. num2 := decimal.NewFromFloat(biddiscount)
  2765. decimalValue := num2.Div(num1)
  2766. res, _ := decimalValue.Float64()
  2767. return res
  2768. } else if biddiscount > 10.0 {
  2769. num1 := decimal.NewFromFloat(100.0)
  2770. num2 := decimal.NewFromFloat(biddiscount)
  2771. decimalValue := num2.Div(num1)
  2772. res, _ := decimalValue.Float64()
  2773. //log.Debug("标准-⑩折扣系数:",res)
  2774. return res
  2775. } else {
  2776. //log.Debug("标准折扣系数:",biddiscount)
  2777. return biddiscount
  2778. }
  2779. }
  2780. return 0.0
  2781. }
  2782. //精度丢失-相加
  2783. func precisionAddFloat(tmp1, tmp2 float64) float64 {
  2784. num1 := decimal.NewFromFloat(tmp1)
  2785. num2 := decimal.NewFromFloat(tmp2)
  2786. decimalValue := num2.Add(num1)
  2787. res, _ := decimalValue.Float64()
  2788. return res
  2789. }
  2790. //保存其他
  2791. //kv、表格、块上的标签凡是新的标签都入库
  2792. //val type times firstid createtime 判定field
  2793. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  2794. now := time.Now().Unix()
  2795. coll := e.TaskInfo.TestColl
  2796. if coll == "" {
  2797. coll = "extract_tag_result"
  2798. } else {
  2799. coll += "_tag"
  2800. }
  2801. datas := []map[string]interface{}{}
  2802. kv := map[string]int{}
  2803. for _, v := range j.Block {
  2804. //
  2805. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  2806. if vv == nil || vv.KvTags == nil {
  2807. continue
  2808. }
  2809. for kkk, vvv := range vv.KvTags {
  2810. for _, vvvv := range vvv {
  2811. if vvvv.IsInvalid {
  2812. kv[kkk] = kv[kkk] + 1
  2813. break
  2814. }
  2815. }
  2816. }
  2817. }
  2818. for _, vv := range v.NotClassifyTitles {
  2819. datas = append(datas, map[string]interface{}{
  2820. "val": vv,
  2821. "times": 0,
  2822. "type": "block",
  2823. "firstid": j.SourceMid,
  2824. "createtime": now,
  2825. })
  2826. if len(datas) == saveLimit {
  2827. db.Mgo.SaveBulk(coll, datas...)
  2828. datas = []map[string]interface{}{}
  2829. }
  2830. }
  2831. }
  2832. for k, v := range kv {
  2833. datas = append(datas, map[string]interface{}{
  2834. "val": k,
  2835. "times": v,
  2836. "type": "kv",
  2837. "firstid": j.SourceMid,
  2838. "createtime": now,
  2839. })
  2840. if len(datas) == saveLimit {
  2841. db.Mgo.SaveBulk(coll, datas...)
  2842. datas = []map[string]interface{}{}
  2843. }
  2844. }
  2845. if len(datas) > 0 {
  2846. db.Mgo.SaveBulk(coll, datas...)
  2847. }
  2848. }
  2849. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2850. if j == nil {
  2851. return nil
  2852. }
  2853. if len(j.Block) > 0 {
  2854. for i, v := range j.Block {
  2855. rangetmp := new(ju.TmpBlock)
  2856. vb, _ := json.Marshal(v)
  2857. json.Unmarshal(vb, &rangetmp)
  2858. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2859. }
  2860. }
  2861. if j.ColonKV != nil {
  2862. cb, _ := json.Marshal(j.ColonKV)
  2863. tmpblock.ColonKV = string(cb)
  2864. }
  2865. if j.SpaceKV != nil {
  2866. sb, _ := json.Marshal(j.SpaceKV)
  2867. tmpblock.SpaceKV = string(sb)
  2868. }
  2869. if j.TableKV != nil {
  2870. tb, _ := json.Marshal(j.TableKV)
  2871. tmpblock.TableKV = string(tb)
  2872. }
  2873. return &tmpblock
  2874. }
  2875. //去重冗余字段
  2876. func delFiled(k string) bool {
  2877. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2878. }
  2879. //分析-打分排序
  2880. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2881. defer qu.Catch()
  2882. doc := j.Data
  2883. result := j.Result
  2884. _id := qu.BsonIdToSId((*doc)["_id"])
  2885. result = ScoreFields(j, e.Tag) //正负面词打分
  2886. //结果排序
  2887. for _, val := range result {
  2888. ju.Sort(val)
  2889. }
  2890. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2891. clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
  2892. marshalbt, _ := json.Marshal(j.Jsondata)
  2893. tmpjddata := make(map[string]interface{})
  2894. json.Unmarshal(marshalbt, &tmpjddata)
  2895. for _, jdkey := range ju.JsonData {
  2896. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2897. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2898. if jdkey == "budget" || jdkey == "bidamount" {
  2899. lockclear.Lock()
  2900. cfn := e.ClearFn[jdkey]
  2901. lockclear.Unlock()
  2902. if len(cfn) == 0 {
  2903. continue
  2904. }
  2905. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
  2906. if tmpv.Value == newNum[0] {
  2907. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2908. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2909. ju.Sort(j.Result[jdkey])
  2910. delete((*j.Jsondata), jdkey)
  2911. break
  2912. }
  2913. } else {
  2914. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2915. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2916. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2917. ju.Sort(j.Result[jdkey])
  2918. delete((*j.Jsondata), jdkey)
  2919. break
  2920. }
  2921. }
  2922. }
  2923. }
  2924. }
  2925. if len(*j.Jsondata) > 0 {
  2926. j.Result = JsonDataMergeProcessing(j, e)
  2927. }
  2928. j.Jsondata = &tmpjddata
  2929. }
  2930. return doc, result, _id
  2931. }
  2932. //辅助信息,如果没有排序先排序
  2933. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2934. fieldalls := map[string][]map[string]interface{}{}
  2935. if j == nil {
  2936. return fieldalls
  2937. }
  2938. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2939. defer qykredis.Close()
  2940. db := 0
  2941. for field, val := range j.Result {
  2942. //ju.Sort(val)
  2943. if field == "buyer" {
  2944. db = ju.BuyerDB
  2945. } else if field == "winner" {
  2946. db = ju.WinnerDB
  2947. } else if field == "agency" {
  2948. db = ju.AgencyDB
  2949. }
  2950. sfields := []map[string]interface{}{}
  2951. for _, v := range val {
  2952. standardized := false
  2953. if _, err := qykredis.Do("SELECT", db); err != nil {
  2954. fmt.Println("redis select err", err)
  2955. } else {
  2956. rep, err := qykredis.Do("GET", v.Value)
  2957. if rep != nil && err == nil {
  2958. standardized = true
  2959. }
  2960. }
  2961. if field == "budget" || field == "bidamount" {
  2962. if !v.IsTrue {
  2963. continue
  2964. }
  2965. }
  2966. sfield := map[string]interface{}{
  2967. "val": v.Value,
  2968. "type": v.Type,
  2969. "score": v.Score,
  2970. "blocktag": v.BlockTag,
  2971. "sourceval": v.SourceValue,
  2972. "standardized": standardized,
  2973. }
  2974. sfields = append(sfields, sfield)
  2975. }
  2976. fieldalls[field] = sfields
  2977. }
  2978. return fieldalls
  2979. }
  2980. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2981. defer qu.Catch()
  2982. //获取审核字段
  2983. for _, field := range e.AuditFields {
  2984. //1.分包
  2985. if resulttmp["package"] != nil {
  2986. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2987. for _, val := range packagedata {
  2988. if val[field] != nil {
  2989. fv := qu.ObjToString(val[field])
  2990. if fv != "" {
  2991. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2992. e.RedisMatch(field, fv, val) //redis匹配
  2993. } else { //除了buyer和winner,其他字段走规则匹配
  2994. e.RuleMatch(field, fv, val)
  2995. }
  2996. }
  2997. }
  2998. }
  2999. }
  3000. //2.外围
  3001. if resulttmp[field] != nil {
  3002. fv := qu.ObjToString(resulttmp[field])
  3003. if fv != "" {
  3004. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  3005. e.RedisMatch(field, fv, resulttmp) //redis匹配
  3006. } else { //除了buyer和winner,其他字段走规则匹配
  3007. e.RuleMatch(field, fv, resulttmp)
  3008. }
  3009. }
  3010. }
  3011. }
  3012. }
  3013. //Redis匹配
  3014. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  3015. defer qu.Catch()
  3016. i := redis.GetInt(field, field+"_"+fv) //查找redis
  3017. if i == 0 { //reids未找到,执行规则匹配
  3018. val[field+"_isredis"] = false
  3019. e.RuleMatch(field, fv, val) //规则匹配
  3020. } else { //redis找到,打标识存库
  3021. val[field+"_isredis"] = true
  3022. }
  3023. }
  3024. //规则匹配
  3025. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  3026. defer qu.Catch()
  3027. if fieldval != "" {
  3028. SMap := e.StartMatch(field, fieldval)
  3029. //SMap.AddKey(field+"_isaudit", false)
  3030. for _, k := range SMap.Keys {
  3031. tmpMap[k] = SMap.Map[k]
  3032. }
  3033. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  3034. }
  3035. }
  3036. //开始规则匹配
  3037. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  3038. defer qu.Catch()
  3039. SMap := pretreated.NewSortMap()
  3040. lock.Lock()
  3041. f := e.RecogFieldMap[field]
  3042. lock.Unlock()
  3043. if len(f) > 0 {
  3044. fid := qu.BsonIdToSId(f["_id"])
  3045. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  3046. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  3047. if textAfterRecogFieldPrerule != "" {
  3048. lock.Lock()
  3049. classMap := e.FidClassMap[fid]
  3050. lock.Unlock()
  3051. L:
  3052. for _, c := range classMap { //class
  3053. classid := qu.BsonIdToSId(c["_id"])
  3054. classPrerule := qu.ObjToString(c["s_class_prerule"])
  3055. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  3056. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  3057. if textAfterClassPrerule != "" {
  3058. lock.Lock()
  3059. ruleMap := e.CidRuleMap[classid]
  3060. lock.Unlock()
  3061. for _, r := range ruleMap { //rule
  3062. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  3063. s_name := qu.ObjToString(r["s_name"])
  3064. rule := r["rule"].([]interface{})
  3065. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  3066. if textAfterRulePrerule != "" {
  3067. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  3068. if b { //匹配到一个分类下某个规则时,不再继续匹配
  3069. if savefield != "" { //保存字段不为空,存储代码信息
  3070. SMap.AddKey(field+"_"+savefield, s_name)
  3071. }
  3072. break L
  3073. }
  3074. }
  3075. }
  3076. }
  3077. }
  3078. }
  3079. }
  3080. return SMap
  3081. }
  3082. //筛选重复候选人-相关
  3083. func filterRepeatWinArr(j *ju.Job) {
  3084. if j.SpiderCode == "sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
  3085. sort_WinOrder_Arr := make([][]map[string]interface{}, 0)
  3086. sort_arr := make([]map[string]interface{}, 0)
  3087. for _, v := range j.Winnerorder {
  3088. sort := qu.IntAll(v["sort"])
  3089. if sort == 1 { //为一组
  3090. if len(sort_arr) > 0 {
  3091. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  3092. }
  3093. sort_arr = make([]map[string]interface{}, 0)
  3094. }
  3095. sort_arr = append(sort_arr, v)
  3096. }
  3097. if len(sort_arr) > 0 {
  3098. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  3099. }
  3100. if len(sort_WinOrder_Arr) > 0 { //有重复排序组-开始筛选清理
  3101. isIndex := 0
  3102. for index, winArr := range sort_WinOrder_Arr {
  3103. if len(winArr) > 0 {
  3104. if qu.ObjToString(winArr[0]["price"]) != "" &&
  3105. qu.ObjToString(winArr[0]["entname"]) != "" {
  3106. isIndex = index
  3107. break
  3108. }
  3109. }
  3110. }
  3111. j.Winnerorder = sort_WinOrder_Arr[isIndex]
  3112. }
  3113. }
  3114. }
  3115. //中标候选人经过清理之后,重新取出赋值
  3116. func resetWinnerorder(j *ju.Job) {
  3117. if len(j.Winnerorder) == 0 {
  3118. return
  3119. }
  3120. maxlen := len(j.Winnerorder) - 1
  3121. //中标单位
  3122. //i := 0
  3123. winners := []*ju.ExtField{}
  3124. bidamounts := []*ju.ExtField{}
  3125. if maxlen > 0 {
  3126. //新增-指定爬虫中标候选人过滤
  3127. filterRepeatWinArr(j)
  3128. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  3129. return
  3130. }
  3131. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  3132. if j.Winnerorder[0]["price"] != nil {
  3133. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  3134. if tmpPrice[len(tmpPrice)-1].(bool) {
  3135. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  3136. }
  3137. }
  3138. }
  3139. if j.Result["winner"] == nil && len(winners) > 0 {
  3140. j.Result["winner"] = winners
  3141. } else if len(winners) > 0 {
  3142. j.Result["winner"] = append(j.Result["winner"], winners...)
  3143. }
  3144. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  3145. j.Result["bidamount"] = bidamounts
  3146. } else if len(bidamounts) > 0 {
  3147. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  3148. }
  3149. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  3150. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  3151. j.Result["winner"] = winners
  3152. if j.Winnerorder[0]["price"] != nil {
  3153. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  3154. if tmpPrice[len(tmpPrice)-1].(bool) {
  3155. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  3156. }
  3157. j.Result["bidamount"] = bidamounts
  3158. }
  3159. }
  3160. }
  3161. func RemoveReplicaSliceString(slc []string) []string {
  3162. result := make([]string, 0)
  3163. tempMap := make(map[string]bool, len(slc))
  3164. for _, e := range slc {
  3165. if tempMap[e] == false {
  3166. tempMap[e] = true
  3167. result = append(result, e)
  3168. }
  3169. }
  3170. return result
  3171. }
  3172. type scoreIndex struct {
  3173. Score float64
  3174. Index int
  3175. }