extract.go 79 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  31. //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. /*f = map[string]bool{
  34. "T": true,
  35. "_d": true,
  36. "area": true,
  37. "channel": true,
  38. "comeintime": true,
  39. "competehref": true,
  40. "href": true,
  41. "l_np_publishtime": true,
  42. "publishtime": true,
  43. "sendflag": true,
  44. "site": true,
  45. "spidercode": true,
  46. "title": true,
  47. "projectname": true,
  48. }*/
  49. /*f = map[string]bool{
  50. "contentid": true,
  51. "progName": true,
  52. "updateTime": true,
  53. "url": true,
  54. "areaId": true,
  55. "areaName": true,
  56. "popTitle": true,
  57. "showTitle": true,
  58. "progId": true,
  59. "catid": true,
  60. "isConcern": true,
  61. "followCount": true,
  62. "followSuggestion": true,
  63. "isBoutique": true,
  64. "canTj": true,
  65. "tenderAmountNumber": true,
  66. "tenderAmountUnit": true,
  67. "bidderAmountNumber": true,
  68. "bidderAmountUnit": true,
  69. "registrationBeginTime": true,
  70. "registrationEndTime": true,
  71. "starNum": true,
  72. "title": true,
  73. "proInvested": true,
  74. "projectname": true,
  75. }*/
  76. spidercode = map[string]bool{
  77. "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
  78. "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
  79. "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
  80. "hb_tmsggzyjyxxw_jsgc_kbqk": true,
  81. "zj_nbsyyggzyjyw_jsgc_kbqk": true,
  82. "zj_zjsggzyjyzx_jyxx_kbjg": true,
  83. "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
  84. "zj_lssggzyjyw_jsgc_kbsk": true,
  85. "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
  86. "sc_mssggzydzjypt_jsgc_kbjl": true,
  87. "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
  88. "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
  89. "a_hbszbtbggfwpt_kbjl": true,
  90. "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
  91. "a_szldzbyxgs_kbxx": true,
  92. "zj_zssssxggzyjyw_gcjs_kbjggs": true,
  93. "gd_szszfhjsj_kbqkgs": true,
  94. "a_gjggzyjypt_gcjs_kbjl": true,
  95. "a_gjggzyjypt_gcjs_kbjl_new": true,
  96. "zj_tzsyhggzyjyzx_kbjggg": true,
  97. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
  98. "ah_czsggzyjyw_jsgc_kbjl": true,
  99. "ah_czsggzyjyw_zfcg_kbxx": true,
  100. "ah_whsggzyjyfww_kbxx_cgxm": true,
  101. "ah_whsggzyjyfww_kbxx_gcxm": true,
  102. }
  103. )
  104. //启动测试抽取
  105. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  106. defer qu.Catch()
  107. ext := TaskList[taskId]
  108. if ext == nil {
  109. ext = &ExtractTask{}
  110. ext.Id = taskId
  111. ext.InitTestTaskInfo(resultcoll, trackcoll)
  112. ext.IsRun = true
  113. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  114. }
  115. ext.InitSite()
  116. ext.InitRulePres()
  117. ext.InitRuleBacks(false)
  118. ext.InitRuleBacks(true)
  119. ext.InitRuleCore(false)
  120. ext.InitRuleCore(true)
  121. ext.InitPkgCore()
  122. ext.InitBlockRule()
  123. ext.InfoTypeList()
  124. ext.InitTag(false)
  125. ext.InitTag(true)
  126. ext.InitClearFn(false)
  127. ext.InitClearFn(true)
  128. ext.Lock()
  129. ext.IsExtractCity = false
  130. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  131. //初始化城市DFA信息
  132. ext.InitCityInfo()
  133. //ext.InitCityDFA()
  134. ext.InitAreaCode()
  135. ext.InitPostCode()
  136. }
  137. ext.Unlock()
  138. //质量审核
  139. ext.InitAuditFields()
  140. ext.InitAuditRule()
  141. ext.InitAuditClass()
  142. ext.InitAuditRecogField()
  143. //品牌抽取是否开启
  144. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  145. //价格个数抽取是否开启
  146. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  147. //附件抽取是否开启
  148. ext.InitFile()
  149. ext.TaskInfo.TestColl = resultcoll
  150. TaskList[taskId] = ext
  151. return RunExtractTestTask(ext, startId, num)
  152. }
  153. func IdTrans(startId string) bson.ObjectId {
  154. defer qu.Catch()
  155. return bson.ObjectIdHex(startId)
  156. }
  157. //开始测试任务抽取
  158. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  159. n, _ := strconv.Atoi(num)
  160. id := IdTrans(startId)
  161. if id.Valid() {
  162. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  163. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  164. for _, v := range *list {
  165. //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
  166. // continue
  167. //}
  168. if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
  169. continue
  170. }
  171. var j, jf *ju.Job
  172. var isSite bool
  173. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  174. v["isextFile"] = true
  175. j, jf, isSite = ext.PreInfo(v)
  176. } else {//无附件
  177. j, _, isSite = ext.PreInfo(v)
  178. }
  179. go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
  180. ext.TaskInfo.ProcessPool <- true
  181. }
  182. return true
  183. } else {
  184. return false
  185. }
  186. }
  187. //启动抽取
  188. func StartExtractTaskId(taskId string) bool {
  189. defer qu.Catch()
  190. isgo := false
  191. ext := TaskList[taskId]
  192. if ext == nil {
  193. ext = &ExtractTask{}
  194. ext.Id = taskId
  195. ext.InitTaskInfo()
  196. isgo = true
  197. } else {
  198. ext.Id = taskId
  199. ext.InitTaskInfo()
  200. }
  201. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  202. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  203. ext.InitSite()
  204. ext.InitRulePres()
  205. ext.InitRuleBacks(false)
  206. ext.InitRuleBacks(true)
  207. ext.InitRuleCore(false)
  208. ext.InitRuleCore(true)
  209. ext.InitPkgCore()
  210. ext.InitBlockRule()
  211. ext.InfoTypeList()
  212. ext.InitTag(false)
  213. ext.InitTag(true)
  214. ext.InitClearFn(false)
  215. ext.InitClearFn(true)
  216. ext.Lock()
  217. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  218. //初始化城市DFA信息
  219. //ext.InitCityDFA()
  220. ext.InitCityInfo()
  221. ext.InitAreaCode()
  222. ext.InitPostCode()
  223. }
  224. ext.Unlock()
  225. //质量审核
  226. ext.InitAuditFields()
  227. ext.InitAuditRule()
  228. ext.InitAuditClass()
  229. ext.InitAuditRecogField()
  230. //品牌抽取是否开启
  231. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  232. //价格个数抽取是否开启
  233. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  234. //附件抽取是否开启
  235. ext.InitFile()
  236. ext.IsRun = true
  237. go ext.ResultSave(true)
  238. go ext.BidSave(true)
  239. if isgo {
  240. go RunExtractTask(taskId)
  241. }
  242. TaskList[taskId] = ext
  243. return true
  244. }
  245. //停止抽取
  246. func StopExtractTaskId(taskId string) bool {
  247. defer qu.Catch()
  248. ext := TaskList[taskId]
  249. if ext != nil {
  250. ext.IsRun = false
  251. TaskList[taskId] = ext
  252. }
  253. //更新task.s_extlastid
  254. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  255. return true
  256. }
  257. //开始抽取
  258. func RunExtractTask(taskId string) {
  259. defer qu.Catch()
  260. ext := TaskList[taskId]
  261. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  262. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  263. pageNum := (count + PageSize - 1) / PageSize
  264. limit := PageSize
  265. if count < PageSize {
  266. limit = count
  267. }
  268. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  269. for i := 0; i < pageNum; i++ {
  270. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  271. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  272. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  273. for _, v := range *list {
  274. //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  275. // continue
  276. //}
  277. //根据标题判断是否抽取
  278. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  279. if !b {
  280. continue
  281. }
  282. _id := qu.BsonIdToSId(v["_id"])
  283. //log.Debug(_id)
  284. if !ext.IsRun {
  285. break
  286. }
  287. var j, jf *ju.Job
  288. var isSite bool
  289. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  290. v["isextFile"] = true
  291. j, jf, isSite = ext.PreInfo(v)
  292. } else {
  293. j, _, isSite = ext.PreInfo(v)
  294. }
  295. go ext.ExtractProcess(j, jf, isSite)
  296. ext.TaskInfo.LastExtId = _id
  297. ext.TaskInfo.ProcessPool <- true
  298. }
  299. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  300. if !ext.IsRun {
  301. break
  302. }
  303. }
  304. //更新task.s_extlastid
  305. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  306. }
  307. //信息预处理-不和版本关联,取最新版本的配置项
  308. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  309. return (&ExtractTask{}).PreInfo(doc)
  310. }
  311. var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  312. //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
  313. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  314. defer qu.Catch()
  315. //判断是否有附件这个字段
  316. var isextFile bool
  317. if doc["isextFile"] != nil {
  318. isextFile = doc["isextFile"].(bool)
  319. }
  320. detail := ""
  321. d1, _ := doc["detail"].(string)
  322. d2, _ := doc["contenthtml"].(string)
  323. if len(d1) >= len(d2) || d2 == "" {
  324. detail = d1
  325. } else {
  326. detail = d2
  327. }
  328. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  329. d3, _ := doc["summary"].(string)
  330. //全文的需要修复表格
  331. detail = pretreated.RepairCon(detail)
  332. detail = ju.CutLableStr(d3 + "\n" + detail)
  333. detail = cut.ClearHtml(d3 + "\n" + detail)
  334. doc["detail"] = detail
  335. isClearnMoney := !clearMoneyReg.MatchString(detail)
  336. if isClearnMoney {
  337. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  338. }
  339. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  340. if isextFile {
  341. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  342. }
  343. //正文小于200个字,有附件把附件内容加到正文
  344. //tmpDeatil := detail
  345. //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  346. //if err == nil {
  347. // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  348. // if conlen < 2000 {
  349. // if isextFile {
  350. // detail += qu.ObjToString(doc["detailfile"])
  351. // doc["detail"] = detail
  352. // }
  353. // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
  354. // //防止文本过长,造成抽取阻塞
  355. // log.Debug("文本太长", doc["_id"], conlen)
  356. // doc["detail"] = d3
  357. // }
  358. //}
  359. toptype := qu.ObjToString(doc["toptype"])
  360. subtype := qu.ObjToString(doc["subtype"])
  361. if qu.ObjToString(doc["type"]) == "bid" {
  362. toptype = "结果"
  363. }
  364. if toptype == "" {
  365. toptype = "all"
  366. }
  367. if subtype == "" {
  368. subtype = "all"
  369. }
  370. if subtype == "其他" {
  371. subtype = "其它"
  372. }
  373. toMap := qu.ObjToMap(doc["jsondata"])
  374. //log.Debug("toMap", toMap)
  375. if (*toMap) != nil {
  376. if (*toMap)["extweight"] == nil {
  377. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  378. }
  379. if (*toMap)["jsoncontent"] != nil {
  380. delete(*toMap, "jsoncontent")
  381. }
  382. for k, v := range *toMap {
  383. if _, ok := v.(float64); ok {
  384. continue
  385. } else if _, ok := v.(int64); ok {
  386. continue
  387. } else if _, ok2 := v.(string); ok2 {
  388. continue
  389. } else {
  390. delete(*toMap, k)
  391. }
  392. }
  393. }
  394. j = &ju.Job{
  395. SourceMid: qu.BsonIdToSId(doc["_id"]),
  396. Category: toptype,
  397. CategorySecond: subtype,
  398. Content: qu.ObjToString(doc["detail"]),
  399. SpiderCode: qu.ObjToString(doc["spidercode"]),
  400. Site: qu.ObjToString(doc["site"]),
  401. //Domain: qu.ObjToString(doc["domain"]),
  402. //Href: qu.ObjToString(doc["href"]),
  403. Title: qu.ObjToString(doc["title"]),
  404. Data: &doc,
  405. City: qu.ObjToString(doc["city"]),
  406. Province: qu.ObjToString(doc["area"]),
  407. Jsondata: toMap,
  408. Result: map[string][]*ju.ExtField{},
  409. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  410. RuleBlock: e.RuleBlock,
  411. Dataging: qu.IntAll(doc["dataging"]),
  412. IsClearnMoney: isClearnMoneystr,
  413. }
  414. if isextFile {
  415. jf = &ju.Job{
  416. SourceMid: qu.BsonIdToSId(doc["_id"]),
  417. Category: toptype,
  418. CategorySecond: subtype,
  419. Content: qu.ObjToString(doc["detailfile"]),
  420. SpiderCode: qu.ObjToString(doc["spidercode"]),
  421. Site: qu.ObjToString(doc["site"]),
  422. Title: qu.ObjToString(doc["title"]),
  423. Data: &doc,
  424. City: qu.ObjToString(doc["city"]),
  425. Province: qu.ObjToString(doc["area"]),
  426. Jsondata: toMap,
  427. Result: map[string][]*ju.ExtField{},
  428. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  429. RuleBlock: e.RuleBlock,
  430. IsFile: isextFile,
  431. Dataging: qu.IntAll(doc["dataging"]),
  432. IsClearnMoney: isClearnMoneystr,
  433. }
  434. }
  435. codeSite := j.SpiderCode
  436. //是否启用站点
  437. if value, ok := e.SiteMerge.Load(codeSite); ok {
  438. isSite = value.(bool)
  439. }
  440. if isSite {
  441. //是否配置站点
  442. exp, isSite := e.Luacodes.Load(codeSite)
  443. if isSite {
  444. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  445. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  446. }
  447. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  448. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  449. }
  450. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  451. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  452. }
  453. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  454. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  455. }
  456. }
  457. }
  458. qu.Try(func() {
  459. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  460. if isextFile && strings.TrimSpace(jf.Content) != "" {
  461. pretreated.AnalyStart(jf, isSite, codeSite)
  462. }
  463. }, func(err interface{}) {
  464. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  465. })
  466. return j, jf, isSite
  467. }
  468. var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  469. var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  470. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  471. func file2text(doc *map[string]interface{}) {
  472. mnameone := map[string]bool{}
  473. mname := map[string]bool{}
  474. murl := map[string]string{}
  475. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  476. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  477. for _, attachs := range attach_text {
  478. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  479. for _, fileinfo := range fileinfos {
  480. if ff, ok := fileinfo.(map[string]interface{}); ok {
  481. attach_url := qu.ObjToString(ff["attach_url"])
  482. ffname := qu.ObjToString(ff["file_name"])
  483. if clearStrReg.MatchString(ffname) {
  484. continue
  485. }
  486. mname[ffname] = true
  487. murl[ffname] = attach_url
  488. if sortStrReg.MatchString(ffname) {
  489. mnameone[ffname] = true
  490. }
  491. }
  492. }
  493. }
  494. }
  495. }
  496. tmpstr := ""
  497. for k := range mnameone {
  498. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  499. (*doc)["detailfile"] = tmpstr
  500. return
  501. }
  502. bs := ju.OssGetObject(murl[k])
  503. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  504. tmpstr += bs + "\n"
  505. }
  506. }
  507. for k := range mname {
  508. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  509. (*doc)["detailfile"] = tmpstr
  510. return
  511. }
  512. bs := ju.OssGetObject(murl[k])
  513. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  514. tmpstr += bs + "\n"
  515. }
  516. }
  517. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  518. }
  519. //抽取-正文
  520. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  521. e.ExtractDetail(j, isSite, j.SpiderCode) //正文-抽取属性
  522. if jf != nil && jf.IsFile { //附件jf → j 合并
  523. e.ExtractDetail(jf, isSite, j.SpiderCode)
  524. for tmpk, xs := range jf.Result {
  525. if len(j.Result[tmpk]) == 0 {
  526. if tmpk == "budget" || tmpk == "bidamount" {
  527. for _, v := range xs {
  528. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  529. j.Result[tmpk] = append(j.Result[tmpk], v)
  530. }
  531. }
  532. } else {
  533. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  534. }
  535. }
  536. }
  537. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  538. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  539. }
  540. }
  541. if isSite {
  542. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  543. if ok && ismerge.(bool) {
  544. tmpj := &ju.Job{
  545. SourceMid: j.SourceMid,
  546. Category: j.Category,
  547. CategorySecond: j.CategorySecond,
  548. Content: j.Content,
  549. SpiderCode: j.SpiderCode,
  550. //Domain: qu.ObjToString(doc["domain"]),
  551. //Href: qu.ObjToString(doc["href"]),
  552. Title: j.Title,
  553. Data: j.Data,
  554. City: j.City,
  555. Province: j.Province,
  556. Jsondata: j.Jsondata,
  557. Result: map[string][]*ju.ExtField{},
  558. BuyerAddr: j.BuyerAddr,
  559. RuleBlock: e.RuleBlock,
  560. }
  561. qu.Try(func() {
  562. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  563. }, func(err interface{}) {
  564. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  565. })
  566. e.ExtractDetail(tmpj, false, "")
  567. //if jf != nil && jf.IsFile {
  568. // e.ExtractFile(jf, false, "")
  569. //}
  570. //合并数据
  571. j.Block = append(j.Block, tmpj.Block...)
  572. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  573. for tmpk, _ := range j.Result {
  574. if len(tmpj.Result[tmpk]) > 0 {
  575. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  576. }
  577. }
  578. for tmpk, _ := range tmpj.Result {
  579. if len(j.Result[tmpk]) == 0 {
  580. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  581. }
  582. }
  583. }
  584. }
  585. //分析抽取结果并保存
  586. AnalysisSaveResult(j, jf, e)
  587. <-e.TaskInfo.ProcessPool
  588. }
  589. //抽取-正文-规则等 detail
  590. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  591. qu.Try(func() {
  592. doc := *j.Data
  593. //全局前置规则,结果覆盖doc属性
  594. //for _, v := range e.RulePres {
  595. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  596. //}
  597. tmprules := map[string][]*RuleCore{}
  598. lockrule.Lock()
  599. //加载分类抽取配置
  600. if j.Category == "all" || j.CategorySecond == "all" {
  601. if isSite {
  602. for k, vc1 := range e.SiteRuleCores["all_all"] {
  603. tmprules[k] = vc1
  604. }
  605. } else {
  606. for k, vc1 := range e.RuleCores["all_all"] {
  607. tmprules[k] = vc1
  608. }
  609. }
  610. } else {
  611. if isSite {
  612. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  613. tmprules[k] = vc1
  614. }
  615. } else {
  616. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  617. tmprules[k] = vc1
  618. }
  619. }
  620. }
  621. if len(tmprules) < 1 { //分类未覆盖部分
  622. if isSite {
  623. for k, vc1 := range e.RuleCores["all_all"] {
  624. tmprules[k] = vc1
  625. }
  626. } else {
  627. for k, vc1 := range e.SiteRuleCores["all_all"] {
  628. tmprules[k] = vc1
  629. }
  630. }
  631. }
  632. lockrule.Unlock()
  633. //抽取规则
  634. for _, vc1 := range tmprules {
  635. for _, vc := range vc1 {
  636. tmp := ju.DeepCopy(doc).(map[string]interface{})
  637. //是否进入逻辑
  638. if !ju.Logic(vc.LuaLogic, tmp) {
  639. continue
  640. }
  641. ////抽取-前置规则
  642. //for _, v := range vc.RulePres {
  643. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  644. //}
  645. // log.Debug("抽取-前置规则", tmp)
  646. //抽取-规则
  647. ExtRuleCore(tmp, e, vc, j, isSite)
  648. // log.Debug("抽取-规则", tmp)
  649. //抽取-后置规则
  650. for _, v := range vc.RuleBacks {
  651. ExtRegBack(j, v, e.TaskInfo, vc)
  652. }
  653. //kv规则
  654. for _, v := range vc.KVRuleCores {
  655. ExtRuleKV(j, v, e.TaskInfo)
  656. }
  657. // log.Debug("抽取-后置规则", tmp)
  658. //项目名称未能抽取到,标题来凑
  659. if vc.Field == "projectname" {
  660. if vc.ExtFrom == "title" {
  661. isextitle := true
  662. for _, v := range j.Result[vc.Field] {
  663. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  664. isextitle = false
  665. break
  666. }
  667. }
  668. if isextitle { //标题加入选举
  669. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  670. if isSite {
  671. field.Score = 1
  672. }
  673. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  674. }
  675. }
  676. for i := 0; i < 3; i++ {
  677. for _, v := range vc.RuleBacks {
  678. ExtRegBack(j, v, e.TaskInfo, vc)
  679. }
  680. }
  681. }
  682. }
  683. }
  684. //全局后置规则
  685. if isSite {
  686. for _, v := range e.SiteRuleBacks {
  687. ExtRegBack(j, v, e.TaskInfo, nil)
  688. }
  689. } else {
  690. for _, v := range e.RuleBacks {
  691. ExtRegBack(j, v, e.TaskInfo, nil)
  692. }
  693. }
  694. //函数清理
  695. for key, val := range j.Result {
  696. for i, v := range val {
  697. if v.Field == "project_duration" {
  698. arr := clear.ObjToMoney([]interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  699. if len(arr) > 0 {
  700. v.Value = arr[0]
  701. }
  702. }
  703. if v.Field == "projectname" && v.Type == "table" {
  704. break
  705. }
  706. if key == "budget" || key == "bidamount" {
  707. if _, ok := v.Value.(float64); ok && !v.IsTrue {
  708. continue
  709. }
  710. }
  711. lockclear.Lock()
  712. var cfn = []string{}
  713. if isSite {
  714. cfn = e.SiteClearFn[key]
  715. if len(cfn) == 0 {
  716. cfn = e.ClearFn[key]
  717. }
  718. } else {
  719. cfn = e.ClearFn[key]
  720. }
  721. lockclear.Unlock()
  722. if len(cfn) == 0 {
  723. continue
  724. }
  725. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  726. if key == "budget" || key == "bidamount" {
  727. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  728. j.Result[key][i].IsTrue = true
  729. } else {
  730. j.Result[key][i].Value = data[0]
  731. continue
  732. }
  733. }
  734. before, _ := v.Value.(string)
  735. v.Value = data[0]
  736. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  737. //添加行数清理的日志
  738. //清理特殊符号
  739. lockclear.Lock()
  740. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  741. text := qu.ObjToString(v.Value)
  742. before = text
  743. v.Value = clear.OtherClean(key, text)
  744. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  745. }
  746. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  747. lockclear.Unlock()
  748. }
  749. }
  750. PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
  751. // bs, _ := json.Marshal(j.Result)
  752. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  753. }, func(err interface{}) {
  754. log.Debug("ExtractProcess err", err, j.SourceMid)
  755. })
  756. }
  757. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  758. qu.Try(func() {
  759. doc := *j.Data
  760. //全局前置规则,结果覆盖doc属性
  761. // for _, v := range e.RulePres {
  762. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  763. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  764. // }
  765. // }
  766. //抽取规则
  767. tmprules := map[string][]*RuleCore{}
  768. lockrule.Lock()
  769. if j.Category == "all" || j.CategorySecond == "all" {
  770. for k, vc1 := range e.RuleCores["all_all"] {
  771. tmprules[k] = vc1
  772. }
  773. } else {
  774. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  775. tmprules[k] = vc1
  776. }
  777. }
  778. lockrule.Unlock()
  779. for _, vc1 := range tmprules {
  780. for _, vc := range vc1 {
  781. tmp := ju.DeepCopy(doc).(map[string]interface{})
  782. //是否进入逻辑
  783. if !ju.Logic(vc.LuaLogic, tmp) {
  784. continue
  785. }
  786. //抽取-前置规则
  787. // for _, v := range vc.RulePres {
  788. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  789. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  790. // }
  791. // }
  792. // log.Debug("抽取-前置规则", tmp)
  793. //抽取-规则
  794. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  795. ExtRuleCore(tmp, e, vc, j, isSite)
  796. }
  797. // log.Debug("抽取-规则", tmp)
  798. //抽取-后置规则
  799. for _, v := range vc.RuleBacks {
  800. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  801. ExtRegBack(j, v, e.TaskInfo, vc)
  802. }
  803. }
  804. // log.Debug("抽取-后置规则", tmp)
  805. }
  806. }
  807. //全局后置规则
  808. for _, v := range e.RuleBacks {
  809. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  810. ExtRegBack(j, v, e.TaskInfo, nil)
  811. }
  812. }
  813. //函数清理
  814. for key, val := range j.Result {
  815. for _, v := range val {
  816. lockclear.Lock()
  817. var cfn = []string{}
  818. if isSite {
  819. cfn = e.SiteClearFn[key]
  820. if len(cfn) == 0 {
  821. cfn = e.ClearFn[key]
  822. }
  823. } else {
  824. cfn = e.ClearFn[key]
  825. }
  826. lockclear.Unlock()
  827. if len(cfn) == 0 {
  828. continue
  829. }
  830. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  831. v.Value = data[0]
  832. //清理特殊符号
  833. lockclear.Lock()
  834. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  835. clear.MesField[key] != nil {
  836. text := qu.ObjToString(v.Value)
  837. text = clear.OtherClean(key, text)
  838. v.Value = text
  839. }
  840. lockclear.Unlock()
  841. }
  842. }
  843. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  844. // bs, _ := json.Marshal(j.Result)
  845. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  846. }, func(err interface{}) {
  847. log.Debug("ExtractProcess err", err)
  848. })
  849. }
  850. //前置过滤
  851. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  852. defer qu.Catch()
  853. before := ju.DeepCopy(doc).(map[string]interface{})
  854. extinfo := map[string]interface{}{}
  855. if in.IsLua {
  856. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  857. if j != nil {
  858. lua.Block = j.Block
  859. }
  860. extinfo = lua.RunScript("pre")
  861. for k, v := range extinfo { //结果覆盖原doc
  862. doc[k] = v
  863. }
  864. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  865. } else {
  866. var key string
  867. if !j.IsFile {
  868. key = qu.If(in.Field == "", "detail", in.Field).(string)
  869. } else {
  870. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  871. }
  872. text := qu.ObjToString(doc[key])
  873. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  874. doc[key] = extinfo[key] //结果覆盖原doc
  875. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  876. }
  877. return doc
  878. }
  879. //抽取-规则
  880. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  881. //候选人加入
  882. var kvMap map[string][]map[string]interface{}
  883. extByReg := true
  884. if vc.ExtFrom != "title" {
  885. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  886. }
  887. for _, v := range vc.RuleCores {
  888. if v.IsLua {
  889. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  890. } else if extByReg {
  891. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  892. }
  893. }
  894. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  895. if vc.Field == "budget" && len(kvMap) == 0 {
  896. if len(j.BlockPackage) == 1 {
  897. for _, bp := range j.BlockPackage {
  898. for fieldname, field := range vc.LFields {
  899. if field != vc.Field {
  900. continue
  901. }
  902. tp := ""
  903. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  904. if k == 0 {
  905. tp = "colon"
  906. } else if k == 1 {
  907. tp = "space"
  908. } else if k == 2 {
  909. tp = "table"
  910. }
  911. if v == nil || v.KvTags == nil {
  912. continue
  913. }
  914. for _, vv := range v.KvTags[fieldname] {
  915. text := ju.TrimLRSpace(vv.Value, "")
  916. if text != "" {
  917. tmp := &ju.ExtField{
  918. ExtFrom: "package",
  919. Field: vc.Field,
  920. Code: "CL_分包",
  921. Type: tp,
  922. MatchType: "package",
  923. RuleText: bp.Text,
  924. SourceValue: vv.Key,
  925. Value: text,
  926. }
  927. if isSite {
  928. tmp.Score = 1
  929. }
  930. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  931. }
  932. }
  933. }
  934. }
  935. break
  936. }
  937. }
  938. } else {
  939. for k, v := range kvMap {
  940. if j.Result[k] == nil {
  941. j.Result[k] = [](*ju.ExtField){}
  942. }
  943. for _, tmp := range v {
  944. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  945. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  946. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  947. MatchType: qu.ObjToString(tmp["matchtype"]),
  948. RuleText: qu.ObjToString(tmp["ruletext"]),
  949. SourceValue: tmp["sourcevalue"],
  950. Value: tmp["value"]}
  951. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  952. field.Score = 1
  953. }
  954. if isSite {
  955. field.Score = 1
  956. }
  957. if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
  958. moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
  959. if len(moneys) > 0 {
  960. if vf, ok := moneys[0].(float64); ok {
  961. field.Value = vf
  962. field.IsTrue = moneys[len(moneys)-1].(bool)
  963. } else if vi, ok := moneys[0].(int); ok {
  964. field.Value = float64(vi)
  965. field.IsTrue = moneys[len(moneys)-1].(bool)
  966. }
  967. }
  968. }
  969. if tmp["blocktag"] != nil {
  970. btag := make(map[string]string)
  971. for k := range tmp["blocktag"].(map[string]bool) {
  972. blocktag.Lock()
  973. if TagConfigDesc[k] != "" {
  974. btag[k] = TagConfigDesc[k]
  975. }
  976. blocktag.Unlock()
  977. }
  978. field.BlockTag = btag
  979. }
  980. j.Result[k] = append(j.Result[k], field)
  981. }
  982. }
  983. }
  984. }
  985. //抽取-规则-kv
  986. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  987. defer qu.Catch()
  988. if extfrom == "title" || !in.IsLua {
  989. return
  990. }
  991. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  992. lua.KvMap = *kvMap
  993. lua.Block = j.Block
  994. extinfo := lua.RunScript("core")
  995. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  996. for _, v := range tmps {
  997. v["core"] = in.Code
  998. }
  999. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  1000. }
  1001. if len(extinfo) > 0 {
  1002. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1003. }
  1004. }
  1005. //抽取-规则-正则
  1006. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  1007. defer qu.Catch()
  1008. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1009. b := IsExtract(in.Field, j.Title, j.Content)
  1010. if !b {
  1011. return
  1012. }
  1013. //全文正则
  1014. //text := qu.ObjToString(doc[extfrom])
  1015. //if in.Field != "" {
  1016. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  1017. // if len(extinfo) > 0 {
  1018. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1019. // }
  1020. //}
  1021. //块抽取
  1022. if in.Field != "" {
  1023. if extfrom == "title" {
  1024. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  1025. if len(extinfo) > 0 {
  1026. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1027. }
  1028. } else if in.Field == "qualifies" {
  1029. extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite)
  1030. if len(extinfo) > 0 {
  1031. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1032. }
  1033. } else {
  1034. for _, v := range j.Block {
  1035. btag := make(map[string]string)
  1036. for k := range v.Classify {
  1037. blocktag.Lock()
  1038. btag[k] = TagConfigDesc[k]
  1039. blocktag.Unlock()
  1040. }
  1041. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  1042. if len(extinfo) > 0 {
  1043. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  1044. }
  1045. }
  1046. }
  1047. }
  1048. }
  1049. //pkg抽取-规则-正则
  1050. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  1051. defer qu.Catch()
  1052. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1053. b := IsExtract(in.Field, j.Title, j.Content)
  1054. if !b {
  1055. return
  1056. }
  1057. //块抽取
  1058. if in.Field != "" {
  1059. for k, vbpkg := range j.BlockPackage {
  1060. rep := map[string]string{}
  1061. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1062. if in.Field == "budget" && vbpkg.Budget > 0 {
  1063. continue
  1064. }
  1065. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  1066. continue
  1067. }
  1068. if in.Field == "winner" && vbpkg.Winner != "" {
  1069. continue
  1070. }
  1071. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  1072. continue
  1073. }
  1074. if in.Field == "projectname" && vbpkg.Name != "" {
  1075. continue
  1076. }
  1077. if in.Field == "winner" && vbpkg.Winner != "" {
  1078. continue
  1079. }
  1080. if in.Field == "winnerperson" {
  1081. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  1082. continue
  1083. }
  1084. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  1085. continue
  1086. }
  1087. }
  1088. if in.Field == "winnertel" {
  1089. if vbpkg.WinnerPerson == "" {
  1090. continue
  1091. }
  1092. }
  1093. //处理正负数修正
  1094. ptmp := strings.Split(in.RuleText, "#")
  1095. sign := 0
  1096. if len(ptmp) == 2 {
  1097. if ptmp[1] == "正" {
  1098. sign = 1
  1099. } else if ptmp[1] == "负" {
  1100. sign = -1
  1101. }
  1102. }
  1103. tmp := strings.Split(ptmp[0], "__")
  1104. if len(tmp) == 2 {
  1105. epos := strings.Split(tmp[1], ",")
  1106. posm := map[string]int{}
  1107. for _, v := range epos {
  1108. ks := strings.Split(v, ":")
  1109. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1110. posm[ks[1]] = qu.IntAll(ks[0])
  1111. } else {
  1112. posm[in.Field] = qu.IntAll(ks[0])
  1113. }
  1114. }
  1115. var pattern string
  1116. if strings.Contains(tmp[0], "\\u") {
  1117. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1118. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1119. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1120. } else {
  1121. pattern = tmp[0]
  1122. }
  1123. //log.Debug("pattern", pattern)
  1124. //fmt.Println(text)
  1125. reg := regexp.MustCompile(pattern)
  1126. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  1127. for i, _ := range apos {
  1128. pos := apos[i]
  1129. for k, p := range posm {
  1130. if len(pos) > p {
  1131. if pos[p] == -1 || pos[p+1] == -1 {
  1132. continue
  1133. }
  1134. val := vbpkg.Text[pos[p]:pos[p+1]]
  1135. if string(val) == "" {
  1136. continue
  1137. }
  1138. if sign == -1 {
  1139. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1140. } else {
  1141. rep[k+"_"+fmt.Sprint(i)] = val
  1142. }
  1143. }
  1144. }
  1145. }
  1146. //fmt.Println(text)
  1147. for i := 0; i < len(apos); i++ {
  1148. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  1149. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1150. lock.Lock()
  1151. cfn := e.ClearFn[in.Field]
  1152. lock.Unlock()
  1153. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1154. if data[len(data)-1].(bool) {
  1155. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1156. j.BlockPackage[k].IsTrueBudget = true
  1157. }
  1158. break
  1159. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1160. lock.Lock()
  1161. cfn := e.ClearFn[in.Field]
  1162. lock.Unlock()
  1163. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1164. if data[len(data)-1].(bool) {
  1165. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1166. j.BlockPackage[k].IsTrueBidamount = true
  1167. }
  1168. break
  1169. } else if in.Field == "winner" {
  1170. if j.BlockPackage[k].Winner == "" {
  1171. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  1172. break
  1173. }
  1174. } else if in.Field == "winnertel" {
  1175. if j.BlockPackage[k].WinnerTel == "" {
  1176. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1177. break
  1178. }
  1179. } else if in.Field == "winnerperson" {
  1180. if j.BlockPackage[k].WinnerPerson == "" {
  1181. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1182. break
  1183. }
  1184. } else if in.Field == "bidstatus" {
  1185. if j.BlockPackage[k].BidStatus == "" {
  1186. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1187. break
  1188. }
  1189. } else if in.Field == "projectname" {
  1190. if j.BlockPackage[k].Name == "" {
  1191. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1192. break
  1193. }
  1194. } else if in.Field == "winnerperson" {
  1195. if j.BlockPackage[k].WinnerPerson == "" {
  1196. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1197. break
  1198. }
  1199. } else if in.Field == "winnertel" {
  1200. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1201. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1202. break
  1203. }
  1204. }
  1205. }
  1206. }
  1207. }
  1208. } else {
  1209. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1210. val := ""
  1211. if len(pos) == 2 {
  1212. //"text" = "text"[pos[1]:]
  1213. val = "text"[pos[1]:]
  1214. rs := regexp.MustCompile("[^\r\n\t]+")
  1215. tmp := rs.FindAllString("text", -1)
  1216. if len(tmp) > 0 {
  1217. val = tmp[0]
  1218. }
  1219. }
  1220. if val != "" {
  1221. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1222. lock.Lock()
  1223. cfn := e.ClearFn[in.Field]
  1224. lock.Unlock()
  1225. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1226. if data[len(data)-1].(bool) {
  1227. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1228. j.BlockPackage[k].IsTrueBudget = true
  1229. }
  1230. break
  1231. }
  1232. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1233. lock.Lock()
  1234. cfn := e.ClearFn[in.Field]
  1235. lock.Unlock()
  1236. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1237. if data[len(data)-1].(bool) {
  1238. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1239. j.BlockPackage[k].IsTrueBidamount = true
  1240. }
  1241. break
  1242. } else if in.Field == "bidstatus" {
  1243. if j.BlockPackage[k].BidStatus == "" {
  1244. j.BlockPackage[k].BidStatus = val
  1245. break
  1246. }
  1247. } else if in.Field == "projectname" {
  1248. if j.BlockPackage[k].Name == "" {
  1249. j.BlockPackage[k].Name = val
  1250. break
  1251. }
  1252. }
  1253. }
  1254. }
  1255. }
  1256. }
  1257. }
  1258. //lua脚本根据属性设置提取kv值
  1259. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1260. kvmap := map[string][]map[string]interface{}{}
  1261. if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  1262. if vc.Field == "bidamount" {
  1263. for _, v := range j.Winnerorder {
  1264. if v["price"] == nil {
  1265. continue
  1266. }
  1267. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1268. "code": "winnerorder",
  1269. "field": vc.Field,
  1270. "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
  1271. "extfrom": v["sortstr"],
  1272. "sourcevalue": v["price"],
  1273. "value": v["price"],
  1274. "type": "winnerorder",
  1275. "matchtype": "winnerorder",
  1276. })
  1277. return kvmap, false
  1278. }
  1279. //候选人中标金额
  1280. if price := j.Winnerorder[0]["price"]; price != nil {
  1281. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1282. "code": "CL_中标候选人",
  1283. "field": vc.Field,
  1284. "ruletext": "中标候选人",
  1285. "extfrom": j.Winnerorder[0]["sortstr"],
  1286. "sourcevalue": price,
  1287. "value": price,
  1288. "type": "winnerorder",
  1289. "matchtype": "winnerorder",
  1290. })
  1291. return kvmap, false
  1292. }
  1293. }
  1294. }
  1295. for fieldname, field := range vc.LFields {
  1296. if field != vc.Field {
  1297. continue
  1298. }
  1299. extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category)
  1300. }
  1301. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1302. return kvmap, true
  1303. }
  1304. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) {
  1305. //qu.Debug("fieldname+++", fieldname)
  1306. for _, bl := range blocks {
  1307. tp := ""
  1308. if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") {
  1309. if text := ju.TrimLRSpace(bl.Text, ""); text != "" {
  1310. if Category == "招标" || Category == "拟建" || Category == "预告" {
  1311. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1312. "code": "CL_块内容",
  1313. "field": field,
  1314. "ruletext": "投标保证金",
  1315. "extfrom": "投标保证金_块内容",
  1316. "sourcevalue": bl.Text,
  1317. "value": text,
  1318. "type": "投标保证金_块内容",
  1319. "matchtype": "tag_string",
  1320. "blocktag": bl.Classify,
  1321. "weight": 0,
  1322. })
  1323. } else if Category == "结果" {
  1324. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1325. "code": "CL_",
  1326. "field": field,
  1327. "ruletext": "履约保证金",
  1328. "extfrom": "履约保证金_块内容",
  1329. "sourcevalue": bl.Text,
  1330. "value": text,
  1331. "type": "履约保证金_块内容",
  1332. "matchtype": "tag_string",
  1333. "blocktag": bl.Classify,
  1334. "weight": 0,
  1335. })
  1336. }
  1337. }
  1338. return
  1339. }
  1340. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1341. if k == 0 {
  1342. tp = "colon"
  1343. } else if k == 1 {
  1344. tp = "space"
  1345. } else if k == 2 {
  1346. tp = "table"
  1347. }
  1348. if v == nil || v.KvTags == nil {
  1349. continue
  1350. }
  1351. for _, vv := range v.KvTags[fieldname] {
  1352. text := ju.TrimLRSpace(vv.Value, "")
  1353. if text != "" {
  1354. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1355. "code": "CL_" + vv.Key,
  1356. "field": field,
  1357. "ruletext": vv.Key,
  1358. "extfrom": vc.ExtFrom,
  1359. "sourcevalue": text,
  1360. "value": text,
  1361. "type": tp,
  1362. "matchtype": "tag_string",
  1363. "blocktag": bl.Classify,
  1364. "weight": vv.Weight,
  1365. })
  1366. //if field != "winnertel" && field != "winnerperson" {
  1367. // //break //暂定取第一个
  1368. //}
  1369. }
  1370. }
  1371. }
  1372. if len(kvmap[field]) == 0 {
  1373. extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category)
  1374. }
  1375. }
  1376. }
  1377. //正则提取结果
  1378. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1379. defer qu.Catch()
  1380. var score float64
  1381. score = vre.Score
  1382. if isSite {
  1383. score = score + 1.0
  1384. }
  1385. extinfo := map[string][]map[string]interface{}{}
  1386. rep := map[string]string{}
  1387. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1388. //处理正负数修正
  1389. ptmp := strings.Split(vre.RuleText, "#")
  1390. sign := 0
  1391. if len(ptmp) == 2 {
  1392. if ptmp[1] == "正" {
  1393. sign = 1
  1394. } else if ptmp[1] == "负" {
  1395. sign = -1
  1396. }
  1397. }
  1398. tmp := strings.Split(ptmp[0], "__")
  1399. if len(tmp) == 2 {
  1400. epos := strings.Split(tmp[1], ",")
  1401. posm := map[string]int{}
  1402. for _, v := range epos {
  1403. ks := strings.Split(v, ":")
  1404. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1405. posm[ks[1]] = qu.IntAll(ks[0])
  1406. } else {
  1407. posm[vre.Field] = qu.IntAll(ks[0])
  1408. }
  1409. }
  1410. var pattern string
  1411. if strings.Contains(tmp[0], "\\u") {
  1412. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1413. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1414. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1415. } else {
  1416. pattern = tmp[0]
  1417. }
  1418. //log.Debug("pattern", pattern)
  1419. //fmt.Println(text)
  1420. reg := regexp.MustCompile(pattern)
  1421. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1422. for i, _ := range apos {
  1423. pos := apos[i]
  1424. for k, p := range posm {
  1425. if len(pos) > p {
  1426. if pos[p] == -1 || pos[p+1] == -1 {
  1427. continue
  1428. }
  1429. val := text[pos[p]:pos[p+1]]
  1430. if string(val) == "" {
  1431. continue
  1432. }
  1433. if sign == -1 {
  1434. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1435. } else {
  1436. rep[k+"_"+fmt.Sprint(i)] = val
  1437. }
  1438. }
  1439. }
  1440. }
  1441. tmps := []map[string]interface{}{}
  1442. for i := 0; i < len(apos); i++ {
  1443. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1444. tmp := map[string]interface{}{
  1445. "field": vre.Field,
  1446. "code": vre.Code,
  1447. "ruletext": vre.RuleText,
  1448. "extfrom": text,
  1449. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1450. "type": "regexp",
  1451. "matchtype": "regcontent",
  1452. "blocktag": *tag,
  1453. "score": score,
  1454. }
  1455. exfield := ju.ExtField{
  1456. BlockTag: *tag,
  1457. Field: vre.Field,
  1458. Code: vre.Code,
  1459. RuleText: vre.RuleText,
  1460. Type: "regexp",
  1461. MatchType: "regcontent",
  1462. ExtFrom: extfrom,
  1463. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1464. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1465. Score: score,
  1466. }
  1467. if vre.Field == "qualifies" {
  1468. if len(rep) >= 2 {
  1469. tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1470. exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1471. }
  1472. }
  1473. tmps = append(tmps, tmp)
  1474. if tmp["blocktag"] != nil {
  1475. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1476. }
  1477. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1478. }
  1479. }
  1480. if len(tmps) > 0 {
  1481. //fmt.Println(tmps)
  1482. extinfo[vre.Field] = tmps
  1483. }
  1484. }
  1485. } else {
  1486. pos := vre.RegCore.Reg.FindStringIndex(text)
  1487. val := ""
  1488. if len(pos) == 2 {
  1489. text = text[pos[1]:]
  1490. rs := regexp.MustCompile("[^\r\n\t]+")
  1491. tmp := rs.FindAllString(text, -1)
  1492. if len(tmp) > 0 {
  1493. val = tmp[0]
  1494. }
  1495. }
  1496. if val != "" {
  1497. tmps := []map[string]interface{}{}
  1498. tmp := map[string]interface{}{
  1499. "field": vre.Field,
  1500. "code": vre.Code,
  1501. "ruletext": vre.RuleText,
  1502. "extfrom": text,
  1503. "value": val,
  1504. "type": "regexp",
  1505. "matchtype": "regcontent",
  1506. "blocktag": *tag,
  1507. "score": score,
  1508. }
  1509. tmps = append(tmps, tmp)
  1510. extinfo[vre.Field] = tmps
  1511. if j.Result[vre.Field] == nil {
  1512. j.Result[vre.Field] = [](*ju.ExtField){}
  1513. }
  1514. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1515. Value: val,
  1516. Score: score}
  1517. if tmp["blocktag"] != nil {
  1518. field.BlockTag = tmp["blocktag"].(map[string]string)
  1519. }
  1520. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1521. }
  1522. }
  1523. return extinfo
  1524. }
  1525. //后置过滤
  1526. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1527. defer qu.Catch()
  1528. if in.IsLua {
  1529. result := GetResultMapForLua(j)
  1530. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1531. if j != nil {
  1532. lua.Block = j.Block
  1533. }
  1534. extinfo := lua.RunScript("back")
  1535. for k, v := range extinfo {
  1536. if tmps, ok := v.([]map[string]interface{}); ok {
  1537. j.Result[k] = [](*ju.ExtField){}
  1538. for _, tmp := range tmps {
  1539. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1540. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1541. Value: tmp["value"]}
  1542. if tmp["blocktag"] != nil {
  1543. field.BlockTag = tmp["blocktag"].(map[string]string)
  1544. }
  1545. j.Result[k] = append(j.Result[k], field)
  1546. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1547. }
  1548. }
  1549. }
  1550. if len(extinfo) > 0 {
  1551. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1552. }
  1553. } else {
  1554. extinfo := map[string]interface{}{}
  1555. if in.Field != "" {
  1556. clearByTitle := false
  1557. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1558. clearByTitle = true
  1559. }
  1560. if j.Result[in.Field] != nil {
  1561. tmp := j.Result[in.Field]
  1562. exts := []interface{}{}
  1563. for k, v := range tmp {
  1564. if clearByTitle && v.ExtFrom != "title" {
  1565. continue
  1566. }
  1567. //table抽取到的数据不清理
  1568. if v.Type == "table" && v.Field == "projectname" {
  1569. return
  1570. }
  1571. text := qu.ObjToString(v.Value)
  1572. if v.Field == "bidamount" || v.Field == "budget" {
  1573. if strings.Contains(qu.ObjToString(v.SourceValue), "费率") {
  1574. j.Result[in.Field][k].IsTrue = false
  1575. continue
  1576. }
  1577. }
  1578. if text != "" {
  1579. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1580. }
  1581. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1582. continue
  1583. }
  1584. j.Result[in.Field][k].Value = text
  1585. exts = append(exts, map[string]interface{}{
  1586. "field": v.Field,
  1587. "code": v.Code,
  1588. "ruletext": v.RuleText,
  1589. "type": v.Type,
  1590. "matchtype": v.MatchType,
  1591. "extfrom": v.ExtFrom,
  1592. "value": text,
  1593. })
  1594. }
  1595. if len(exts) > 0 {
  1596. extinfo[in.Field] = exts
  1597. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1598. }
  1599. }
  1600. } else {
  1601. for key, tmp := range j.Result {
  1602. exts := []interface{}{}
  1603. for k, v := range tmp {
  1604. //table抽取到的数据不清理
  1605. if v.Type == "table" && v.Field == "projectname" {
  1606. return
  1607. }
  1608. text := qu.ObjToString(v.Value)
  1609. if text != "" {
  1610. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1611. }
  1612. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1613. continue
  1614. }
  1615. j.Result[key][k].Value = text
  1616. exts = append(exts, map[string]interface{}{
  1617. "field": v.Field,
  1618. "code": v.Code,
  1619. "ruletext": v.RuleText,
  1620. "type": v.Type,
  1621. "matchtype": v.MatchType,
  1622. "extfrom": v.ExtFrom,
  1623. "value": text,
  1624. })
  1625. }
  1626. if len(exts) > 0 {
  1627. extinfo[key] = exts
  1628. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1629. }
  1630. }
  1631. }
  1632. }
  1633. }
  1634. //后置过滤
  1635. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1636. defer qu.Catch()
  1637. for k, v := range j.BlockPackage {
  1638. if in.Field == "winner" {
  1639. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1640. } else if in.Field == "bidstatus" {
  1641. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1642. } else if in.Field == "" {
  1643. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1644. } else if in.Field == "projectname" {
  1645. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1646. } else if in.Field == "winnerperson" {
  1647. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1648. } else if in.Field == "winnertel" {
  1649. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1650. }
  1651. }
  1652. }
  1653. //KV过滤
  1654. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1655. defer qu.Catch()
  1656. extinfo := map[string]interface{}{}
  1657. if in.Field != "" {
  1658. if j.Result[in.Field] != nil {
  1659. tmp := j.Result[in.Field]
  1660. exts := []interface{}{}
  1661. for k, v := range tmp {
  1662. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1663. continue
  1664. }
  1665. text := qu.ObjToString(v.Value)
  1666. if text != "" {
  1667. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1668. }
  1669. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1670. continue
  1671. }
  1672. j.Result[in.Field][k].Value = text
  1673. exts = append(exts, map[string]interface{}{
  1674. "field": v.Field,
  1675. "code": v.Code,
  1676. "ruletext": v.RuleText,
  1677. "type": v.Type,
  1678. "matchtype": v.MatchType,
  1679. "extfrom": v.ExtFrom,
  1680. "value": text,
  1681. })
  1682. }
  1683. if len(exts) > 0 {
  1684. extinfo[in.Field] = exts
  1685. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1686. }
  1687. }
  1688. }
  1689. }
  1690. //获取抽取结果map[string][]interface{},lua脚本使用
  1691. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1692. defer qu.Catch()
  1693. result := map[string][]map[string]interface{}{}
  1694. for key, val := range j.Result {
  1695. if result[key] == nil {
  1696. result[key] = []map[string]interface{}{}
  1697. }
  1698. for _, v := range val {
  1699. tmp := map[string]interface{}{
  1700. "field": v.Field,
  1701. "code": v.Code,
  1702. "ruletext": v.RuleText,
  1703. "value": v.Value,
  1704. "type": v.Type,
  1705. "matchtype": v.MatchType,
  1706. "extfrom": v.ExtFrom,
  1707. }
  1708. result[key] = append(result[key], tmp)
  1709. }
  1710. }
  1711. return result
  1712. }
  1713. //抽取日志
  1714. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1715. defer qu.Catch()
  1716. if !t.IsEtxLog {
  1717. return
  1718. }
  1719. logdata := map[string]interface{}{
  1720. "code": qu.If(v.Code == "", "kv", v.Code),
  1721. "name": v.Name,
  1722. "type": ftype,
  1723. "ruletext": v.RuleText,
  1724. "islua": v.IsLua,
  1725. "field": v.Field,
  1726. "version": t.Version,
  1727. "taskname": t.Name,
  1728. "before": before,
  1729. "extinfo": extinfo,
  1730. "sid": sid,
  1731. "comeintime": time.Now().Unix(),
  1732. }
  1733. lock.Lock()
  1734. ExtLogs[t] = append(ExtLogs[t], logdata)
  1735. lock.Unlock()
  1736. }
  1737. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1738. exts := []map[string]interface{}{}
  1739. exts = append(exts, map[string]interface{}{
  1740. "field": ext.Field,
  1741. "code": ext.Code,
  1742. "type": ftype,
  1743. "matchtype": matchtype,
  1744. "extfrom": ext.ExtFrom,
  1745. "value": ext.Value,
  1746. })
  1747. extinfo := map[string]interface{}{
  1748. ext.Field: exts,
  1749. }
  1750. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1751. }
  1752. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1753. defer qu.Catch()
  1754. if !t.IsEtxLog {
  1755. return
  1756. }
  1757. logdata := map[string]interface{}{
  1758. "code": code,
  1759. "name": name,
  1760. "type": ftype,
  1761. "ruletext": "",
  1762. "islua": false,
  1763. "field": field,
  1764. "version": t.Version,
  1765. "taskname": t.Name,
  1766. "before": before,
  1767. "extinfo": extinfo,
  1768. "sid": sid,
  1769. "comeintime": time.Now().Unix(),
  1770. }
  1771. lock.Lock()
  1772. ExtLogs[t] = append(ExtLogs[t], logdata)
  1773. lock.Unlock()
  1774. }
  1775. //保存抽取日志
  1776. func SaveExtLog() {
  1777. defer qu.Catch()
  1778. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1779. lock.Lock()
  1780. tmpLogs = ExtLogs
  1781. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1782. lock.Unlock()
  1783. for k, v := range tmpLogs {
  1784. if len(v) < saveLimit {
  1785. db.Mgo.SaveBulk(k.TrackColl, v...)
  1786. } else {
  1787. for {
  1788. if len(v) > saveLimit {
  1789. tmp := v[:saveLimit]
  1790. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1791. v = v[saveLimit:]
  1792. } else {
  1793. db.Mgo.SaveBulk(k.TrackColl, v...)
  1794. break
  1795. }
  1796. }
  1797. }
  1798. }
  1799. time.AfterFunc(10*time.Second, SaveExtLog)
  1800. }
  1801. type FieldValue struct {
  1802. Value interface{}
  1803. Count int
  1804. }
  1805. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1806. //分析抽取结果并保存
  1807. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1808. qu.Try(func() {
  1809. if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
  1810. if j.CategorySecond != "单一" {
  1811. delete(j.Result, "winner")
  1812. delete(j.Result, "bidamount")
  1813. for _, v := range j.BlockPackage {
  1814. v.Bidamount = 0
  1815. v.IsTrueBidamount = false
  1816. if v.Winner != "" {
  1817. v.Winner = ""
  1818. if v.SpaceKV != nil {
  1819. delete(v.SpaceKV.KvTags, "中标单位")
  1820. }
  1821. if v.TableKV != nil {
  1822. delete(v.TableKV.KvTags, "中标单位")
  1823. }
  1824. if v.ColonKV != nil {
  1825. delete(v.ColonKV.KvTags, "中标单位")
  1826. }
  1827. }
  1828. }
  1829. for _, v := range j.PackageInfo {
  1830. delete(v, "winner")
  1831. delete(v, "bidamount")
  1832. }
  1833. j.Winnerorder = nil
  1834. if jf != nil && jf.Winnerorder != nil {
  1835. jf.Winnerorder = nil
  1836. }
  1837. }
  1838. }
  1839. //重新取出清理过后的中标候选人
  1840. resetWinnerorder(j)
  1841. //打分
  1842. doc, result, _id := funcAnalysis(j, e)
  1843. //_, result, _id := funcAnalysis(j, e)
  1844. if ju.IsSaveTag {
  1845. go otherNeedSave(j, result, e)
  1846. }
  1847. //从排序结果中取值
  1848. tmp := map[string]interface{}{} //抽取值
  1849. tmp["spidercode"] = j.SpiderCode
  1850. tmp["site"] = j.Site
  1851. if len(*j.Jsondata) > 0 {
  1852. tmp["jsondata"] = j.Jsondata
  1853. }
  1854. for k, val := range result {
  1855. if k == "qualifies" {
  1856. squalifies := make([]interface{}, 0)
  1857. squalifiesMap := make(map[string]*scoreIndex, 0)
  1858. for _, kv := range val {
  1859. skey := kv.RuleText
  1860. if kv.Score > 0 {
  1861. if squalifiesMap[skey] == nil {
  1862. squalifiesMap = map[string]*scoreIndex{
  1863. skey: &scoreIndex{
  1864. Score: kv.Score,
  1865. Index: len(squalifies),
  1866. },
  1867. }
  1868. squalifies = append(squalifies, map[string]interface{}{
  1869. "key": skey,
  1870. "value": kv.Value,
  1871. })
  1872. } else {
  1873. if squalifiesMap[skey].Score < kv.Score {
  1874. squalifies[squalifiesMap[skey].Index] = map[string]interface{}{
  1875. "key": skey,
  1876. "value": kv.Value,
  1877. }
  1878. }
  1879. }
  1880. }
  1881. }
  1882. tmp[k] = squalifies
  1883. continue
  1884. }
  1885. for _, v := range val { //取第一个非负数,项目名称除外
  1886. //存0是否有效
  1887. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 {
  1888. tmp[v.Field] = v.Value
  1889. break
  1890. }
  1891. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  1892. tmp[v.Field] = v.Value
  1893. break
  1894. }
  1895. }
  1896. }
  1897. tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",")
  1898. if len(j.PackageInfo) > 15 {
  1899. for k, v := range j.PackageInfo {
  1900. j.PackageInfo = map[string]map[string]interface{}{}
  1901. j.PackageInfo[k] = v
  1902. break
  1903. }
  1904. }
  1905. if len(j.PackageInfo) > 0 { //分包信息
  1906. tmp["package"] = j.PackageInfo
  1907. //包预算,中标金额合并大于抽取就覆盖
  1908. var tmpBidamount, tmpBudget float64
  1909. //s_winner逗号分隔拼接,分包中标人
  1910. var tmpstr, savewinner []string
  1911. //按包排序
  1912. for b, v := range j.PackageInfo {
  1913. if v["winner"] != nil && v["winner"] != "" {
  1914. tmpstr = append(tmpstr, b)
  1915. }
  1916. }
  1917. //包预算,中标金额合并大于抽取就覆盖
  1918. if len(j.PackageInfo) >= 1 {
  1919. //包数大于1累加
  1920. for _, v := range j.PackageInfo {
  1921. if v["budget"] != nil {
  1922. tmpBudget += qu.Float64All(v["budget"])
  1923. }
  1924. if v["bidamount"] != nil {
  1925. tmpBidamount += qu.Float64All(v["bidamount"])
  1926. }
  1927. }
  1928. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1929. tmp["budget"] = tmpBudget
  1930. }
  1931. if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
  1932. tmp["bidamount"] = tmpBidamount
  1933. } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1934. tmp["bidamount"] = tmpBidamount
  1935. }
  1936. } else {
  1937. //包数等于1,tmp没有值取包里的值
  1938. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1939. for _, v := range j.PackageInfo {
  1940. if v["budget"] != nil {
  1941. tmp["budget"] = v["budget"]
  1942. }
  1943. }
  1944. }
  1945. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1946. for _, v := range j.PackageInfo {
  1947. if v["bidamount"] != nil {
  1948. tmp["bidamount"] = v["bidamount"]
  1949. }
  1950. }
  1951. }
  1952. }
  1953. //s_winner逗号分隔拼接,分包中标人
  1954. sort.Strings(tmpstr)
  1955. for _, v := range tmpstr {
  1956. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1957. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1958. if savevvv == "" {
  1959. continue
  1960. }
  1961. savewinner = append(savewinner, savevvv)
  1962. }
  1963. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1964. tmp["s_winner"] = tmp["winner"]
  1965. } else if savewinner != nil {
  1966. savewinner = RemoveReplicaSliceString(savewinner)
  1967. tmp["s_winner"] = strings.Join(savewinner, ",")
  1968. }
  1969. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1970. //没有分包取winner
  1971. tmp["s_winner"] = tmp["winner"]
  1972. }
  1973. if len(j.Winnerorder) > 0 { //候选人信息
  1974. for i, v := range j.Winnerorder {
  1975. if v["price"] != nil {
  1976. tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  1977. if tmpPrice[len(tmpPrice)-1].(bool) {
  1978. j.Winnerorder[i]["price"] = tmpPrice[0]
  1979. } else {
  1980. delete(j.Winnerorder[i], "price")
  1981. }
  1982. }
  1983. }
  1984. tmp["winnerorder"] = j.Winnerorder
  1985. }
  1986. //处理附件
  1987. var resultf map[string][]*ju.ExtField
  1988. ffield := map[string]interface{}{}
  1989. if jf != nil {
  1990. _, resultf, _ = funcAnalysis(jf, e)
  1991. for _, val := range resultf {
  1992. for _, v := range val { //取第一个非负数
  1993. if v.Score > -1 {
  1994. ffield[v.Field] = v.Value
  1995. if tmp[v.Field] == nil {
  1996. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
  1997. tmp[v.Field] = v.Value
  1998. break
  1999. }
  2000. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  2001. tmp[v.Field] = v.Value
  2002. break
  2003. }
  2004. }
  2005. break
  2006. }
  2007. }
  2008. }
  2009. if len(jf.PackageInfo) > 0 { //分包信息
  2010. ffield["package"] = jf.PackageInfo
  2011. }
  2012. if len(jf.Winnerorder) > 0 { //候选人信息
  2013. ffield["winnerorder"] = jf.Winnerorder
  2014. }
  2015. }
  2016. for k, v := range *doc {
  2017. if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
  2018. (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
  2019. }
  2020. //去重冗余字段
  2021. if delFiled(k) {
  2022. continue
  2023. }
  2024. if tmp[k] == nil {
  2025. tmp[k] = v
  2026. }
  2027. }
  2028. //质量审核
  2029. if ju.QualityAudit {
  2030. e.QualityAudit(tmp)
  2031. }
  2032. //城市抽取
  2033. if e.IsExtractCity {
  2034. e.NewExtractCity(j, &tmp, _id)
  2035. }
  2036. //品牌抽取
  2037. if ju.IsBrandGoods {
  2038. tmp["checkhas"] = map[string]int{
  2039. "hastable": j.HasTable,
  2040. "hasgoods": j.HasGoods,
  2041. "hasbrand": j.HasBrand,
  2042. "haskey": j.HasKey,
  2043. }
  2044. if len(j.BrandData) > 0 {
  2045. tmp["tablebrand"] = j.BrandData
  2046. }
  2047. }
  2048. //prince和number抽取
  2049. if ju.IsPriceNumber {
  2050. priceNumberLen := len(j.PriceNumberData)
  2051. if priceNumberLen > 1 { //table数据去重
  2052. tmpPriceNumberData := []map[string]interface{}{}
  2053. tableStrs := map[string]bool{}
  2054. for _, tb := range j.PriceNumberData {
  2055. has := false
  2056. bytes, _ := json.Marshal(tb)
  2057. str := string(bytes)
  2058. if len(tableStrs) > 0 && tableStrs[str] {
  2059. has = true
  2060. } else {
  2061. tableStrs[str] = true
  2062. }
  2063. if !has {
  2064. for _, data := range tb {
  2065. tmpPriceNumberData = append(tmpPriceNumberData, data)
  2066. }
  2067. }
  2068. }
  2069. tmp["pricenumber"] = tmpPriceNumberData
  2070. } else if priceNumberLen == 1 {
  2071. tmp["pricenumber"] = j.PriceNumberData[0]
  2072. }
  2073. }
  2074. //所有kv组成的字符串
  2075. var kvtext bytes.Buffer
  2076. blocks := make([]ju.BlockAndTag, 0)
  2077. for _, v := range j.Block {
  2078. //分包和标签
  2079. if ju.SaveBlock {
  2080. xx, _ := json.Marshal(v)
  2081. tmpblock := new(ju.TmpBlock)
  2082. err := json.Unmarshal(xx, &tmpblock)
  2083. if err != nil {
  2084. if v.BPackage != nil {
  2085. bpb, _ := json.Marshal(v.BPackage)
  2086. tmpblock.BPackage = string(bpb)
  2087. }
  2088. tmpblock = rangeBlockToJson(v, *tmpblock)
  2089. }
  2090. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  2091. }
  2092. //把所有kv组装成一个字符串,存库
  2093. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  2094. if jv == nil {
  2095. continue
  2096. }
  2097. for jv_k, jv_v := range jv.KvTags {
  2098. for _, jv_vv := range jv_v {
  2099. kvtext.WriteString(jv_k)
  2100. kvtext.WriteString(":")
  2101. kvtext.WriteString(jv_vv.Value)
  2102. kvtext.WriteString("\n")
  2103. }
  2104. }
  2105. }
  2106. }
  2107. if kvtext.Len() > 0 {
  2108. tmp["kvtext"] = kvtext.String()
  2109. }
  2110. if len(blocks) > 0 {
  2111. if blocksBytes, err := json.Marshal(blocks); err == nil {
  2112. if utf8.RuneCount(blocksBytes) < 100000 {
  2113. tmp["blocks"] = string(blocksBytes)
  2114. }
  2115. }
  2116. }
  2117. tmp["dataging"] = j.Dataging
  2118. /*for k, v := range *j.Data {
  2119. if f[k] {
  2120. tmp[k] = v
  2121. }
  2122. }
  2123. for k := range tmp {
  2124. if !f[k]{
  2125. delete(tmp,k)
  2126. }
  2127. }*/
  2128. //检查字段
  2129. tmp = checkFields(tmp)
  2130. if tmp["projectname"] == nil || tmp["projectname"] == "" {
  2131. tmp["projectname"] = j.Title
  2132. }
  2133. tmp["repeat"] = 0
  2134. if ju.Ffield {
  2135. if len(ffield) > 0 {
  2136. tmp["ffield"] = ffield
  2137. }
  2138. }
  2139. if e.TaskInfo.TestColl == "" {
  2140. if len(tmp) > 0 { //保存抽取结果
  2141. tmparr := []map[string]interface{}{
  2142. map[string]interface{}{
  2143. "_id": qu.StringTOBsonId(_id),
  2144. },
  2145. map[string]interface{}{"$set": tmp},
  2146. }
  2147. e.RWMutex.Lock()
  2148. e.BidArr = append(e.BidArr, tmparr)
  2149. e.BidTotal++
  2150. e.RWMutex.Unlock()
  2151. }
  2152. if ju.SaveResult {
  2153. id := tmp["_id"]
  2154. tmp["result"] = result
  2155. tmp["resultf"] = resultf
  2156. delete(tmp, "_id")
  2157. tmparr := []map[string]interface{}{
  2158. map[string]interface{}{
  2159. "_id": id,
  2160. },
  2161. map[string]interface{}{"$set": tmp},
  2162. }
  2163. e.RWMutex.Lock()
  2164. e.ResultArr = append(e.ResultArr, tmparr)
  2165. e.RWMutex.Unlock()
  2166. }
  2167. } else { //测试结果
  2168. delete(tmp, "_id")
  2169. delete(tmp, "fieldall")
  2170. if len(j.BlockPackage) > 0 { //分包详情
  2171. if len(j.BlockPackage) > 10 {
  2172. tmp["epackage"] = "分包异常"
  2173. } else {
  2174. bs, _ := json.Marshal(j.BlockPackage)
  2175. tmp["epackage"] = string(bs)
  2176. }
  2177. }
  2178. tmp["result"] = result
  2179. //tmp["resultf"] = resultf
  2180. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  2181. if !b {
  2182. log.Debug(e.TaskInfo.TestColl, _id)
  2183. }
  2184. }
  2185. }, func(err interface{}) {
  2186. log.Debug("AnalysisSaveResult err", err)
  2187. })
  2188. }
  2189. //检查字段-
  2190. func checkFields(tmp map[string]interface{}) map[string]interface{} {
  2191. delete(tmp, "contenthtml")
  2192. delete(tmp, "detail")
  2193. //delete(tmp, "toptype")
  2194. //delete(tmp, "subtype")
  2195. if _, ok := tmp["bidamount"].(string); ok {
  2196. delete(tmp, "bidamount")
  2197. } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/5 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
  2198. delete(tmp, "bidamount")
  2199. }
  2200. if _, ok := tmp["budget"].(string); ok {
  2201. delete(tmp, "budget")
  2202. }
  2203. if _, ok := tmp["unitprice"].(string); ok {
  2204. delete(tmp, "unitprice")
  2205. }
  2206. if _, ok := tmp["bidopentime"].(string); ok {
  2207. delete(tmp, "bidopentime")
  2208. }
  2209. if _, ok := tmp["signaturedate"].(string); ok {
  2210. delete(tmp, "signaturedate")
  2211. }
  2212. if _, ok := tmp["supervisorrate"].(string); ok {
  2213. delete(tmp, "supervisorrate")
  2214. }
  2215. for k, v := range tmp {
  2216. if k == "qualifies" {
  2217. continue
  2218. }
  2219. if k == "contract_guarantee" || k == "bid_guarantee" {
  2220. if len(fmt.Sprint(v)) > 0 {
  2221. tmp[k] = true
  2222. } else {
  2223. delete(tmp, k)
  2224. }
  2225. }
  2226. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") {
  2227. delete(tmp, k)
  2228. }
  2229. }
  2230. //工期单位-清理
  2231. if tmp["project_timeunit"] == "年" && tmp["project_duration"] == nil {
  2232. delete(tmp, "project_timeunit")
  2233. }
  2234. tmp["repeat"] = 0
  2235. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  2236. strwin := qu.ObjToString(tmp["winner"])
  2237. strwin_s := qu.ObjToString(tmp["s_winner"])
  2238. if !strings.Contains(strwin_s, strwin) {
  2239. tmp["s_winner"] = strwin
  2240. }
  2241. }
  2242. //budget bidamount
  2243. if bg, ok := tmp["budget"].(float64); ok {
  2244. if bg >= 50000000000 {
  2245. tmp["budget_max_err"] = bg
  2246. delete(tmp, "budget")
  2247. } /*else if bg > 0 && bg < 1000 {
  2248. tmp["budget_min_err"] = bg
  2249. delete(tmp, "budget")
  2250. }*/
  2251. }
  2252. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
  2253. if bg >= 50000000000 {
  2254. tmp["bidamount_max_err"] = bg
  2255. delete(tmp, "bidamount")
  2256. } /*else if bg > 0 && bg < 1000 {
  2257. tmp["bidamount_min_err"] = bg
  2258. delete(tmp, "bidamount")
  2259. }*/
  2260. }
  2261. return tmp
  2262. }
  2263. //保存其他
  2264. //kv、表格、块上的标签凡是新的标签都入库
  2265. //val type times firstid createtime 判定field
  2266. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  2267. now := time.Now().Unix()
  2268. coll := e.TaskInfo.TestColl
  2269. if coll == "" {
  2270. coll = "extract_tag_result"
  2271. } else {
  2272. coll += "_tag"
  2273. }
  2274. datas := []map[string]interface{}{}
  2275. kv := map[string]int{}
  2276. for _, v := range j.Block {
  2277. //
  2278. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  2279. if vv == nil || vv.KvTags == nil {
  2280. continue
  2281. }
  2282. for kkk, vvv := range vv.KvTags {
  2283. for _, vvvv := range vvv {
  2284. if vvvv.IsInvalid {
  2285. kv[kkk] = kv[kkk] + 1
  2286. break
  2287. }
  2288. }
  2289. }
  2290. }
  2291. for _, vv := range v.NotClassifyTitles {
  2292. datas = append(datas, map[string]interface{}{
  2293. "val": vv,
  2294. "times": 0,
  2295. "type": "block",
  2296. "firstid": j.SourceMid,
  2297. "createtime": now,
  2298. })
  2299. if len(datas) == saveLimit {
  2300. db.Mgo.SaveBulk(coll, datas...)
  2301. datas = []map[string]interface{}{}
  2302. }
  2303. }
  2304. }
  2305. for k, v := range kv {
  2306. datas = append(datas, map[string]interface{}{
  2307. "val": k,
  2308. "times": v,
  2309. "type": "kv",
  2310. "firstid": j.SourceMid,
  2311. "createtime": now,
  2312. })
  2313. if len(datas) == saveLimit {
  2314. db.Mgo.SaveBulk(coll, datas...)
  2315. datas = []map[string]interface{}{}
  2316. }
  2317. }
  2318. if len(datas) > 0 {
  2319. db.Mgo.SaveBulk(coll, datas...)
  2320. }
  2321. }
  2322. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2323. if j == nil {
  2324. return nil
  2325. }
  2326. if len(j.Block) > 0 {
  2327. for i, v := range j.Block {
  2328. rangetmp := new(ju.TmpBlock)
  2329. vb, _ := json.Marshal(v)
  2330. json.Unmarshal(vb, &rangetmp)
  2331. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2332. }
  2333. }
  2334. if j.ColonKV != nil {
  2335. cb, _ := json.Marshal(j.ColonKV)
  2336. tmpblock.ColonKV = string(cb)
  2337. }
  2338. if j.SpaceKV != nil {
  2339. sb, _ := json.Marshal(j.SpaceKV)
  2340. tmpblock.SpaceKV = string(sb)
  2341. }
  2342. if j.TableKV != nil {
  2343. tb, _ := json.Marshal(j.TableKV)
  2344. tmpblock.TableKV = string(tb)
  2345. }
  2346. return &tmpblock
  2347. }
  2348. //去重冗余字段
  2349. func delFiled(k string) bool {
  2350. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2351. }
  2352. //分析-打分排序
  2353. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2354. defer qu.Catch()
  2355. doc := j.Data
  2356. result := j.Result
  2357. _id := qu.BsonIdToSId((*doc)["_id"])
  2358. result = ScoreFields(j, e.Tag) //正负面词打分
  2359. //结果排序
  2360. for _, val := range result {
  2361. ju.Sort(val)
  2362. }
  2363. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2364. clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
  2365. marshalbt, _ := json.Marshal(j.Jsondata)
  2366. tmpjddata := make(map[string]interface{})
  2367. json.Unmarshal(marshalbt, &tmpjddata)
  2368. for _, jdkey := range ju.JsonData {
  2369. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2370. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2371. if jdkey == "budget" || jdkey == "bidamount" {
  2372. lockclear.Lock()
  2373. cfn := e.ClearFn[jdkey]
  2374. lockclear.Unlock()
  2375. if len(cfn) == 0 {
  2376. continue
  2377. }
  2378. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
  2379. if tmpv.Value == newNum[0] {
  2380. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2381. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2382. ju.Sort(j.Result[jdkey])
  2383. delete((*j.Jsondata), jdkey)
  2384. break
  2385. }
  2386. } else {
  2387. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2388. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2389. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2390. ju.Sort(j.Result[jdkey])
  2391. delete((*j.Jsondata), jdkey)
  2392. break
  2393. }
  2394. }
  2395. }
  2396. }
  2397. }
  2398. if len(*j.Jsondata) > 0 {
  2399. j.Result = JsonDataMergeProcessing(j, e)
  2400. }
  2401. j.Jsondata = &tmpjddata
  2402. }
  2403. return doc, result, _id
  2404. }
  2405. //辅助信息,如果没有排序先排序
  2406. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2407. fieldalls := map[string][]map[string]interface{}{}
  2408. if j == nil {
  2409. return fieldalls
  2410. }
  2411. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2412. defer qykredis.Close()
  2413. db := 0
  2414. for field, val := range j.Result {
  2415. //ju.Sort(val)
  2416. if field == "buyer" {
  2417. db = ju.BuyerDB
  2418. } else if field == "winner" {
  2419. db = ju.WinnerDB
  2420. } else if field == "agency" {
  2421. db = ju.AgencyDB
  2422. }
  2423. sfields := []map[string]interface{}{}
  2424. for _, v := range val {
  2425. standardized := false
  2426. if _, err := qykredis.Do("SELECT", db); err != nil {
  2427. fmt.Println("redis select err", err)
  2428. } else {
  2429. rep, err := qykredis.Do("GET", v.Value)
  2430. if rep != nil && err == nil {
  2431. standardized = true
  2432. }
  2433. }
  2434. if field == "budget" || field == "bidamount" {
  2435. if !v.IsTrue {
  2436. continue
  2437. }
  2438. }
  2439. sfield := map[string]interface{}{
  2440. "val": v.Value,
  2441. "type": v.Type,
  2442. "score": v.Score,
  2443. "blocktag": v.BlockTag,
  2444. "sourceval": v.SourceValue,
  2445. "standardized": standardized,
  2446. }
  2447. sfields = append(sfields, sfield)
  2448. }
  2449. fieldalls[field] = sfields
  2450. }
  2451. return fieldalls
  2452. }
  2453. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2454. defer qu.Catch()
  2455. //获取审核字段
  2456. for _, field := range e.AuditFields {
  2457. //1.分包
  2458. if resulttmp["package"] != nil {
  2459. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2460. for _, val := range packagedata {
  2461. if val[field] != nil {
  2462. fv := qu.ObjToString(val[field])
  2463. if fv != "" {
  2464. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2465. e.RedisMatch(field, fv, val) //redis匹配
  2466. } else { //除了buyer和winner,其他字段走规则匹配
  2467. e.RuleMatch(field, fv, val)
  2468. }
  2469. }
  2470. }
  2471. }
  2472. }
  2473. //2.外围
  2474. if resulttmp[field] != nil {
  2475. fv := qu.ObjToString(resulttmp[field])
  2476. if fv != "" {
  2477. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2478. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2479. } else { //除了buyer和winner,其他字段走规则匹配
  2480. e.RuleMatch(field, fv, resulttmp)
  2481. }
  2482. }
  2483. }
  2484. }
  2485. }
  2486. //Redis匹配
  2487. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2488. defer qu.Catch()
  2489. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2490. if i == 0 { //reids未找到,执行规则匹配
  2491. val[field+"_isredis"] = false
  2492. e.RuleMatch(field, fv, val) //规则匹配
  2493. } else { //redis找到,打标识存库
  2494. val[field+"_isredis"] = true
  2495. }
  2496. }
  2497. //规则匹配
  2498. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2499. defer qu.Catch()
  2500. if fieldval != "" {
  2501. SMap := e.StartMatch(field, fieldval)
  2502. //SMap.AddKey(field+"_isaudit", false)
  2503. for _, k := range SMap.Keys {
  2504. tmpMap[k] = SMap.Map[k]
  2505. }
  2506. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2507. }
  2508. }
  2509. //开始规则匹配
  2510. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2511. defer qu.Catch()
  2512. SMap := pretreated.NewSortMap()
  2513. lock.Lock()
  2514. f := e.RecogFieldMap[field]
  2515. lock.Unlock()
  2516. if len(f) > 0 {
  2517. fid := qu.BsonIdToSId(f["_id"])
  2518. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2519. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2520. if textAfterRecogFieldPrerule != "" {
  2521. lock.Lock()
  2522. classMap := e.FidClassMap[fid]
  2523. lock.Unlock()
  2524. L:
  2525. for _, c := range classMap { //class
  2526. classid := qu.BsonIdToSId(c["_id"])
  2527. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2528. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2529. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2530. if textAfterClassPrerule != "" {
  2531. lock.Lock()
  2532. ruleMap := e.CidRuleMap[classid]
  2533. lock.Unlock()
  2534. for _, r := range ruleMap { //rule
  2535. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2536. s_name := qu.ObjToString(r["s_name"])
  2537. rule := r["rule"].([]interface{})
  2538. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2539. if textAfterRulePrerule != "" {
  2540. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2541. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2542. if savefield != "" { //保存字段不为空,存储代码信息
  2543. SMap.AddKey(field+"_"+savefield, s_name)
  2544. }
  2545. break L
  2546. }
  2547. }
  2548. }
  2549. }
  2550. }
  2551. }
  2552. }
  2553. return SMap
  2554. }
  2555. //中标候选人经过清理之后,重新取出赋值
  2556. func resetWinnerorder(j *ju.Job) {
  2557. if len(j.Winnerorder) == 0 {
  2558. return
  2559. }
  2560. maxlen := len(j.Winnerorder) - 1
  2561. //中标单位
  2562. //i := 0
  2563. winners := []*ju.ExtField{}
  2564. bidamounts := []*ju.ExtField{}
  2565. if maxlen > 0 {
  2566. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  2567. return
  2568. }
  2569. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2570. if j.Winnerorder[0]["price"] != nil {
  2571. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2572. if tmpPrice[len(tmpPrice)-1].(bool) {
  2573. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2574. }
  2575. }
  2576. }
  2577. if j.Result["winner"] == nil && len(winners) > 0 {
  2578. j.Result["winner"] = winners
  2579. } else if len(winners) > 0 {
  2580. j.Result["winner"] = append(j.Result["winner"], winners...)
  2581. }
  2582. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2583. j.Result["bidamount"] = bidamounts
  2584. } else if len(bidamounts) > 0 {
  2585. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2586. }
  2587. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  2588. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2589. j.Result["winner"] = winners
  2590. if j.Winnerorder[0]["price"] != nil {
  2591. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2592. if tmpPrice[len(tmpPrice)-1].(bool) {
  2593. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2594. }
  2595. j.Result["bidamount"] = bidamounts
  2596. }
  2597. }
  2598. }
  2599. func RemoveReplicaSliceString(slc []string) []string {
  2600. result := make([]string, 0)
  2601. tempMap := make(map[string]bool, len(slc))
  2602. for _, e := range slc {
  2603. if tempMap[e] == false {
  2604. tempMap[e] = true
  2605. result = append(result, e)
  2606. }
  2607. }
  2608. return result
  2609. }
  2610. type scoreIndex struct {
  2611. Score float64
  2612. Index int
  2613. }