extract.go 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. log "github.com/donnie4w/go-logger/logger"
  20. "gopkg.in/mgo.v2/bson"
  21. )
  22. var (
  23. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  24. cut = ju.NewCut() //获取正文并清理
  25. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  26. TaskList map[string]*ExtractTask //任务列表
  27. ClearTaskList map[string]*ClearTask //清理任务列表
  28. saveLimit = 100 //抽取日志批量保存
  29. PageSize = 5000 //查询分页
  30. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  31. //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. spidercode = map[string]bool{
  34. "gd_zhsggzyjyzx_jsgc_fjczbgg":true,
  35. "js_szgyyqggzyjyzx_jsgc_zjfbgs":true,
  36. "zj_tzsyhggzyjyzx_jsgc_kbqk":true,
  37. "hb_tmsggzyjyxxw_jsgc_kbqk":true,
  38. "zj_nbsyyggzyjyw_jsgc_kbqk":true,
  39. "zj_zjsggzyjyzx_jyxx_kbjg":true,
  40. "zj_zjzdgcjyw_ztbjglxx_kbjg":true,
  41. "zj_lssggzyjyw_jsgc_kbsk":true,
  42. "zj_qzslyxggzyjyzx_gggs_xkbjl":true,
  43. "sc_mssggzydzjypt_jsgc_kbjl":true,
  44. "sc_pzhsggzyjyfwzx_jsgc_kbylb":true,
  45. "a_zgzbtbggfwpt_wasjgf_ss_kbjl":true,
  46. "a_hbszbtbggfwpt_kbjl":true,
  47. "a_szsjsgcjyfwzxbafzx_kbqkgs":true,
  48. "a_szldzbyxgs_kbxx":true,
  49. "zj_zssssxggzyjyw_gcjs_kbjggs":true,
  50. "gd_szszfhjsj_kbqkgs":true,
  51. "a_gjggzyjypt_gcjs_kbjl":true,
  52. "a_gjggzyjypt_gcjs_kbjl_new":true,
  53. "zj_tzsyhggzyjyzx_kbjggg":true,
  54. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl":true,
  55. "ah_czsggzyjyw_jsgc_kbjl":true,
  56. "ah_czsggzyjyw_zfcg_kbxx":true,
  57. "ah_whsggzyjyfww_kbxx_cgxm":true,
  58. "ah_whsggzyjyfww_kbxx_gcxm":true,
  59. }
  60. )
  61. //启动测试抽取
  62. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  63. defer qu.Catch()
  64. ext := TaskList[taskId]
  65. if ext == nil {
  66. ext = &ExtractTask{}
  67. ext.Id = taskId
  68. ext.InitTestTaskInfo(resultcoll, trackcoll)
  69. ext.IsRun = true
  70. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  71. }
  72. ext.InitSite()
  73. ext.InitRulePres()
  74. ext.InitRuleBacks(false)
  75. ext.InitRuleBacks(true)
  76. ext.InitRuleCore(false)
  77. ext.InitRuleCore(true)
  78. ext.InitPkgCore()
  79. ext.InitBlockRule()
  80. ext.InfoTypeList()
  81. ext.InitTag(false)
  82. ext.InitTag(true)
  83. ext.InitClearFn(false)
  84. ext.InitClearFn(true)
  85. ext.Lock()
  86. //ext.IsExtractCity = false
  87. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  88. //初始化城市DFA信息
  89. ext.InitCityInfo()
  90. //ext.InitCityDFA()
  91. ext.InitAreaCode()
  92. ext.InitPostCode()
  93. }
  94. ext.Unlock()
  95. //质量审核
  96. ext.InitAuditFields()
  97. ext.InitAuditRule()
  98. ext.InitAuditClass()
  99. ext.InitAuditRecogField()
  100. //品牌抽取是否开启
  101. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  102. //价格个数抽取是否开启
  103. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  104. //附件抽取是否开启
  105. ext.InitFile()
  106. ext.TaskInfo.TestColl = resultcoll
  107. TaskList[taskId] = ext
  108. return RunExtractTestTask(ext, startId, num)
  109. }
  110. func IdTrans(startId string) bson.ObjectId {
  111. defer qu.Catch()
  112. return bson.ObjectIdHex(startId)
  113. }
  114. //开始测试任务抽取
  115. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  116. n, _ := strconv.Atoi(num)
  117. id := IdTrans(startId)
  118. if id.Valid() {
  119. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  120. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  121. for _, v := range *list {
  122. //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
  123. // continue
  124. //}
  125. if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
  126. continue
  127. }
  128. var j, jf *ju.Job
  129. var isSite bool
  130. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  131. v["isextFile"] = true
  132. j, jf, isSite = ext.PreInfo(v)
  133. } else {
  134. j, _, isSite = ext.PreInfo(v)
  135. }
  136. go ext.ExtractProcess(j, jf, isSite)
  137. ext.TaskInfo.ProcessPool <- true
  138. }
  139. return true
  140. } else {
  141. return false
  142. }
  143. }
  144. //启动抽取
  145. func StartExtractTaskId(taskId string) bool {
  146. defer qu.Catch()
  147. isgo := false
  148. ext := TaskList[taskId]
  149. if ext == nil {
  150. ext = &ExtractTask{}
  151. ext.Id = taskId
  152. ext.InitTaskInfo()
  153. isgo = true
  154. } else {
  155. ext.Id = taskId
  156. ext.InitTaskInfo()
  157. }
  158. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  159. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  160. ext.InitSite()
  161. ext.InitRulePres()
  162. ext.InitRuleBacks(false)
  163. ext.InitRuleBacks(true)
  164. ext.InitRuleCore(false)
  165. ext.InitRuleCore(true)
  166. ext.InitPkgCore()
  167. ext.InitBlockRule()
  168. ext.InfoTypeList()
  169. ext.InitTag(false)
  170. ext.InitTag(true)
  171. ext.InitClearFn(false)
  172. ext.InitClearFn(true)
  173. ext.Lock()
  174. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  175. //初始化城市DFA信息
  176. //ext.InitCityDFA()
  177. ext.InitCityInfo()
  178. ext.InitAreaCode()
  179. ext.InitPostCode()
  180. }
  181. ext.Unlock()
  182. //质量审核
  183. ext.InitAuditFields()
  184. ext.InitAuditRule()
  185. ext.InitAuditClass()
  186. ext.InitAuditRecogField()
  187. //品牌抽取是否开启
  188. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  189. //价格个数抽取是否开启
  190. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  191. //附件抽取是否开启
  192. ext.InitFile()
  193. ext.IsRun = true
  194. go ext.ResultSave(true)
  195. go ext.BidSave(true)
  196. if isgo {
  197. go RunExtractTask(taskId)
  198. }
  199. TaskList[taskId] = ext
  200. return true
  201. }
  202. //停止抽取
  203. func StopExtractTaskId(taskId string) bool {
  204. defer qu.Catch()
  205. ext := TaskList[taskId]
  206. if ext != nil {
  207. ext.IsRun = false
  208. TaskList[taskId] = ext
  209. }
  210. //更新task.s_extlastid
  211. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  212. return true
  213. }
  214. //开始抽取
  215. func RunExtractTask(taskId string) {
  216. defer qu.Catch()
  217. ext := TaskList[taskId]
  218. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  219. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  220. pageNum := (count + PageSize - 1) / PageSize
  221. limit := PageSize
  222. if count < PageSize {
  223. limit = count
  224. }
  225. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  226. for i := 0; i < pageNum; i++ {
  227. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  228. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  229. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  230. for _, v := range *list {
  231. //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  232. // continue
  233. //}
  234. //根据标题判断是否抽取
  235. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  236. if !b {
  237. continue
  238. }
  239. _id := qu.BsonIdToSId(v["_id"])
  240. //log.Debug(_id)
  241. if !ext.IsRun {
  242. break
  243. }
  244. var j, jf *ju.Job
  245. var isSite bool
  246. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  247. v["isextFile"] = true
  248. j, jf, isSite = ext.PreInfo(v)
  249. } else {
  250. j, _, isSite = ext.PreInfo(v)
  251. }
  252. go ext.ExtractProcess(j, jf, isSite)
  253. ext.TaskInfo.LastExtId = _id
  254. ext.TaskInfo.ProcessPool <- true
  255. }
  256. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  257. if !ext.IsRun {
  258. break
  259. }
  260. }
  261. //更新task.s_extlastid
  262. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  263. }
  264. //信息预处理-不和版本关联,取最新版本的配置项
  265. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  266. return (&ExtractTask{}).PreInfo(doc)
  267. }
  268. var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  269. //信息预处理-和版本关联
  270. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  271. defer qu.Catch()
  272. //判断是否有附件这个字段
  273. var isextFile bool
  274. if doc["isextFile"] != nil {
  275. isextFile = doc["isextFile"].(bool)
  276. }
  277. detail := ""
  278. d1, _ := doc["detail"].(string)
  279. d2, _ := doc["contenthtml"].(string)
  280. if len(d1) >= len(d2) || d2 == "" {
  281. detail = d1
  282. } else {
  283. detail = d2
  284. }
  285. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  286. d3, _ := doc["summary"].(string)
  287. //全文的需要修复表格
  288. detail = pretreated.RepairCon(detail)
  289. detail = ju.CutLableStr(d3 + "\n" + detail)
  290. detail = cut.ClearHtml(d3 + "\n" + detail)
  291. doc["detail"] = detail
  292. isClearnMoney := !clearMoneyReg.MatchString(detail)
  293. if isClearnMoney {
  294. isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
  295. }
  296. isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
  297. if isextFile {
  298. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  299. }
  300. //正文小于200个字,有附件把附件内容加到正文
  301. //tmpDeatil := detail
  302. //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  303. //if err == nil {
  304. // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  305. // if conlen < 2000 {
  306. // if isextFile {
  307. // detail += qu.ObjToString(doc["detailfile"])
  308. // doc["detail"] = detail
  309. // }
  310. // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
  311. // //防止文本过长,造成抽取阻塞
  312. // log.Debug("文本太长", doc["_id"], conlen)
  313. // doc["detail"] = d3
  314. // }
  315. //}
  316. toptype := qu.ObjToString(doc["toptype"])
  317. subtype := qu.ObjToString(doc["subtype"])
  318. if qu.ObjToString(doc["type"]) == "bid" {
  319. toptype = "结果"
  320. }
  321. if toptype == "" {
  322. toptype = "all"
  323. }
  324. if subtype == "" {
  325. subtype = "all"
  326. }
  327. if subtype == "其他" {
  328. subtype = "其它"
  329. }
  330. toMap := qu.ObjToMap(doc["jsondata"])
  331. //log.Debug("toMap", toMap)
  332. if (*toMap) != nil {
  333. if (*toMap)["extweight"] == nil {
  334. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  335. }
  336. if (*toMap)["jsoncontent"] != nil {
  337. delete(*toMap, "jsoncontent")
  338. }
  339. for k, v := range *toMap {
  340. if _, ok := v.(float64); ok {
  341. continue
  342. } else if _, ok := v.(int64); ok {
  343. continue
  344. } else if _, ok2 := v.(string); ok2 {
  345. continue
  346. } else {
  347. delete(*toMap, k)
  348. }
  349. }
  350. }
  351. j = &ju.Job{
  352. SourceMid: qu.BsonIdToSId(doc["_id"]),
  353. Category: toptype,
  354. CategorySecond: subtype,
  355. Content: qu.ObjToString(doc["detail"]),
  356. SpiderCode: qu.ObjToString(doc["spidercode"]),
  357. Site: qu.ObjToString(doc["site"]),
  358. //Domain: qu.ObjToString(doc["domain"]),
  359. //Href: qu.ObjToString(doc["href"]),
  360. Title: qu.ObjToString(doc["title"]),
  361. Data: &doc,
  362. City: qu.ObjToString(doc["city"]),
  363. Province: qu.ObjToString(doc["area"]),
  364. Jsondata: toMap,
  365. Result: map[string][]*ju.ExtField{},
  366. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  367. RuleBlock: e.RuleBlock,
  368. Dataging: qu.IntAll(doc["dataging"]),
  369. IsClearnMoney: isClearnMoneystr,
  370. }
  371. if isextFile {
  372. jf = &ju.Job{
  373. SourceMid: qu.BsonIdToSId(doc["_id"]),
  374. Category: toptype,
  375. CategorySecond: subtype,
  376. Content: qu.ObjToString(doc["detailfile"]),
  377. SpiderCode: qu.ObjToString(doc["spidercode"]),
  378. Site: qu.ObjToString(doc["site"]),
  379. Title: qu.ObjToString(doc["title"]),
  380. Data: &doc,
  381. City: qu.ObjToString(doc["city"]),
  382. Province: qu.ObjToString(doc["area"]),
  383. Jsondata: toMap,
  384. Result: map[string][]*ju.ExtField{},
  385. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  386. RuleBlock: e.RuleBlock,
  387. IsFile: isextFile,
  388. Dataging: qu.IntAll(doc["dataging"]),
  389. IsClearnMoney: isClearnMoneystr,
  390. }
  391. }
  392. codeSite := j.SpiderCode
  393. //是否启用站点
  394. if value, ok := e.SiteMerge.Load(codeSite); ok {
  395. isSite = value.(bool)
  396. }
  397. if isSite {
  398. //是否配置站点
  399. exp, isSite := e.Luacodes.Load(codeSite)
  400. if isSite {
  401. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  402. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  403. }
  404. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  405. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  406. }
  407. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  408. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  409. }
  410. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  411. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  412. }
  413. }
  414. }
  415. qu.Try(func() {
  416. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  417. if isextFile && strings.TrimSpace(jf.Content) != "" {
  418. pretreated.AnalyStart(jf, isSite, codeSite)
  419. }
  420. }, func(err interface{}) {
  421. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  422. })
  423. return j, jf, isSite
  424. }
  425. var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  426. var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  427. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  428. func file2text(doc *map[string]interface{}) {
  429. mnameone := map[string]bool{}
  430. mname := map[string]bool{}
  431. murl := map[string]string{}
  432. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  433. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  434. for _, attachs := range attach_text {
  435. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  436. for _, fileinfo := range fileinfos {
  437. if ff, ok := fileinfo.(map[string]interface{}); ok {
  438. attach_url := qu.ObjToString(ff["attach_url"])
  439. ffname := qu.ObjToString(ff["file_name"])
  440. if clearStrReg.MatchString(ffname) {
  441. continue
  442. }
  443. mname[ffname] = true
  444. murl[ffname] = attach_url
  445. if sortStrReg.MatchString(ffname) {
  446. mnameone[ffname] = true
  447. }
  448. }
  449. }
  450. }
  451. }
  452. }
  453. tmpstr := ""
  454. for k := range mnameone {
  455. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  456. (*doc)["detailfile"] = tmpstr
  457. return
  458. }
  459. bs := ju.OssGetObject(murl[k])
  460. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  461. tmpstr += bs + "\n"
  462. }
  463. }
  464. for k := range mname {
  465. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  466. (*doc)["detailfile"] = tmpstr
  467. return
  468. }
  469. bs := ju.OssGetObject(murl[k])
  470. if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
  471. tmpstr += bs + "\n"
  472. }
  473. }
  474. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  475. }
  476. //抽取
  477. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  478. e.ExtractDetail(j, isSite, j.SpiderCode)
  479. if jf != nil && jf.IsFile {
  480. e.ExtractDetail(jf, isSite, j.SpiderCode)
  481. for tmpk, xs := range jf.Result {
  482. if len(j.Result[tmpk]) == 0 {
  483. if tmpk == "budget" || tmpk == "bidamount" {
  484. for _, v := range xs {
  485. if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
  486. j.Result[tmpk] = append(j.Result[tmpk], v)
  487. }
  488. }
  489. } else {
  490. j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
  491. }
  492. }
  493. }
  494. if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
  495. j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
  496. }
  497. }
  498. if isSite {
  499. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  500. if ok && ismerge.(bool) {
  501. tmpj := &ju.Job{
  502. SourceMid: j.SourceMid,
  503. Category: j.Category,
  504. CategorySecond: j.CategorySecond,
  505. Content: j.Content,
  506. SpiderCode: j.SpiderCode,
  507. //Domain: qu.ObjToString(doc["domain"]),
  508. //Href: qu.ObjToString(doc["href"]),
  509. Title: j.Title,
  510. Data: j.Data,
  511. City: j.City,
  512. Province: j.Province,
  513. Jsondata: j.Jsondata,
  514. Result: map[string][]*ju.ExtField{},
  515. BuyerAddr: j.BuyerAddr,
  516. RuleBlock: e.RuleBlock,
  517. }
  518. qu.Try(func() {
  519. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  520. }, func(err interface{}) {
  521. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  522. })
  523. e.ExtractDetail(tmpj, false, "")
  524. //if jf != nil && jf.IsFile {
  525. // e.ExtractFile(jf, false, "")
  526. //}
  527. //合并数据
  528. j.Block = append(j.Block, tmpj.Block...)
  529. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  530. for tmpk, _ := range j.Result {
  531. if len(tmpj.Result[tmpk]) > 0 {
  532. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  533. }
  534. }
  535. for tmpk, _ := range tmpj.Result {
  536. if len(j.Result[tmpk]) == 0 {
  537. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  538. }
  539. }
  540. }
  541. }
  542. //分析抽取结果并保存
  543. AnalysisSaveResult(j, jf, e)
  544. <-e.TaskInfo.ProcessPool
  545. }
  546. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  547. qu.Try(func() {
  548. doc := *j.Data
  549. //全局前置规则,结果覆盖doc属性
  550. //for _, v := range e.RulePres {
  551. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  552. //}
  553. tmprules := map[string][]*RuleCore{}
  554. lockrule.Lock()
  555. if j.Category == "all" || j.CategorySecond == "all" {
  556. if isSite {
  557. for k, vc1 := range e.SiteRuleCores["all_all"] {
  558. tmprules[k] = vc1
  559. }
  560. } else {
  561. for k, vc1 := range e.RuleCores["all_all"] {
  562. tmprules[k] = vc1
  563. }
  564. }
  565. } else {
  566. if isSite {
  567. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  568. tmprules[k] = vc1
  569. }
  570. } else {
  571. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  572. tmprules[k] = vc1
  573. }
  574. }
  575. }
  576. if len(tmprules) < 1 { //分类未覆盖部分
  577. if isSite {
  578. for k, vc1 := range e.RuleCores["all_all"] {
  579. tmprules[k] = vc1
  580. }
  581. } else {
  582. for k, vc1 := range e.SiteRuleCores["all_all"] {
  583. tmprules[k] = vc1
  584. }
  585. }
  586. }
  587. lockrule.Unlock()
  588. //抽取规则
  589. for _, vc1 := range tmprules {
  590. for _, vc := range vc1 {
  591. tmp := ju.DeepCopy(doc).(map[string]interface{})
  592. //是否进入逻辑
  593. if !ju.Logic(vc.LuaLogic, tmp) {
  594. continue
  595. }
  596. ////抽取-前置规则
  597. //for _, v := range vc.RulePres {
  598. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  599. //}
  600. // log.Debug("抽取-前置规则", tmp)
  601. //抽取-规则
  602. ExtRuleCore(tmp, e, vc, j, isSite)
  603. // log.Debug("抽取-规则", tmp)
  604. //抽取-后置规则
  605. for _, v := range vc.RuleBacks {
  606. ExtRegBack(j, v, e.TaskInfo, vc)
  607. }
  608. //kv规则
  609. for _, v := range vc.KVRuleCores {
  610. ExtRuleKV(j, v, e.TaskInfo)
  611. }
  612. // log.Debug("抽取-后置规则", tmp)
  613. //项目名称未能抽取到,标题来凑
  614. if vc.Field == "projectname" {
  615. if vc.ExtFrom == "title" {
  616. isextitle := true
  617. for _, v := range j.Result[vc.Field] {
  618. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  619. isextitle = false
  620. break
  621. }
  622. }
  623. if isextitle { //标题加入选举
  624. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  625. if isSite {
  626. field.Score = 1
  627. }
  628. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  629. }
  630. }
  631. for i := 0; i < 3; i++ {
  632. for _, v := range vc.RuleBacks {
  633. ExtRegBack(j, v, e.TaskInfo, vc)
  634. }
  635. }
  636. }
  637. }
  638. }
  639. //全局后置规则
  640. if isSite {
  641. for _, v := range e.SiteRuleBacks {
  642. ExtRegBack(j, v, e.TaskInfo, nil)
  643. }
  644. } else {
  645. for _, v := range e.RuleBacks {
  646. ExtRegBack(j, v, e.TaskInfo, nil)
  647. }
  648. }
  649. //函数清理
  650. for key, val := range j.Result {
  651. for i, v := range val {
  652. if v.Field == "project_duration" {
  653. arr := clear.ObjToMoney([]interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  654. if len(arr) > 0 {
  655. v.Value = arr[0]
  656. }
  657. }
  658. if v.Field == "projectname" && v.Type == "table" {
  659. break
  660. }
  661. if key == "budget" || key == "bidamount" {
  662. if _, ok := v.Value.(float64); ok && !v.IsTrue {
  663. continue
  664. }
  665. }
  666. lockclear.Lock()
  667. var cfn = []string{}
  668. if isSite {
  669. cfn = e.SiteClearFn[key]
  670. if len(cfn) == 0 {
  671. cfn = e.ClearFn[key]
  672. }
  673. } else {
  674. cfn = e.ClearFn[key]
  675. }
  676. lockclear.Unlock()
  677. if len(cfn) == 0 {
  678. continue
  679. }
  680. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  681. if key == "budget" || key == "bidamount" {
  682. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  683. j.Result[key][i].IsTrue = true
  684. } else {
  685. j.Result[key][i].Value = data[0]
  686. continue
  687. }
  688. }
  689. before, _ := v.Value.(string)
  690. v.Value = data[0]
  691. BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
  692. //添加行数清理的日志
  693. //清理特殊符号
  694. lockclear.Lock()
  695. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  696. text := qu.ObjToString(v.Value)
  697. before = text
  698. v.Value = clear.OtherClean(key, text)
  699. BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
  700. }
  701. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  702. lockclear.Unlock()
  703. }
  704. }
  705. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  706. // bs, _ := json.Marshal(j.Result)
  707. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  708. }, func(err interface{}) {
  709. log.Debug("ExtractProcess err", err, j.SourceMid)
  710. })
  711. }
  712. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  713. qu.Try(func() {
  714. doc := *j.Data
  715. //全局前置规则,结果覆盖doc属性
  716. // for _, v := range e.RulePres {
  717. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  718. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  719. // }
  720. // }
  721. //抽取规则
  722. tmprules := map[string][]*RuleCore{}
  723. lockrule.Lock()
  724. if j.Category == "all" || j.CategorySecond == "all" {
  725. for k, vc1 := range e.RuleCores["all_all"] {
  726. tmprules[k] = vc1
  727. }
  728. } else {
  729. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  730. tmprules[k] = vc1
  731. }
  732. }
  733. lockrule.Unlock()
  734. for _, vc1 := range tmprules {
  735. for _, vc := range vc1 {
  736. tmp := ju.DeepCopy(doc).(map[string]interface{})
  737. //是否进入逻辑
  738. if !ju.Logic(vc.LuaLogic, tmp) {
  739. continue
  740. }
  741. //抽取-前置规则
  742. // for _, v := range vc.RulePres {
  743. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  744. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  745. // }
  746. // }
  747. // log.Debug("抽取-前置规则", tmp)
  748. //抽取-规则
  749. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  750. ExtRuleCore(tmp, e, vc, j, isSite)
  751. }
  752. // log.Debug("抽取-规则", tmp)
  753. //抽取-后置规则
  754. for _, v := range vc.RuleBacks {
  755. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  756. ExtRegBack(j, v, e.TaskInfo, vc)
  757. }
  758. }
  759. // log.Debug("抽取-后置规则", tmp)
  760. }
  761. }
  762. //全局后置规则
  763. for _, v := range e.RuleBacks {
  764. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  765. ExtRegBack(j, v, e.TaskInfo, nil)
  766. }
  767. }
  768. //函数清理
  769. for key, val := range j.Result {
  770. for _, v := range val {
  771. lockclear.Lock()
  772. var cfn = []string{}
  773. if isSite {
  774. cfn = e.SiteClearFn[key]
  775. if len(cfn) == 0 {
  776. cfn = e.ClearFn[key]
  777. }
  778. } else {
  779. cfn = e.ClearFn[key]
  780. }
  781. lockclear.Unlock()
  782. if len(cfn) == 0 {
  783. continue
  784. }
  785. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
  786. v.Value = data[0]
  787. //清理特殊符号
  788. lockclear.Lock()
  789. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  790. clear.MesField[key] != nil {
  791. text := qu.ObjToString(v.Value)
  792. text = clear.OtherClean(key, text)
  793. v.Value = text
  794. }
  795. lockclear.Unlock()
  796. }
  797. }
  798. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  799. // bs, _ := json.Marshal(j.Result)
  800. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  801. }, func(err interface{}) {
  802. log.Debug("ExtractProcess err", err)
  803. })
  804. }
  805. //前置过滤
  806. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  807. defer qu.Catch()
  808. before := ju.DeepCopy(doc).(map[string]interface{})
  809. extinfo := map[string]interface{}{}
  810. if in.IsLua {
  811. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  812. if j != nil {
  813. lua.Block = j.Block
  814. }
  815. extinfo = lua.RunScript("pre")
  816. for k, v := range extinfo { //结果覆盖原doc
  817. doc[k] = v
  818. }
  819. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  820. } else {
  821. var key string
  822. if !j.IsFile {
  823. key = qu.If(in.Field == "", "detail", in.Field).(string)
  824. } else {
  825. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  826. }
  827. text := qu.ObjToString(doc[key])
  828. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  829. doc[key] = extinfo[key] //结果覆盖原doc
  830. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  831. }
  832. return doc
  833. }
  834. //抽取-规则
  835. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  836. //候选人加入
  837. var kvMap map[string][]map[string]interface{}
  838. extByReg := true
  839. if vc.ExtFrom != "title" {
  840. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  841. }
  842. for _, v := range vc.RuleCores {
  843. if v.IsLua {
  844. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  845. } else if extByReg {
  846. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  847. }
  848. }
  849. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  850. if vc.Field == "budget" && len(kvMap) == 0 {
  851. if len(j.BlockPackage) == 1 {
  852. for _, bp := range j.BlockPackage {
  853. for fieldname, field := range vc.LFields {
  854. if field != vc.Field {
  855. continue
  856. }
  857. tp := ""
  858. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  859. if k == 0 {
  860. tp = "colon"
  861. } else if k == 1 {
  862. tp = "space"
  863. } else if k == 2 {
  864. tp = "table"
  865. }
  866. if v == nil || v.KvTags == nil {
  867. continue
  868. }
  869. for _, vv := range v.KvTags[fieldname] {
  870. text := ju.TrimLRSpace(vv.Value, "")
  871. if text != "" {
  872. tmp := &ju.ExtField{
  873. ExtFrom: "package",
  874. Field: vc.Field,
  875. Code: "CL_分包",
  876. Type: tp,
  877. MatchType: "package",
  878. RuleText: bp.Text,
  879. SourceValue: vv.Key,
  880. Value: text,
  881. }
  882. if isSite {
  883. tmp.Score = 1
  884. }
  885. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  886. }
  887. }
  888. }
  889. }
  890. break
  891. }
  892. }
  893. } else {
  894. for k, v := range kvMap {
  895. if j.Result[k] == nil {
  896. j.Result[k] = [](*ju.ExtField){}
  897. }
  898. for _, tmp := range v {
  899. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  900. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  901. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  902. MatchType: qu.ObjToString(tmp["matchtype"]),
  903. RuleText: qu.ObjToString(tmp["ruletext"]),
  904. SourceValue: tmp["sourcevalue"],
  905. Value: tmp["value"]}
  906. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  907. field.Score = 1
  908. }
  909. if isSite {
  910. field.Score = 1
  911. }
  912. if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
  913. moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
  914. if len(moneys) > 0 {
  915. if vf, ok := moneys[0].(float64); ok {
  916. field.Value = vf
  917. field.IsTrue = moneys[len(moneys)-1].(bool)
  918. } else if vi, ok := moneys[0].(int); ok {
  919. field.Value = float64(vi)
  920. field.IsTrue = moneys[len(moneys)-1].(bool)
  921. }
  922. }
  923. }
  924. if tmp["blocktag"] != nil {
  925. btag := make(map[string]string)
  926. for k := range tmp["blocktag"].(map[string]bool) {
  927. blocktag.Lock()
  928. if TagConfigDesc[k] != "" {
  929. btag[k] = TagConfigDesc[k]
  930. }
  931. blocktag.Unlock()
  932. }
  933. field.BlockTag = btag
  934. }
  935. j.Result[k] = append(j.Result[k], field)
  936. }
  937. }
  938. }
  939. }
  940. //抽取-规则-kv
  941. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  942. defer qu.Catch()
  943. if extfrom == "title" || !in.IsLua {
  944. return
  945. }
  946. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  947. lua.KvMap = *kvMap
  948. lua.Block = j.Block
  949. extinfo := lua.RunScript("core")
  950. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  951. for _, v := range tmps {
  952. v["core"] = in.Code
  953. }
  954. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  955. }
  956. if len(extinfo) > 0 {
  957. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  958. }
  959. }
  960. //抽取-规则-正则
  961. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  962. defer qu.Catch()
  963. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  964. b := IsExtract(in.Field, j.Title, j.Content)
  965. if !b {
  966. return
  967. }
  968. //全文正则
  969. //text := qu.ObjToString(doc[extfrom])
  970. //if in.Field != "" {
  971. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  972. // if len(extinfo) > 0 {
  973. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  974. // }
  975. //}
  976. //块抽取
  977. if in.Field != "" {
  978. if extfrom == "title" {
  979. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  980. if len(extinfo) > 0 {
  981. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  982. }
  983. } else if in.Field == "qualifies" {
  984. extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite)
  985. if len(extinfo) > 0 {
  986. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  987. }
  988. } else {
  989. for _, v := range j.Block {
  990. btag := make(map[string]string)
  991. for k := range v.Classify {
  992. blocktag.Lock()
  993. btag[k] = TagConfigDesc[k]
  994. blocktag.Unlock()
  995. }
  996. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  997. if len(extinfo) > 0 {
  998. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  999. }
  1000. }
  1001. }
  1002. }
  1003. }
  1004. //pkg抽取-规则-正则
  1005. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  1006. defer qu.Catch()
  1007. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  1008. b := IsExtract(in.Field, j.Title, j.Content)
  1009. if !b {
  1010. return
  1011. }
  1012. //块抽取
  1013. if in.Field != "" {
  1014. for k, vbpkg := range j.BlockPackage {
  1015. rep := map[string]string{}
  1016. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1017. if in.Field == "budget" && vbpkg.Budget > 0 {
  1018. continue
  1019. }
  1020. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  1021. continue
  1022. }
  1023. if in.Field == "winner" && vbpkg.Winner != "" {
  1024. continue
  1025. }
  1026. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  1027. continue
  1028. }
  1029. if in.Field == "projectname" && vbpkg.Name != "" {
  1030. continue
  1031. }
  1032. if in.Field == "winner" && vbpkg.Winner != "" {
  1033. continue
  1034. }
  1035. if in.Field == "winnerperson" {
  1036. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  1037. continue
  1038. }
  1039. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  1040. continue
  1041. }
  1042. }
  1043. if in.Field == "winnertel" {
  1044. if vbpkg.WinnerPerson == "" {
  1045. continue
  1046. }
  1047. }
  1048. //处理正负数修正
  1049. ptmp := strings.Split(in.RuleText, "#")
  1050. sign := 0
  1051. if len(ptmp) == 2 {
  1052. if ptmp[1] == "正" {
  1053. sign = 1
  1054. } else if ptmp[1] == "负" {
  1055. sign = -1
  1056. }
  1057. }
  1058. tmp := strings.Split(ptmp[0], "__")
  1059. if len(tmp) == 2 {
  1060. epos := strings.Split(tmp[1], ",")
  1061. posm := map[string]int{}
  1062. for _, v := range epos {
  1063. ks := strings.Split(v, ":")
  1064. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1065. posm[ks[1]] = qu.IntAll(ks[0])
  1066. } else {
  1067. posm[in.Field] = qu.IntAll(ks[0])
  1068. }
  1069. }
  1070. var pattern string
  1071. if strings.Contains(tmp[0], "\\u") {
  1072. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1073. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1074. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1075. } else {
  1076. pattern = tmp[0]
  1077. }
  1078. //log.Debug("pattern", pattern)
  1079. //fmt.Println(text)
  1080. reg := regexp.MustCompile(pattern)
  1081. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  1082. for i, _ := range apos {
  1083. pos := apos[i]
  1084. for k, p := range posm {
  1085. if len(pos) > p {
  1086. if pos[p] == -1 || pos[p+1] == -1 {
  1087. continue
  1088. }
  1089. val := vbpkg.Text[pos[p]:pos[p+1]]
  1090. if string(val) == "" {
  1091. continue
  1092. }
  1093. if sign == -1 {
  1094. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1095. } else {
  1096. rep[k+"_"+fmt.Sprint(i)] = val
  1097. }
  1098. }
  1099. }
  1100. }
  1101. //fmt.Println(text)
  1102. for i := 0; i < len(apos); i++ {
  1103. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  1104. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1105. lock.Lock()
  1106. cfn := e.ClearFn[in.Field]
  1107. lock.Unlock()
  1108. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1109. if data[len(data)-1].(bool) {
  1110. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1111. j.BlockPackage[k].IsTrueBudget = true
  1112. }
  1113. break
  1114. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1115. lock.Lock()
  1116. cfn := e.ClearFn[in.Field]
  1117. lock.Unlock()
  1118. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
  1119. if data[len(data)-1].(bool) {
  1120. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1121. j.BlockPackage[k].IsTrueBidamount = true
  1122. }
  1123. break
  1124. } else if in.Field == "winner" {
  1125. if j.BlockPackage[k].Winner == "" {
  1126. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  1127. break
  1128. }
  1129. } else if in.Field == "winnertel" {
  1130. if j.BlockPackage[k].WinnerTel == "" {
  1131. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1132. break
  1133. }
  1134. } else if in.Field == "winnerperson" {
  1135. if j.BlockPackage[k].WinnerPerson == "" {
  1136. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1137. break
  1138. }
  1139. } else if in.Field == "bidstatus" {
  1140. if j.BlockPackage[k].BidStatus == "" {
  1141. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1142. break
  1143. }
  1144. } else if in.Field == "projectname" {
  1145. if j.BlockPackage[k].Name == "" {
  1146. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1147. break
  1148. }
  1149. } else if in.Field == "winnerperson" {
  1150. if j.BlockPackage[k].WinnerPerson == "" {
  1151. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1152. break
  1153. }
  1154. } else if in.Field == "winnertel" {
  1155. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1156. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1157. break
  1158. }
  1159. }
  1160. }
  1161. }
  1162. }
  1163. } else {
  1164. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1165. val := ""
  1166. if len(pos) == 2 {
  1167. //"text" = "text"[pos[1]:]
  1168. val = "text"[pos[1]:]
  1169. rs := regexp.MustCompile("[^\r\n\t]+")
  1170. tmp := rs.FindAllString("text", -1)
  1171. if len(tmp) > 0 {
  1172. val = tmp[0]
  1173. }
  1174. }
  1175. if val != "" {
  1176. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1177. lock.Lock()
  1178. cfn := e.ClearFn[in.Field]
  1179. lock.Unlock()
  1180. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1181. if data[len(data)-1].(bool) {
  1182. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1183. j.BlockPackage[k].IsTrueBudget = true
  1184. }
  1185. break
  1186. }
  1187. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1188. lock.Lock()
  1189. cfn := e.ClearFn[in.Field]
  1190. lock.Unlock()
  1191. data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
  1192. if data[len(data)-1].(bool) {
  1193. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1194. j.BlockPackage[k].IsTrueBidamount = true
  1195. }
  1196. break
  1197. } else if in.Field == "bidstatus" {
  1198. if j.BlockPackage[k].BidStatus == "" {
  1199. j.BlockPackage[k].BidStatus = val
  1200. break
  1201. }
  1202. } else if in.Field == "projectname" {
  1203. if j.BlockPackage[k].Name == "" {
  1204. j.BlockPackage[k].Name = val
  1205. break
  1206. }
  1207. }
  1208. }
  1209. }
  1210. }
  1211. }
  1212. }
  1213. //lua脚本根据属性设置提取kv值
  1214. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1215. kvmap := map[string][]map[string]interface{}{}
  1216. if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  1217. if vc.Field == "bidamount" {
  1218. for _, v := range j.Winnerorder {
  1219. if v["price"] == nil {
  1220. continue
  1221. }
  1222. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1223. "code": "winnerorder",
  1224. "field": vc.Field,
  1225. "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
  1226. "extfrom": v["sortstr"],
  1227. "sourcevalue": v["price"],
  1228. "value": v["price"],
  1229. "type": "winnerorder",
  1230. "matchtype": "winnerorder",
  1231. })
  1232. return kvmap, false
  1233. }
  1234. //候选人中标金额
  1235. if price := j.Winnerorder[0]["price"]; price != nil {
  1236. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1237. "code": "CL_中标候选人",
  1238. "field": vc.Field,
  1239. "ruletext": "中标候选人",
  1240. "extfrom": j.Winnerorder[0]["sortstr"],
  1241. "sourcevalue": price,
  1242. "value": price,
  1243. "type": "winnerorder",
  1244. "matchtype": "winnerorder",
  1245. })
  1246. return kvmap, false
  1247. }
  1248. }
  1249. }
  1250. for fieldname, field := range vc.LFields {
  1251. if field != vc.Field {
  1252. continue
  1253. }
  1254. extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category)
  1255. }
  1256. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1257. return kvmap, true
  1258. }
  1259. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) {
  1260. //qu.Debug("fieldname+++", fieldname)
  1261. for _, bl := range blocks {
  1262. tp := ""
  1263. if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") {
  1264. if text := ju.TrimLRSpace(bl.Text, ""); text != "" {
  1265. if Category == "招标" || Category == "拟建" || Category == "预告" {
  1266. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1267. "code": "CL_块内容",
  1268. "field": field,
  1269. "ruletext": "投标保证金",
  1270. "extfrom": "投标保证金_块内容",
  1271. "sourcevalue": bl.Text,
  1272. "value": text,
  1273. "type": "投标保证金_块内容",
  1274. "matchtype": "tag_string",
  1275. "blocktag": bl.Classify,
  1276. "weight": 0,
  1277. })
  1278. } else if Category == "结果" {
  1279. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1280. "code": "CL_",
  1281. "field": field,
  1282. "ruletext": "履约保证金",
  1283. "extfrom": "履约保证金_块内容",
  1284. "sourcevalue": bl.Text,
  1285. "value": text,
  1286. "type": "履约保证金_块内容",
  1287. "matchtype": "tag_string",
  1288. "blocktag": bl.Classify,
  1289. "weight": 0,
  1290. })
  1291. }
  1292. }
  1293. return
  1294. }
  1295. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1296. if k == 0 {
  1297. tp = "colon"
  1298. } else if k == 1 {
  1299. tp = "space"
  1300. } else if k == 2 {
  1301. tp = "table"
  1302. }
  1303. if v == nil || v.KvTags == nil {
  1304. continue
  1305. }
  1306. for _, vv := range v.KvTags[fieldname] {
  1307. text := ju.TrimLRSpace(vv.Value, "")
  1308. if text != "" {
  1309. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1310. "code": "CL_" + vv.Key,
  1311. "field": field,
  1312. "ruletext": vv.Key,
  1313. "extfrom": vc.ExtFrom,
  1314. "sourcevalue": text,
  1315. "value": text,
  1316. "type": tp,
  1317. "matchtype": "tag_string",
  1318. "blocktag": bl.Classify,
  1319. "weight": vv.Weight,
  1320. })
  1321. //if field != "winnertel" && field != "winnerperson" {
  1322. // //break //暂定取第一个
  1323. //}
  1324. }
  1325. }
  1326. }
  1327. if len(kvmap[field]) == 0 {
  1328. extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category)
  1329. }
  1330. }
  1331. }
  1332. //正则提取结果
  1333. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1334. defer qu.Catch()
  1335. var score float64
  1336. score = vre.Score
  1337. if isSite {
  1338. score = score + 1.0
  1339. }
  1340. extinfo := map[string][]map[string]interface{}{}
  1341. rep := map[string]string{}
  1342. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1343. //处理正负数修正
  1344. ptmp := strings.Split(vre.RuleText, "#")
  1345. sign := 0
  1346. if len(ptmp) == 2 {
  1347. if ptmp[1] == "正" {
  1348. sign = 1
  1349. } else if ptmp[1] == "负" {
  1350. sign = -1
  1351. }
  1352. }
  1353. tmp := strings.Split(ptmp[0], "__")
  1354. if len(tmp) == 2 {
  1355. epos := strings.Split(tmp[1], ",")
  1356. posm := map[string]int{}
  1357. for _, v := range epos {
  1358. ks := strings.Split(v, ":")
  1359. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1360. posm[ks[1]] = qu.IntAll(ks[0])
  1361. } else {
  1362. posm[vre.Field] = qu.IntAll(ks[0])
  1363. }
  1364. }
  1365. var pattern string
  1366. if strings.Contains(tmp[0], "\\u") {
  1367. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1368. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1369. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1370. } else {
  1371. pattern = tmp[0]
  1372. }
  1373. //log.Debug("pattern", pattern)
  1374. //fmt.Println(text)
  1375. reg := regexp.MustCompile(pattern)
  1376. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1377. for i, _ := range apos {
  1378. pos := apos[i]
  1379. for k, p := range posm {
  1380. if len(pos) > p {
  1381. if pos[p] == -1 || pos[p+1] == -1 {
  1382. continue
  1383. }
  1384. val := text[pos[p]:pos[p+1]]
  1385. if string(val) == "" {
  1386. continue
  1387. }
  1388. if sign == -1 {
  1389. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1390. } else {
  1391. rep[k+"_"+fmt.Sprint(i)] = val
  1392. }
  1393. }
  1394. }
  1395. }
  1396. tmps := []map[string]interface{}{}
  1397. for i := 0; i < len(apos); i++ {
  1398. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1399. tmp := map[string]interface{}{
  1400. "field": vre.Field,
  1401. "code": vre.Code,
  1402. "ruletext": vre.RuleText,
  1403. "extfrom": text,
  1404. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1405. "type": "regexp",
  1406. "matchtype": "regcontent",
  1407. "blocktag": *tag,
  1408. "score": score,
  1409. }
  1410. exfield := ju.ExtField{
  1411. BlockTag: *tag,
  1412. Field: vre.Field,
  1413. Code: vre.Code,
  1414. RuleText: vre.RuleText,
  1415. Type: "regexp",
  1416. MatchType: "regcontent",
  1417. ExtFrom: extfrom,
  1418. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1419. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1420. Score: score,
  1421. }
  1422. if vre.Field == "qualifies" {
  1423. if len(rep) >= 2 {
  1424. tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1425. exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)]
  1426. }
  1427. }
  1428. tmps = append(tmps, tmp)
  1429. if tmp["blocktag"] != nil {
  1430. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1431. }
  1432. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1433. }
  1434. }
  1435. if len(tmps) > 0 {
  1436. //fmt.Println(tmps)
  1437. extinfo[vre.Field] = tmps
  1438. }
  1439. }
  1440. } else {
  1441. pos := vre.RegCore.Reg.FindStringIndex(text)
  1442. val := ""
  1443. if len(pos) == 2 {
  1444. text = text[pos[1]:]
  1445. rs := regexp.MustCompile("[^\r\n\t]+")
  1446. tmp := rs.FindAllString(text, -1)
  1447. if len(tmp) > 0 {
  1448. val = tmp[0]
  1449. }
  1450. }
  1451. if val != "" {
  1452. tmps := []map[string]interface{}{}
  1453. tmp := map[string]interface{}{
  1454. "field": vre.Field,
  1455. "code": vre.Code,
  1456. "ruletext": vre.RuleText,
  1457. "extfrom": text,
  1458. "value": val,
  1459. "type": "regexp",
  1460. "matchtype": "regcontent",
  1461. "blocktag": *tag,
  1462. "score": score,
  1463. }
  1464. tmps = append(tmps, tmp)
  1465. extinfo[vre.Field] = tmps
  1466. if j.Result[vre.Field] == nil {
  1467. j.Result[vre.Field] = [](*ju.ExtField){}
  1468. }
  1469. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1470. Value: val,
  1471. Score: score}
  1472. if tmp["blocktag"] != nil {
  1473. field.BlockTag = tmp["blocktag"].(map[string]string)
  1474. }
  1475. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1476. }
  1477. }
  1478. return extinfo
  1479. }
  1480. //后置过滤
  1481. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1482. defer qu.Catch()
  1483. if in.IsLua {
  1484. result := GetResultMapForLua(j)
  1485. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1486. if j != nil {
  1487. lua.Block = j.Block
  1488. }
  1489. extinfo := lua.RunScript("back")
  1490. for k, v := range extinfo {
  1491. if tmps, ok := v.([]map[string]interface{}); ok {
  1492. j.Result[k] = [](*ju.ExtField){}
  1493. for _, tmp := range tmps {
  1494. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1495. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1496. Value: tmp["value"]}
  1497. if tmp["blocktag"] != nil {
  1498. field.BlockTag = tmp["blocktag"].(map[string]string)
  1499. }
  1500. j.Result[k] = append(j.Result[k], field)
  1501. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1502. }
  1503. }
  1504. }
  1505. if len(extinfo) > 0 {
  1506. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1507. }
  1508. } else {
  1509. extinfo := map[string]interface{}{}
  1510. if in.Field != "" {
  1511. clearByTitle := false
  1512. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1513. clearByTitle = true
  1514. }
  1515. if j.Result[in.Field] != nil {
  1516. tmp := j.Result[in.Field]
  1517. exts := []interface{}{}
  1518. for k, v := range tmp {
  1519. if clearByTitle && v.ExtFrom != "title" {
  1520. continue
  1521. }
  1522. //table抽取到的数据不清理
  1523. if v.Type == "table" && v.Field == "projectname" {
  1524. return
  1525. }
  1526. text := qu.ObjToString(v.Value)
  1527. if v.Field == "bidamount" || v.Field == "budget" {
  1528. if strings.Contains(qu.ObjToString(v.SourceValue), "费率") {
  1529. j.Result[in.Field][k].IsTrue = false
  1530. continue
  1531. }
  1532. }
  1533. if text != "" {
  1534. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1535. }
  1536. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1537. continue
  1538. }
  1539. j.Result[in.Field][k].Value = text
  1540. exts = append(exts, map[string]interface{}{
  1541. "field": v.Field,
  1542. "code": v.Code,
  1543. "ruletext": v.RuleText,
  1544. "type": v.Type,
  1545. "matchtype": v.MatchType,
  1546. "extfrom": v.ExtFrom,
  1547. "value": text,
  1548. })
  1549. }
  1550. if len(exts) > 0 {
  1551. extinfo[in.Field] = exts
  1552. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1553. }
  1554. }
  1555. } else {
  1556. for key, tmp := range j.Result {
  1557. exts := []interface{}{}
  1558. for k, v := range tmp {
  1559. //table抽取到的数据不清理
  1560. if v.Type == "table" && v.Field == "projectname" {
  1561. return
  1562. }
  1563. text := qu.ObjToString(v.Value)
  1564. if text != "" {
  1565. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1566. }
  1567. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1568. continue
  1569. }
  1570. j.Result[key][k].Value = text
  1571. exts = append(exts, map[string]interface{}{
  1572. "field": v.Field,
  1573. "code": v.Code,
  1574. "ruletext": v.RuleText,
  1575. "type": v.Type,
  1576. "matchtype": v.MatchType,
  1577. "extfrom": v.ExtFrom,
  1578. "value": text,
  1579. })
  1580. }
  1581. if len(exts) > 0 {
  1582. extinfo[key] = exts
  1583. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1584. }
  1585. }
  1586. }
  1587. }
  1588. }
  1589. //后置过滤
  1590. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1591. defer qu.Catch()
  1592. for k, v := range j.BlockPackage {
  1593. if in.Field == "winner" {
  1594. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1595. } else if in.Field == "bidstatus" {
  1596. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1597. } else if in.Field == "" {
  1598. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1599. } else if in.Field == "projectname" {
  1600. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1601. } else if in.Field == "winnerperson" {
  1602. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1603. } else if in.Field == "winnertel" {
  1604. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1605. }
  1606. }
  1607. }
  1608. //KV过滤
  1609. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1610. defer qu.Catch()
  1611. extinfo := map[string]interface{}{}
  1612. if in.Field != "" {
  1613. if j.Result[in.Field] != nil {
  1614. tmp := j.Result[in.Field]
  1615. exts := []interface{}{}
  1616. for k, v := range tmp {
  1617. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1618. continue
  1619. }
  1620. text := qu.ObjToString(v.Value)
  1621. if text != "" {
  1622. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1623. }
  1624. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1625. continue
  1626. }
  1627. j.Result[in.Field][k].Value = text
  1628. exts = append(exts, map[string]interface{}{
  1629. "field": v.Field,
  1630. "code": v.Code,
  1631. "ruletext": v.RuleText,
  1632. "type": v.Type,
  1633. "matchtype": v.MatchType,
  1634. "extfrom": v.ExtFrom,
  1635. "value": text,
  1636. })
  1637. }
  1638. if len(exts) > 0 {
  1639. extinfo[in.Field] = exts
  1640. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1641. }
  1642. }
  1643. }
  1644. }
  1645. //获取抽取结果map[string][]interface{},lua脚本使用
  1646. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1647. defer qu.Catch()
  1648. result := map[string][]map[string]interface{}{}
  1649. for key, val := range j.Result {
  1650. if result[key] == nil {
  1651. result[key] = []map[string]interface{}{}
  1652. }
  1653. for _, v := range val {
  1654. tmp := map[string]interface{}{
  1655. "field": v.Field,
  1656. "code": v.Code,
  1657. "ruletext": v.RuleText,
  1658. "value": v.Value,
  1659. "type": v.Type,
  1660. "matchtype": v.MatchType,
  1661. "extfrom": v.ExtFrom,
  1662. }
  1663. result[key] = append(result[key], tmp)
  1664. }
  1665. }
  1666. return result
  1667. }
  1668. //抽取日志
  1669. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1670. defer qu.Catch()
  1671. if !t.IsEtxLog {
  1672. return
  1673. }
  1674. logdata := map[string]interface{}{
  1675. "code": qu.If(v.Code == "", "kv", v.Code),
  1676. "name": v.Name,
  1677. "type": ftype,
  1678. "ruletext": v.RuleText,
  1679. "islua": v.IsLua,
  1680. "field": v.Field,
  1681. "version": t.Version,
  1682. "taskname": t.Name,
  1683. "before": before,
  1684. "extinfo": extinfo,
  1685. "sid": sid,
  1686. "comeintime": time.Now().Unix(),
  1687. }
  1688. lock.Lock()
  1689. ExtLogs[t] = append(ExtLogs[t], logdata)
  1690. lock.Unlock()
  1691. }
  1692. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1693. exts := []map[string]interface{}{}
  1694. exts = append(exts, map[string]interface{}{
  1695. "field": ext.Field,
  1696. "code": ext.Code,
  1697. "type": ftype,
  1698. "matchtype": matchtype,
  1699. "extfrom": ext.ExtFrom,
  1700. "value": ext.Value,
  1701. })
  1702. extinfo := map[string]interface{}{
  1703. ext.Field: exts,
  1704. }
  1705. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1706. }
  1707. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1708. defer qu.Catch()
  1709. if !t.IsEtxLog {
  1710. return
  1711. }
  1712. logdata := map[string]interface{}{
  1713. "code": code,
  1714. "name": name,
  1715. "type": ftype,
  1716. "ruletext": "",
  1717. "islua": false,
  1718. "field": field,
  1719. "version": t.Version,
  1720. "taskname": t.Name,
  1721. "before": before,
  1722. "extinfo": extinfo,
  1723. "sid": sid,
  1724. "comeintime": time.Now().Unix(),
  1725. }
  1726. lock.Lock()
  1727. ExtLogs[t] = append(ExtLogs[t], logdata)
  1728. lock.Unlock()
  1729. }
  1730. //保存抽取日志
  1731. func SaveExtLog() {
  1732. defer qu.Catch()
  1733. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1734. lock.Lock()
  1735. tmpLogs = ExtLogs
  1736. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1737. lock.Unlock()
  1738. for k, v := range tmpLogs {
  1739. if len(v) < saveLimit {
  1740. db.Mgo.SaveBulk(k.TrackColl, v...)
  1741. } else {
  1742. for {
  1743. if len(v) > saveLimit {
  1744. tmp := v[:saveLimit]
  1745. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1746. v = v[saveLimit:]
  1747. } else {
  1748. db.Mgo.SaveBulk(k.TrackColl, v...)
  1749. break
  1750. }
  1751. }
  1752. }
  1753. }
  1754. time.AfterFunc(10*time.Second, SaveExtLog)
  1755. }
  1756. type FieldValue struct {
  1757. Value interface{}
  1758. Count int
  1759. }
  1760. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1761. //分析抽取结果并保存
  1762. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1763. qu.Try(func() {
  1764. if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
  1765. if j.CategorySecond != "单一" {
  1766. delete(j.Result, "winner")
  1767. delete(j.Result, "bidamount")
  1768. for _, v := range j.BlockPackage {
  1769. v.Bidamount = 0
  1770. v.IsTrueBidamount = false
  1771. if v.Winner != "" {
  1772. v.Winner = ""
  1773. if v.SpaceKV != nil {
  1774. delete(v.SpaceKV.KvTags, "中标单位")
  1775. }
  1776. if v.TableKV != nil {
  1777. delete(v.TableKV.KvTags, "中标单位")
  1778. }
  1779. if v.ColonKV != nil {
  1780. delete(v.ColonKV.KvTags, "中标单位")
  1781. }
  1782. }
  1783. }
  1784. for _, v := range j.PackageInfo {
  1785. delete(v, "winner")
  1786. delete(v, "bidamount")
  1787. }
  1788. j.Winnerorder = nil
  1789. if jf != nil && jf.Winnerorder != nil {
  1790. jf.Winnerorder = nil
  1791. }
  1792. }
  1793. }
  1794. //重新取出清理过后的中标候选人
  1795. resetWinnerorder(j)
  1796. doc, result, _id := funcAnalysis(j, e)
  1797. if ju.IsSaveTag {
  1798. go otherNeedSave(j, result, e)
  1799. }
  1800. //从排序结果中取值
  1801. tmp := map[string]interface{}{} //抽取值
  1802. tmp["spidercode"] = j.SpiderCode
  1803. tmp["site"] = j.Site
  1804. if len(*j.Jsondata) > 0 {
  1805. tmp["jsondata"] = j.Jsondata
  1806. }
  1807. for k, val := range result {
  1808. if k == "qualifies" {
  1809. squalifies := make([]interface{}, 0)
  1810. squalifiesMap := make(map[string]*scoreIndex, 0)
  1811. for _, kv := range val {
  1812. skey := kv.RuleText
  1813. if kv.Score > 0 {
  1814. if squalifiesMap[skey] == nil {
  1815. squalifiesMap = map[string]*scoreIndex{
  1816. skey: &scoreIndex{
  1817. Score: kv.Score,
  1818. Index: len(squalifies),
  1819. },
  1820. }
  1821. squalifies = append(squalifies, map[string]interface{}{
  1822. "key": skey,
  1823. "value": kv.Value,
  1824. })
  1825. } else {
  1826. if squalifiesMap[skey].Score < kv.Score {
  1827. squalifies[squalifiesMap[skey].Index] = map[string]interface{}{
  1828. "key": skey,
  1829. "value": kv.Value,
  1830. }
  1831. }
  1832. }
  1833. }
  1834. }
  1835. tmp[k] = squalifies
  1836. continue
  1837. }
  1838. for _, v := range val { //取第一个非负数,项目名称除外
  1839. //存0是否有效
  1840. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 {
  1841. tmp[v.Field] = v.Value
  1842. break
  1843. }
  1844. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  1845. tmp[v.Field] = v.Value
  1846. break
  1847. }
  1848. }
  1849. }
  1850. tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",")
  1851. if len(j.PackageInfo) > 15 {
  1852. for k, v := range j.PackageInfo {
  1853. j.PackageInfo = map[string]map[string]interface{}{}
  1854. j.PackageInfo[k] = v
  1855. break
  1856. }
  1857. }
  1858. if len(j.PackageInfo) > 0 { //分包信息
  1859. tmp["package"] = j.PackageInfo
  1860. //包预算,中标金额合并大于抽取就覆盖
  1861. var tmpBidamount, tmpBudget float64
  1862. //s_winner逗号分隔拼接,分包中标人
  1863. var tmpstr, savewinner []string
  1864. //按包排序
  1865. for b, v := range j.PackageInfo {
  1866. if v["winner"] != nil && v["winner"] != "" {
  1867. tmpstr = append(tmpstr, b)
  1868. }
  1869. }
  1870. //包预算,中标金额合并大于抽取就覆盖
  1871. if len(j.PackageInfo) >= 1 {
  1872. //包数大于1累加
  1873. for _, v := range j.PackageInfo {
  1874. if v["budget"] != nil {
  1875. tmpBudget += qu.Float64All(v["budget"])
  1876. }
  1877. if v["bidamount"] != nil {
  1878. tmpBidamount += qu.Float64All(v["bidamount"])
  1879. }
  1880. }
  1881. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1882. tmp["budget"] = tmpBudget
  1883. }
  1884. if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
  1885. tmp["bidamount"] = tmpBidamount
  1886. } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1887. tmp["bidamount"] = tmpBidamount
  1888. }
  1889. } else {
  1890. //包数等于1,tmp没有值取包里的值
  1891. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1892. for _, v := range j.PackageInfo {
  1893. if v["budget"] != nil {
  1894. tmp["budget"] = v["budget"]
  1895. }
  1896. }
  1897. }
  1898. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1899. for _, v := range j.PackageInfo {
  1900. if v["bidamount"] != nil {
  1901. tmp["bidamount"] = v["bidamount"]
  1902. }
  1903. }
  1904. }
  1905. }
  1906. //s_winner逗号分隔拼接,分包中标人
  1907. sort.Strings(tmpstr)
  1908. for _, v := range tmpstr {
  1909. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1910. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1911. if savevvv == "" {
  1912. continue
  1913. }
  1914. savewinner = append(savewinner, savevvv)
  1915. }
  1916. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1917. tmp["s_winner"] = tmp["winner"]
  1918. } else if savewinner != nil {
  1919. savewinner = RemoveReplicaSliceString(savewinner)
  1920. tmp["s_winner"] = strings.Join(savewinner, ",")
  1921. }
  1922. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1923. //没有分包取winner
  1924. tmp["s_winner"] = tmp["winner"]
  1925. }
  1926. if len(j.Winnerorder) > 0 { //候选人信息
  1927. for i, v := range j.Winnerorder {
  1928. if v["price"] != nil {
  1929. tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  1930. if tmpPrice[len(tmpPrice)-1].(bool) {
  1931. j.Winnerorder[i]["price"] = tmpPrice[0]
  1932. } else {
  1933. delete(j.Winnerorder[i], "price")
  1934. }
  1935. }
  1936. }
  1937. tmp["winnerorder"] = j.Winnerorder
  1938. }
  1939. //处理附件
  1940. var resultf map[string][]*ju.ExtField
  1941. ffield := map[string]interface{}{}
  1942. if jf != nil {
  1943. _, resultf, _ = funcAnalysis(jf, e)
  1944. for _, val := range resultf {
  1945. for _, v := range val { //取第一个非负数
  1946. if v.Score > -1 {
  1947. ffield[v.Field] = v.Value
  1948. if tmp[v.Field] == nil {
  1949. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
  1950. tmp[v.Field] = v.Value
  1951. break
  1952. }
  1953. if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
  1954. tmp[v.Field] = v.Value
  1955. break
  1956. }
  1957. }
  1958. break
  1959. }
  1960. }
  1961. }
  1962. if len(jf.PackageInfo) > 0 { //分包信息
  1963. ffield["package"] = jf.PackageInfo
  1964. }
  1965. if len(jf.Winnerorder) > 0 { //候选人信息
  1966. ffield["winnerorder"] = jf.Winnerorder
  1967. }
  1968. }
  1969. for k, v := range *doc {
  1970. if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
  1971. (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
  1972. }
  1973. //去重冗余字段
  1974. if delFiled(k) {
  1975. continue
  1976. }
  1977. if tmp[k] == nil {
  1978. tmp[k] = v
  1979. }
  1980. }
  1981. //质量审核
  1982. if ju.QualityAudit {
  1983. e.QualityAudit(tmp)
  1984. }
  1985. //城市抽取
  1986. if e.IsExtractCity {
  1987. e.NewExtractCity(j, &tmp, _id)
  1988. }
  1989. //品牌抽取
  1990. if ju.IsBrandGoods {
  1991. tmp["checkhas"] = map[string]int{
  1992. "hastable": j.HasTable,
  1993. "hasgoods": j.HasGoods,
  1994. "hasbrand": j.HasBrand,
  1995. "haskey": j.HasKey,
  1996. }
  1997. if len(j.BrandData) > 0 {
  1998. tmp["tablebrand"] = j.BrandData
  1999. }
  2000. }
  2001. //prince和number抽取
  2002. if ju.IsPriceNumber {
  2003. priceNumberLen := len(j.PriceNumberData)
  2004. if priceNumberLen > 1 { //table数据去重
  2005. tmpPriceNumberData := []map[string]interface{}{}
  2006. tableStrs := map[string]bool{}
  2007. for _, tb := range j.PriceNumberData {
  2008. has := false
  2009. bytes, _ := json.Marshal(tb)
  2010. str := string(bytes)
  2011. if len(tableStrs) > 0 && tableStrs[str] {
  2012. has = true
  2013. } else {
  2014. tableStrs[str] = true
  2015. }
  2016. if !has {
  2017. for _, data := range tb {
  2018. tmpPriceNumberData = append(tmpPriceNumberData, data)
  2019. }
  2020. }
  2021. }
  2022. tmp["pricenumber"] = tmpPriceNumberData
  2023. } else if priceNumberLen == 1 {
  2024. tmp["pricenumber"] = j.PriceNumberData[0]
  2025. }
  2026. }
  2027. //所有kv组成的字符串
  2028. var kvtext bytes.Buffer
  2029. blocks := make([]ju.BlockAndTag, 0)
  2030. for _, v := range j.Block {
  2031. //分包和标签
  2032. if ju.SaveBlock {
  2033. xx, _ := json.Marshal(v)
  2034. tmpblock := new(ju.TmpBlock)
  2035. err := json.Unmarshal(xx, &tmpblock)
  2036. if err != nil {
  2037. if v.BPackage != nil {
  2038. bpb, _ := json.Marshal(v.BPackage)
  2039. tmpblock.BPackage = string(bpb)
  2040. }
  2041. tmpblock = rangeBlockToJson(v, *tmpblock)
  2042. }
  2043. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  2044. }
  2045. //把所有kv组装成一个字符串,存库
  2046. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  2047. if jv == nil {
  2048. continue
  2049. }
  2050. for jv_k, jv_v := range jv.KvTags {
  2051. for _, jv_vv := range jv_v {
  2052. kvtext.WriteString(jv_k)
  2053. kvtext.WriteString(":")
  2054. kvtext.WriteString(jv_vv.Value)
  2055. kvtext.WriteString("\n")
  2056. }
  2057. }
  2058. }
  2059. }
  2060. if kvtext.Len() > 0 {
  2061. tmp["kvtext"] = kvtext.String()
  2062. }
  2063. if len(blocks) > 0 {
  2064. if blocksBytes, err := json.Marshal(blocks); err == nil {
  2065. if utf8.RuneCount(blocksBytes) < 100000 {
  2066. tmp["blocks"] = string(blocksBytes)
  2067. }
  2068. }
  2069. }
  2070. tmp["dataging"] = j.Dataging
  2071. //检查字段
  2072. tmp = checkFields(tmp)
  2073. if tmp["projectname"] == nil || tmp["projectname"] == "" {
  2074. tmp["projectname"] = j.Title
  2075. }
  2076. tmp["repeat"] = 0
  2077. if ju.Ffield {
  2078. if len(ffield) > 0 {
  2079. tmp["ffield"] = ffield
  2080. }
  2081. }
  2082. if e.TaskInfo.TestColl == "" {
  2083. if len(tmp) > 0 { //保存抽取结果
  2084. tmparr := []map[string]interface{}{
  2085. map[string]interface{}{
  2086. "_id": qu.StringTOBsonId(_id),
  2087. },
  2088. map[string]interface{}{"$set": tmp},
  2089. }
  2090. e.RWMutex.Lock()
  2091. e.BidArr = append(e.BidArr, tmparr)
  2092. e.BidTotal++
  2093. e.RWMutex.Unlock()
  2094. }
  2095. if ju.SaveResult {
  2096. id := tmp["_id"]
  2097. tmp["result"] = result
  2098. tmp["resultf"] = resultf
  2099. delete(tmp, "_id")
  2100. tmparr := []map[string]interface{}{
  2101. map[string]interface{}{
  2102. "_id": id,
  2103. },
  2104. map[string]interface{}{"$set": tmp},
  2105. }
  2106. e.RWMutex.Lock()
  2107. e.ResultArr = append(e.ResultArr, tmparr)
  2108. e.RWMutex.Unlock()
  2109. }
  2110. } else { //测试结果
  2111. delete(tmp, "_id")
  2112. delete(tmp, "fieldall")
  2113. if len(j.BlockPackage) > 0 { //分包详情
  2114. if len(j.BlockPackage) > 10 {
  2115. tmp["epackage"] = "分包异常"
  2116. } else {
  2117. bs, _ := json.Marshal(j.BlockPackage)
  2118. tmp["epackage"] = string(bs)
  2119. }
  2120. }
  2121. tmp["result"] = result
  2122. //tmp["resultf"] = resultf
  2123. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  2124. if !b {
  2125. log.Debug(e.TaskInfo.TestColl, _id)
  2126. }
  2127. }
  2128. }, func(err interface{}) {
  2129. log.Debug("AnalysisSaveResult err", err)
  2130. })
  2131. }
  2132. //检查字段-
  2133. func checkFields(tmp map[string]interface{}) map[string]interface{} {
  2134. delete(tmp, "contenthtml")
  2135. delete(tmp, "detail")
  2136. //delete(tmp, "toptype")
  2137. //delete(tmp, "subtype")
  2138. if _, ok := tmp["bidamount"].(string); ok {
  2139. delete(tmp, "bidamount")
  2140. } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/5 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
  2141. delete(tmp, "bidamount")
  2142. }
  2143. if _, ok := tmp["budget"].(string); ok {
  2144. delete(tmp, "budget")
  2145. }
  2146. if _, ok := tmp["unitprice"].(string); ok {
  2147. delete(tmp, "unitprice")
  2148. }
  2149. if _, ok := tmp["bidopentime"].(string); ok {
  2150. delete(tmp, "bidopentime")
  2151. }
  2152. if _, ok := tmp["signaturedate"].(string); ok {
  2153. delete(tmp, "signaturedate")
  2154. }
  2155. if _, ok := tmp["supervisorrate"].(string); ok {
  2156. delete(tmp, "supervisorrate")
  2157. }
  2158. for k, v := range tmp {
  2159. if k == "qualifies" {
  2160. continue
  2161. }
  2162. if k == "contract_guarantee" || k == "bid_guarantee" {
  2163. if len(fmt.Sprint(v)) > 0 {
  2164. tmp[k] = true
  2165. } else {
  2166. delete(tmp, k)
  2167. }
  2168. }
  2169. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") {
  2170. delete(tmp, k)
  2171. }
  2172. }
  2173. //工期单位-清理
  2174. if tmp["project_timeunit"] == "年" && tmp["project_duration"] == nil {
  2175. delete(tmp, "project_timeunit")
  2176. }
  2177. tmp["repeat"] = 0
  2178. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  2179. strwin := qu.ObjToString(tmp["winner"])
  2180. strwin_s := qu.ObjToString(tmp["s_winner"])
  2181. if !strings.Contains(strwin_s, strwin) {
  2182. tmp["s_winner"] = strwin
  2183. }
  2184. }
  2185. //budget bidamount
  2186. if bg, ok := tmp["budget"].(float64); ok && bg >= 500000000000 {
  2187. tmp["big_budget_err"] = bg
  2188. delete(tmp, "budget")
  2189. }
  2190. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 500000000000 {
  2191. tmp["big_bidamount_err"] = bg
  2192. delete(tmp, "bidamount")
  2193. }
  2194. return tmp
  2195. }
  2196. //保存其他
  2197. //kv、表格、块上的标签凡是新的标签都入库
  2198. //val type times firstid createtime 判定field
  2199. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  2200. now := time.Now().Unix()
  2201. coll := e.TaskInfo.TestColl
  2202. if coll == "" {
  2203. coll = "extract_tag_result"
  2204. } else {
  2205. coll += "_tag"
  2206. }
  2207. datas := []map[string]interface{}{}
  2208. kv := map[string]int{}
  2209. for _, v := range j.Block {
  2210. //
  2211. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  2212. if vv == nil || vv.KvTags == nil {
  2213. continue
  2214. }
  2215. for kkk, vvv := range vv.KvTags {
  2216. for _, vvvv := range vvv {
  2217. if vvvv.IsInvalid {
  2218. kv[kkk] = kv[kkk] + 1
  2219. break
  2220. }
  2221. }
  2222. }
  2223. }
  2224. for _, vv := range v.NotClassifyTitles {
  2225. datas = append(datas, map[string]interface{}{
  2226. "val": vv,
  2227. "times": 0,
  2228. "type": "block",
  2229. "firstid": j.SourceMid,
  2230. "createtime": now,
  2231. })
  2232. if len(datas) == saveLimit {
  2233. db.Mgo.SaveBulk(coll, datas...)
  2234. datas = []map[string]interface{}{}
  2235. }
  2236. }
  2237. }
  2238. for k, v := range kv {
  2239. datas = append(datas, map[string]interface{}{
  2240. "val": k,
  2241. "times": v,
  2242. "type": "kv",
  2243. "firstid": j.SourceMid,
  2244. "createtime": now,
  2245. })
  2246. if len(datas) == saveLimit {
  2247. db.Mgo.SaveBulk(coll, datas...)
  2248. datas = []map[string]interface{}{}
  2249. }
  2250. }
  2251. if len(datas) > 0 {
  2252. db.Mgo.SaveBulk(coll, datas...)
  2253. }
  2254. }
  2255. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2256. if j == nil {
  2257. return nil
  2258. }
  2259. if len(j.Block) > 0 {
  2260. for i, v := range j.Block {
  2261. rangetmp := new(ju.TmpBlock)
  2262. vb, _ := json.Marshal(v)
  2263. json.Unmarshal(vb, &rangetmp)
  2264. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2265. }
  2266. }
  2267. if j.ColonKV != nil {
  2268. cb, _ := json.Marshal(j.ColonKV)
  2269. tmpblock.ColonKV = string(cb)
  2270. }
  2271. if j.SpaceKV != nil {
  2272. sb, _ := json.Marshal(j.SpaceKV)
  2273. tmpblock.SpaceKV = string(sb)
  2274. }
  2275. if j.TableKV != nil {
  2276. tb, _ := json.Marshal(j.TableKV)
  2277. tmpblock.TableKV = string(tb)
  2278. }
  2279. return &tmpblock
  2280. }
  2281. //去重冗余字段
  2282. func delFiled(k string) bool {
  2283. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2284. }
  2285. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2286. defer qu.Catch()
  2287. doc := j.Data
  2288. result := j.Result
  2289. _id := qu.BsonIdToSId((*doc)["_id"])
  2290. result = ScoreFields(j, e.Tag) //正负面词打分
  2291. //结果排序
  2292. for _, val := range result {
  2293. ju.Sort(val)
  2294. }
  2295. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2296. //jsondata清理
  2297. clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
  2298. marshalbt, _ := json.Marshal(j.Jsondata)
  2299. tmpjddata := make(map[string]interface{})
  2300. json.Unmarshal(marshalbt, &tmpjddata)
  2301. for _, jdkey := range ju.JsonData {
  2302. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2303. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2304. if jdkey == "budget" || jdkey == "bidamount" {
  2305. lockclear.Lock()
  2306. cfn := e.ClearFn[jdkey]
  2307. lockclear.Unlock()
  2308. if len(cfn) == 0 {
  2309. continue
  2310. }
  2311. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
  2312. if tmpv.Value == newNum[0] {
  2313. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2314. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2315. ju.Sort(j.Result[jdkey])
  2316. delete((*j.Jsondata), jdkey)
  2317. break
  2318. }
  2319. } else {
  2320. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2321. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2322. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2323. ju.Sort(j.Result[jdkey])
  2324. delete((*j.Jsondata), jdkey)
  2325. break
  2326. }
  2327. }
  2328. }
  2329. }
  2330. }
  2331. if len(*j.Jsondata) > 0 {
  2332. j.Result = JsonDataMergeProcessing(j, e)
  2333. }
  2334. j.Jsondata = &tmpjddata
  2335. }
  2336. return doc, result, _id
  2337. }
  2338. //辅助信息,如果没有排序先排序
  2339. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2340. fieldalls := map[string][]map[string]interface{}{}
  2341. if j == nil {
  2342. return fieldalls
  2343. }
  2344. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2345. defer qykredis.Close()
  2346. db := 0
  2347. for field, val := range j.Result {
  2348. //ju.Sort(val)
  2349. if field == "buyer" {
  2350. db = ju.BuyerDB
  2351. } else if field == "winner" {
  2352. db = ju.WinnerDB
  2353. } else if field == "agency" {
  2354. db = ju.AgencyDB
  2355. }
  2356. sfields := []map[string]interface{}{}
  2357. for _, v := range val {
  2358. standardized := false
  2359. if _, err := qykredis.Do("SELECT", db); err != nil {
  2360. fmt.Println("redis select err", err)
  2361. } else {
  2362. rep, err := qykredis.Do("GET", v.Value)
  2363. if rep != nil && err == nil {
  2364. standardized = true
  2365. }
  2366. }
  2367. if field == "budget" || field == "bidamount" {
  2368. if !v.IsTrue {
  2369. continue
  2370. }
  2371. }
  2372. sfield := map[string]interface{}{
  2373. "val": v.Value,
  2374. "type": v.Type,
  2375. "score": v.Score,
  2376. "blocktag": v.BlockTag,
  2377. "sourceval": v.SourceValue,
  2378. "standardized": standardized,
  2379. }
  2380. sfields = append(sfields, sfield)
  2381. }
  2382. fieldalls[field] = sfields
  2383. }
  2384. return fieldalls
  2385. }
  2386. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2387. defer qu.Catch()
  2388. //获取审核字段
  2389. for _, field := range e.AuditFields {
  2390. //1.分包
  2391. if resulttmp["package"] != nil {
  2392. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2393. for _, val := range packagedata {
  2394. if val[field] != nil {
  2395. fv := qu.ObjToString(val[field])
  2396. if fv != "" {
  2397. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2398. e.RedisMatch(field, fv, val) //redis匹配
  2399. } else { //除了buyer和winner,其他字段走规则匹配
  2400. e.RuleMatch(field, fv, val)
  2401. }
  2402. }
  2403. }
  2404. }
  2405. }
  2406. //2.外围
  2407. if resulttmp[field] != nil {
  2408. fv := qu.ObjToString(resulttmp[field])
  2409. if fv != "" {
  2410. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2411. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2412. } else { //除了buyer和winner,其他字段走规则匹配
  2413. e.RuleMatch(field, fv, resulttmp)
  2414. }
  2415. }
  2416. }
  2417. }
  2418. }
  2419. //Redis匹配
  2420. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2421. defer qu.Catch()
  2422. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2423. if i == 0 { //reids未找到,执行规则匹配
  2424. val[field+"_isredis"] = false
  2425. e.RuleMatch(field, fv, val) //规则匹配
  2426. } else { //redis找到,打标识存库
  2427. val[field+"_isredis"] = true
  2428. }
  2429. }
  2430. //规则匹配
  2431. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2432. defer qu.Catch()
  2433. if fieldval != "" {
  2434. SMap := e.StartMatch(field, fieldval)
  2435. //SMap.AddKey(field+"_isaudit", false)
  2436. for _, k := range SMap.Keys {
  2437. tmpMap[k] = SMap.Map[k]
  2438. }
  2439. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2440. }
  2441. }
  2442. //开始规则匹配
  2443. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2444. defer qu.Catch()
  2445. SMap := pretreated.NewSortMap()
  2446. lock.Lock()
  2447. f := e.RecogFieldMap[field]
  2448. lock.Unlock()
  2449. if len(f) > 0 {
  2450. fid := qu.BsonIdToSId(f["_id"])
  2451. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2452. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2453. if textAfterRecogFieldPrerule != "" {
  2454. lock.Lock()
  2455. classMap := e.FidClassMap[fid]
  2456. lock.Unlock()
  2457. L:
  2458. for _, c := range classMap { //class
  2459. classid := qu.BsonIdToSId(c["_id"])
  2460. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2461. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2462. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2463. if textAfterClassPrerule != "" {
  2464. lock.Lock()
  2465. ruleMap := e.CidRuleMap[classid]
  2466. lock.Unlock()
  2467. for _, r := range ruleMap { //rule
  2468. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2469. s_name := qu.ObjToString(r["s_name"])
  2470. rule := r["rule"].([]interface{})
  2471. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2472. if textAfterRulePrerule != "" {
  2473. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2474. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2475. if savefield != "" { //保存字段不为空,存储代码信息
  2476. SMap.AddKey(field+"_"+savefield, s_name)
  2477. }
  2478. break L
  2479. }
  2480. }
  2481. }
  2482. }
  2483. }
  2484. }
  2485. }
  2486. return SMap
  2487. }
  2488. //中标候选人经过清理之后,重新取出赋值
  2489. func resetWinnerorder(j *ju.Job) {
  2490. if len(j.Winnerorder) == 0 {
  2491. return
  2492. }
  2493. maxlen := len(j.Winnerorder) - 1
  2494. //中标单位
  2495. //i := 0
  2496. winners := []*ju.ExtField{}
  2497. bidamounts := []*ju.ExtField{}
  2498. if maxlen > 0 {
  2499. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  2500. return
  2501. }
  2502. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2503. if j.Winnerorder[0]["price"] != nil {
  2504. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2505. if tmpPrice[len(tmpPrice)-1].(bool) {
  2506. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2507. }
  2508. }
  2509. }
  2510. if j.Result["winner"] == nil && len(winners) > 0 {
  2511. j.Result["winner"] = winners
  2512. } else if len(winners) > 0 {
  2513. j.Result["winner"] = append(j.Result["winner"], winners...)
  2514. }
  2515. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2516. j.Result["bidamount"] = bidamounts
  2517. } else if len(bidamounts) > 0 {
  2518. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2519. }
  2520. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  2521. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2522. j.Result["winner"] = winners
  2523. if j.Winnerorder[0]["price"] != nil {
  2524. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  2525. if tmpPrice[len(tmpPrice)-1].(bool) {
  2526. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  2527. }
  2528. j.Result["bidamount"] = bidamounts
  2529. }
  2530. }
  2531. }
  2532. func RemoveReplicaSliceString(slc []string) []string {
  2533. result := make([]string, 0)
  2534. tempMap := make(map[string]bool, len(slc))
  2535. for _, e := range slc {
  2536. if tempMap[e] == false {
  2537. tempMap[e] = true
  2538. result = append(result, e)
  2539. }
  2540. }
  2541. return result
  2542. }
  2543. type scoreIndex struct {
  2544. Score float64
  2545. Index int
  2546. }