extract.go 68 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "jy/clear"
  7. db "jy/mongodbutil"
  8. "jy/pretreated"
  9. ju "jy/util"
  10. qu "qfw/util"
  11. "qfw/util/redis"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "unicode/utf8"
  19. "github.com/PuerkitoBio/goquery"
  20. log "github.com/donnie4w/go-logger/logger"
  21. "gopkg.in/mgo.v2/bson"
  22. )
  23. var (
  24. lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
  25. cut = ju.NewCut() //获取正文并清理
  26. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  27. TaskList map[string]*ExtractTask //任务列表
  28. ClearTaskList map[string]*ClearTask //清理任务列表
  29. saveLimit = 100 //抽取日志批量保存
  30. PageSize = 5000 //查询分页
  31. Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
  32. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  33. )
  34. //启动测试抽取
  35. func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
  36. defer qu.Catch()
  37. ext := &ExtractTask{}
  38. ext.Id = taskId
  39. ext.IsRun = true
  40. ext.InitTestTaskInfo(resultcoll, trackcoll)
  41. ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  42. ext.InitSite()
  43. ext.InitRulePres()
  44. ext.InitRuleBacks(false)
  45. ext.InitRuleBacks(true)
  46. ext.InitRuleCore(false)
  47. ext.InitRuleCore(true)
  48. ext.InitPkgCore()
  49. ext.InitBlockRule()
  50. ext.InfoTypeList()
  51. ext.InitTag(false)
  52. ext.InitTag(true)
  53. ext.InitClearFn(false)
  54. ext.InitClearFn(true)
  55. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  56. //初始化城市DFA信息
  57. ext.InitCityInfo()
  58. //ext.InitCityDFA()
  59. ext.InitAreaCode()
  60. ext.InitPostCode()
  61. }
  62. //质量审核
  63. ext.InitAuditFields()
  64. ext.InitAuditRule()
  65. ext.InitAuditClass()
  66. ext.InitAuditRecogField()
  67. //品牌抽取是否开启
  68. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  69. //价格个数抽取是否开启
  70. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  71. //附件抽取是否开启
  72. ext.InitFile()
  73. return RunExtractTestTask(ext, startId, num)
  74. }
  75. func IdTrans(startId string) bson.ObjectId {
  76. defer qu.Catch()
  77. return bson.ObjectIdHex(startId)
  78. }
  79. //开始测试任务抽取
  80. func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
  81. n, _ := strconv.Atoi(num)
  82. id := IdTrans(startId)
  83. if id.Valid() {
  84. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
  85. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
  86. for _, v := range *list {
  87. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  88. continue
  89. }
  90. var j, jf *ju.Job
  91. var isSite bool
  92. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  93. v["isextFile"] = true
  94. j, jf, isSite = ext.PreInfo(v)
  95. } else {
  96. j, _, isSite = ext.PreInfo(v)
  97. }
  98. go ext.ExtractProcess(j, jf, isSite)
  99. ext.TaskInfo.ProcessPool <- true
  100. }
  101. return true
  102. } else {
  103. return false
  104. }
  105. }
  106. //启动抽取
  107. func StartExtractTaskId(taskId string) bool {
  108. defer qu.Catch()
  109. isgo := false
  110. ext := TaskList[taskId]
  111. if ext == nil {
  112. ext = &ExtractTask{}
  113. ext.Id = taskId
  114. ext.InitTaskInfo()
  115. isgo = true
  116. } else {
  117. ext.Id = taskId
  118. ext.InitTaskInfo()
  119. }
  120. ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
  121. ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
  122. ext.InitSite()
  123. ext.InitRulePres()
  124. ext.InitRuleBacks(false)
  125. ext.InitRuleBacks(true)
  126. ext.InitRuleCore(false)
  127. ext.InitRuleCore(true)
  128. ext.InitPkgCore()
  129. ext.InitBlockRule()
  130. ext.InfoTypeList()
  131. ext.InitTag(false)
  132. ext.InitTag(true)
  133. ext.InitClearFn(false)
  134. ext.InitClearFn(true)
  135. if ext.IsExtractCity { //版本上控制是否开始城市抽取
  136. //初始化城市DFA信息
  137. //ext.InitCityDFA()
  138. ext.InitCityInfo()
  139. ext.InitAreaCode()
  140. ext.InitPostCode()
  141. }
  142. //质量审核
  143. ext.InitAuditFields()
  144. ext.InitAuditRule()
  145. ext.InitAuditClass()
  146. ext.InitAuditRecogField()
  147. //品牌抽取是否开启
  148. ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
  149. //价格个数抽取是否开启
  150. ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
  151. //附件抽取是否开启
  152. ext.InitFile()
  153. ext.IsRun = true
  154. go ext.ResultSave(true)
  155. go ext.BidSave(true)
  156. if isgo {
  157. go RunExtractTask(taskId)
  158. }
  159. TaskList[taskId] = ext
  160. return true
  161. }
  162. //停止抽取
  163. func StopExtractTaskId(taskId string) bool {
  164. defer qu.Catch()
  165. ext := TaskList[taskId]
  166. if ext != nil {
  167. ext.IsRun = false
  168. TaskList[taskId] = ext
  169. }
  170. //更新task.s_extlastid
  171. db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  172. return true
  173. }
  174. //开始抽取
  175. func RunExtractTask(taskId string) {
  176. defer qu.Catch()
  177. ext := TaskList[taskId]
  178. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  179. count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
  180. pageNum := (count + PageSize - 1) / PageSize
  181. limit := PageSize
  182. if count < PageSize {
  183. limit = count
  184. }
  185. fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
  186. for i := 0; i < pageNum; i++ {
  187. query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
  188. list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
  189. fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
  190. for _, v := range *list {
  191. if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
  192. continue
  193. }
  194. //根据标题判断是否抽取
  195. b := IsExtract("title", qu.ObjToString(v["title"]), "")
  196. if !b {
  197. continue
  198. }
  199. _id := qu.BsonIdToSId(v["_id"])
  200. //log.Debug(_id)
  201. if !ext.IsRun {
  202. break
  203. }
  204. var j, jf *ju.Job
  205. var isSite bool
  206. if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
  207. v["isextFile"] = true
  208. j, jf, isSite = ext.PreInfo(v)
  209. } else {
  210. j, _, isSite = ext.PreInfo(v)
  211. }
  212. go ext.ExtractProcess(j, jf, isSite)
  213. ext.TaskInfo.LastExtId = _id
  214. ext.TaskInfo.ProcessPool <- true
  215. }
  216. db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
  217. if !ext.IsRun {
  218. break
  219. }
  220. }
  221. //更新task.s_extlastid
  222. time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
  223. }
  224. //信息预处理-不和版本关联,取最新版本的配置项
  225. func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  226. return (&ExtractTask{}).PreInfo(doc)
  227. }
  228. //信息预处理-和版本关联
  229. func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
  230. defer qu.Catch()
  231. //判断是否有附件这个字段
  232. var isextFile bool
  233. if doc["isextFile"] != nil {
  234. isextFile = doc["isextFile"].(bool)
  235. }
  236. detail := ""
  237. d1, _ := doc["detail"].(string)
  238. d2, _ := doc["contenthtml"].(string)
  239. if len(d1) >= len(d2) || d2 == "" {
  240. detail = d1
  241. } else {
  242. detail = d2
  243. }
  244. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  245. d3, _ := doc["summary"].(string)
  246. //全文的需要修复表格
  247. detail = pretreated.RepairCon(detail)
  248. detail = ju.CutLableStr(d3 + "\n" + detail)
  249. detail = cut.ClearHtml(d3 + "\n" + detail)
  250. doc["detail"] = detail
  251. if isextFile {
  252. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  253. }
  254. if utf8.RuneCountInString(detail) < 2000 {
  255. if doc["detailfile"] == nil || doc["detailfile"] == "" {
  256. file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  257. }
  258. detail += qu.ObjToString(doc["detailfile"])
  259. doc["detail"] = detail
  260. } else {
  261. //正文小于200个字,有附件把附件内容加到正文
  262. tmpDeatil := detail
  263. tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
  264. if err == nil {
  265. conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
  266. if conlen < 2000 {
  267. if isextFile {
  268. detail += qu.ObjToString(doc["detailfile"])
  269. doc["detail"] = detail
  270. }
  271. } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
  272. //防止文本过长,造成抽取阻塞
  273. log.Debug("文本太长", doc["_id"], conlen)
  274. doc["detail"] = d3
  275. }
  276. }
  277. }
  278. toptype := qu.ObjToString(doc["toptype"])
  279. subtype := qu.ObjToString(doc["subtype"])
  280. if qu.ObjToString(doc["type"]) == "bid" {
  281. toptype = "结果"
  282. }
  283. if toptype == "" {
  284. toptype = "all"
  285. }
  286. if subtype == "" {
  287. subtype = "all"
  288. }
  289. if toptype == "其它" || subtype == "其它" || subtype == "其他" || subtype == "结果变更" {
  290. toptype = "all"
  291. subtype = "all"
  292. }
  293. toMap := qu.ObjToMap(doc["jsondata"])
  294. //log.Debug("toMap", toMap)
  295. if (*toMap) != nil {
  296. if (*toMap)["extweight"] == nil {
  297. (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
  298. }
  299. }
  300. j = &ju.Job{
  301. SourceMid: qu.BsonIdToSId(doc["_id"]),
  302. Category: toptype,
  303. CategorySecond: subtype,
  304. Content: qu.ObjToString(doc["detail"]),
  305. SpiderCode: qu.ObjToString(doc["spidercode"]),
  306. Site: qu.ObjToString(doc["site"]),
  307. //Domain: qu.ObjToString(doc["domain"]),
  308. //Href: qu.ObjToString(doc["href"]),
  309. Title: qu.ObjToString(doc["title"]),
  310. Data: &doc,
  311. City: qu.ObjToString(doc["city"]),
  312. Province: qu.ObjToString(doc["area"]),
  313. Jsondata: toMap,
  314. Result: map[string][]*ju.ExtField{},
  315. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  316. RuleBlock: e.RuleBlock,
  317. Dataging: qu.IntAll(doc["dataging"]),
  318. }
  319. if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
  320. delete((*j.Jsondata), "jsoncontent")
  321. }
  322. if isextFile {
  323. jf = &ju.Job{
  324. SourceMid: qu.BsonIdToSId(doc["_id"]),
  325. Category: toptype,
  326. CategorySecond: subtype,
  327. Content: qu.ObjToString(doc["detailfile"]),
  328. SpiderCode: qu.ObjToString(doc["spidercode"]),
  329. Site: qu.ObjToString(doc["site"]),
  330. Title: qu.ObjToString(doc["title"]),
  331. Data: &doc,
  332. City: qu.ObjToString(doc["city"]),
  333. Province: qu.ObjToString(doc["area"]),
  334. Jsondata: toMap,
  335. Result: map[string][]*ju.ExtField{},
  336. BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
  337. RuleBlock: e.RuleBlock,
  338. IsFile: isextFile,
  339. Dataging: qu.IntAll(doc["dataging"]),
  340. }
  341. if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
  342. delete((*jf.Jsondata), "jsoncontent")
  343. }
  344. }
  345. codeSite := j.SpiderCode
  346. //是否启用站点
  347. if value, ok := e.SiteMerge.Load(codeSite); ok {
  348. isSite = value.(bool)
  349. }
  350. if isSite {
  351. //是否配置站点
  352. exp, isSite := e.Luacodes.Load(codeSite)
  353. if isSite {
  354. if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
  355. e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
  356. }
  357. if exp.(map[string]interface{})["e.SiteTag"] != nil {
  358. e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
  359. }
  360. if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
  361. e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
  362. }
  363. if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
  364. e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
  365. }
  366. }
  367. }
  368. qu.Try(func() {
  369. pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
  370. if isextFile {
  371. pretreated.AnalyStart(jf, isSite, codeSite)
  372. }
  373. }, func(err interface{}) {
  374. log.Debug("pretreated.AnalyStart", err, j.SourceMid)
  375. })
  376. return j, jf, isSite
  377. }
  378. //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  379. func file2text(doc *map[string]interface{}) {
  380. tmpstr := ""
  381. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  382. for _, attachs := range attach_text {
  383. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  384. for _, fileinfo := range fileinfos {
  385. if ff, ok := fileinfo.(map[string]interface{}); ok {
  386. attach_url := qu.ObjToString(ff["attach_url"])
  387. bs := ju.OssGetObject(attach_url)
  388. if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
  389. tmpstr += bs + "\n"
  390. } else {
  391. break
  392. }
  393. }
  394. }
  395. }
  396. }
  397. }
  398. (*doc)["detailfile"] = tmpstr
  399. }
  400. //抽取
  401. func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
  402. e.ExtractDetail(j, isSite, j.SpiderCode)
  403. if jf != nil && jf.IsFile {
  404. e.ExtractFile(jf, isSite, j.SpiderCode)
  405. }
  406. if isSite {
  407. ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
  408. if ok && ismerge.(bool) {
  409. tmpj := &ju.Job{
  410. SourceMid: j.SourceMid,
  411. Category: j.Category,
  412. CategorySecond: j.CategorySecond,
  413. Content: j.Content,
  414. SpiderCode: j.SpiderCode,
  415. //Domain: qu.ObjToString(doc["domain"]),
  416. //Href: qu.ObjToString(doc["href"]),
  417. Title: j.Title,
  418. Data: j.Data,
  419. City: j.City,
  420. Province: j.Province,
  421. Jsondata: j.Jsondata,
  422. Result: map[string][]*ju.ExtField{},
  423. BuyerAddr: j.BuyerAddr,
  424. RuleBlock: e.RuleBlock,
  425. }
  426. qu.Try(func() {
  427. pretreated.AnalyStart(tmpj, false, "") //job.Block分块
  428. }, func(err interface{}) {
  429. log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
  430. })
  431. e.ExtractDetail(tmpj, false, "")
  432. //if jf != nil && jf.IsFile {
  433. // e.ExtractFile(jf, false, "")
  434. //}
  435. //合并数据
  436. j.Block = append(j.Block, tmpj.Block...)
  437. j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
  438. for tmpk, _ := range j.Result {
  439. if len(tmpj.Result[tmpk]) > 0 {
  440. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  441. }
  442. }
  443. for tmpk, _ := range tmpj.Result {
  444. if len(j.Result[tmpk]) == 0 {
  445. j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
  446. }
  447. }
  448. }
  449. }
  450. //分析抽取结果并保存
  451. AnalysisSaveResult(j, jf, e)
  452. <-e.TaskInfo.ProcessPool
  453. }
  454. func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
  455. qu.Try(func() {
  456. doc := *j.Data
  457. //全局前置规则,结果覆盖doc属性
  458. //for _, v := range e.RulePres {
  459. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  460. //}
  461. tmprules := map[string][]*RuleCore{}
  462. lockrule.Lock()
  463. if j.Category == "all" || j.CategorySecond == "all" {
  464. if isSite {
  465. for k, vc1 := range e.SiteRuleCores["all_all"] {
  466. tmprules[k] = vc1
  467. }
  468. } else {
  469. for k, vc1 := range e.RuleCores["all_all"] {
  470. tmprules[k] = vc1
  471. }
  472. }
  473. } else {
  474. if isSite {
  475. for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
  476. tmprules[k] = vc1
  477. }
  478. } else {
  479. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  480. tmprules[k] = vc1
  481. }
  482. }
  483. }
  484. if len(tmprules) < 1 { //分类未覆盖部分
  485. if isSite {
  486. for k, vc1 := range e.RuleCores["all_all"] {
  487. tmprules[k] = vc1
  488. }
  489. } else {
  490. for k, vc1 := range e.SiteRuleCores["all_all"] {
  491. tmprules[k] = vc1
  492. }
  493. }
  494. }
  495. lockrule.Unlock()
  496. //抽取规则
  497. for _, vc1 := range tmprules {
  498. for _, vc := range vc1 {
  499. tmp := ju.DeepCopy(doc).(map[string]interface{})
  500. //是否进入逻辑
  501. if !ju.Logic(vc.LuaLogic, tmp) {
  502. continue
  503. }
  504. ////抽取-前置规则
  505. //for _, v := range vc.RulePres {
  506. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  507. //}
  508. // log.Debug("抽取-前置规则", tmp)
  509. //抽取-规则
  510. ExtRuleCore(tmp, e, vc, j, isSite)
  511. // log.Debug("抽取-规则", tmp)
  512. //抽取-后置规则
  513. for _, v := range vc.RuleBacks {
  514. ExtRegBack(j, v, e.TaskInfo, vc)
  515. }
  516. //kv规则
  517. for _, v := range vc.KVRuleCores {
  518. ExtRuleKV(j, v, e.TaskInfo)
  519. }
  520. // log.Debug("抽取-后置规则", tmp)
  521. //项目名称未能抽取到,标题来凑
  522. if vc.Field == "projectname" {
  523. if vc.ExtFrom == "title" {
  524. isextitle := true
  525. for _, v := range j.Result[vc.Field] {
  526. if len([]rune(qu.ObjToString(v.Value))) > 5 {
  527. isextitle = false
  528. break
  529. }
  530. }
  531. if isextitle { //标题加入选举
  532. field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
  533. if isSite {
  534. field.Score = 1
  535. }
  536. j.Result[vc.Field] = append(j.Result[vc.Field], field)
  537. }
  538. }
  539. for i := 0; i < 3; i++ {
  540. for _, v := range vc.RuleBacks {
  541. ExtRegBack(j, v, e.TaskInfo, vc)
  542. }
  543. }
  544. }
  545. }
  546. }
  547. //全局后置规则
  548. if isSite {
  549. for _, v := range e.SiteRuleBacks {
  550. ExtRegBack(j, v, e.TaskInfo, nil)
  551. }
  552. } else {
  553. for _, v := range e.RuleBacks {
  554. ExtRegBack(j, v, e.TaskInfo, nil)
  555. }
  556. }
  557. //函数清理
  558. for key, val := range j.Result {
  559. for i, v := range val {
  560. // if v.ExtFrom == "title"&& v.Field == "buyer"{
  561. // qu.Debug("title---",v.Value)
  562. // }else if v.Field == "buyer"{
  563. // qu.Debug("text---",v.Value)
  564. // }
  565. lockclear.Lock()
  566. var cfn = []string{}
  567. if isSite {
  568. cfn = e.SiteClearFn[key]
  569. } else {
  570. cfn = e.ClearFn[key]
  571. }
  572. lockclear.Unlock()
  573. if len(cfn) == 0 {
  574. continue
  575. }
  576. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  577. if key == "budget" || key == "bidamount" {
  578. if istrue, ok := data[len(data)-1].(bool); istrue && ok {
  579. j.Result[key][i].IsTrue = true
  580. } else {
  581. j.Result[key][i].Value = data[0]
  582. continue
  583. }
  584. }
  585. before, _ := v.Value.(string)
  586. v.Value = data[0]
  587. BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
  588. //添加行数清理的日志
  589. //清理特殊符号
  590. lockclear.Lock()
  591. if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
  592. text := qu.ObjToString(v.Value)
  593. before = text
  594. v.Value = clear.OtherClean(key, text)
  595. BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
  596. }
  597. //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
  598. lockclear.Unlock()
  599. }
  600. }
  601. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  602. // bs, _ := json.Marshal(j.Result)
  603. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  604. }, func(err interface{}) {
  605. log.Debug("ExtractProcess err", err)
  606. })
  607. }
  608. func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
  609. qu.Try(func() {
  610. doc := *j.Data
  611. //全局前置规则,结果覆盖doc属性
  612. // for _, v := range e.RulePres {
  613. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  614. // doc = ExtRegPre(doc, j, v, e.TaskInfo)
  615. // }
  616. // }
  617. //抽取规则
  618. tmprules := map[string][]*RuleCore{}
  619. lockrule.Lock()
  620. if j.Category == "all" || j.CategorySecond == "all" {
  621. for k, vc1 := range e.RuleCores["all_all"] {
  622. tmprules[k] = vc1
  623. }
  624. } else {
  625. for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
  626. tmprules[k] = vc1
  627. }
  628. }
  629. lockrule.Unlock()
  630. for _, vc1 := range tmprules {
  631. for _, vc := range vc1 {
  632. tmp := ju.DeepCopy(doc).(map[string]interface{})
  633. //是否进入逻辑
  634. if !ju.Logic(vc.LuaLogic, tmp) {
  635. continue
  636. }
  637. //抽取-前置规则
  638. // for _, v := range vc.RulePres {
  639. // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  640. // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
  641. // }
  642. // }
  643. // log.Debug("抽取-前置规则", tmp)
  644. //抽取-规则
  645. if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
  646. ExtRuleCore(tmp, e, vc, j, isSite)
  647. }
  648. // log.Debug("抽取-规则", tmp)
  649. //抽取-后置规则
  650. for _, v := range vc.RuleBacks {
  651. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  652. ExtRegBack(j, v, e.TaskInfo, vc)
  653. }
  654. }
  655. // log.Debug("抽取-后置规则", tmp)
  656. }
  657. }
  658. //全局后置规则
  659. for _, v := range e.RuleBacks {
  660. if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
  661. ExtRegBack(j, v, e.TaskInfo, nil)
  662. }
  663. }
  664. //函数清理
  665. for key, val := range j.Result {
  666. for _, v := range val {
  667. lockclear.Lock()
  668. cfn := e.ClearFn[key]
  669. lockclear.Unlock()
  670. data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
  671. v.Value = data[0]
  672. //清理特殊符号
  673. lockclear.Lock()
  674. if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
  675. clear.MesField[key] != nil {
  676. text := qu.ObjToString(v.Value)
  677. text = clear.OtherClean(key, text)
  678. v.Value = text
  679. }
  680. lockclear.Unlock()
  681. }
  682. }
  683. PackageDetail(j, e, isSite, codeSite) //处理分包信息
  684. // bs, _ := json.Marshal(j.Result)
  685. // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
  686. }, func(err interface{}) {
  687. log.Debug("ExtractProcess err", err)
  688. })
  689. }
  690. //前置过滤
  691. func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
  692. defer qu.Catch()
  693. before := ju.DeepCopy(doc).(map[string]interface{})
  694. extinfo := map[string]interface{}{}
  695. if in.IsLua {
  696. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  697. if j != nil {
  698. lua.Block = j.Block
  699. }
  700. extinfo = lua.RunScript("pre")
  701. for k, v := range extinfo { //结果覆盖原doc
  702. doc[k] = v
  703. }
  704. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  705. } else {
  706. var key string
  707. if !j.IsFile {
  708. key = qu.If(in.Field == "", "detail", in.Field).(string)
  709. } else {
  710. key = qu.If(in.Field == "", "detailfile", in.Field).(string)
  711. }
  712. text := qu.ObjToString(doc[key])
  713. extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
  714. doc[key] = extinfo[key] //结果覆盖原doc
  715. AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
  716. }
  717. return doc
  718. }
  719. //抽取-规则
  720. func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
  721. //候选人加入
  722. var kvMap map[string][]map[string]interface{}
  723. extByReg := true
  724. if vc.ExtFrom != "title" {
  725. kvMap, extByReg = getKvByLuaFields(vc, j, e)
  726. }
  727. for _, v := range vc.RuleCores {
  728. if v.IsLua {
  729. ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
  730. } else if extByReg {
  731. ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
  732. }
  733. }
  734. //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
  735. if vc.Field == "budget" && len(kvMap) == 0 {
  736. if len(j.BlockPackage) == 1 {
  737. for _, bp := range j.BlockPackage {
  738. for fieldname, field := range vc.LFields {
  739. if field != vc.Field {
  740. continue
  741. }
  742. tp := ""
  743. for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
  744. if k == 0 {
  745. tp = "colon"
  746. } else if k == 1 {
  747. tp = "space"
  748. } else if k == 2 {
  749. tp = "table"
  750. }
  751. if v == nil || v.KvTags == nil {
  752. continue
  753. }
  754. for _, vv := range v.KvTags[fieldname] {
  755. text := ju.TrimLRSpace(vv.Value, "")
  756. if text != "" {
  757. tmp := &ju.ExtField{
  758. ExtFrom: "package",
  759. Field: vc.Field,
  760. Code: "CL_分包",
  761. Type: tp,
  762. MatchType: "package",
  763. RuleText: bp.Text,
  764. SourceValue: vv.Key,
  765. Value: text,
  766. }
  767. if isSite {
  768. tmp.Score = 1
  769. }
  770. j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
  771. }
  772. }
  773. }
  774. }
  775. break
  776. }
  777. }
  778. } else {
  779. for k, v := range kvMap {
  780. if j.Result[k] == nil {
  781. j.Result[k] = [](*ju.ExtField){}
  782. }
  783. for _, tmp := range v {
  784. field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
  785. ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
  786. Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
  787. MatchType: qu.ObjToString(tmp["matchtype"]),
  788. RuleText: qu.ObjToString(tmp["ruletext"]),
  789. SourceValue: tmp["sourcevalue"],
  790. Value: tmp["value"]}
  791. if k == "bidamount" && field.ExtFrom == "第一候选人" {
  792. field.Score = 1
  793. }
  794. if isSite {
  795. field.Score = 1
  796. }
  797. if tmp["blocktag"] != nil {
  798. btag := make(map[string]string)
  799. for k := range tmp["blocktag"].(map[string]bool) {
  800. blocktag.Lock()
  801. if TagConfigDesc[k] != "" {
  802. btag[k] = TagConfigDesc[k]
  803. }
  804. blocktag.Unlock()
  805. }
  806. field.BlockTag = btag
  807. }
  808. j.Result[k] = append(j.Result[k], field)
  809. }
  810. }
  811. }
  812. }
  813. //抽取-规则-kv
  814. func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
  815. defer qu.Catch()
  816. if extfrom == "title" || !in.IsLua {
  817. return
  818. }
  819. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
  820. lua.KvMap = *kvMap
  821. lua.Block = j.Block
  822. extinfo := lua.RunScript("core")
  823. if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
  824. for _, v := range tmps {
  825. v["core"] = in.Code
  826. }
  827. (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
  828. }
  829. if len(extinfo) > 0 {
  830. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  831. }
  832. }
  833. //抽取-规则-正则
  834. func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
  835. defer qu.Catch()
  836. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  837. b := IsExtract(in.Field, j.Title, j.Content)
  838. if !b {
  839. return
  840. }
  841. //全文正则
  842. //text := qu.ObjToString(doc[extfrom])
  843. //if in.Field != "" {
  844. // extinfo := extRegCoreToResult(extfrom, text, j, in)
  845. // if len(extinfo) > 0 {
  846. // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  847. // }
  848. //}
  849. //块抽取
  850. if in.Field != "" {
  851. if extfrom == "title" {
  852. extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
  853. if len(extinfo) > 0 {
  854. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  855. }
  856. } else {
  857. for _, v := range j.Block {
  858. btag := make(map[string]string)
  859. for k := range v.Classify {
  860. blocktag.Lock()
  861. btag[k] = TagConfigDesc[k]
  862. blocktag.Unlock()
  863. }
  864. extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
  865. if len(extinfo) > 0 {
  866. AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
  867. }
  868. }
  869. }
  870. }
  871. }
  872. //pkg抽取-规则-正则
  873. func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
  874. defer qu.Catch()
  875. //根据field配置项目,是否抽取。例如:废标、流标等跳过,
  876. b := IsExtract(in.Field, j.Title, j.Content)
  877. if !b {
  878. return
  879. }
  880. //块抽取
  881. if in.Field != "" {
  882. for k, vbpkg := range j.BlockPackage {
  883. rep := map[string]string{}
  884. if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  885. if in.Field == "budget" && vbpkg.Budget > 0 {
  886. continue
  887. }
  888. if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
  889. continue
  890. }
  891. if in.Field == "winner" && vbpkg.Winner != "" {
  892. continue
  893. }
  894. if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
  895. continue
  896. }
  897. if in.Field == "projectname" && vbpkg.Name != "" {
  898. continue
  899. }
  900. if in.Field == "winner" && vbpkg.Winner != "" {
  901. continue
  902. }
  903. if in.Field == "winnerperson" {
  904. if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
  905. continue
  906. }
  907. if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
  908. continue
  909. }
  910. }
  911. if in.Field == "winnertel" {
  912. if vbpkg.WinnerPerson == "" {
  913. continue
  914. }
  915. }
  916. //处理正负数修正
  917. ptmp := strings.Split(in.RuleText, "#")
  918. sign := 0
  919. if len(ptmp) == 2 {
  920. if ptmp[1] == "正" {
  921. sign = 1
  922. } else if ptmp[1] == "负" {
  923. sign = -1
  924. }
  925. }
  926. tmp := strings.Split(ptmp[0], "__")
  927. if len(tmp) == 2 {
  928. epos := strings.Split(tmp[1], ",")
  929. posm := map[string]int{}
  930. for _, v := range epos {
  931. ks := strings.Split(v, ":")
  932. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  933. posm[ks[1]] = qu.IntAll(ks[0])
  934. } else {
  935. posm[in.Field] = qu.IntAll(ks[0])
  936. }
  937. }
  938. var pattern string
  939. if strings.Contains(tmp[0], "\\u") {
  940. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  941. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  942. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  943. } else {
  944. pattern = tmp[0]
  945. }
  946. //log.Debug("pattern", pattern)
  947. //fmt.Println(text)
  948. reg := regexp.MustCompile(pattern)
  949. apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
  950. for i, _ := range apos {
  951. pos := apos[i]
  952. for k, p := range posm {
  953. if len(pos) > p {
  954. if pos[p] == -1 || pos[p+1] == -1 {
  955. continue
  956. }
  957. val := vbpkg.Text[pos[p]:pos[p+1]]
  958. if string(val) == "" {
  959. continue
  960. }
  961. if sign == -1 {
  962. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  963. } else {
  964. rep[k+"_"+fmt.Sprint(i)] = val
  965. }
  966. }
  967. }
  968. }
  969. //fmt.Println(text)
  970. for i := 0; i < len(apos); i++ {
  971. if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
  972. if in.Field == "budget" && vbpkg.Budget <= 0 {
  973. lock.Lock()
  974. cfn := e.ClearFn[in.Field]
  975. lock.Unlock()
  976. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  977. if data[len(data)-1].(bool) {
  978. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  979. j.BlockPackage[k].IsTrueBudget = true
  980. }
  981. break
  982. } else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  983. lock.Lock()
  984. cfn := e.ClearFn[in.Field]
  985. lock.Unlock()
  986. data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
  987. if data[len(data)-1].(bool) {
  988. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  989. j.BlockPackage[k].IsTrueBidamount = true
  990. }
  991. break
  992. } else if in.Field == "winner" {
  993. if j.BlockPackage[k].Winner == "" {
  994. j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
  995. break
  996. }
  997. } else if in.Field == "winnertel" {
  998. if j.BlockPackage[k].WinnerTel == "" {
  999. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1000. break
  1001. }
  1002. } else if in.Field == "winnerperson" {
  1003. if j.BlockPackage[k].WinnerPerson == "" {
  1004. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1005. break
  1006. }
  1007. } else if in.Field == "bidstatus" {
  1008. if j.BlockPackage[k].BidStatus == "" {
  1009. j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
  1010. break
  1011. }
  1012. } else if in.Field == "projectname" {
  1013. if j.BlockPackage[k].Name == "" {
  1014. j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
  1015. break
  1016. }
  1017. } else if in.Field == "winnerperson" {
  1018. if j.BlockPackage[k].WinnerPerson == "" {
  1019. j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
  1020. break
  1021. }
  1022. } else if in.Field == "winnertel" {
  1023. if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
  1024. j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
  1025. break
  1026. }
  1027. }
  1028. }
  1029. }
  1030. }
  1031. } else {
  1032. pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
  1033. val := ""
  1034. if len(pos) == 2 {
  1035. //"text" = "text"[pos[1]:]
  1036. val = "text"[pos[1]:]
  1037. rs := regexp.MustCompile("[^\r\n\t]+")
  1038. tmp := rs.FindAllString("text", -1)
  1039. if len(tmp) > 0 {
  1040. val = tmp[0]
  1041. }
  1042. }
  1043. if val != "" {
  1044. if in.Field == "budget" && vbpkg.Budget <= 0 {
  1045. lock.Lock()
  1046. cfn := e.ClearFn[in.Field]
  1047. lock.Unlock()
  1048. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1049. if data[len(data)-1].(bool) {
  1050. j.BlockPackage[k].Budget = qu.Float64All(data[0])
  1051. j.BlockPackage[k].IsTrueBudget = true
  1052. }
  1053. break
  1054. }
  1055. if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
  1056. lock.Lock()
  1057. cfn := e.ClearFn[in.Field]
  1058. lock.Unlock()
  1059. data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
  1060. if data[len(data)-1].(bool) {
  1061. j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
  1062. j.BlockPackage[k].IsTrueBidamount = true
  1063. }
  1064. break
  1065. } else if in.Field == "bidstatus" {
  1066. if j.BlockPackage[k].BidStatus == "" {
  1067. j.BlockPackage[k].BidStatus = val
  1068. break
  1069. }
  1070. } else if in.Field == "projectname" {
  1071. if j.BlockPackage[k].Name == "" {
  1072. j.BlockPackage[k].Name = val
  1073. break
  1074. }
  1075. }
  1076. }
  1077. }
  1078. }
  1079. }
  1080. }
  1081. //lua脚本根据属性设置提取kv值
  1082. func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
  1083. kvmap := map[string][]map[string]interface{}{}
  1084. if len(j.Winnerorder) > 1 {
  1085. if vc.Field == "bidamount" {
  1086. for _, v := range j.Winnerorder {
  1087. if v["price"] == nil {
  1088. continue
  1089. }
  1090. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1091. "code": "winnerorder",
  1092. "field": vc.Field,
  1093. "ruletext": "中标候选人_" + v["sortstr"].(string),
  1094. "extfrom": v["sortstr"],
  1095. "sourcevalue": v["price"],
  1096. "value": v["price"],
  1097. "type": "winnerorder",
  1098. "matchtype": "winnerorder",
  1099. })
  1100. return kvmap, false
  1101. }
  1102. //候选人中标金额
  1103. if price := j.Winnerorder[0]["price"]; price != nil {
  1104. kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1105. "code": "CL_中标候选人",
  1106. "field": vc.Field,
  1107. "ruletext": "中标候选人",
  1108. "extfrom": j.Winnerorder[0]["sortstr"],
  1109. "sourcevalue": price,
  1110. "value": price,
  1111. "type": "winnerorder",
  1112. "matchtype": "winnerorder",
  1113. })
  1114. return kvmap, false
  1115. }
  1116. }
  1117. //else if vc.Field == "winner" {
  1118. // for _, v := range j.Winnerorder {
  1119. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1120. // "code": "winnerorder",
  1121. // "field": vc.Field,
  1122. // "ruletext": "中标候选人",
  1123. // "extfrom": vc.ExtFrom,
  1124. // "sourcevalue": "中标候选人",
  1125. // "value": v["entname"],
  1126. // "type": "winnerorder",
  1127. // "matchtype": "winnerorder",
  1128. // })
  1129. // }
  1130. // //候选人中标单位
  1131. // if entname := j.Winnerorder[0]["entname"]; entname != nil {
  1132. // kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
  1133. // "code": "CL_中标候选人",
  1134. // "field": vc.Field,
  1135. // "ruletext": "中标候选人",
  1136. // "extfrom": vc.ExtFrom,
  1137. // "sourcevalue": "中标候选人",
  1138. // "value": entname,
  1139. // "type": "winnerorder",
  1140. // "matchtype": "winnerorder",
  1141. // })
  1142. // return kvmap, false
  1143. // }
  1144. //}
  1145. }
  1146. for fieldname, field := range vc.LFields {
  1147. if field != vc.Field {
  1148. continue
  1149. }
  1150. extractFromKv(field, fieldname, j.Block, vc, kvmap)
  1151. }
  1152. AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
  1153. return kvmap, true
  1154. }
  1155. func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
  1156. //qu.Debug("fieldname+++", fieldname)
  1157. for _, bl := range blocks {
  1158. tp := ""
  1159. for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
  1160. if k == 0 {
  1161. tp = "colon"
  1162. // for _, vv := range v.Kvs {
  1163. // qu.Debug("colon-kvs:", vv.Key, vv.Value)
  1164. // }
  1165. // for kkk, vv := range v.KvTags {
  1166. // for _, vvv := range vv {
  1167. // qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
  1168. // }
  1169. // }
  1170. } else if k == 1 {
  1171. tp = "space"
  1172. // for _, vv := range v.Kvs {
  1173. // qu.Debug("space-kvs:", vv.Key, vv.Value)
  1174. // }
  1175. // for kkk, vv := range v.KvTags {
  1176. // for _, vvv := range vv {
  1177. // qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
  1178. // }
  1179. // }
  1180. } else if k == 2 {
  1181. tp = "table"
  1182. // for _, vv := range v.Kvs {
  1183. // qu.Debug("table-kvs:", vv.Key, vv.Value)
  1184. // }
  1185. // for kkk, vv := range v.KvTags {
  1186. // for _, vvv := range vv {
  1187. // qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
  1188. // }
  1189. // }
  1190. }
  1191. if v == nil || v.KvTags == nil {
  1192. continue
  1193. }
  1194. for _, vv := range v.KvTags[fieldname] {
  1195. text := ju.TrimLRSpace(vv.Value, "")
  1196. if text != "" {
  1197. kvmap[field] = append(kvmap[field], map[string]interface{}{
  1198. "code": "CL_" + vv.Key,
  1199. "field": field,
  1200. "ruletext": vv.Key,
  1201. "extfrom": vc.ExtFrom,
  1202. "sourcevalue": text,
  1203. "value": text,
  1204. "type": tp,
  1205. "matchtype": "tag_string",
  1206. "blocktag": bl.Classify,
  1207. "weight": vv.Weight,
  1208. })
  1209. //if field != "winnertel" && field != "winnerperson" {
  1210. // //break //暂定取第一个
  1211. //}
  1212. }
  1213. }
  1214. }
  1215. if len(kvmap[field]) == 0 {
  1216. extractFromKv(field, fieldname, bl.Block, vc, kvmap)
  1217. }
  1218. }
  1219. }
  1220. //正则提取结果
  1221. func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
  1222. defer qu.Catch()
  1223. var score float64
  1224. score = vre.Score
  1225. if isSite {
  1226. score = score + 1.0
  1227. }
  1228. extinfo := map[string][]map[string]interface{}{}
  1229. rep := map[string]string{}
  1230. if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
  1231. //处理正负数修正
  1232. ptmp := strings.Split(vre.RuleText, "#")
  1233. sign := 0
  1234. if len(ptmp) == 2 {
  1235. if ptmp[1] == "正" {
  1236. sign = 1
  1237. } else if ptmp[1] == "负" {
  1238. sign = -1
  1239. }
  1240. }
  1241. tmp := strings.Split(ptmp[0], "__")
  1242. if len(tmp) == 2 {
  1243. epos := strings.Split(tmp[1], ",")
  1244. posm := map[string]int{}
  1245. for _, v := range epos {
  1246. ks := strings.Split(v, ":")
  1247. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  1248. posm[ks[1]] = qu.IntAll(ks[0])
  1249. } else {
  1250. posm[vre.Field] = qu.IntAll(ks[0])
  1251. }
  1252. }
  1253. var pattern string
  1254. if strings.Contains(tmp[0], "\\u") {
  1255. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  1256. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  1257. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  1258. } else {
  1259. pattern = tmp[0]
  1260. }
  1261. //log.Debug("pattern", pattern)
  1262. //fmt.Println(text)
  1263. reg := regexp.MustCompile(pattern)
  1264. apos := reg.FindAllStringSubmatchIndex(text, -1)
  1265. for i, _ := range apos {
  1266. pos := apos[i]
  1267. for k, p := range posm {
  1268. if len(pos) > p {
  1269. if pos[p] == -1 || pos[p+1] == -1 {
  1270. continue
  1271. }
  1272. val := text[pos[p]:pos[p+1]]
  1273. if string(val) == "" {
  1274. continue
  1275. }
  1276. if sign == -1 {
  1277. rep[k+"_"+fmt.Sprint(i)] = "-" + val
  1278. } else {
  1279. rep[k+"_"+fmt.Sprint(i)] = val
  1280. }
  1281. }
  1282. }
  1283. }
  1284. //fmt.Println(text)
  1285. tmps := []map[string]interface{}{}
  1286. for i := 0; i < len(apos); i++ {
  1287. if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
  1288. tmp := map[string]interface{}{
  1289. "field": vre.Field,
  1290. "code": vre.Code,
  1291. "ruletext": vre.RuleText,
  1292. "extfrom": text,
  1293. "value": rep[vre.Field+"_"+fmt.Sprint(i)],
  1294. "type": "regexp",
  1295. "matchtype": "regcontent",
  1296. "blocktag": *tag,
  1297. "score": score,
  1298. }
  1299. tmps = append(tmps, tmp)
  1300. exfield := ju.ExtField{
  1301. BlockTag: *tag,
  1302. Field: vre.Field,
  1303. Code: vre.Code,
  1304. RuleText: vre.RuleText,
  1305. Type: "regexp",
  1306. MatchType: "regcontent",
  1307. ExtFrom: extfrom,
  1308. SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
  1309. Value: rep[vre.Field+"_"+fmt.Sprint(i)],
  1310. Score: score}
  1311. if tmp["blocktag"] != nil {
  1312. exfield.BlockTag = tmp["blocktag"].(map[string]string)
  1313. }
  1314. j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
  1315. }
  1316. }
  1317. if len(tmps) > 0 {
  1318. //fmt.Println(tmps)
  1319. extinfo[vre.Field] = tmps
  1320. }
  1321. }
  1322. } else {
  1323. pos := vre.RegCore.Reg.FindStringIndex(text)
  1324. val := ""
  1325. if len(pos) == 2 {
  1326. text = text[pos[1]:]
  1327. rs := regexp.MustCompile("[^\r\n\t]+")
  1328. tmp := rs.FindAllString(text, -1)
  1329. if len(tmp) > 0 {
  1330. val = tmp[0]
  1331. }
  1332. }
  1333. if val != "" {
  1334. tmps := []map[string]interface{}{}
  1335. tmp := map[string]interface{}{
  1336. "field": vre.Field,
  1337. "code": vre.Code,
  1338. "ruletext": vre.RuleText,
  1339. "extfrom": text,
  1340. "value": val,
  1341. "type": "regexp",
  1342. "matchtype": "regcontent",
  1343. "blocktag": *tag,
  1344. "score": score,
  1345. }
  1346. tmps = append(tmps, tmp)
  1347. extinfo[vre.Field] = tmps
  1348. if j.Result[vre.Field] == nil {
  1349. j.Result[vre.Field] = [](*ju.ExtField){}
  1350. }
  1351. field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
  1352. Value: val,
  1353. Score: score}
  1354. if tmp["blocktag"] != nil {
  1355. field.BlockTag = tmp["blocktag"].(map[string]string)
  1356. }
  1357. j.Result[vre.Field] = append(j.Result[vre.Field], field)
  1358. }
  1359. }
  1360. return extinfo
  1361. }
  1362. //后置过滤
  1363. func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
  1364. defer qu.Catch()
  1365. if in.IsLua {
  1366. result := GetResultMapForLua(j)
  1367. lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
  1368. if j != nil {
  1369. lua.Block = j.Block
  1370. }
  1371. extinfo := lua.RunScript("back")
  1372. for k, v := range extinfo {
  1373. if tmps, ok := v.([]map[string]interface{}); ok {
  1374. j.Result[k] = [](*ju.ExtField){}
  1375. for _, tmp := range tmps {
  1376. field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
  1377. ExtFrom: qu.ObjToString(tmp["extfrom"]),
  1378. Value: tmp["value"]}
  1379. if tmp["blocktag"] != nil {
  1380. field.BlockTag = tmp["blocktag"].(map[string]string)
  1381. }
  1382. j.Result[k] = append(j.Result[k], field)
  1383. //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
  1384. }
  1385. }
  1386. }
  1387. if len(extinfo) > 0 {
  1388. AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
  1389. }
  1390. } else {
  1391. extinfo := map[string]interface{}{}
  1392. if in.Field != "" {
  1393. clearByTitle := false
  1394. if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
  1395. clearByTitle = true
  1396. }
  1397. if j.Result[in.Field] != nil {
  1398. tmp := j.Result[in.Field]
  1399. exts := []interface{}{}
  1400. for k, v := range tmp {
  1401. if clearByTitle && v.ExtFrom != "title" {
  1402. continue
  1403. }
  1404. //table抽取到的数据不清理
  1405. // if v.Type == "table" && v.Field != "projectname" {
  1406. // continue
  1407. // }
  1408. text := qu.ObjToString(v.Value)
  1409. if text != "" {
  1410. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1411. }
  1412. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1413. continue
  1414. }
  1415. j.Result[in.Field][k].Value = text
  1416. exts = append(exts, map[string]interface{}{
  1417. "field": v.Field,
  1418. "code": v.Code,
  1419. "ruletext": v.RuleText,
  1420. "type": v.Type,
  1421. "matchtype": v.MatchType,
  1422. "extfrom": v.ExtFrom,
  1423. "value": text,
  1424. })
  1425. }
  1426. if len(exts) > 0 {
  1427. extinfo[in.Field] = exts
  1428. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1429. }
  1430. }
  1431. } else {
  1432. for key, tmp := range j.Result {
  1433. exts := []interface{}{}
  1434. for k, v := range tmp {
  1435. if v.Type == "table" { //table抽取到的数据不清理
  1436. continue
  1437. }
  1438. text := qu.ObjToString(v.Value)
  1439. if text != "" {
  1440. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1441. }
  1442. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1443. continue
  1444. }
  1445. j.Result[key][k].Value = text
  1446. exts = append(exts, map[string]interface{}{
  1447. "field": v.Field,
  1448. "code": v.Code,
  1449. "ruletext": v.RuleText,
  1450. "type": v.Type,
  1451. "matchtype": v.MatchType,
  1452. "extfrom": v.ExtFrom,
  1453. "value": text,
  1454. })
  1455. }
  1456. if len(exts) > 0 {
  1457. extinfo[key] = exts
  1458. AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
  1459. }
  1460. }
  1461. }
  1462. }
  1463. }
  1464. //后置过滤
  1465. func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
  1466. defer qu.Catch()
  1467. for k, v := range j.BlockPackage {
  1468. if in.Field == "winner" {
  1469. j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
  1470. } else if in.Field == "bidstatus" {
  1471. j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
  1472. } else if in.Field == "" {
  1473. j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
  1474. } else if in.Field == "projectname" {
  1475. j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
  1476. } else if in.Field == "winnerperson" {
  1477. j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
  1478. } else if in.Field == "winnertel" {
  1479. j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
  1480. }
  1481. }
  1482. }
  1483. //KV过滤
  1484. func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
  1485. defer qu.Catch()
  1486. extinfo := map[string]interface{}{}
  1487. if in.Field != "" {
  1488. if j.Result[in.Field] != nil {
  1489. tmp := j.Result[in.Field]
  1490. exts := []interface{}{}
  1491. for k, v := range tmp {
  1492. if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
  1493. continue
  1494. }
  1495. text := qu.ObjToString(v.Value)
  1496. if text != "" {
  1497. text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
  1498. }
  1499. if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
  1500. continue
  1501. }
  1502. j.Result[in.Field][k].Value = text
  1503. exts = append(exts, map[string]interface{}{
  1504. "field": v.Field,
  1505. "code": v.Code,
  1506. "ruletext": v.RuleText,
  1507. "type": v.Type,
  1508. "matchtype": v.MatchType,
  1509. "extfrom": v.ExtFrom,
  1510. "value": text,
  1511. })
  1512. }
  1513. if len(exts) > 0 {
  1514. extinfo[in.Field] = exts
  1515. AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
  1516. }
  1517. }
  1518. }
  1519. }
  1520. //获取抽取结果map[string][]interface{},lua脚本使用
  1521. func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
  1522. defer qu.Catch()
  1523. result := map[string][]map[string]interface{}{}
  1524. for key, val := range j.Result {
  1525. if result[key] == nil {
  1526. result[key] = []map[string]interface{}{}
  1527. }
  1528. for _, v := range val {
  1529. tmp := map[string]interface{}{
  1530. "field": v.Field,
  1531. "code": v.Code,
  1532. "ruletext": v.RuleText,
  1533. "value": v.Value,
  1534. "type": v.Type,
  1535. "matchtype": v.MatchType,
  1536. "extfrom": v.ExtFrom,
  1537. }
  1538. result[key] = append(result[key], tmp)
  1539. }
  1540. }
  1541. return result
  1542. }
  1543. //抽取日志
  1544. func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
  1545. defer qu.Catch()
  1546. if !t.IsEtxLog {
  1547. return
  1548. }
  1549. logdata := map[string]interface{}{
  1550. "code": qu.If(v.Code == "", "kv", v.Code),
  1551. "name": v.Name,
  1552. "type": ftype,
  1553. "ruletext": v.RuleText,
  1554. "islua": v.IsLua,
  1555. "field": v.Field,
  1556. "version": t.Version,
  1557. "taskname": t.Name,
  1558. "before": before,
  1559. "extinfo": extinfo,
  1560. "sid": sid,
  1561. "comeintime": time.Now().Unix(),
  1562. }
  1563. lock.Lock()
  1564. ExtLogs[t] = append(ExtLogs[t], logdata)
  1565. lock.Unlock()
  1566. }
  1567. func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
  1568. exts := []map[string]interface{}{}
  1569. exts = append(exts, map[string]interface{}{
  1570. "field": ext.Field,
  1571. "code": ext.Code,
  1572. "type": ftype,
  1573. "matchtype": matchtype,
  1574. "extfrom": ext.ExtFrom,
  1575. "value": ext.Value,
  1576. })
  1577. extinfo := map[string]interface{}{
  1578. ext.Field: exts,
  1579. }
  1580. AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
  1581. }
  1582. func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
  1583. defer qu.Catch()
  1584. if !t.IsEtxLog {
  1585. return
  1586. }
  1587. logdata := map[string]interface{}{
  1588. "code": code,
  1589. "name": name,
  1590. "type": ftype,
  1591. "ruletext": "",
  1592. "islua": false,
  1593. "field": field,
  1594. "version": t.Version,
  1595. "taskname": t.Name,
  1596. "before": before,
  1597. "extinfo": extinfo,
  1598. "sid": sid,
  1599. "comeintime": time.Now().Unix(),
  1600. }
  1601. lock.Lock()
  1602. ExtLogs[t] = append(ExtLogs[t], logdata)
  1603. lock.Unlock()
  1604. }
  1605. //保存抽取日志
  1606. func SaveExtLog() {
  1607. defer qu.Catch()
  1608. tmpLogs := map[*TaskInfo][]map[string]interface{}{}
  1609. lock.Lock()
  1610. tmpLogs = ExtLogs
  1611. ExtLogs = map[*TaskInfo][]map[string]interface{}{}
  1612. lock.Unlock()
  1613. for k, v := range tmpLogs {
  1614. if len(v) < saveLimit {
  1615. db.Mgo.SaveBulk(k.TrackColl, v...)
  1616. } else {
  1617. for {
  1618. if len(v) > saveLimit {
  1619. tmp := v[:saveLimit]
  1620. db.Mgo.SaveBulk(k.TrackColl, tmp...)
  1621. v = v[saveLimit:]
  1622. } else {
  1623. db.Mgo.SaveBulk(k.TrackColl, v...)
  1624. break
  1625. }
  1626. }
  1627. }
  1628. }
  1629. time.AfterFunc(10*time.Second, SaveExtLog)
  1630. }
  1631. type FieldValue struct {
  1632. Value interface{}
  1633. Count int
  1634. }
  1635. var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
  1636. //分析抽取结果并保存
  1637. func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
  1638. qu.Try(func() {
  1639. //重新取出清理过后的中标候选人
  1640. resetWinnerorder(j)
  1641. doc, result, _id := funcAnalysis(j, e)
  1642. if ju.IsSaveTag {
  1643. go otherNeedSave(j, result, e)
  1644. }
  1645. auxinfo := auxInfo(j)
  1646. //从排序结果中取值
  1647. tmp := map[string]interface{}{} //抽取值
  1648. tmp["spidercode"] = j.SpiderCode
  1649. tmp["site"] = j.Site
  1650. tmp["jsondata"] = j.Jsondata
  1651. tmp["fieldall"] = auxinfo
  1652. for _, val := range result {
  1653. for _, v := range val { //取第一个非负数,项目名称除外
  1654. //存0是否有效
  1655. if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
  1656. tmp[v.Field] = v.Value
  1657. break
  1658. }
  1659. if v.Score > -1 {
  1660. tmp[v.Field] = v.Value
  1661. break
  1662. } else if v.Field == "projectname" {
  1663. tmp[v.Field] = v.Value
  1664. break
  1665. }
  1666. }
  1667. }
  1668. if len(j.PackageInfo) > 15 {
  1669. for k, v := range j.PackageInfo {
  1670. j.PackageInfo = map[string]map[string]interface{}{}
  1671. j.PackageInfo[k] = v
  1672. break
  1673. }
  1674. }
  1675. if len(j.PackageInfo) > 0 { //分包信息
  1676. tmp["package"] = j.PackageInfo
  1677. //包预算,中标金额合并大于抽取就覆盖
  1678. var tmpBidamount, tmpBudget float64
  1679. //s_winner逗号分隔拼接,分包中标人
  1680. var tmpstr, savewinner []string
  1681. //按包排序
  1682. for b, v := range j.PackageInfo {
  1683. if v["winner"] != nil && v["winner"] != "" {
  1684. tmpstr = append(tmpstr, b)
  1685. }
  1686. }
  1687. //包预算,中标金额合并大于抽取就覆盖
  1688. if len(j.PackageInfo) >= 1 {
  1689. //包数大于1累加
  1690. for _, v := range j.PackageInfo {
  1691. if v["budget"] != nil {
  1692. tmpBudget += qu.Float64All(v["budget"])
  1693. }
  1694. if v["bidamount"] != nil {
  1695. tmpBidamount += qu.Float64All(v["bidamount"])
  1696. }
  1697. }
  1698. if qu.Float64All(tmp["budget"]) < tmpBudget {
  1699. tmp["budget"] = tmpBudget
  1700. }
  1701. if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
  1702. tmp["bidamount"] = tmpBidamount
  1703. }
  1704. } else {
  1705. //包数等于1,tmp没有值取包里的值
  1706. if tmp["budget"] == nil || tmp["budget"] == 0 {
  1707. for _, v := range j.PackageInfo {
  1708. if v["budget"] != nil {
  1709. tmp["budget"] = v["budget"]
  1710. }
  1711. }
  1712. }
  1713. if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
  1714. for _, v := range j.PackageInfo {
  1715. if v["bidamount"] != nil {
  1716. tmp["bidamount"] = v["bidamount"]
  1717. }
  1718. }
  1719. }
  1720. }
  1721. //s_winner逗号分隔拼接,分包中标人
  1722. sort.Strings(tmpstr)
  1723. for _, v := range tmpstr {
  1724. svvvv := qu.ObjToString(j.PackageInfo[v]["winner"])
  1725. savevvv := clearWinnerReg.ReplaceAllString(svvvv, "")
  1726. if savevvv == "" {
  1727. continue
  1728. }
  1729. savewinner = append(savewinner, savevvv)
  1730. }
  1731. if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
  1732. tmp["s_winner"] = tmp["winner"]
  1733. } else if savewinner != nil {
  1734. savewinner = RemoveReplicaSliceString(savewinner)
  1735. tmp["s_winner"] = strings.Join(savewinner, ",")
  1736. }
  1737. } else if tmp["winner"] != nil && tmp["winner"] != "" {
  1738. //没有分包取winner
  1739. tmp["s_winner"] = tmp["winner"]
  1740. }
  1741. if len(j.Winnerorder) > 0 { //候选人信息
  1742. for i, v := range j.Winnerorder {
  1743. if v["price"] != nil {
  1744. j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
  1745. }
  1746. }
  1747. tmp["winnerorder"] = j.Winnerorder
  1748. }
  1749. //处理附件
  1750. var resultf map[string][]*ju.ExtField
  1751. if jf != nil {
  1752. _, resultf, _ = funcAnalysis(jf, e)
  1753. auxinfof := auxInfo(jf)
  1754. tmp["fieldallf"] = auxinfof
  1755. ffield := map[string]interface{}{}
  1756. for _, val := range resultf {
  1757. for _, v := range val { //取第一个非负数
  1758. if v.Score > -1 {
  1759. ffield[v.Field] = v.Value
  1760. break
  1761. }
  1762. }
  1763. }
  1764. if len(jf.PackageInfo) > 0 { //分包信息
  1765. ffield["package"] = jf.PackageInfo
  1766. }
  1767. if len(jf.Winnerorder) > 0 { //候选人信息
  1768. ffield["winnerorder"] = jf.Winnerorder
  1769. }
  1770. tmp["ffield"] = ffield
  1771. }
  1772. for k, v := range *doc {
  1773. //去重冗余字段
  1774. if delFiled(k) {
  1775. continue
  1776. }
  1777. if tmp[k] == nil {
  1778. tmp[k] = v
  1779. }
  1780. }
  1781. //质量审核
  1782. if ju.QualityAudit {
  1783. e.QualityAudit(tmp)
  1784. }
  1785. if e.IsExtractCity { //城市抽取
  1786. //e.ExtractCity(j, tmp, _id)
  1787. e.NewExtractCity(j, tmp, _id)
  1788. // b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
  1789. // // log.Debug("省份---", p, "城市---", c, "区---", d)
  1790. // tmp["district"] = d
  1791. // if b {
  1792. // tmp["city"] = c
  1793. // tmp["area"] = p
  1794. // }
  1795. }
  1796. //品牌抽取
  1797. if ju.IsBrandGoods {
  1798. tmp["checkhas"] = map[string]int{
  1799. "hastable": j.HasTable,
  1800. "hasgoods": j.HasGoods,
  1801. "hasbrand": j.HasBrand,
  1802. "haskey": j.HasKey,
  1803. }
  1804. if len(j.BrandData) > 0 {
  1805. tmp["tablebrand"] = j.BrandData
  1806. }
  1807. // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
  1808. }
  1809. //prince和number抽取
  1810. if ju.IsPriceNumber {
  1811. priceNumberLen := len(j.PriceNumberData)
  1812. if priceNumberLen > 1 { //table数据去重
  1813. tmpPriceNumberData := []map[string]interface{}{}
  1814. tableStrs := map[string]bool{}
  1815. for _, tb := range j.PriceNumberData {
  1816. has := false
  1817. bytes, _ := json.Marshal(tb)
  1818. str := string(bytes)
  1819. if len(tableStrs) > 0 && tableStrs[str] {
  1820. has = true
  1821. } else {
  1822. tableStrs[str] = true
  1823. }
  1824. if !has {
  1825. for _, data := range tb {
  1826. tmpPriceNumberData = append(tmpPriceNumberData, data)
  1827. }
  1828. }
  1829. }
  1830. tmp["pricenumber"] = tmpPriceNumberData
  1831. } else if priceNumberLen == 1 {
  1832. tmp["pricenumber"] = j.PriceNumberData[0]
  1833. }
  1834. }
  1835. //所有kv组成的字符串
  1836. var kvtext bytes.Buffer
  1837. blocks := make([]ju.BlockAndTag, 0)
  1838. for _, v := range j.Block {
  1839. //分包和标签
  1840. if ju.SaveBlock {
  1841. xx, _ := json.Marshal(v)
  1842. tmpblock := new(ju.TmpBlock)
  1843. err := json.Unmarshal(xx, &tmpblock)
  1844. if err != nil {
  1845. if v.BPackage != nil {
  1846. bpb, _ := json.Marshal(v.BPackage)
  1847. tmpblock.BPackage = string(bpb)
  1848. }
  1849. tmpblock = rangeBlockToJson(v, *tmpblock)
  1850. }
  1851. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  1852. }
  1853. //把所有kv组装成一个字符串,存库
  1854. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  1855. if jv == nil {
  1856. continue
  1857. }
  1858. for jv_k, jv_v := range jv.KvTags {
  1859. for _, jv_vv := range jv_v {
  1860. kvtext.WriteString(jv_k)
  1861. kvtext.WriteString(":")
  1862. kvtext.WriteString(jv_vv.Value)
  1863. kvtext.WriteString(" ")
  1864. }
  1865. }
  1866. }
  1867. }
  1868. if kvtext.Len() > 0 {
  1869. tmp["kvtext"] = kvtext.String()
  1870. }
  1871. if len(blocks) > 0 {
  1872. if blocksBytes, err := json.Marshal(blocks); err == nil {
  1873. if utf8.RuneCount(blocksBytes) < 100000 {
  1874. tmp["blocks"] = string(blocksBytes)
  1875. }
  1876. }
  1877. }
  1878. // fmt.Println("=============抽取结果================")
  1879. // for k, v := range tmp {
  1880. // qu.Debug(k, "---", v)
  1881. // }
  1882. //tmp["extract_content"] = j.Content
  1883. tmp["dataging"] = j.Dataging
  1884. if e.TaskInfo.TestColl == "" {
  1885. if len(tmp) > 0 { //保存抽取结果
  1886. /* if len(e.SiteFields) <= 0 {
  1887. //for field, _ := range e.Fields {
  1888. // if tmp[field] == nil && {
  1889. // tmp[field] = "" //覆盖之前版本数据
  1890. // }
  1891. //}
  1892. } else {
  1893. //for field, _ := range e.SiteFields {
  1894. // if tmp[field] == nil &&{
  1895. // tmp[field] = "" //覆盖之前版本数据
  1896. // }
  1897. //}
  1898. }*/
  1899. tmp["repeat"] = 0
  1900. tmparr := []map[string]interface{}{
  1901. map[string]interface{}{
  1902. "_id": qu.StringTOBsonId(_id),
  1903. },
  1904. map[string]interface{}{"$set": tmp},
  1905. }
  1906. e.RWMutex.Lock()
  1907. e.BidArr = append(e.BidArr, tmparr)
  1908. e.BidTotal++
  1909. e.RWMutex.Unlock()
  1910. }
  1911. if ju.SaveResult {
  1912. id := tmp["_id"]
  1913. tmp["result"] = result
  1914. tmp["resultf"] = resultf
  1915. delete(tmp, "_id")
  1916. tmparr := []map[string]interface{}{
  1917. map[string]interface{}{
  1918. "_id": id,
  1919. },
  1920. map[string]interface{}{"$set": tmp},
  1921. }
  1922. e.RWMutex.Lock()
  1923. e.ResultArr = append(e.ResultArr, tmparr)
  1924. e.RWMutex.Unlock()
  1925. }
  1926. } else { //测试结果
  1927. delete(tmp, "_id")
  1928. delete(tmp, "fieldall")
  1929. if len(j.BlockPackage) > 0 { //分包详情
  1930. if len(j.BlockPackage) > 10 {
  1931. tmp["epackage"] = "分包异常"
  1932. } else {
  1933. bs, _ := json.Marshal(j.BlockPackage)
  1934. tmp["epackage"] = string(bs)
  1935. }
  1936. }
  1937. tmp["result"] = result
  1938. tmp["resultf"] = resultf
  1939. b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
  1940. if !b {
  1941. log.Debug(e.TaskInfo.TestColl, _id)
  1942. }
  1943. }
  1944. }, func(err interface{}) {
  1945. log.Debug("AnalysisSaveResult err", err)
  1946. })
  1947. }
  1948. //保存其他
  1949. //kv、表格、块上的标签凡是新的标签都入库
  1950. //val type times firstid createtime 判定field
  1951. func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
  1952. now := time.Now().Unix()
  1953. coll := e.TaskInfo.TestColl
  1954. if coll == "" {
  1955. coll = "extract_tag_result"
  1956. } else {
  1957. coll += "_tag"
  1958. }
  1959. datas := []map[string]interface{}{}
  1960. kv := map[string]int{}
  1961. for _, v := range j.Block {
  1962. //
  1963. for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
  1964. if vv == nil || vv.KvTags == nil {
  1965. continue
  1966. }
  1967. for kkk, vvv := range vv.KvTags {
  1968. for _, vvvv := range vvv {
  1969. if vvvv.IsInvalid {
  1970. kv[kkk] = kv[kkk] + 1
  1971. break
  1972. }
  1973. }
  1974. }
  1975. }
  1976. for _, vv := range v.NotClassifyTitles {
  1977. datas = append(datas, map[string]interface{}{
  1978. "val": vv,
  1979. "times": 0,
  1980. "type": "block",
  1981. "firstid": j.SourceMid,
  1982. "createtime": now,
  1983. })
  1984. if len(datas) == saveLimit {
  1985. db.Mgo.SaveBulk(coll, datas...)
  1986. datas = []map[string]interface{}{}
  1987. }
  1988. }
  1989. }
  1990. for k, v := range kv {
  1991. datas = append(datas, map[string]interface{}{
  1992. "val": k,
  1993. "times": v,
  1994. "type": "kv",
  1995. "firstid": j.SourceMid,
  1996. "createtime": now,
  1997. })
  1998. if len(datas) == saveLimit {
  1999. db.Mgo.SaveBulk(coll, datas...)
  2000. datas = []map[string]interface{}{}
  2001. }
  2002. }
  2003. if len(datas) > 0 {
  2004. db.Mgo.SaveBulk(coll, datas...)
  2005. }
  2006. }
  2007. func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
  2008. if j == nil {
  2009. return nil
  2010. }
  2011. if len(j.Block) > 0 {
  2012. for i, v := range j.Block {
  2013. rangetmp := new(ju.TmpBlock)
  2014. vb, _ := json.Marshal(v)
  2015. json.Unmarshal(vb, &rangetmp)
  2016. tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
  2017. }
  2018. }
  2019. if j.ColonKV != nil {
  2020. cb, _ := json.Marshal(j.ColonKV)
  2021. tmpblock.ColonKV = string(cb)
  2022. }
  2023. if j.SpaceKV != nil {
  2024. sb, _ := json.Marshal(j.SpaceKV)
  2025. tmpblock.SpaceKV = string(sb)
  2026. }
  2027. if j.TableKV != nil {
  2028. tb, _ := json.Marshal(j.TableKV)
  2029. tmpblock.TableKV = string(tb)
  2030. }
  2031. return &tmpblock
  2032. }
  2033. //去重冗余字段
  2034. func delFiled(k string) bool {
  2035. return k=="detailfile"||k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  2036. }
  2037. func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
  2038. defer qu.Catch()
  2039. doc := j.Data
  2040. result := j.Result
  2041. _id := qu.BsonIdToSId((*doc)["_id"])
  2042. result = ScoreFields(j, e.Tag) //正负面词打分
  2043. //结果排序
  2044. for _, val := range result {
  2045. ju.Sort(val)
  2046. }
  2047. if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
  2048. //jsondata清理
  2049. clearJd(j.Jsondata, e)
  2050. marshalbt, _ := json.Marshal(j.Jsondata)
  2051. tmpjddata := make(map[string]interface{})
  2052. json.Unmarshal(marshalbt, &tmpjddata)
  2053. for _, jdkey := range ju.JsonData {
  2054. if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
  2055. for tmpk, tmpv := range j.Result[jdkey][:5] {
  2056. if jdkey == "budget" || jdkey == "bidamount" {
  2057. lockclear.Lock()
  2058. cfn := e.ClearFn[jdkey]
  2059. lockclear.Unlock()
  2060. if len(cfn) == 0 {
  2061. continue
  2062. }
  2063. newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
  2064. if tmpv.Value == newNum[0] {
  2065. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
  2066. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2067. ju.Sort(j.Result[jdkey])
  2068. delete((*j.Jsondata), jdkey)
  2069. break
  2070. }
  2071. } else {
  2072. if (*j.Jsondata)[jdkey] == tmpv.Value {
  2073. extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
  2074. j.Result[jdkey] = append(j.Result[jdkey], extField)
  2075. ju.Sort(j.Result[jdkey])
  2076. delete((*j.Jsondata), jdkey)
  2077. break
  2078. }
  2079. }
  2080. }
  2081. }
  2082. }
  2083. if len(*j.Jsondata) > 0 {
  2084. j.Result = JsonDataMergeProcessing(j, e)
  2085. }
  2086. j.Jsondata = &tmpjddata
  2087. }
  2088. return doc, result, _id
  2089. }
  2090. //辅助信息,如果没有排序先排序
  2091. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  2092. fieldalls := map[string][]map[string]interface{}{}
  2093. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  2094. defer qykredis.Close()
  2095. db := 0
  2096. for field, val := range j.Result {
  2097. //ju.Sort(val)
  2098. if field == "buyer" {
  2099. db = ju.BuyerDB
  2100. } else if field == "winner" {
  2101. db = ju.WinnerDB
  2102. } else if field == "agency" {
  2103. db = ju.AgencyDB
  2104. }
  2105. sfields := []map[string]interface{}{}
  2106. for _, v := range val {
  2107. standardized := false
  2108. if _, err := qykredis.Do("SELECT", db); err != nil {
  2109. fmt.Println("redis select err", err)
  2110. } else {
  2111. rep, err := qykredis.Do("GET", v.Value)
  2112. if rep != nil && err == nil {
  2113. standardized = true
  2114. }
  2115. }
  2116. if field == "budget" || field == "bidamount" {
  2117. if !v.IsTrue {
  2118. continue
  2119. }
  2120. }
  2121. sfield := map[string]interface{}{
  2122. "val": v.Value,
  2123. "type": v.Type,
  2124. "score": v.Score,
  2125. "blocktag": v.BlockTag,
  2126. "sourceval": v.SourceValue,
  2127. "standardized": standardized,
  2128. }
  2129. sfields = append(sfields, sfield)
  2130. }
  2131. fieldalls[field] = sfields
  2132. }
  2133. return fieldalls
  2134. }
  2135. func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
  2136. defer qu.Catch()
  2137. //获取审核字段
  2138. for _, field := range e.AuditFields {
  2139. //1.分包
  2140. if resulttmp["package"] != nil {
  2141. packagedata := resulttmp["package"].(map[string]map[string]interface{})
  2142. for _, val := range packagedata {
  2143. if val[field] != nil {
  2144. fv := qu.ObjToString(val[field])
  2145. if fv != "" {
  2146. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2147. e.RedisMatch(field, fv, val) //redis匹配
  2148. } else { //除了buyer和winner,其他字段走规则匹配
  2149. e.RuleMatch(field, fv, val)
  2150. }
  2151. }
  2152. }
  2153. }
  2154. }
  2155. //2.外围
  2156. if resulttmp[field] != nil {
  2157. fv := qu.ObjToString(resulttmp[field])
  2158. if fv != "" {
  2159. if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
  2160. e.RedisMatch(field, fv, resulttmp) //redis匹配
  2161. } else { //除了buyer和winner,其他字段走规则匹配
  2162. e.RuleMatch(field, fv, resulttmp)
  2163. }
  2164. }
  2165. }
  2166. }
  2167. }
  2168. //Redis匹配
  2169. func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
  2170. defer qu.Catch()
  2171. i := redis.GetInt(field, field+"_"+fv) //查找redis
  2172. if i == 0 { //reids未找到,执行规则匹配
  2173. val[field+"_isredis"] = false
  2174. e.RuleMatch(field, fv, val) //规则匹配
  2175. } else { //redis找到,打标识存库
  2176. val[field+"_isredis"] = true
  2177. }
  2178. }
  2179. //规则匹配
  2180. func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
  2181. defer qu.Catch()
  2182. if fieldval != "" {
  2183. SMap := e.StartMatch(field, fieldval)
  2184. //SMap.AddKey(field+"_isaudit", false)
  2185. for _, k := range SMap.Keys {
  2186. tmpMap[k] = SMap.Map[k]
  2187. }
  2188. tmpMap[field+"_isaudit"] = false //添加字段未审核信息
  2189. }
  2190. }
  2191. //开始规则匹配
  2192. func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
  2193. defer qu.Catch()
  2194. SMap := pretreated.NewSortMap()
  2195. lock.Lock()
  2196. f := e.RecogFieldMap[field]
  2197. lock.Unlock()
  2198. if len(f) > 0 {
  2199. fid := qu.BsonIdToSId(f["_id"])
  2200. recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
  2201. textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
  2202. if textAfterRecogFieldPrerule != "" {
  2203. lock.Lock()
  2204. classMap := e.FidClassMap[fid]
  2205. lock.Unlock()
  2206. L:
  2207. for _, c := range classMap { //class
  2208. classid := qu.BsonIdToSId(c["_id"])
  2209. classPrerule := qu.ObjToString(c["s_class_prerule"])
  2210. savefield := qu.ObjToString(c["s_savefield"]) //保存字段
  2211. textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
  2212. if textAfterClassPrerule != "" {
  2213. lock.Lock()
  2214. ruleMap := e.CidRuleMap[classid]
  2215. lock.Unlock()
  2216. for _, r := range ruleMap { //rule
  2217. rulePrerule := qu.ObjToString(r["s_rule_prerule"])
  2218. s_name := qu.ObjToString(r["s_name"])
  2219. rule := r["rule"].([]interface{})
  2220. textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
  2221. if textAfterRulePrerule != "" {
  2222. b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
  2223. if b { //匹配到一个分类下某个规则时,不再继续匹配
  2224. if savefield != "" { //保存字段不为空,存储代码信息
  2225. SMap.AddKey(field+"_"+savefield, s_name)
  2226. }
  2227. break L
  2228. }
  2229. }
  2230. }
  2231. }
  2232. }
  2233. }
  2234. }
  2235. return SMap
  2236. }
  2237. //中标候选人经过清理之后,重新取出赋值
  2238. func resetWinnerorder(j *ju.Job) {
  2239. if len(j.Winnerorder) == 0 {
  2240. return
  2241. }
  2242. maxlen := len(j.Winnerorder) - 1
  2243. //中标单位
  2244. //i := 0
  2245. winners := []*ju.ExtField{}
  2246. bidamounts := []*ju.ExtField{}
  2247. //for _, v := range j.Result["winner"] {
  2248. // if v.Code == "winnerorder" {
  2249. // if maxlen < i {
  2250. // continue
  2251. // }
  2252. // j.Winnerorder[i]["entname"] = v.Value
  2253. // i++
  2254. // } else {
  2255. // winners = append(winners, v)
  2256. // }
  2257. //}
  2258. if maxlen > 0 {
  2259. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  2260. if j.Winnerorder[0]["price"] != nil {
  2261. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
  2262. if tmpPrice[len(tmpPrice)-1].(bool) {
  2263. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
  2264. }
  2265. }
  2266. }
  2267. if j.Result["winner"] == nil && len(winners) > 0 {
  2268. j.Result["winner"] = winners
  2269. } else if len(winners) > 0 {
  2270. j.Result["winner"] = append(j.Result["winner"], winners...)
  2271. }
  2272. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  2273. j.Result["bidamount"] = bidamounts
  2274. } else if len(bidamounts) > 0 {
  2275. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  2276. }
  2277. //j.Result["winner"] = winners
  2278. //中标金额
  2279. //i = 0
  2280. //bidamounts := []*ju.ExtField{}
  2281. //for _, v := range j.Result["bidamount"] {
  2282. // if v.Code == "winnerorder" {
  2283. // if maxlen < i {
  2284. // continue
  2285. // }
  2286. // j.Winnerorder[i]["price"] = v.Value
  2287. // i++
  2288. // } else {
  2289. // bidamounts = append(bidamounts, v)
  2290. // }
  2291. //}
  2292. //j.Result["bidamount"] = bidamounts
  2293. }
  2294. func RemoveReplicaSliceString(slc []string) []string {
  2295. result := make([]string, 0)
  2296. tempMap := make(map[string]bool, len(slc))
  2297. for _, e := range slc {
  2298. if tempMap[e] == false {
  2299. tempMap[e] = true
  2300. result = append(result, e)
  2301. }
  2302. }
  2303. return result
  2304. }