1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983 |
- package extract
- import (
- "bytes"
- "encoding/json"
- "fmt"
- "github.com/shopspring/decimal"
- "go.mongodb.org/mongo-driver/bson/primitive"
- "jy/clear"
- db "jy/mongodbutil"
- "jy/pretreated"
- ju "jy/util"
- qu "qfw/util"
- "qfw/util/redis"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "sync"
- "time"
- "unicode/utf8"
- log "github.com/donnie4w/go-logger/logger"
- "gopkg.in/mgo.v2/bson"
- )
- var (
- lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
- JYUrl = "https://www.jianyu360.com/article/content/%s.html"
- cut = ju.NewCut() //获取正文并清理
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
- TaskList map[string]*ExtractTask //任务列表
- ClearTaskList map[string]*ClearTask //清理任务列表
- saveLimit = 100 //抽取日志批量保存
- PageSize = 5000 //查询分页
- Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
- //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
- Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
- /*f = map[string]bool{
- "T": true,
- "_d": true,
- "area": true,
- "channel": true,
- "comeintime": true,
- "competehref": true,
- "href": true,
- "l_np_publishtime": true,
- "publishtime": true,
- "sendflag": true,
- "site": true,
- "spidercode": true,
- "title": true,
- "projectname": true,
- }*/
- /*f = map[string]bool{
- "contentid": true,
- "progName": true,
- "updateTime": true,
- "url": true,
- "areaId": true,
- "areaName": true,
- "popTitle": true,
- "showTitle": true,
- "progId": true,
- "catid": true,
- "isConcern": true,
- "followCount": true,
- "followSuggestion": true,
- "isBoutique": true,
- "canTj": true,
- "tenderAmountNumber": true,
- "tenderAmountUnit": true,
- "bidderAmountNumber": true,
- "bidderAmountUnit": true,
- "registrationBeginTime": true,
- "registrationEndTime": true,
- "starNum": true,
- "title": true,
- "proInvested": true,
- "projectname": true,
- }*/
- spidercode = map[string]bool{
- "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
- "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
- "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
- "hb_tmsggzyjyxxw_jsgc_kbqk": true,
- "zj_nbsyyggzyjyw_jsgc_kbqk": true,
- "zj_zjsggzyjyzx_jyxx_kbjg": true,
- "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
- "zj_lssggzyjyw_jsgc_kbsk": true,
- "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
- "sc_mssggzydzjypt_jsgc_kbjl": true,
- "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
- "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
- "a_hbszbtbggfwpt_kbjl": true,
- "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
- "a_szldzbyxgs_kbxx": true,
- "zj_zssssxggzyjyw_gcjs_kbjggs": true,
- "gd_szszfhjsj_kbqkgs": true,
- "a_gjggzyjypt_gcjs_kbjl": true,
- "a_gjggzyjypt_gcjs_kbjl_new": true,
- "zj_tzsyhggzyjyzx_kbjggg": true,
- "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
- "ah_czsggzyjyw_jsgc_kbjl": true,
- "ah_czsggzyjyw_zfcg_kbxx": true,
- "ah_whsggzyjyfww_kbxx_cgxm": true,
- "ah_whsggzyjyfww_kbxx_gcxm": true,
- }
- )
- //启动测试抽取-、、、、结果追踪
- func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
- defer qu.Catch()
- ext := TaskList[taskId]
- if ext == nil {
- ext = &ExtractTask{}
- ext.Id = taskId
- ext.InitTestTaskInfo(resultcoll, trackcoll)
- ext.IsRun = true
- ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- }
- ext.InitSite()
- ext.InitRulePres()
- ext.InitRuleBacks(false)
- ext.InitRuleBacks(true)
- ext.InitRuleCore(false)
- ext.InitRuleCore(true)
- ext.InitPkgCore()
- ext.InitBlockRule()
- ext.InfoTypeList()
- ext.InitTag(false)
- ext.InitTag(true)
- ext.InitClearFn(false)
- ext.InitClearFn(true)
- ext.Lock()
- //ext.IsExtractCity = false
- if ext.IsExtractCity { //版本上控制是否开始城市抽取
- //初始化城市DFA信息
- ext.InitCityInfo()
- //ext.InitCityDFA()
- ext.InitAreaCode()
- ext.InitPostCode()
- }
- ext.Unlock()
- //质量审核
- ext.InitAuditFields()
- ext.InitAuditRule()
- ext.InitAuditClass()
- ext.InitAuditRecogField()
- //品牌抽取是否开启
- ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
- //价格个数抽取是否开启
- ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
- //附件抽取是否开启
- ext.InitFile()
- ext.TaskInfo.TestColl = resultcoll
- TaskList[taskId] = ext
- return RunExtractTestTask(ext, startId, num)
- }
- func IdTrans(startId string) bson.ObjectId {
- defer qu.Catch()
- return bson.ObjectIdHex(startId)
- }
- func StringTOBsonId(id string) primitive.ObjectID {
- objectId, _ := primitive.ObjectIDFromHex(id)
- return objectId
- }
- func BsonTOStringId(id interface{}) string {
- return id.(primitive.ObjectID).Hex()
- }
- //开始测试任务抽取
- func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
- n, _ := strconv.Atoi(num)
- id := IdTrans(startId)
- if id.Valid() {
- //query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
- list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
- for _, v := range *list {
- //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
- // continue
- //}
- if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
- continue
- }
- var j, jf *ju.Job
- var isSite bool
- if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
- v["isextFile"] = true
- j, jf, isSite = ext.PreInfo(v)
- } else {//无附件
- j, _, isSite = ext.PreInfo(v)
- }
- go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
- ext.TaskInfo.ProcessPool <- true
- }
- return true
- } else {
- return false
- }
- }
- //启动抽取
- func StartExtractTaskId(taskId string) bool {
- defer qu.Catch()
- isgo := false
- ext := TaskList[taskId]
- if ext == nil {
- ext = &ExtractTask{}
- ext.Id = taskId
- ext.InitTaskInfo()
- isgo = true
- } else {
- ext.Id = taskId
- ext.InitTaskInfo()
- }
- ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
- ext.InitSite()
- ext.InitRulePres()
- ext.InitRuleBacks(false)
- ext.InitRuleBacks(true)
- ext.InitRuleCore(false)
- ext.InitRuleCore(true)
- ext.InitPkgCore()
- ext.InitBlockRule()
- ext.InfoTypeList()
- ext.InitTag(false)
- ext.InitTag(true)
- ext.InitClearFn(false)
- ext.InitClearFn(true)
- ext.Lock()
- if ext.IsExtractCity { //版本上控制是否开始城市抽取
- //初始化城市DFA信息
- //ext.InitCityDFA()
- ext.InitCityInfo()
- ext.InitAreaCode()
- ext.InitPostCode()
- }
- ext.Unlock()
- //质量审核
- ext.InitAuditFields()
- ext.InitAuditRule()
- ext.InitAuditClass()
- ext.InitAuditRecogField()
- //品牌抽取是否开启
- ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
- //价格个数抽取是否开启
- ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
- //附件抽取是否开启
- ext.InitFile()
- ext.IsRun = true
- go ext.ResultSave(true)
- go ext.BidSave(true)
- if isgo {
- go RunExtractTask(taskId)
- }
- TaskList[taskId] = ext
- return true
- }
- //停止抽取
- func StopExtractTaskId(taskId string) bool {
- defer qu.Catch()
- ext := TaskList[taskId]
- if ext != nil {
- ext.IsRun = false
- TaskList[taskId] = ext
- }
- //更新task.s_extlastid
- db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- return true
- }
- //开始抽取
- func RunExtractTask(taskId string) {
- defer qu.Catch()
- ext := TaskList[taskId]
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
- count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
- pageNum := (count + PageSize - 1) / PageSize
- limit := PageSize
- if count < PageSize {
- limit = count
- }
- fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
- for i := 0; i < pageNum; i++ {
- query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
- list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
- fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
- for _, v := range *list {
- //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
- // continue
- //}
- //根据标题判断是否抽取
- b := IsExtract("title", qu.ObjToString(v["title"]), "")
- if !b {
- continue
- }
- _id := qu.BsonIdToSId(v["_id"])
- //log.Debug(_id)
- if !ext.IsRun {
- break
- }
- var j, jf *ju.Job
- var isSite bool
- if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
- v["isextFile"] = true
- j, jf, isSite = ext.PreInfo(v)
- } else {
- j, _, isSite = ext.PreInfo(v)
- }
- go ext.ExtractProcess(j, jf, isSite)
- ext.TaskInfo.LastExtId = _id
- ext.TaskInfo.ProcessPool <- true
- }
- db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- if !ext.IsRun {
- break
- }
- }
- //更新task.s_extlastid
- time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
- }
- //信息预处理-不和版本关联,取最新版本的配置项
- func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
- return (&ExtractTask{}).PreInfo(doc)
- }
- var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
- //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
- func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
- defer qu.Catch()
- //判断是否有附件这个字段
- var isextFile bool
- if doc["isextFile"] != nil {
- isextFile = doc["isextFile"].(bool)
- }
- detail := ""
- d1, _ := doc["detail"].(string)
- d2, _ := doc["contenthtml"].(string)
- if len(d1) >= len(d2) || d2 == "" {
- detail = d1
- } else {
- detail = d2
- }
- detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
- d3, _ := doc["summary"].(string)
- //全文的需要修复表格
- detail = pretreated.RepairCon(detail)
- detail = ju.CutLableStr(d3 + "\n" + detail)
- detail = cut.ClearHtml(d3 + "\n" + detail)
- doc["detail"] = detail
- isClearnMoney := !clearMoneyReg.MatchString(detail)
- if isClearnMoney {
- isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
- }
- isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
- if isextFile {
- file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
- }
- //正文小于200个字,有附件把附件内容加到正文
- //tmpDeatil := detail
- //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
- //if err == nil {
- // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
- // if conlen < 2000 {
- // if isextFile {
- // detail += qu.ObjToString(doc["detailfile"])
- // doc["detail"] = detail
- // }
- // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
- // //防止文本过长,造成抽取阻塞
- // log.Debug("文本太长", doc["_id"], conlen)
- // doc["detail"] = d3
- // }
- //}
- toptype := qu.ObjToString(doc["toptype"])
- subtype := qu.ObjToString(doc["subtype"])
- if qu.ObjToString(doc["type"]) == "bid" {
- toptype = "结果"
- }
- if toptype == "" {
- toptype = "all"
- }
- if subtype == "" {
- subtype = "all"
- }
- if subtype == "其他" {
- subtype = "其它"
- }
- toMap := qu.ObjToMap(doc["jsondata"])
- //log.Debug("toMap", toMap)
- if (*toMap) != nil {
- if (*toMap)["extweight"] == nil {
- (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
- }
- if (*toMap)["jsoncontent"] != nil {
- delete(*toMap, "jsoncontent")
- }
- for k, v := range *toMap {
- if _, ok := v.(float64); ok {
- continue
- } else if _, ok := v.(int64); ok {
- continue
- } else if _, ok2 := v.(string); ok2 {
- continue
- } else {
- delete(*toMap, k)
- }
- }
- }
- j = &ju.Job{
- SourceMid: qu.BsonIdToSId(doc["_id"]),
- Category: toptype,
- CategorySecond: subtype,
- Content: qu.ObjToString(doc["detail"]),
- SpiderCode: qu.ObjToString(doc["spidercode"]),
- Site: qu.ObjToString(doc["site"]),
- //Domain: qu.ObjToString(doc["domain"]),
- //Href: qu.ObjToString(doc["href"]),
- Title: qu.ObjToString(doc["title"]),
- Data: &doc,
- City: qu.ObjToString(doc["city"]),
- Province: qu.ObjToString(doc["area"]),
- Jsondata: toMap,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
- RuleBlock: e.RuleBlock,
- Dataging: qu.IntAll(doc["dataging"]),
- IsClearnMoney: isClearnMoneystr,
- }
- if isextFile {
- jf = &ju.Job{
- SourceMid: qu.BsonIdToSId(doc["_id"]),
- Category: toptype,
- CategorySecond: subtype,
- Content: qu.ObjToString(doc["detailfile"]),
- SpiderCode: qu.ObjToString(doc["spidercode"]),
- Site: qu.ObjToString(doc["site"]),
- Title: qu.ObjToString(doc["title"]),
- Data: &doc,
- City: qu.ObjToString(doc["city"]),
- Province: qu.ObjToString(doc["area"]),
- Jsondata: toMap,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
- RuleBlock: e.RuleBlock,
- IsFile: isextFile,
- Dataging: qu.IntAll(doc["dataging"]),
- IsClearnMoney: isClearnMoneystr,
- }
- }
- codeSite := j.SpiderCode
- //是否启用站点
- if value, ok := e.SiteMerge.Load(codeSite); ok {
- isSite = value.(bool)
- }
- if isSite {
- //是否配置站点
- exp, isSite := e.Luacodes.Load(codeSite)
- if isSite {
- if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
- e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
- }
- if exp.(map[string]interface{})["e.SiteTag"] != nil {
- e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
- }
- if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
- e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
- }
- if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
- e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
- }
- }
- }
- qu.Try(func() {
- pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
- if isextFile && strings.TrimSpace(jf.Content) != "" {
- pretreated.AnalyStart(jf, isSite, codeSite)
- }
- }, func(err interface{}) {
- log.Debug("pretreated.AnalyStart", err, j.SourceMid)
- })
- return j, jf, isSite
- }
- var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
- var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
- //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
- func file2text(doc *map[string]interface{}) {
- mnameone := map[string]bool{}
- mname := map[string]bool{}
- murl := map[string]string{}
- //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
- if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
- for _, attachs := range attach_text {
- if fileinfos, ok := attachs.(map[string]interface{}); ok {
- for _, fileinfo := range fileinfos {
- if ff, ok := fileinfo.(map[string]interface{}); ok {
- attach_url := qu.ObjToString(ff["attach_url"])
- ffname := qu.ObjToString(ff["file_name"])
- if clearStrReg.MatchString(ffname) {
- continue
- }
- mname[ffname] = true
- murl[ffname] = attach_url
- if sortStrReg.MatchString(ffname) {
- mnameone[ffname] = true
- }
- }
- }
- }
- }
- }
- tmpstr := ""
- for k := range mnameone {
- if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
- (*doc)["detailfile"] = tmpstr
- return
- }
- bs := ju.OssGetObject(murl[k])
- if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
- tmpstr += bs + "\n"
- }
- }
- for k := range mname {
- if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
- (*doc)["detailfile"] = tmpstr
- return
- }
- bs := ju.OssGetObject(murl[k])
- if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
- tmpstr += bs + "\n"
- }
- }
- (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
- }
- //抽取-正文
- func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
- e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性
- if jf != nil && jf.IsFile { //附件jf → j 合并
- e.ExtractDetail(jf, isSite, j.SpiderCode)
- for tmpk, xs := range jf.Result {
- if len(j.Result[tmpk]) == 0 {
- if tmpk == "budget" || tmpk == "bidamount" {
- for _, v := range xs {
- if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
- j.Result[tmpk] = append(j.Result[tmpk], v)
- }
- }
- } else {
- j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
- }
- }else {
- if tmpk=="winner" && len(j.Result[tmpk]) == 1 {
- if j.Result[tmpk][0].Value == "" {
- j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
- }
- }
- //if tmpk=="buyer" { //附件数据-没有正文靠谱
- // j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
- //}
- }
- }
- if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
- j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
- }
- if len(j.PackageInfo) == 0 && jf.PackageInfo != nil && len(jf.PackageInfo) > 0 {
- j.PackageInfo = jf.PackageInfo
- }
- }
- if isSite {
- ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
- if ok && ismerge.(bool) {
- tmpj := &ju.Job{
- SourceMid: j.SourceMid,
- Category: j.Category,
- CategorySecond: j.CategorySecond,
- Content: j.Content,
- SpiderCode: j.SpiderCode,
- //Domain: qu.ObjToString(doc["domain"]),
- //Href: qu.ObjToString(doc["href"]),
- Title: j.Title,
- Data: j.Data,
- City: j.City,
- Province: j.Province,
- Jsondata: j.Jsondata,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: j.BuyerAddr,
- RuleBlock: e.RuleBlock,
- }
- qu.Try(func() {
- pretreated.AnalyStart(tmpj, false, "") //job.Block分块
- }, func(err interface{}) {
- log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
- })
- e.ExtractDetail(tmpj, false, "")
- //if jf != nil && jf.IsFile {
- // e.ExtractFile(jf, false, "")
- //}
- //合并数据
- j.Block = append(j.Block, tmpj.Block...)
- j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
- for tmpk, _ := range j.Result {
- if len(tmpj.Result[tmpk]) > 0 {
- j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
- }
- }
- for tmpk, _ := range tmpj.Result {
- if len(j.Result[tmpk]) == 0 {
- j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
- }
- }
- }
- }
- //分析抽取结果并保存
- AnalysisSaveResult(j, jf, e)
- <-e.TaskInfo.ProcessPool
- }
- //抽取-正文-规则等 detail
- func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
- qu.Try(func() {
- doc := *j.Data
- //全局前置规则,结果覆盖doc属性
- //for _, v := range e.RulePres {
- // doc = ExtRegPre(doc, j, v, e.TaskInfo)
- //}
- tmprules := map[string][]*RuleCore{}
- lockrule.Lock()
- //加载分类抽取配置
- if j.Category == "all" || j.CategorySecond == "all" {
- if isSite {
- for k, vc1 := range e.SiteRuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- } else {
- if isSite {
- for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- }
- }
- if len(tmprules) < 1 { //分类未覆盖部分
- if isSite {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.SiteRuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- }
- lockrule.Unlock()
- //抽取规则
- for _, vc1 := range tmprules {
- for _, vc := range vc1 {
- tmp := ju.DeepCopy(doc).(map[string]interface{})
- //是否进入逻辑
- if !ju.Logic(vc.LuaLogic, tmp) {
- continue
- }
- if vc.Field =="buyer" {
- //log.Debug("调试抽取字段")
- }
- ////抽取-前置规则
- //for _, v := range vc.RulePres {
- // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
- //}
- // log.Debug("抽取-前置规则", tmp)
- //抽取-规则
- ExtRuleCore(tmp, e, vc, j, isSite)
- // log.Debug("抽取-规则", tmp)
- //抽取-后置规则
- for _, v := range vc.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- //kv规则
- for _, v := range vc.KVRuleCores {
- ExtRuleKV(j, v, e.TaskInfo)
- }
- // log.Debug("抽取-后置规则", tmp)
- //项目名称未能抽取到,标题来凑
- if vc.Field == "projectname" {
- if vc.ExtFrom == "title" {
- isextitle := true
- for _, v := range j.Result[vc.Field] {
- if len([]rune(qu.ObjToString(v.Value))) > 5 {
- isextitle = false
- break
- }
- }
- if isextitle { //标题加入选举
- field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
- if isSite {
- field.Score = 1
- }
- j.Result[vc.Field] = append(j.Result[vc.Field], field)
- }
- }
- for i := 0; i < 3; i++ {
- for _, v := range vc.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- }
- }
- }
- }
- //全局后置规则
- if isSite {
- for _, v := range e.SiteRuleBacks {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- } else {
- for _, v := range e.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- }
- //函数清理
- for key, val := range j.Result {
- for i, v := range val {
- if v.Field == "projectname" && v.Type == "table" {
- break
- }
- if key == "budget" || key == "bidamount" {
- if _, ok := v.Value.(float64); ok && !v.IsTrue {
- continue
- }
- }
- lockclear.Lock()
- var cfn = []string{}
- if isSite {
- cfn = e.SiteClearFn[key]
- if len(cfn) == 0 {
- cfn = e.ClearFn[key]
- }
- } else {
- cfn = e.ClearFn[key]
- }
- lockclear.Unlock()
- if len(cfn) == 0 {
- continue
- }
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
- if key == "budget" || key == "bidamount" {
- if istrue, ok := data[len(data)-1].(bool); istrue && ok {
- j.Result[key][i].IsTrue = true
- } else {
- j.Result[key][i].Value = data[0]
- continue
- }
- }
- before, _ := v.Value.(string)
- v.Value = data[0]
- BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
- //添加行数清理的日志
- //清理特殊符号
- lockclear.Lock()
- if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
- text := qu.ObjToString(v.Value)
- before = text
- //指定清理--新增-函数清理-其他清理
- if key=="winner"||key=="agency"||key=="buyer" {
- text = strings.ReplaceAll(text,"【","")
- text = strings.ReplaceAll(text,"】","")
- }
- v.Value = clear.OtherClean(key, text)
- BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
- }
- //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
- lockclear.Unlock()
- }
- }
- PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
- // bs, _ := json.Marshal(j.Result)
- // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
- }, func(err interface{}) {
- log.Debug("ExtractProcess err", err, j.SourceMid)
- })
- }
- func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
- qu.Try(func() {
- doc := *j.Data
- //全局前置规则,结果覆盖doc属性
- // for _, v := range e.RulePres {
- // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- // doc = ExtRegPre(doc, j, v, e.TaskInfo)
- // }
- // }
- //抽取规则
- tmprules := map[string][]*RuleCore{}
- lockrule.Lock()
- if j.Category == "all" || j.CategorySecond == "all" {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- }
- lockrule.Unlock()
- for _, vc1 := range tmprules {
- for _, vc := range vc1 {
- tmp := ju.DeepCopy(doc).(map[string]interface{})
- //是否进入逻辑
- if !ju.Logic(vc.LuaLogic, tmp) {
- continue
- }
- //抽取-前置规则
- // for _, v := range vc.RulePres {
- // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
- // }
- // }
- // log.Debug("抽取-前置规则", tmp)
- //抽取-规则
- if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRuleCore(tmp, e, vc, j, isSite)
- }
- // log.Debug("抽取-规则", tmp)
- //抽取-后置规则
- for _, v := range vc.RuleBacks {
- if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- }
- // log.Debug("抽取-后置规则", tmp)
- }
- }
- //全局后置规则
- for _, v := range e.RuleBacks {
- if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- }
- //函数清理
- for key, val := range j.Result {
- for _, v := range val {
- lockclear.Lock()
- var cfn = []string{}
- if isSite {
- cfn = e.SiteClearFn[key]
- if len(cfn) == 0 {
- cfn = e.ClearFn[key]
- }
- } else {
- cfn = e.ClearFn[key]
- }
- lockclear.Unlock()
- if len(cfn) == 0 {
- continue
- }
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
- v.Value = data[0]
- //清理特殊符号
- lockclear.Lock()
- if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
- clear.MesField[key] != nil {
- text := qu.ObjToString(v.Value)
- text = clear.OtherClean(key, text)
- v.Value = text
- }
- lockclear.Unlock()
- }
- }
- PackageDetail(j, e, isSite, codeSite) //处理分包信息
- // bs, _ := json.Marshal(j.Result)
- // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
- }, func(err interface{}) {
- log.Debug("ExtractProcess err", err)
- })
- }
- //前置过滤
- func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
- defer qu.Catch()
- before := ju.DeepCopy(doc).(map[string]interface{})
- extinfo := map[string]interface{}{}
- if in.IsLua {
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
- if j != nil {
- lua.Block = j.Block
- }
- extinfo = lua.RunScript("pre")
- for k, v := range extinfo { //结果覆盖原doc
- doc[k] = v
- }
- AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
- } else {
- var key string
- if !j.IsFile {
- key = qu.If(in.Field == "", "detail", in.Field).(string)
- } else {
- key = qu.If(in.Field == "", "detailfile", in.Field).(string)
- }
- text := qu.ObjToString(doc[key])
- extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
- doc[key] = extinfo[key] //结果覆盖原doc
- AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
- }
- return doc
- }
- //抽取-规则
- func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
- //候选人加入
- var kvMap map[string][]map[string]interface{}
- extByReg := true
- if vc.ExtFrom != "title" {
- kvMap, extByReg = getKvByLuaFields(vc, j, e)
- }
- for _, v := range vc.RuleCores {
- if v.IsLua {
- ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
- } else if extByReg {
- ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
- }
- }
- //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
- if vc.Field == "budget" && len(kvMap) == 0 {
- if len(j.BlockPackage) == 1 {
- for _, bp := range j.BlockPackage {
- for fieldname, field := range vc.LFields {
- if field != vc.Field {
- continue
- }
- tp := ""
- for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
- if k == 0 {
- tp = "colon"
- } else if k == 1 {
- tp = "space"
- } else if k == 2 {
- tp = "table"
- }
- if v == nil || v.KvTags == nil {
- continue
- }
- for _, vv := range v.KvTags[fieldname] {
- text := ju.TrimLRSpace(vv.Value, "")
- if text != "" {
- tmp := &ju.ExtField{
- ExtFrom: "package",
- Field: vc.Field,
- Code: "CL_分包",
- Type: tp,
- MatchType: "package",
- RuleText: bp.Text,
- SourceValue: vv.Key,
- Value: text,
- }
- if isSite {
- tmp.Score = 1
- }
- j.Result[vc.Field] = append(j.Result[vc.Field], tmp)
- }
- }
- }
- }
- break
- }
- }
- } else {
- for k, v := range kvMap {
- if j.Result[k] == nil {
- j.Result[k] = [](*ju.ExtField){}
- }
- for _, tmp := range v {
- field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),
- ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k,
- Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]),
- MatchType: qu.ObjToString(tmp["matchtype"]),
- RuleText: qu.ObjToString(tmp["ruletext"]),
- SourceValue: tmp["sourcevalue"],
- Value: tmp["value"]}
- if k == "bidamount" && field.ExtFrom == "第一候选人" {
- field.Score = 1
- }
- if isSite {
- field.Score = 1
- }
- if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" {
- moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney)
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- field.Value = vf
- field.IsTrue = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- field.Value = float64(vi)
- field.IsTrue = moneys[len(moneys)-1].(bool)
- }
- }
- }
- if tmp["blocktag"] != nil {
- btag := make(map[string]string)
- for k := range tmp["blocktag"].(map[string]bool) {
- blocktag.Lock()
- if TagConfigDesc[k] != "" {
- btag[k] = TagConfigDesc[k]
- }
- blocktag.Unlock()
- }
- field.BlockTag = btag
- }
- j.Result[k] = append(j.Result[k], field)
- }
- }
- }
- }
- //抽取-规则-kv
- func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
- defer qu.Catch()
- if extfrom == "title" || !in.IsLua {
- return
- }
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
- lua.KvMap = *kvMap
- lua.Block = j.Block
- extinfo := lua.RunScript("core")
- if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
- for _, v := range tmps {
- v["core"] = in.Code
- }
- (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
- }
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- }
- //抽取-规则-正则
- func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
- defer qu.Catch()
- //根据field配置项目,是否抽取。例如:废标、流标等跳过,
- b := IsExtract(in.Field, j.Title, j.Content)
- if !b {
- return
- }
- //全文正则
- //text := qu.ObjToString(doc[extfrom])
- //if in.Field != "" {
- // extinfo := extRegCoreToResult(extfrom, text, j, in)
- // if len(extinfo) > 0 {
- // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- // }
- //}
- //块抽取
- if in.Field != "" {
- if extfrom == "title" {
- extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- } else if in.Field == "qualifies" {
- extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite)
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- } else {
- for _, v := range j.Block {
- btag := make(map[string]string)
- for k := range v.Classify {
- blocktag.Lock()
- btag[k] = TagConfigDesc[k]
- blocktag.Unlock()
- }
- extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- }
- }
- }
- }
- //pkg抽取-规则-正则
- func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
- defer qu.Catch()
- //根据field配置项目,是否抽取。例如:废标、流标等跳过,
- b := IsExtract(in.Field, j.Title, j.Content)
- if !b {
- return
- }
- //块抽取
- if in.Field != "" {
- for k, vbpkg := range j.BlockPackage {
- rep := map[string]string{}
- if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
- if in.Field == "budget" && vbpkg.Budget > 0 {
- continue
- }
- if in.Field == "agencyfee" && vbpkg.Agencyfee > 0 {
- continue
- }
- if in.Field == "bidamount" && vbpkg.Bidamount > 0 {
- continue
- }
- if in.Field == "winner" && vbpkg.Winner != "" {
- continue
- }
- if in.Field == "bidstatus" && vbpkg.BidStatus != "" {
- continue
- }
- if in.Field == "projectname" && vbpkg.Name != "" {
- continue
- }
- if in.Field == "winner" && vbpkg.Winner != "" {
- continue
- }
- if in.Field == "winnerperson" {
- if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
- continue
- }
- if !strings.Contains(vbpkg.Text, vbpkg.Winner) {
- continue
- }
- }
- if in.Field == "winnertel" {
- if vbpkg.WinnerPerson == "" {
- continue
- }
- }
- //处理正负数修正
- ptmp := strings.Split(in.RuleText, "#")
- sign := 0
- if len(ptmp) == 2 {
- if ptmp[1] == "正" {
- sign = 1
- } else if ptmp[1] == "负" {
- sign = -1
- }
- }
- tmp := strings.Split(ptmp[0], "__")
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else {
- posm[in.Field] = qu.IntAll(ks[0])
- }
- }
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- //log.Debug("pattern", pattern)
- //fmt.Println(text)
- reg := regexp.MustCompile(pattern)
- apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1)
- for i, _ := range apos {
- pos := apos[i]
- for k, p := range posm {
- if len(pos) > p {
- if pos[p] == -1 || pos[p+1] == -1 {
- continue
- }
- val := vbpkg.Text[pos[p]:pos[p+1]]
- if string(val) == "" {
- continue
- }
- if sign == -1 {
- rep[k+"_"+fmt.Sprint(i)] = "-" + val
- } else {
- rep[k+"_"+fmt.Sprint(i)] = val
- }
- }
- }
- }
- //fmt.Println(text)
- for i := 0; i < len(apos); i++ {
- if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" {
- if in.Field == "budget" && vbpkg.Budget <= 0 {
- lock.Lock()
- cfn := e.ClearFn[in.Field]
- lock.Unlock()
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
- if data[len(data)-1].(bool) {
- j.BlockPackage[k].Budget = qu.Float64All(data[0])
- j.BlockPackage[k].IsTrueBudget = true
- }
- break
- } else if in.Field == "agencyfee" && vbpkg.Agencyfee <= 0 {
- lock.Lock()
- cfn := e.ClearFn[in.Field]
- lock.Unlock()
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
- if data[len(data)-1].(bool) {
- j.BlockPackage[k].Agencyfee = qu.Float64All(data[0])
- j.BlockPackage[k].IsTrueAgencyfee = true
- }
- break
- }else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
- lock.Lock()
- cfn := e.ClearFn[in.Field]
- lock.Unlock()
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney)
- if data[len(data)-1].(bool) {
- j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
- j.BlockPackage[k].IsTrueBidamount = true
- }
- break
- } else if in.Field == "winner" {
- if j.BlockPackage[k].Winner == "" {
- j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "winnertel" {
- if j.BlockPackage[k].WinnerTel == "" {
- j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "winnerperson" {
- if j.BlockPackage[k].WinnerPerson == "" {
- j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "bidstatus" {
- if j.BlockPackage[k].BidStatus == "" {
- j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "projectname" {
- if j.BlockPackage[k].Name == "" {
- j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "winnerperson" {
- if j.BlockPackage[k].WinnerPerson == "" {
- j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- } else if in.Field == "winnertel" {
- if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" {
- j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)]
- break
- }
- }
- }
- }
- }
- } else {
- pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text)
- val := ""
- if len(pos) == 2 {
- //"text" = "text"[pos[1]:]
- val = "text"[pos[1]:]
- rs := regexp.MustCompile("[^\r\n\t]+")
- tmp := rs.FindAllString("text", -1)
- if len(tmp) > 0 {
- val = tmp[0]
- }
- }
- if val != "" {
- if in.Field == "budget" && vbpkg.Budget <= 0 {
- lock.Lock()
- cfn := e.ClearFn[in.Field]
- lock.Unlock()
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
- if data[len(data)-1].(bool) {
- j.BlockPackage[k].Budget = qu.Float64All(data[0])
- j.BlockPackage[k].IsTrueBudget = true
- }
- break
- }
- if in.Field == "bidamount" && vbpkg.Bidamount <= 0 {
- lock.Lock()
- cfn := e.ClearFn[in.Field]
- lock.Unlock()
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney)
- if data[len(data)-1].(bool) {
- j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
- j.BlockPackage[k].IsTrueBidamount = true
- }
- break
- } else if in.Field == "bidstatus" {
- if j.BlockPackage[k].BidStatus == "" {
- j.BlockPackage[k].BidStatus = val
- break
- }
- } else if in.Field == "projectname" {
- if j.BlockPackage[k].Name == "" {
- j.BlockPackage[k].Name = val
- break
- }
- }
- }
- }
- }
- }
- }
- //lua脚本根据属性设置提取kv值
- func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
- kvmap := map[string][]map[string]interface{}{}
- if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
- if vc.Field == "bidamount" {
- for _, v := range j.Winnerorder {
- if v["price"] == nil {
- continue
- }
- kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
- "code": "winnerorder",
- "field": vc.Field,
- "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]),
- "extfrom": v["sortstr"],
- "sourcevalue": v["price"],
- "value": v["price"],
- "type": "winnerorder",
- "matchtype": "winnerorder",
- })
- return kvmap, false
- }
- //候选人中标金额
- if price := j.Winnerorder[0]["price"]; price != nil {
- kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
- "code": "CL_中标候选人",
- "field": vc.Field,
- "ruletext": "中标候选人",
- "extfrom": j.Winnerorder[0]["sortstr"],
- "sourcevalue": price,
- "value": price,
- "type": "winnerorder",
- "matchtype": "winnerorder",
- })
- return kvmap, false
- }
- }
- }
- for fieldname, field := range vc.LFields {
- if field != vc.Field {
- continue
- }
- extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category)
- }
- AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
- return kvmap, true
- }
- func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) {
- //qu.Debug("fieldname+++", fieldname)
- for _, bl := range blocks {
- tp := ""
- if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") {
- if text := ju.TrimLRSpace(bl.Text, ""); text != "" {
- if Category == "招标" || Category == "拟建" || Category == "预告" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "code": "CL_块内容",
- "field": field,
- "ruletext": "投标保证金",
- "extfrom": "投标保证金_块内容",
- "sourcevalue": bl.Text,
- "value": text,
- "type": "投标保证金_块内容",
- "matchtype": "tag_string",
- "blocktag": bl.Classify,
- "weight": 0,
- })
- } else if Category == "结果" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "code": "CL_",
- "field": field,
- "ruletext": "履约保证金",
- "extfrom": "履约保证金_块内容",
- "sourcevalue": bl.Text,
- "value": text,
- "type": "履约保证金_块内容",
- "matchtype": "tag_string",
- "blocktag": bl.Classify,
- "weight": 0,
- })
- }
- }
- return
- }
- for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
- if k == 0 {
- tp = "colon"
- } else if k == 1 {
- tp = "space"
- } else if k == 2 {
- tp = "table"
- }
- if v == nil || v.KvTags == nil {
- continue
- }
- for _, vv := range v.KvTags[fieldname] {
- text := ju.TrimLRSpace(vv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "code": "CL_" + vv.Key,
- "field": field,
- "ruletext": vv.Key,
- "extfrom": vc.ExtFrom,
- "sourcevalue": text,
- "value": text,
- "type": tp,
- "matchtype": "tag_string",
- "blocktag": bl.Classify,
- "weight": vv.Weight,
- })
- //if field != "winnertel" && field != "winnerperson" {
- // //break //暂定取第一个
- //}
- }
- }
- }
- if len(kvmap[field]) == 0 {
- extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category)
- }
- }
- }
- //正则提取结果
- func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} {
- defer qu.Catch()
- var score float64
- score = vre.Score
- if isSite {
- score = score + 1.0
- }
- extinfo := map[string][]map[string]interface{}{}
- rep := map[string]string{}
- if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
- //处理正负数修正
- ptmp := strings.Split(vre.RuleText, "#")
- sign := 0
- if len(ptmp) == 2 {
- if ptmp[1] == "正" {
- sign = 1
- } else if ptmp[1] == "负" {
- sign = -1
- }
- }
- tmp := strings.Split(ptmp[0], "__")
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else {
- posm[vre.Field] = qu.IntAll(ks[0])
- }
- }
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- //log.Debug("pattern", pattern)
- //fmt.Println(text)
- reg := regexp.MustCompile(pattern)
- apos := reg.FindAllStringSubmatchIndex(text, -1)
- for i, _ := range apos {
- pos := apos[i]
- for k, p := range posm {
- if len(pos) > p {
- if pos[p] == -1 || pos[p+1] == -1 {
- continue
- }
- val := text[pos[p]:pos[p+1]]
- if string(val) == "" {
- continue
- }
- if sign == -1 {
- rep[k+"_"+fmt.Sprint(i)] = "-" + val
- } else {
- rep[k+"_"+fmt.Sprint(i)] = val
- }
- }
- }
- }
- tmps := []map[string]interface{}{}
- for i := 0; i < len(apos); i++ {
- if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
- tmp := map[string]interface{}{
- "field": vre.Field,
- "code": vre.Code,
- "ruletext": vre.RuleText,
- "extfrom": text,
- "value": rep[vre.Field+"_"+fmt.Sprint(i)],
- "type": "regexp",
- "matchtype": "regcontent",
- "blocktag": *tag,
- "score": score,
- }
- exfield := ju.ExtField{
- BlockTag: *tag,
- Field: vre.Field,
- Code: vre.Code,
- RuleText: vre.RuleText,
- Type: "regexp",
- MatchType: "regcontent",
- ExtFrom: extfrom,
- SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)],
- Value: rep[vre.Field+"_"+fmt.Sprint(i)],
- Score: score,
- }
- if vre.Field == "qualifies" {
- if len(rep) >= 2 {
- tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)]
- exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)]
- }
- }
- tmps = append(tmps, tmp)
- if tmp["blocktag"] != nil {
- exfield.BlockTag = tmp["blocktag"].(map[string]string)
- }
- j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
- }
- }
- if len(tmps) > 0 {
- //fmt.Println(tmps)
- extinfo[vre.Field] = tmps
- }
- }
- } else {
- pos := vre.RegCore.Reg.FindStringIndex(text)
- val := ""
- if len(pos) == 2 {
- text = text[pos[1]:]
- rs := regexp.MustCompile("[^\r\n\t]+")
- tmp := rs.FindAllString(text, -1)
- if len(tmp) > 0 {
- val = tmp[0]
- }
- }
- if val != "" {
- tmps := []map[string]interface{}{}
- tmp := map[string]interface{}{
- "field": vre.Field,
- "code": vre.Code,
- "ruletext": vre.RuleText,
- "extfrom": text,
- "value": val,
- "type": "regexp",
- "matchtype": "regcontent",
- "blocktag": *tag,
- "score": score,
- }
- tmps = append(tmps, tmp)
- extinfo[vre.Field] = tmps
- if j.Result[vre.Field] == nil {
- j.Result[vre.Field] = [](*ju.ExtField){}
- }
- field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text,
- Value: val,
- Score: score}
- if tmp["blocktag"] != nil {
- field.BlockTag = tmp["blocktag"].(map[string]string)
- }
- j.Result[vre.Field] = append(j.Result[vre.Field], field)
- }
- }
- return extinfo
- }
- //后置过滤
- func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
- defer qu.Catch()
- if in.IsLua {
- result := GetResultMapForLua(j)
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
- if j != nil {
- lua.Block = j.Block
- }
- extinfo := lua.RunScript("back")
- for k, v := range extinfo {
- if tmps, ok := v.([]map[string]interface{}); ok {
- j.Result[k] = [](*ju.ExtField){}
- for _, tmp := range tmps {
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]),
- ExtFrom: qu.ObjToString(tmp["extfrom"]),
- Value: tmp["value"]}
- if tmp["blocktag"] != nil {
- field.BlockTag = tmp["blocktag"].(map[string]string)
- }
- j.Result[k] = append(j.Result[k], field)
- //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
- }
- }
- }
- if len(extinfo) > 0 {
- AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
- }
- } else {
- extinfo := map[string]interface{}{}
- if in.Field != "" {
- clearByTitle := false
- if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
- clearByTitle = true
- }
- if j.Result[in.Field] != nil {
- tmp := j.Result[in.Field]
- exts := []interface{}{}
- for k, v := range tmp {
- if clearByTitle && v.ExtFrom != "title" {
- continue
- }
- //table抽取到的数据不清理
- if v.Type == "table" && v.Field == "projectname" {
- return
- }
- text := qu.ObjToString(v.Value)
- if v.Field == "bidamount" || v.Field == "budget" {
- if (strings.Contains(qu.ObjToString(v.SourceValue), "费率")||
- strings.Contains(qu.ObjToString(v.SourceValue), "税率") ||
- strings.Contains(qu.ObjToString(v.SourceValue), "(%)") ) &&
- !strings.Contains(qu.ObjToString(v.SourceValue), "工程设计费"){
- j.Result[in.Field][k].IsTrue = false
- continue
- }
- }
- if text != "" {
- text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
- }
- if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
- continue
- }
- j.Result[in.Field][k].Value = text
- exts = append(exts, map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- "value": text,
- })
- }
- if len(exts) > 0 {
- extinfo[in.Field] = exts
- AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
- }
- }
- } else {
- for key, tmp := range j.Result {
- exts := []interface{}{}
- for k, v := range tmp {
- //table抽取到的数据不清理
- if v.Type == "table" && v.Field == "projectname" {
- return
- }
- text := qu.ObjToString(v.Value)
- if text != "" {
- text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
- }
- if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
- continue
- }
- j.Result[key][k].Value = text
- exts = append(exts, map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- "value": text,
- })
- }
- if len(exts) > 0 {
- extinfo[key] = exts
- AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
- }
- }
- }
- }
- }
- //后置过滤
- func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) {
- defer qu.Catch()
- for k, v := range j.BlockPackage {
- if in.Field == "winner" {
- j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace)
- } else if in.Field == "bidstatus" {
- j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace)
- } else if in.Field == "" {
- j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace)
- } else if in.Field == "projectname" {
- j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace)
- } else if in.Field == "winnerperson" {
- j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace)
- } else if in.Field == "winnertel" {
- j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace)
- }
- }
- }
- //KV过滤
- func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
- defer qu.Catch()
- extinfo := map[string]interface{}{}
- if in.Field != "" {
- if j.Result[in.Field] != nil {
- tmp := j.Result[in.Field]
- exts := []interface{}{}
- for k, v := range tmp {
- if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") {
- continue
- }
- if v.Field=="中标金额" ||v.Field=="bidamount" {
- //log.Debug("调试字段...")
- }
- text := qu.ObjToString(v.Value)
- if text != "" {
- text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
- }
- if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志
- continue
- }
- j.Result[in.Field][k].Value = text
- exts = append(exts, map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- "value": text,
- })
- }
- if len(exts) > 0 {
- extinfo[in.Field] = exts
- AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
- }
- }
- }
- }
- //获取抽取结果map[string][]interface{},lua脚本使用
- func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
- defer qu.Catch()
- result := map[string][]map[string]interface{}{}
- for key, val := range j.Result {
- if result[key] == nil {
- result[key] = []map[string]interface{}{}
- }
- for _, v := range val {
- tmp := map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "value": v.Value,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- }
- result[key] = append(result[key], tmp)
- }
- }
- return result
- }
- //抽取日志
- func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
- defer qu.Catch()
- if !t.IsEtxLog {
- return
- }
- logdata := map[string]interface{}{
- "code": qu.If(v.Code == "", "kv", v.Code),
- "name": v.Name,
- "type": ftype,
- "ruletext": v.RuleText,
- "islua": v.IsLua,
- "field": v.Field,
- "version": t.Version,
- "taskname": t.Name,
- "before": before,
- "extinfo": extinfo,
- "sid": sid,
- "comeintime": time.Now().Unix(),
- }
- lock.Lock()
- ExtLogs[t] = append(ExtLogs[t], logdata)
- lock.Unlock()
- }
- func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) {
- exts := []map[string]interface{}{}
- exts = append(exts, map[string]interface{}{
- "field": ext.Field,
- "code": ext.Code,
- "type": ftype,
- "matchtype": matchtype,
- "extfrom": ext.ExtFrom,
- "value": ext.Value,
- })
- extinfo := map[string]interface{}{
- ext.Field: exts,
- }
- AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo)
- }
- func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) {
- defer qu.Catch()
- if !t.IsEtxLog {
- return
- }
- logdata := map[string]interface{}{
- "code": code,
- "name": name,
- "type": ftype,
- "ruletext": "",
- "islua": false,
- "field": field,
- "version": t.Version,
- "taskname": t.Name,
- "before": before,
- "extinfo": extinfo,
- "sid": sid,
- "comeintime": time.Now().Unix(),
- }
- lock.Lock()
- ExtLogs[t] = append(ExtLogs[t], logdata)
- lock.Unlock()
- }
- //保存抽取日志
- func SaveExtLog() {
- defer qu.Catch()
- tmpLogs := map[*TaskInfo][]map[string]interface{}{}
- lock.Lock()
- tmpLogs = ExtLogs
- ExtLogs = map[*TaskInfo][]map[string]interface{}{}
- lock.Unlock()
- for k, v := range tmpLogs {
- if len(v) < saveLimit {
- db.Mgo.SaveBulk(k.TrackColl, v...)
- } else {
- for {
- if len(v) > saveLimit {
- tmp := v[:saveLimit]
- db.Mgo.SaveBulk(k.TrackColl, tmp...)
- v = v[saveLimit:]
- } else {
- db.Mgo.SaveBulk(k.TrackColl, v...)
- break
- }
- }
- }
- }
- time.AfterFunc(10*time.Second, SaveExtLog)
- }
- type FieldValue struct {
- Value interface{}
- Count int
- }
- var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:")
- var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
- //分析抽取结果并保存
- func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
- qu.Try(func() {
- if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) {
- if j.CategorySecond != "单一" {
- delete(j.Result, "winner")
- delete(j.Result, "bidamount")
- for _, v := range j.BlockPackage {
- v.Bidamount = 0
- v.IsTrueBidamount = false
- if v.Winner != "" {
- v.Winner = ""
- if v.SpaceKV != nil {
- delete(v.SpaceKV.KvTags, "中标单位")
- }
- if v.TableKV != nil {
- delete(v.TableKV.KvTags, "中标单位")
- }
- if v.ColonKV != nil {
- delete(v.ColonKV.KvTags, "中标单位")
- }
- }
- }
- for _, v := range j.PackageInfo {
- delete(v, "winner")
- delete(v, "bidamount")
- }
- j.Winnerorder = nil
- if jf != nil && jf.Winnerorder != nil {
- jf.Winnerorder = nil
- }
- }
- }
- //重新取出清理过后的中标候选人
- resetWinnerorder(j)
- //打分
- doc, result, _id := funcAnalysis(j, e)
- //_, result, _id := funcAnalysis(j, e)
- if ju.IsSaveTag {
- go otherNeedSave(j, result, e)
- }
- //从排序结果中取值
- tmp := map[string]interface{}{} //抽取值
- tmp["spidercode"] = j.SpiderCode
- tmp["site"] = j.Site
- if len(*j.Jsondata) > 0 {
- tmp["jsondata"] = j.Jsondata
- }
- //字段-抽取来源
- fieldSource := make(map[string]interface{},0)
- //字段-抽取来源
- for k, val := range result {
- if k == "qualifies" {
- squalifies := make([]interface{}, 0)
- squalifiesMap := make(map[string]*scoreIndex, 0)
- for _, kv := range val {
- skey := kv.RuleText
- if kv.Score > 0 {
- if squalifiesMap[skey] == nil {
- squalifiesMap = map[string]*scoreIndex{
- skey: &scoreIndex{
- Score: kv.Score,
- Index: len(squalifies),
- },
- }
- squalifies = append(squalifies, map[string]interface{}{
- "key": skey,
- "value": kv.Value,
- })
- } else {
- if squalifiesMap[skey].Score < kv.Score {
- squalifies[squalifiesMap[skey].Index] = map[string]interface{}{
- "key": skey,
- "value": kv.Value,
- }
- }
- }
- }
- }
- tmp[k] = squalifies
- continue
- }
- for _, v := range val { //取第一个非负数,项目名称除外
- //存0是否有效
- if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 {
- tmp[v.Field] = v.Value
- fieldSource[v.Field] = map[string]interface{}{
- "ext_type":v.Type,
- "ext_from":v.ExtFrom,
- }
- break
- }
- if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
- tmp[v.Field] = v.Value
- fieldSource[v.Field] = map[string]interface{}{
- "ext_type":v.Type,
- "ext_from":v.ExtFrom,
- }
- break
- }
- }
- }
- tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",")
- if len(j.PackageInfo) > 15 {
- for k, v := range j.PackageInfo {
- j.PackageInfo = map[string]map[string]interface{}{}
- j.PackageInfo[k] = v
- break
- }
- }
- if len(j.PackageInfo) > 0 { //分包信息
- tmp["package"] = j.PackageInfo
- //包预算,中标金额合并大于抽取就覆盖
- tmpBidamount, tmpBudget,tmpAgencyfee:=qu.Float64All(0),qu.Float64All(0),qu.Float64All(0)
- //s_winner逗号分隔拼接,分包中标人
- var tmpstr, savewinner []string
- //按包排序
- for b, v := range j.PackageInfo {
- if v["winner"] != nil && v["winner"] != "" {
- tmpstr = append(tmpstr, b)
- }
- }
- //包预算,中标金额合并大于抽取就覆盖
- if len(j.PackageInfo) >= 1 {
- //包数大于1累加
- for _, v := range j.PackageInfo {
- if v["budget"] != nil {
- tmpBudget = precisionAddFloat(tmpBudget,qu.Float64All(v["budget"]))
- }
- if v["bidamount"] != nil {
- tmpBidamount = precisionAddFloat(tmpBidamount,qu.Float64All(v["bidamount"]))
- }
- if v["agencyfee"] != nil {
- tmpAgencyfee = precisionAddFloat(tmpAgencyfee,qu.Float64All(v["agencyfee"]))
- }
- }
- if qu.Float64All(tmp["budget"]) < tmpBudget {
- fieldSource["budget"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["budget"] = tmpBudget
- }
- if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee {
- fieldSource["agencyfee"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["agencyfee"] = tmpAgencyfee
- }
- if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
- fieldSource["bidamount"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["bidamount"] = tmpBidamount
- } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
- fieldSource["bidamount"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["bidamount"] = tmpBidamount
- }
- } else {
- //包数等于1,tmp没有值取包里的值
- if tmp["budget"] == nil || tmp["budget"] == 0 {
- for _, v := range j.PackageInfo {
- if v["budget"] != nil {
- fieldSource["budget"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["budget"] = v["budget"]
- }
- }
- }
- if tmp["agencyfee"] == nil || tmp["agencyfee"] == 0 {
- for _, v := range j.PackageInfo {
- if v["agencyfee"] != nil {
- fieldSource["agencyfee"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["agencyfee"] = v["agencyfee"]
- }
- }
- }
- if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
- for _, v := range j.PackageInfo {
- if v["bidamount"] != nil {
- fieldSource["bidamount"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- tmp["bidamount"] = v["bidamount"]
- }
- }
- }
- }
- //s_winner逗号分隔拼接,分包中标人
- sort.Strings(tmpstr)
- for _, v := range tmpstr {
- winner := qu.ObjToString(j.PackageInfo[v]["winner"])
- new_winner := clearWinnerReg.ReplaceAllString(winner, "")
- if new_winner == "" {
- continue
- }
- //名称黑名单
- if unPackageWinnerReg.MatchString(new_winner) {
- continue
- }
- savewinner = append(savewinner, new_winner)
- }
- if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
- tmp["s_winner"] = tmp["winner"]
- fieldSource["s_winner"] = fieldSource["winner"]
- } else if savewinner != nil {
- savewinner = RemoveReplicaSliceString(savewinner)
- tmp["s_winner"] = strings.Join(savewinner, ",")
- if len(savewinner)==1 {
- fieldSource["s_winner"] = fieldSource["winner"]
- }else if len(savewinner)>1{
- fieldSource["s_winner"] = map[string]interface{}{
- "ext_type":"",
- "ext_from":"package",
- }
- }
- }
- } else if tmp["winner"] != nil && tmp["winner"] != "" {
- //没有分包取winner
- tmp["s_winner"] = tmp["winner"]
- fieldSource["s_winner"] = fieldSource["winner"]
- }
- if len(j.Winnerorder) > 0 { //候选人信息
- for i, v := range j.Winnerorder {
- if v["price"] != nil {
- tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpPrice[len(tmpPrice)-1].(bool) {
- j.Winnerorder[i]["price"] = tmpPrice[0]
- } else {
- delete(j.Winnerorder[i], "price")
- }
- }
- }
- tmp["winnerorder"] = j.Winnerorder
- }
- //处理附件
- var resultf map[string][]*ju.ExtField
- ffield := map[string]interface{}{}
- if jf != nil {
- _, resultf, _ = funcAnalysis(jf, e)
- for _, val := range resultf {
- for _, v := range val { //取第一个非负数
- if v.Score > -1 {
- ffield[v.Field] = v.Value
- if tmp[v.Field] == nil {
- if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
- tmp[v.Field] = v.Value
- fieldSource[v.Field] = map[string]interface{}{
- "ext_type":v.Type,
- "ext_from":"ff",
- }
- break
- }
- if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 {
- tmp[v.Field] = v.Value
- fieldSource[v.Field] = map[string]interface{}{
- "ext_type":v.Type,
- "ext_from":"ff",
- }
- break
- }
- }
- break
- }
- }
- }
- if len(jf.PackageInfo) > 0 { //分包信息
- ffield["package"] = jf.PackageInfo
- }
- if len(jf.Winnerorder) > 0 { //候选人信息
- ffield["winnerorder"] = jf.Winnerorder
- }
- }
- //添加字段来源
- tmp["field_source"] = fieldSource
- //添加字段来源
- for k, v := range *doc {
- if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
- (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
- }
- //去重冗余字段
- if delFiled(k) {
- continue
- }
- if tmp[k] == nil {
- tmp[k] = v
- }
- }
- //质量审核
- if ju.QualityAudit {
- e.QualityAudit(tmp)
- }
- //城市抽取
- if e.IsExtractCity {
- e.NewExtractCity(j, &tmp, _id)
- }
- //品牌抽取
- if ju.IsBrandGoods {
- tmp["checkhas"] = map[string]int{
- "hastable": j.HasTable,
- "hasgoods": j.HasGoods,
- "hasbrand": j.HasBrand,
- "haskey": j.HasKey,
- }
- if len(j.BrandData) > 0 {
- tmp["tablebrand"] = j.BrandData
- }
- }
- //prince和number抽取
- if ju.IsPriceNumber {
- priceNumberLen := len(j.PriceNumberData)
- if priceNumberLen > 1 { //table数据去重
- tmpPriceNumberData := []map[string]interface{}{}
- tableStrs := map[string]bool{}
- for _, tb := range j.PriceNumberData {
- has := false
- bytes, _ := json.Marshal(tb)
- str := string(bytes)
- if len(tableStrs) > 0 && tableStrs[str] {
- has = true
- } else {
- tableStrs[str] = true
- }
- if !has {
- for _, data := range tb {
- tmpPriceNumberData = append(tmpPriceNumberData, data)
- }
- }
- }
- tmp["pricenumber"] = tmpPriceNumberData
- } else if priceNumberLen == 1 {
- tmp["pricenumber"] = j.PriceNumberData[0]
- }
- }
- //所有kv组成的字符串
- var kvtext bytes.Buffer
- blocks := make([]ju.BlockAndTag, 0)
- for _, v := range j.Block {
- //分包和标签
- if ju.SaveBlock {
- xx, _ := json.Marshal(v)
- tmpblock := new(ju.TmpBlock)
- err := json.Unmarshal(xx, &tmpblock)
- if err != nil {
- if v.BPackage != nil {
- bpb, _ := json.Marshal(v.BPackage)
- tmpblock.BPackage = string(bpb)
- }
- tmpblock = rangeBlockToJson(v, *tmpblock)
- }
- blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
- }
- //把所有kv组装成一个字符串,存库
- for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
- if jv == nil {
- continue
- }
- for jv_k, jv_v := range jv.KvTags {
- for _, jv_vv := range jv_v {
- kvtext.WriteString(jv_k)
- kvtext.WriteString(":")
- kvtext.WriteString(jv_vv.Value)
- kvtext.WriteString("\n")
- }
- }
- }
- }
- if kvtext.Len() > 0 {
- tmp["kvtext"] = kvtext.String()
- }
- if len(blocks) > 0 {
- if blocksBytes, err := json.Marshal(blocks); err == nil {
- if utf8.RuneCount(blocksBytes) < 100000 {
- tmp["blocks"] = string(blocksBytes)
- }
- }
- }
- tmp["dataging"] = j.Dataging
- /*for k, v := range *j.Data {
- if f[k] {
- tmp[k] = v
- }
- }
- for k := range tmp {
- if !f[k]{
- delete(tmp,k)
- }
- }*/
- //检查字段
- tmp = checkFields(tmp)
- if tmp["projectname"] == nil || tmp["projectname"] == "" {
- tmp["projectname"] = j.Title
- }
- tmp["repeat"] = 0
- if ju.Ffield {
- if len(ffield) > 0 {
- tmp["ffield"] = ffield
- }
- }
- if e.TaskInfo.TestColl == "" {
- if len(tmp) > 0 { //保存抽取结果
- delete(tmp, "_id")
- tmparr := []map[string]interface{}{
- map[string]interface{}{
- "_id": qu.StringTOBsonId(_id),
- },
- map[string]interface{}{"$set": tmp},
- }
- e.RWMutex.Lock()
- e.BidArr = append(e.BidArr, tmparr)
- e.BidTotal++
- e.RWMutex.Unlock()
- }
- if ju.SaveResult {
- id := tmp["_id"]
- tmp["result"] = result
- tmp["resultf"] = resultf
- delete(tmp, "_id")
- tmparr := []map[string]interface{}{
- map[string]interface{}{
- "_id": id,
- },
- map[string]interface{}{"$set": tmp},
- }
- e.RWMutex.Lock()
- e.ResultArr = append(e.ResultArr, tmparr)
- e.RWMutex.Unlock()
- }
- } else { //测试结果
- delete(tmp, "_id")
- delete(tmp, "fieldall")
- if len(j.BlockPackage) > 0 { //分包详情
- if len(j.BlockPackage) > 10 {
- tmp["epackage"] = "分包异常"
- } else {
- bs, _ := json.Marshal(j.BlockPackage)
- tmp["epackage"] = string(bs)
- }
- }
- tmp["result"] = result
- //tmp["resultf"] = resultf
- //_,err :=db.Mgo.Get().DB("zhengkun").C("result_data").Upsert(`{"_id":"`+_id+`"}`,map[string]interface{}{"$set": tmp})
- //log.Debug("save:",err)
- b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
- if !b {
- log.Debug(e.TaskInfo.TestColl, _id)
- }
- }
- }, func(err interface{}) {
- log.Debug("AnalysisSaveResult err", err)
- })
- }
- //检查字段-
- func checkFields(tmp map[string]interface{}) map[string]interface{} {
- delete(tmp, "contenthtml")
- delete(tmp, "detail")
- tmp["repeat"] = 0
- //指定爬虫-金额处理-预算-中标金额异常
- if qu.ObjToString(tmp["spidercode"])=="xz_xzzzqjzscjgycxxxpt_zbtzs" {
- if budget, ok := tmp["budget"].(float64); ok && budget>0 && budget < 1000000{
- tmp["budget"] = budget*10000.0
- }
- if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{
- tmp["bidamount"] = bidamount/10000.0
- }
- }
- if qu.ObjToString(tmp["spidercode"])=="js_jsszbtbw_zbhxrgs" {
- if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{
- tmp["bidamount"] = bidamount/10000.0
- }
- }
- if _, ok := tmp["bidamount"].(string); ok {
- delete(tmp, "bidamount")
- } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/5 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
- delete(tmp, "bidamount")
- }
- if _, ok := tmp["budget"].(string); ok {
- delete(tmp, "budget")
- }
- if _, ok := tmp["unitprice"].(string); ok {
- delete(tmp, "unitprice")
- }
- if _, ok := tmp["bidopentime"].(string); ok {
- delete(tmp, "bidopentime")
- }
- if _, ok := tmp["signaturedate"].(string); ok {
- delete(tmp, "signaturedate")
- }
- if _, ok := tmp["supervisorrate"].(string); ok {
- delete(tmp, "supervisorrate")
- }
- for k, v := range tmp {
- if k == "qualifies" {
- continue
- }
- if k == "contract_guarantee" || k == "bid_guarantee" {
- if len(fmt.Sprint(v)) > 0 {
- tmp[k] = true
- } else {
- delete(tmp, k)
- }
- }
- if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") {
- delete(tmp, k)
- }
- }
- //项目周期-有效值
- projectperiod := qu.ObjToString(tmp["projectperiod"])
- if projectperiod !="" {
- //项目周期包含日期,数字及日期单位可保留,其余可清洗
- isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
- if !isNeedValueReg.MatchString(projectperiod) {
- delete(tmp, "projectperiod")
- }
- }
- //工期单位是否有效-清理
- if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
- dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
- if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit)>4 {
- delete(tmp, "project_timeunit")
- }
- //年-0 >5 删除
- if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"])==0 || qu.Int64All(tmp["project_duration"])>5 ){
- delete(tmp, "project_timeunit")
- }
- }
- if tmp["winner"] != nil && tmp["s_winner"] != nil {
- strwin := qu.ObjToString(tmp["winner"])
- strwin_s := qu.ObjToString(tmp["s_winner"])
- if !strings.Contains(strwin_s, strwin) {
- tmp["s_winner"] = strwin
- }
- }
- //budget bidamount
- if bg, ok := tmp["budget"].(float64); ok {
- if bg >= 50000000000 {
- tmp["budget_max_err"] = bg
- delete(tmp, "budget")
- }
- }
- if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
- code := qu.ObjToString(tmp["spidercode"])
- if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" &&
- code != "js_jsszbtbw_zbhxrgs"{
- tmp["bidamount_max_err"] = bg
- delete(tmp, "bidamount")
- }
- }
- //投标方式-
- bidway := qu.IntAll(tmp["bidway"])
- if bidway == 1 {
- tmp["bidway"] = "纸质投标"
- }else if bidway == 2 {
- tmp["bidway"] = "电子投标"
- }else {
- delete(tmp, "bidway")
- }
- //折扣系数
- discount := dealWithDiscountBid(tmp)
- if discount >0.0 {
- tmp["biddiscount"] = discount
- }else {
- delete(tmp, "biddiscount")
- }
- delete(tmp, "biddiscount_up")
- delete(tmp, "biddiscount_down")
- //临时
- //bidopentime := qu.Int64All(tmp["bidopentime"])
- //bidendtime := qu.Int64All(tmp["bidendtime"])
- //timeLayout := "2006-01-02 15:04:05"
- //
- //if bidopentime>0 {
- // bidopentime_str := time.Unix(bidopentime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
- // tmp["bidopentime"] = bidopentime_str
- //}
- //if bidendtime>0 {
- // bidendtime_str := time.Unix(bidendtime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
- // tmp["bidendtime"] = bidendtime_str
- //}
- jyhref:= fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
- tmp["jytest_href"] = jyhref
- return tmp
- }
- //处理折扣系数
- func dealWithDiscountBid(tmp map[string]interface{}) float64 {
- biddiscount := qu.Float64All(tmp["biddiscount"])
- biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
- biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
- baseCount := float64(1)
- if biddiscount_down >0.0 {
- num1:=decimal.NewFromFloat(baseCount)
- num2:=decimal.NewFromFloat(biddiscount_down)
- decimalValue := num1.Sub(num2)
- res,_ := decimalValue.Float64()
- //log.Debug("下浮后折扣系数:",res)
- return res
- }
- if biddiscount_up >0.0 {
- num1:=decimal.NewFromFloat(baseCount)
- num2:=decimal.NewFromFloat(biddiscount_up)
- decimalValue := num1.Add(num2)
- res,_ := decimalValue.Float64()
- //log.Debug("上浮后折扣系数:",res)
- return res
- }
- if biddiscount>0.0 {
- if biddiscount > 1.0 && biddiscount<=10.0 {
- num1:=decimal.NewFromFloat(10.0)
- num2:=decimal.NewFromFloat(biddiscount_up)
- decimalValue := num2.Div(num1)
- res,_ := decimalValue.Float64()
- //log.Debug("标准-①折扣系数:",res)
- return res
- }else if biddiscount>10.0 {
- num1:=decimal.NewFromFloat(100.0)
- num2:=decimal.NewFromFloat(biddiscount_up)
- decimalValue := num2.Div(num1)
- res,_ := decimalValue.Float64()
- //log.Debug("标准-⑩折扣系数:",res)
- return res
- }else {
- //log.Debug("标准折扣系数:",biddiscount)
- return biddiscount
- }
- }
- return 0.0
- }
- //精度丢失-相加
- func precisionAddFloat(tmp1,tmp2 float64)float64{
- num1:=decimal.NewFromFloat(tmp1)
- num2:=decimal.NewFromFloat(tmp2)
- decimalValue := num2.Add(num1)
- res,_ := decimalValue.Float64()
- return res
- }
- //保存其他
- //kv、表格、块上的标签凡是新的标签都入库
- //val type times firstid createtime 判定field
- func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
- now := time.Now().Unix()
- coll := e.TaskInfo.TestColl
- if coll == "" {
- coll = "extract_tag_result"
- } else {
- coll += "_tag"
- }
- datas := []map[string]interface{}{}
- kv := map[string]int{}
- for _, v := range j.Block {
- //
- for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
- if vv == nil || vv.KvTags == nil {
- continue
- }
- for kkk, vvv := range vv.KvTags {
- for _, vvvv := range vvv {
- if vvvv.IsInvalid {
- kv[kkk] = kv[kkk] + 1
- break
- }
- }
- }
- }
- for _, vv := range v.NotClassifyTitles {
- datas = append(datas, map[string]interface{}{
- "val": vv,
- "times": 0,
- "type": "block",
- "firstid": j.SourceMid,
- "createtime": now,
- })
- if len(datas) == saveLimit {
- db.Mgo.SaveBulk(coll, datas...)
- datas = []map[string]interface{}{}
- }
- }
- }
- for k, v := range kv {
- datas = append(datas, map[string]interface{}{
- "val": k,
- "times": v,
- "type": "kv",
- "firstid": j.SourceMid,
- "createtime": now,
- })
- if len(datas) == saveLimit {
- db.Mgo.SaveBulk(coll, datas...)
- datas = []map[string]interface{}{}
- }
- }
- if len(datas) > 0 {
- db.Mgo.SaveBulk(coll, datas...)
- }
- }
- func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
- if j == nil {
- return nil
- }
- if len(j.Block) > 0 {
- for i, v := range j.Block {
- rangetmp := new(ju.TmpBlock)
- vb, _ := json.Marshal(v)
- json.Unmarshal(vb, &rangetmp)
- tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
- }
- }
- if j.ColonKV != nil {
- cb, _ := json.Marshal(j.ColonKV)
- tmpblock.ColonKV = string(cb)
- }
- if j.SpaceKV != nil {
- sb, _ := json.Marshal(j.SpaceKV)
- tmpblock.SpaceKV = string(sb)
- }
- if j.TableKV != nil {
- tb, _ := json.Marshal(j.TableKV)
- tmpblock.TableKV = string(tb)
- }
- return &tmpblock
- }
- //去重冗余字段
- func delFiled(k string) bool {
- return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
- }
- //分析-打分排序
- func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
- defer qu.Catch()
- doc := j.Data
- result := j.Result
- _id := qu.BsonIdToSId((*doc)["_id"])
- result = ScoreFields(j, e.Tag) //正负面词打分
- //结果排序
- for _, val := range result {
- ju.Sort(val)
- }
- if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
- clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
- marshalbt, _ := json.Marshal(j.Jsondata)
- tmpjddata := make(map[string]interface{})
- json.Unmarshal(marshalbt, &tmpjddata)
- for _, jdkey := range ju.JsonData {
- if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
- for tmpk, tmpv := range j.Result[jdkey][:5] {
- if jdkey == "budget" || jdkey == "bidamount" {
- lockclear.Lock()
- cfn := e.ClearFn[jdkey]
- lockclear.Unlock()
- if len(cfn) == 0 {
- continue
- }
- newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpv.Value == newNum[0] {
- extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
- j.Result[jdkey] = append(j.Result[jdkey], extField)
- ju.Sort(j.Result[jdkey])
- delete((*j.Jsondata), jdkey)
- break
- }
- } else {
- if (*j.Jsondata)[jdkey] == tmpv.Value {
- extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
- j.Result[jdkey] = append(j.Result[jdkey], extField)
- ju.Sort(j.Result[jdkey])
- delete((*j.Jsondata), jdkey)
- break
- }
- }
- }
- }
- }
- if len(*j.Jsondata) > 0 {
- j.Result = JsonDataMergeProcessing(j, e)
- }
- j.Jsondata = &tmpjddata
- }
- return doc, result, _id
- }
- //辅助信息,如果没有排序先排序
- func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
- fieldalls := map[string][]map[string]interface{}{}
- if j == nil {
- return fieldalls
- }
- qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
- defer qykredis.Close()
- db := 0
- for field, val := range j.Result {
- //ju.Sort(val)
- if field == "buyer" {
- db = ju.BuyerDB
- } else if field == "winner" {
- db = ju.WinnerDB
- } else if field == "agency" {
- db = ju.AgencyDB
- }
- sfields := []map[string]interface{}{}
- for _, v := range val {
- standardized := false
- if _, err := qykredis.Do("SELECT", db); err != nil {
- fmt.Println("redis select err", err)
- } else {
- rep, err := qykredis.Do("GET", v.Value)
- if rep != nil && err == nil {
- standardized = true
- }
- }
- if field == "budget" || field == "bidamount" {
- if !v.IsTrue {
- continue
- }
- }
- sfield := map[string]interface{}{
- "val": v.Value,
- "type": v.Type,
- "score": v.Score,
- "blocktag": v.BlockTag,
- "sourceval": v.SourceValue,
- "standardized": standardized,
- }
- sfields = append(sfields, sfield)
- }
- fieldalls[field] = sfields
- }
- return fieldalls
- }
- func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
- defer qu.Catch()
- //获取审核字段
- for _, field := range e.AuditFields {
- //1.分包
- if resulttmp["package"] != nil {
- packagedata := resulttmp["package"].(map[string]map[string]interface{})
- for _, val := range packagedata {
- if val[field] != nil {
- fv := qu.ObjToString(val[field])
- if fv != "" {
- if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
- e.RedisMatch(field, fv, val) //redis匹配
- } else { //除了buyer和winner,其他字段走规则匹配
- e.RuleMatch(field, fv, val)
- }
- }
- }
- }
- }
- //2.外围
- if resulttmp[field] != nil {
- fv := qu.ObjToString(resulttmp[field])
- if fv != "" {
- if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
- e.RedisMatch(field, fv, resulttmp) //redis匹配
- } else { //除了buyer和winner,其他字段走规则匹配
- e.RuleMatch(field, fv, resulttmp)
- }
- }
- }
- }
- }
- //Redis匹配
- func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
- defer qu.Catch()
- i := redis.GetInt(field, field+"_"+fv) //查找redis
- if i == 0 { //reids未找到,执行规则匹配
- val[field+"_isredis"] = false
- e.RuleMatch(field, fv, val) //规则匹配
- } else { //redis找到,打标识存库
- val[field+"_isredis"] = true
- }
- }
- //规则匹配
- func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
- defer qu.Catch()
- if fieldval != "" {
- SMap := e.StartMatch(field, fieldval)
- //SMap.AddKey(field+"_isaudit", false)
- for _, k := range SMap.Keys {
- tmpMap[k] = SMap.Map[k]
- }
- tmpMap[field+"_isaudit"] = false //添加字段未审核信息
- }
- }
- //开始规则匹配
- func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
- defer qu.Catch()
- SMap := pretreated.NewSortMap()
- lock.Lock()
- f := e.RecogFieldMap[field]
- lock.Unlock()
- if len(f) > 0 {
- fid := qu.BsonIdToSId(f["_id"])
- recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
- textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
- if textAfterRecogFieldPrerule != "" {
- lock.Lock()
- classMap := e.FidClassMap[fid]
- lock.Unlock()
- L:
- for _, c := range classMap { //class
- classid := qu.BsonIdToSId(c["_id"])
- classPrerule := qu.ObjToString(c["s_class_prerule"])
- savefield := qu.ObjToString(c["s_savefield"]) //保存字段
- textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
- if textAfterClassPrerule != "" {
- lock.Lock()
- ruleMap := e.CidRuleMap[classid]
- lock.Unlock()
- for _, r := range ruleMap { //rule
- rulePrerule := qu.ObjToString(r["s_rule_prerule"])
- s_name := qu.ObjToString(r["s_name"])
- rule := r["rule"].([]interface{})
- textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
- if textAfterRulePrerule != "" {
- b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
- if b { //匹配到一个分类下某个规则时,不再继续匹配
- if savefield != "" { //保存字段不为空,存储代码信息
- SMap.AddKey(field+"_"+savefield, s_name)
- }
- break L
- }
- }
- }
- }
- }
- }
- }
- return SMap
- }
- //筛选重复候选人-相关
- func filterRepeatWinArr(j *ju.Job) {
- if j.SpiderCode=="sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
- sort_WinOrder_Arr := make([][]map[string]interface{},0)
- sort_arr := make([]map[string]interface{},0)
- for _,v := range j.Winnerorder{
- sort := qu.IntAll(v["sort"])
- if sort==1 { //为一组
- if len(sort_arr)>0 {
- sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr)
- }
- sort_arr = make([]map[string]interface{},0)
- }
- sort_arr = append(sort_arr,v)
- }
- if len(sort_arr)>0 {
- sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr)
- }
- if len(sort_WinOrder_Arr)>0 { //有重复排序组-开始筛选清理
- isIndex :=0
- for index,winArr := range sort_WinOrder_Arr {
- if len(winArr)>0 {
- if qu.ObjToString(winArr[0]["price"])!=""&&
- qu.ObjToString(winArr[0]["entname"])!="" {
- isIndex = index
- break
- }
- }
- }
- j.Winnerorder = sort_WinOrder_Arr[isIndex]
- }
- }
- }
- //中标候选人经过清理之后,重新取出赋值
- func resetWinnerorder(j *ju.Job) {
- if len(j.Winnerorder) == 0 {
- return
- }
- maxlen := len(j.Winnerorder) - 1
- //中标单位
- //i := 0
- winners := []*ju.ExtField{}
- bidamounts := []*ju.ExtField{}
- if maxlen > 0 {
- //新增-指定爬虫中标候选人过滤
- filterRepeatWinArr(j)
- if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
- return
- }
- winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
- if j.Winnerorder[0]["price"] != nil {
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpPrice[len(tmpPrice)-1].(bool) {
- bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
- }
- }
- }
- if j.Result["winner"] == nil && len(winners) > 0 {
- j.Result["winner"] = winners
- } else if len(winners) > 0 {
- j.Result["winner"] = append(j.Result["winner"], winners...)
- }
- if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
- j.Result["bidamount"] = bidamounts
- } else if len(bidamounts) > 0 {
- j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
- }
- if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
- winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
- j.Result["winner"] = winners
- if j.Winnerorder[0]["price"] != nil {
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpPrice[len(tmpPrice)-1].(bool) {
- bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
- }
- j.Result["bidamount"] = bidamounts
- }
- }
- }
- func RemoveReplicaSliceString(slc []string) []string {
- result := make([]string, 0)
- tempMap := make(map[string]bool, len(slc))
- for _, e := range slc {
- if tempMap[e] == false {
- tempMap[e] = true
- result = append(result, e)
- }
- }
- return result
- }
- type scoreIndex struct {
- Score float64
- Index int
- }
|