123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847 |
- package extract
- import (
- "bytes"
- "encoding/json"
- "fmt"
- "github.com/shopspring/decimal"
- "gopkg.in/mgo.v2/bson"
- "io"
- "jy/clear"
- "jy/pretreated"
- ju "jy/util"
- "net/http"
- qu "qfw/util"
- "qfw/util/redis"
- "regexp"
- "strings"
- "sync"
- "time"
- "unicode/utf8"
- )
- type scoreIndex struct {
- Score float64
- Index int
- }
- var (
- lock, lockrule sync.RWMutex
- lockclear, locktag sync.RWMutex
- blocktag sync.RWMutex
- JYUrl = "https://www.jianyu360.cn/article/content/%s.html"
- cut = ju.NewCut() //获取正文并清理
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
- TaskList map[string]*ExtractTask //任务列表
- ClearTaskList map[string]*ClearTask //清理任务列表
- saveLimit = 100 //抽取日志批量保存
- PageSize = 5000 //查询分页
- Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
- BiddingFields = map[string]interface{}{
- "_id": 1,
- "title": 1,
- "site": 1,
- "spidercode": 1,
- "toptype": 1,
- "subtype": 1,
- "comeintime": 1,
- "publishtime": 1,
- "href": 1,
- "dataging": 1,
- }
- Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
- NiJianField = []string{
- "string#approvecode",
- "string#total_investment",
- "string#funds",
- "string#owner",
- "string#projectaddr",
- "string#projectperiod",
- "string#project_scale",
- "string#project_person",
- "string#project_phone",
- "string#approvenumber",
- "string#projecttype",
- "string#approvestatus",
- "time#project_startdate",
- "time#project_completedate",
- "map#construction_area",
- "map#floor_area",
- }
- spidercode = map[string]bool{
- "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
- "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
- "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
- "hb_tmsggzyjyxxw_jsgc_kbqk": true,
- "zj_nbsyyggzyjyw_jsgc_kbqk": true,
- "zj_zjsggzyjyzx_jyxx_kbjg": true,
- "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
- "zj_lssggzyjyw_jsgc_kbsk": true,
- "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
- "sc_mssggzydzjypt_jsgc_kbjl": true,
- "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
- "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
- "a_hbszbtbggfwpt_kbjl": true,
- "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
- "a_szldzbyxgs_kbxx": true,
- "zj_zssssxggzyjyw_gcjs_kbjggs": true,
- "gd_szszfhjsj_kbqkgs": true,
- "a_gjggzyjypt_gcjs_kbjl": true,
- "a_gjggzyjypt_gcjs_kbjl_new": true,
- "zj_tzsyhggzyjyzx_kbjggg": true,
- "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
- "ah_czsggzyjyw_jsgc_kbjl": true,
- "ah_czsggzyjyw_zfcg_kbxx": true,
- "ah_whsggzyjyfww_kbxx_cgxm": true,
- "ah_whsggzyjyfww_kbxx_gcxm": true,
- }
- clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
- sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
- clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
- clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
- textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
- winorderLock sync.Mutex
- jfwinorderLock sync.Mutex
- )
- var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
- var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
- // 包含字母的实体单位
- var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
- // 落款单位抽取
- var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s ]*[\n]+([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
- var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s ]*([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
- // 特殊实体
- var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府))")
- // 有效企业
- var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|办公室|车务段|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
- // 发布时间识别
- var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
- var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
- //var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
- // 实体通用企业
- var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$")
- var entdfa_clean = regexp.MustCompile("([\\s \n]+)")
- var entdfa_filtration = regexp.MustCompile("(开标记录)")
- // 周期有效
- var isNeedValueReg = regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
- // 清洗正文
- func CleanDetailText(detail string, summary string) string {
- detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
- detail = pretreated.RepairCon(detail)
- detail = ju.CutLableStr(summary + "\n" + detail)
- detail = cut.ClearHtml(summary + "\n" + detail)
- return detail
- }
- // 综合选取detail与contenthtml情况 true采用正文
- func SelectDetailSourceText(detail string, contenthtml string) bool {
- if len(detail) < 1000 {
- return false
- }
- if textSelectReg.MatchString(detail) && !textSelectReg.MatchString(contenthtml) {
- return true
- }
- return false
- }
- // 综合选取detail与contenthtml情况 true采用源码
- func SelectSourceStructText(detail string, contenthtml string) bool {
- arr1 := strings.Split(detail, "\n")
- arr2 := strings.Split(contenthtml, "\n")
- //正文长度相差不大且源码有效
- if len(detail)-len(contenthtml) < 500 && len(contenthtml) > 500 && len(arr1) == 1 && len(arr2) > len(arr1) {
- return true
- }
- return false
- }
- // 遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
- func file2text(doc *map[string]interface{}) {
- mnameone := map[string]bool{}
- mname := map[string]bool{}
- murl := map[string]string{}
- //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
- if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
- for _, attachs := range attach_text {
- if fileinfos, ok := attachs.(map[string]interface{}); ok {
- for _, fileinfo := range fileinfos {
- if ff, ok := fileinfo.(map[string]interface{}); ok {
- attach_url := qu.ObjToString(ff["attach_url"])
- ffname := qu.ObjToString(ff["file_name"])
- if clearStrReg.MatchString(ffname) {
- continue
- }
- mname[ffname] = true
- murl[ffname] = attach_url
- if sortStrReg.MatchString(ffname) {
- mnameone[ffname] = true
- }
- }
- }
- }
- }
- }
- tmpstr := ""
- for k := range mnameone {
- if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
- (*doc)["detailfile"] = tmpstr
- return
- }
- bs := ju.OssGetObject(murl[k])
- if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
- tmpstr += bs + "\n"
- } else {
- tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
- }
- }
- for k := range mname {
- if mnameone[k] {
- continue
- }
- if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
- (*doc)["detailfile"] = tmpstr
- return
- }
- bs := ju.OssGetObject(murl[k])
- if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
- tmpstr += bs + "\n"
- } else {
- tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
- }
- }
- (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
- }
- // 判断-附件分包是否无效判定(不通用)
- func isUsedPackageJF(jf_package map[string]map[string]interface{}) bool {
- if jf_package == nil || len(jf_package) == 0 {
- return false
- }
- for _, pack := range jf_package {
- budget := qu.Float64All(pack["budget"])
- bidamount := qu.Float64All(pack["bidamount"])
- if budget > 0.0 && budget <= 1.0 {
- return false
- }
- if bidamount > 0.0 && bidamount <= 1.0 {
- return false
- }
- }
- return true
- }
- // 是否有效分包
- func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
- if pkg == nil || len(pkg) == 0 {
- return false
- }
- for _, v := range pkg {
- p_winner := qu.ObjToString(v["winner"])
- p_budget := qu.Float64All(v["budget"])
- p_bidamout := qu.Float64All(v["bidamount"])
- if (p_winner != "" && effectivefirm.MatchString(p_winner)) || p_budget > float64(0) || p_bidamout > float64(0) {
- return true
- }
- }
- return false
- }
- // 判断-附件分包是否无效判定(不通用)
- func isExistsPackage(pkg map[string]map[string]interface{}) bool {
- if pkg == nil || len(pkg) == 0 {
- return false
- }
- if len(pkg) == 1 {
- for _, v := range pkg {
- winner := qu.ObjToString(v["winner"])
- budget := qu.Float64All(v["budget"])
- bidamout := qu.Float64All(v["bidamount"])
- if winner != "" || budget > float64(0) || bidamout > float64(0) {
- return true
- }
- }
- return false
- }
- return true
- }
- // getQualifications 添加所有资质新字段
- func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
- /**
- qualifications 资质要求
- */
- detail := qu.ObjToString(j_data["detail"])
- new_detail := pretreated.HtmlToText(detail)
- qualifications := ju.GetQualifications(new_detail)
- if qualifications != "" {
- (*tmp)["qualifications"] = qualifications
- }
- }
- // 落款识别~采购单位
- func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) {
- //落款实体
- if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
- !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
- if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" {
- (*tmp)["buyer"] = new_buyer
- (*tmp)["inscribe_buyer"] = "落款结构实体"
- }
- }
- //落款特殊实体
- if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" &&
- !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
- if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
- (*tmp)["buyer"] = new_buyer
- (*tmp)["inscribe_buyer"] = "落款特殊实体"
- }
- }
- //实体服务识别
- if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
- !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
- if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
- (*tmp)["buyer"] = new_buyer
- (*tmp)["inscribe_buyer"] = "实体识别服务"
- }
- }
- //拟建不能存buyer
- if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
- qu.ObjToString((*tmp)["subtype"]) == "拟建" {
- delete((*tmp), "buyer")
- }
- //识别发布时间
- if qu.IntAll(j_data["publishtime"]) == -1 {
- if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
- if ext_publishtime := InscribePublishtime(j_data); ext_publishtime > int64(0) {
- (*tmp)["ext_publishtime"] = ext_publishtime
- }
- }
- } else {
- delete((*tmp), "ext_publishtime")
- }
- }
- // 识别实体
- func InscribeEntity(detail string, tmp map[string]interface{}) string {
- new_str := ""
- new_detail := pretreated.TextAfterRemoveTable(detail)
- if len(new_detail) > 200 {
- new_detail = detail[len(new_detail)-200:]
- }
- new_str = inscribe_entity_1.FindString(new_detail)
- if new_str == "" {
- new_str = inscribe_entity_2.FindString(new_detail)
- if new_str != "" {
- str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}")
- str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}")
- if str1 == str2 && str1 != "" {
- new_str = str1
- }
- }
- } else {
- new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
- }
- winner := qu.ObjToString(tmp["winner"])
- agency := qu.ObjToString(tmp["agency"])
- //与其它单位发生了重叠
- if new_str != "" && (new_str == winner || new_str == agency) {
- new_str = ""
- }
- if new_str != "" && exclude_entity.MatchString(new_str) {
- new_str = ""
- }
- return new_str
- }
- // 识别实体
- func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string {
- new_str := ""
- projectname := qu.ObjToString(tmp["projectname"])
- title := qu.ObjToString(tmp["title"])
- winner := qu.ObjToString(tmp["winner"])
- agency := qu.ObjToString(tmp["agency"])
- toptype := qu.ObjToString(tmp["toptype"])
- //采用-标题项目名称
- if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
- return new_str
- }
- if !entdfa_filtration.MatchString(title) {
- //采用-排除表格的文本识别
- new_detail := pretreated.TextAfterRemoveTable(detail)
- new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
- if len(new_detail) > 500 {
- new_detail = new_detail[len(new_detail)-500:]
- }
- if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
- return new_str
- }
- if toptype != "结果" {
- //采用-去除标签的纯文本(含表格)
- new_detail = pretreated.HtmlToText(detail)
- new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
- if len(new_detail) > 500 {
- new_detail = new_detail[len(new_detail)-500:]
- }
- if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
- return new_str
- }
- }
- }
- //采用-附件识别
- if !entdfa_filtration.MatchString(title) {
- if len(jf_detail) > 500 {
- jf_detail = jf_detail[len(jf_detail)-500:]
- }
- if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" {
- return new_str
- }
- }
- return new_str
- }
- // 实体识别方法
- func EmployEntDfaText(text string, winner string, agency string) string {
- new_str := ""
- if text == "" {
- return new_str
- }
- dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0
- if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 {
- for _, v := range res {
- if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) {
- if !(v == winner || v == agency) {
- l = cl
- new_str = v
- }
- }
- }
- }
- return new_str
- }
- // 识别发布时间
- func InscribePublishtime(j_data map[string]interface{}) int64 {
- //落款文本识别
- detail := pretreated.TextAfterRemoveTable(qu.ObjToString(j_data["detail"]))
- if len(detail) > 200 {
- detail = detail[len(detail)-200:]
- }
- new_str := inscribe_entity_1.FindString(detail)
- if new_str == "" {
- new_str = inscribe_entity_2.FindString(detail)
- if new_str != "" {
- new_str = inscribe_entity_2.ReplaceAllString(new_str, "${5}")
- }
- } else {
- new_str = inscribe_entity_1.ReplaceAllString(new_str, "${5}")
- }
- if data := clear.ObjToTimestamp([]interface{}{new_str}, ""); len(data) > 0 {
- if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
- return ext_publishtime
- }
- }
- //附件名称识别
- projectinfo := *qu.ObjToMap(j_data["projectinfo"])
- attachments := *qu.ObjToMap(projectinfo["attachments"])
- for _, v := range attachments {
- info := *qu.ObjToMap(v)
- filename := qu.ObjToString(info["filename"])
- if pt_str := inscribe_publishtime_1.FindString(filename); pt_str != "" {
- if data := clear.ObjToTimestamp([]interface{}{pt_str}, ""); len(data) > 0 {
- if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
- return ext_publishtime
- }
- }
- }
- }
- return int64(0)
- }
- // 识别特殊采购单位
- func InscribeSpecEntity(detail string) string {
- new_str := ""
- new_detail := pretreated.TextAfterRemoveTable(detail)
- if len(new_detail) > 200 {
- new_detail = detail[len(new_detail)-200:]
- }
- find_str := inscribe_entity_3.FindString(new_detail)
- if find_str != "" {
- new_str = inscribe_entity_3.ReplaceAllString(find_str, "${2}")
- }
- return new_str
- }
- func EmployPostEntDfa(data map[string]interface{}) map[string]interface{} {
- info := map[string]interface{}{}
- client := &http.Client{Timeout: 2 * time.Second}
- jsonStr, _ := json.Marshal(data)
- //172.17.4.238:9996,extcity.spdata.jianyu360.com
- resp, err := client.Post("http://172.17.4.238:9996/service/entity/", "application/json", bytes.NewBuffer(jsonStr))
- if err != nil {
- return info
- }
- res, err := io.ReadAll(resp.Body)
- if err != nil {
- return info
- }
- err = json.Unmarshal(res, &info)
- if err != nil {
- return info
- }
- return info
- }
- // 处理折扣系数-
- func dealWithDiscountBid(tmp map[string]interface{}) float64 {
- biddiscount := qu.Float64All(tmp["biddiscount"])
- biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
- biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
- baseCount := float64(1)
- if biddiscount_down > 0.0 {
- num1 := decimal.NewFromFloat(baseCount)
- num2 := decimal.NewFromFloat(biddiscount_down)
- decimalValue := num1.Sub(num2)
- res, _ := decimalValue.Float64()
- return res
- }
- if biddiscount_up > 0.0 {
- num1 := decimal.NewFromFloat(baseCount)
- num2 := decimal.NewFromFloat(biddiscount_up)
- decimalValue := num1.Add(num2)
- res, _ := decimalValue.Float64()
- return res
- }
- if biddiscount > 0.0 {
- if biddiscount > 1.0 && biddiscount <= 10.0 {
- num1 := decimal.NewFromFloat(10.0)
- num2 := decimal.NewFromFloat(biddiscount)
- decimalValue := num2.Div(num1)
- res, _ := decimalValue.Float64()
- return res
- } else if biddiscount > 10.0 {
- num1 := decimal.NewFromFloat(100.0)
- num2 := decimal.NewFromFloat(biddiscount)
- decimalValue := num2.Div(num1)
- res, _ := decimalValue.Float64()
- return res
- } else {
- return biddiscount
- }
- }
- return 0.0
- }
- // 精度丢失-相加
- func precisionAddFloat(tmp1, tmp2 float64) float64 {
- num1 := decimal.NewFromFloat(tmp1)
- num2 := decimal.NewFromFloat(tmp2)
- decimalValue := num2.Add(num1)
- res, _ := decimalValue.Float64()
- return res
- }
- // 特殊金额-处理判断-倍率关系
- func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
- //金额结果只有两种 - 倍率关系10000 - 过10E
- moneyIndex := []int{}
- moneyArr := []float64{}
- first_money := float64(0)
- difValue := map[string]interface{}{}
- for k, v := range val { //取第一个非负数,项目名称除外
- if v.IsTrue && v.Score > -1 {
- moneyArr = append(moneyArr, qu.Float64All(v.Value))
- moneyIndex = append(moneyIndex, k)
- key := ""
- if m, ok := v.Value.(float64); ok {
- key = fmt.Sprintf("%f", m)
- } else {
- key = qu.ObjToString(v.Value)
- }
- if difValue[key] == nil {
- difValue[key] = 1
- }
- //if len(difValue) > 2 {
- // return false, 0
- //}
- }
- }
- //计算金额数组
- if len(difValue) == 2 {
- money_1, money_2 := float64(0), float64(0)
- for k, v := range moneyArr {
- if k == 0 {
- money_1 = v
- } else {
- if v != money_1 {
- money_2 = v
- break
- }
- }
- }
- isRatio, new_money := false, float64(0) //判断金额是否为倍率关系
- if money_1 != float64(0) && money_2 != float64(0) {
- if money_1 == money_2*float64(10000) && money_1 >= 100000000 {
- isRatio = true
- new_money = money_2
- }
- if money_2 == money_1*float64(10000) && money_2 >= 100000000 {
- isRatio = true
- new_money = money_1
- }
- if isRatio { //采用新值
- for k, v := range moneyArr {
- if v == new_money {
- return true, moneyIndex[k]
- }
- }
- }
- }
- } else if len(difValue) > 2 { //多组金额
- is_exists := false
- for _, v := range moneyArr {
- if v >= 1000000000 {
- is_exists = true
- first_money = v
- }
- }
- if is_exists {
- for k, v := range moneyArr {
- if v*10000 == first_money {
- return true, moneyIndex[k]
- }
- }
- }
- } else {
- }
- return false, 0
- }
- // 筛选重复候选人-相关
- func filterRepeatWinArr(j *ju.Job) {
- if j.SpiderCode == "sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
- sort_WinOrder_Arr := make([][]map[string]interface{}, 0)
- sort_arr := make([]map[string]interface{}, 0)
- for _, v := range j.Winnerorder {
- sort := qu.IntAll(v["sort"])
- if sort == 1 { //为一组
- if len(sort_arr) > 0 {
- sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
- }
- sort_arr = make([]map[string]interface{}, 0)
- }
- sort_arr = append(sort_arr, v)
- }
- if len(sort_arr) > 0 {
- sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
- }
- if len(sort_WinOrder_Arr) > 0 { //有重复排序组-开始筛选清理
- isIndex := 0
- for index, winArr := range sort_WinOrder_Arr {
- if len(winArr) > 0 {
- if qu.ObjToString(winArr[0]["price"]) != "" &&
- qu.ObjToString(winArr[0]["entname"]) != "" {
- isIndex = index
- break
- }
- }
- }
- j.Winnerorder = sort_WinOrder_Arr[isIndex]
- }
- }
- }
- // 中标候选人经过清理之后,重新取出赋值
- func (e *ExtractTask) ResetWinnerorder(j *ju.Job) {
- if len(j.Winnerorder) == 0 {
- return
- }
- maxlen := len(j.Winnerorder) - 1
- //中标单位
- //i := 0
- winners := []*ju.ExtField{}
- bidamounts := []*ju.ExtField{}
- //对候选人单位名称进行清洗
- winorderLock.Lock()
- ruleArr := []*RuleCore{}
- ruleArr = e.RuleCores["all_all"]["winner"]
- for _, v := range j.Winnerorder {
- new_winner := qu.ObjToString(v["entname"])
- if new_winner != "" {
- for _, v1 := range ruleArr {
- for _, v2 := range v1.KVRuleCores {
- if new_winner == "" {
- break
- }
- new_winner = v2.RegPreBac.Reg.ReplaceAllString(new_winner, v2.RegPreBac.Replace)
- }
- }
- }
- v["entname"] = new_winner
- }
- winorderLock.Unlock()
- if maxlen > 0 {
- //新增-指定爬虫中标候选人过滤
- filterRepeatWinArr(j)
- if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
- return
- }
- winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
- if j.Winnerorder[0]["price"] != nil {
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpPrice[len(tmpPrice)-1].(bool) {
- bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
- }
- }
- }
- if j.Result["winner"] == nil && len(winners) > 0 {
- j.Result["winner"] = winners
- } else if len(winners) > 0 {
- j.Result["winner"] = append(j.Result["winner"], winners...)
- }
- if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
- j.Result["bidamount"] = bidamounts
- } else if len(bidamounts) > 0 {
- j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
- }
- if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
- winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
- j.Result["winner"] = winners
- if j.Winnerorder[0]["price"] != nil {
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
- if tmpPrice[len(tmpPrice)-1].(bool) {
- bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
- }
- j.Result["bidamount"] = bidamounts
- }
- }
- }
- func RemoveReplicaSliceString(slc []string) []string {
- result := make([]string, 0)
- tempMap := make(map[string]bool, len(slc))
- for _, e := range slc {
- if tempMap[e] == false {
- tempMap[e] = true
- result = append(result, e)
- }
- }
- return result
- }
- // 分包中标单位是否-合理
- func isValidPkgWinner(winner string) bool {
- if utf8.RuneCountInString(winner) < 4 {
- return false
- }
- return true
- }
- // 组装kv
- func assembleKVText(j *ju.Job, tmp *map[string]interface{}) {
- var kvtext bytes.Buffer
- blocks := make([]ju.BlockAndTag, 0)
- for _, v := range j.Block {
- //分包和标签
- if ju.SaveBlock {
- xx, _ := json.Marshal(v)
- tmpblock := new(ju.TmpBlock)
- err := json.Unmarshal(xx, &tmpblock)
- if err != nil {
- if v.BPackage != nil {
- bpb, _ := json.Marshal(v.BPackage)
- tmpblock.BPackage = string(bpb)
- }
- tmpblock = rangeBlockToJson(v, *tmpblock)
- }
- blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
- }
- //把所有kv组装成一个字符串,存库
- for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
- if jv == nil {
- continue
- }
- for jv_k, jv_v := range jv.KvTags {
- for _, jv_vv := range jv_v {
- kvtext.WriteString(jv_k)
- kvtext.WriteString(":")
- kvtext.WriteString(jv_vv.Value)
- kvtext.WriteString("\n")
- }
- }
- }
- }
- if kvtext.Len() > 0 {
- (*tmp)["kvtext"] = kvtext.String()
- }
- if len(blocks) > 0 {
- if blocksBytes, err := json.Marshal(blocks); err == nil {
- if utf8.RuneCount(blocksBytes) < 100000 {
- (*tmp)["blocks"] = string(blocksBytes)
- }
- }
- }
- }
- // 辅助信息,如果没有排序先排序
- func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
- fieldalls := map[string][]map[string]interface{}{}
- if j == nil {
- return fieldalls
- }
- qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
- defer qykredis.Close()
- db := 0
- for field, val := range j.Result {
- //ju.Sort(val)
- if field == "buyer" {
- db = ju.BuyerDB
- } else if field == "winner" {
- db = ju.WinnerDB
- } else if field == "agency" {
- db = ju.AgencyDB
- }
- sfields := []map[string]interface{}{}
- for _, v := range val {
- standardized := false
- if _, err := qykredis.Do("SELECT", db); err != nil {
- fmt.Println("redis select err", err)
- } else {
- rep, err := qykredis.Do("GET", v.Value)
- if rep != nil && err == nil {
- standardized = true
- }
- }
- if field == "budget" || field == "bidamount" {
- if !v.IsTrue {
- continue
- }
- }
- sfield := map[string]interface{}{
- "val": v.Value,
- "type": v.Type,
- "score": v.Score,
- "blocktag": v.BlockTag,
- "sourceval": v.SourceValue,
- "standardized": standardized,
- }
- sfields = append(sfields, sfield)
- }
- fieldalls[field] = sfields
- }
- return fieldalls
- }
|