extraxtmethod.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/shopspring/decimal"
  7. "gopkg.in/mgo.v2/bson"
  8. "io"
  9. "jy/clear"
  10. "jy/pretreated"
  11. ju "jy/util"
  12. "net/http"
  13. qu "qfw/util"
  14. "qfw/util/redis"
  15. "regexp"
  16. "strings"
  17. "sync"
  18. "time"
  19. "unicode/utf8"
  20. )
  21. type scoreIndex struct {
  22. Score float64
  23. Index int
  24. }
  25. var (
  26. lock, lockrule sync.RWMutex
  27. lockclear, locktag sync.RWMutex
  28. blocktag sync.RWMutex
  29. JYUrl = "https://www.jianyu360.cn/article/content/%s.html"
  30. cut = ju.NewCut() //获取正文并清理
  31. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  32. TaskList map[string]*ExtractTask //任务列表
  33. ClearTaskList map[string]*ClearTask //清理任务列表
  34. saveLimit = 100 //抽取日志批量保存
  35. PageSize = 5000 //查询分页
  36. Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  37. BiddingFields = map[string]interface{}{
  38. "_id": 1,
  39. "title": 1,
  40. "site": 1,
  41. "spidercode": 1,
  42. "toptype": 1,
  43. "subtype": 1,
  44. "comeintime": 1,
  45. "publishtime": 1,
  46. "href": 1,
  47. "dataging": 1,
  48. }
  49. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  50. NiJianField = []string{
  51. "string#approvecode",
  52. "string#total_investment",
  53. "string#funds",
  54. "string#owner",
  55. "string#projectaddr",
  56. "string#projectperiod",
  57. "string#project_scale",
  58. "string#project_person",
  59. "string#project_phone",
  60. "string#approvenumber",
  61. "string#projecttype",
  62. "string#approvestatus",
  63. "time#project_startdate",
  64. "time#project_completedate",
  65. "map#construction_area",
  66. "map#floor_area",
  67. }
  68. spidercode = map[string]bool{
  69. "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
  70. "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
  71. "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
  72. "hb_tmsggzyjyxxw_jsgc_kbqk": true,
  73. "zj_nbsyyggzyjyw_jsgc_kbqk": true,
  74. "zj_zjsggzyjyzx_jyxx_kbjg": true,
  75. "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
  76. "zj_lssggzyjyw_jsgc_kbsk": true,
  77. "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
  78. "sc_mssggzydzjypt_jsgc_kbjl": true,
  79. "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
  80. "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
  81. "a_hbszbtbggfwpt_kbjl": true,
  82. "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
  83. "a_szldzbyxgs_kbxx": true,
  84. "zj_zssssxggzyjyw_gcjs_kbjggs": true,
  85. "gd_szszfhjsj_kbqkgs": true,
  86. "a_gjggzyjypt_gcjs_kbjl": true,
  87. "a_gjggzyjypt_gcjs_kbjl_new": true,
  88. "zj_tzsyhggzyjyzx_kbjggg": true,
  89. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
  90. "ah_czsggzyjyw_jsgc_kbjl": true,
  91. "ah_czsggzyjyw_zfcg_kbxx": true,
  92. "ah_whsggzyjyfww_kbxx_cgxm": true,
  93. "ah_whsggzyjyfww_kbxx_gcxm": true,
  94. }
  95. clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  96. sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  97. clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  98. clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
  99. textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
  100. winorderLock sync.Mutex
  101. jfwinorderLock sync.Mutex
  102. )
  103. var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
  104. var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
  105. // 包含字母的实体单位
  106. var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
  107. // 落款单位抽取
  108. var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*[\n]+([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
  109. var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
  110. // 特殊实体
  111. var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府))")
  112. // 有效企业
  113. var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|办公室|车务段|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
  114. // 发布时间识别
  115. var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
  116. var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
  117. //var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
  118. // 实体通用企业
  119. var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$")
  120. var entdfa_clean = regexp.MustCompile("([\\s \n]+)")
  121. var entdfa_filtration = regexp.MustCompile("(开标记录)")
  122. // 周期有效
  123. var isNeedValueReg = regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
  124. // 清洗正文
  125. func CleanDetailText(detail string, summary string) string {
  126. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  127. detail = pretreated.RepairCon(detail)
  128. detail = ju.CutLableStr(summary + "\n" + detail)
  129. detail = cut.ClearHtml(summary + "\n" + detail)
  130. return detail
  131. }
  132. // 综合选取detail与contenthtml情况 true采用正文
  133. func SelectDetailSourceText(detail string, contenthtml string) bool {
  134. if len(detail) < 1000 {
  135. return false
  136. }
  137. if textSelectReg.MatchString(detail) && !textSelectReg.MatchString(contenthtml) {
  138. return true
  139. }
  140. return false
  141. }
  142. // 综合选取detail与contenthtml情况 true采用源码
  143. func SelectSourceStructText(detail string, contenthtml string) bool {
  144. arr1 := strings.Split(detail, "\n")
  145. arr2 := strings.Split(contenthtml, "\n")
  146. //正文长度相差不大且源码有效
  147. if len(detail)-len(contenthtml) < 500 && len(contenthtml) > 500 && len(arr1) == 1 && len(arr2) > len(arr1) {
  148. return true
  149. }
  150. return false
  151. }
  152. // 遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  153. func file2text(doc *map[string]interface{}) {
  154. mnameone := map[string]bool{}
  155. mname := map[string]bool{}
  156. murl := map[string]string{}
  157. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  158. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  159. for _, attachs := range attach_text {
  160. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  161. for _, fileinfo := range fileinfos {
  162. if ff, ok := fileinfo.(map[string]interface{}); ok {
  163. attach_url := qu.ObjToString(ff["attach_url"])
  164. ffname := qu.ObjToString(ff["file_name"])
  165. if clearStrReg.MatchString(ffname) {
  166. continue
  167. }
  168. mname[ffname] = true
  169. murl[ffname] = attach_url
  170. if sortStrReg.MatchString(ffname) {
  171. mnameone[ffname] = true
  172. }
  173. }
  174. }
  175. }
  176. }
  177. }
  178. tmpstr := ""
  179. for k := range mnameone {
  180. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  181. (*doc)["detailfile"] = tmpstr
  182. return
  183. }
  184. bs := ju.OssGetObject(murl[k])
  185. if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
  186. tmpstr += bs + "\n"
  187. } else {
  188. tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
  189. }
  190. }
  191. for k := range mname {
  192. if mnameone[k] {
  193. continue
  194. }
  195. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  196. (*doc)["detailfile"] = tmpstr
  197. return
  198. }
  199. bs := ju.OssGetObject(murl[k])
  200. if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
  201. tmpstr += bs + "\n"
  202. } else {
  203. tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
  204. }
  205. }
  206. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  207. }
  208. // 判断-附件分包是否无效判定(不通用)
  209. func isUsedPackageJF(jf_package map[string]map[string]interface{}) bool {
  210. if jf_package == nil || len(jf_package) == 0 {
  211. return false
  212. }
  213. for _, pack := range jf_package {
  214. budget := qu.Float64All(pack["budget"])
  215. bidamount := qu.Float64All(pack["bidamount"])
  216. if budget > 0.0 && budget <= 1.0 {
  217. return false
  218. }
  219. if bidamount > 0.0 && bidamount <= 1.0 {
  220. return false
  221. }
  222. }
  223. return true
  224. }
  225. // 是否有效分包
  226. func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
  227. if pkg == nil || len(pkg) == 0 {
  228. return false
  229. }
  230. for _, v := range pkg {
  231. p_winner := qu.ObjToString(v["winner"])
  232. p_budget := qu.Float64All(v["budget"])
  233. p_bidamout := qu.Float64All(v["bidamount"])
  234. if (p_winner != "" && effectivefirm.MatchString(p_winner)) || p_budget > float64(0) || p_bidamout > float64(0) {
  235. return true
  236. }
  237. }
  238. return false
  239. }
  240. // 判断-附件分包是否无效判定(不通用)
  241. func isExistsPackage(pkg map[string]map[string]interface{}) bool {
  242. if pkg == nil || len(pkg) == 0 {
  243. return false
  244. }
  245. if len(pkg) == 1 {
  246. for _, v := range pkg {
  247. winner := qu.ObjToString(v["winner"])
  248. budget := qu.Float64All(v["budget"])
  249. bidamout := qu.Float64All(v["bidamount"])
  250. if winner != "" || budget > float64(0) || bidamout > float64(0) {
  251. return true
  252. }
  253. }
  254. return false
  255. }
  256. return true
  257. }
  258. // getQualifications 添加所有资质新字段
  259. func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
  260. /**
  261. qualifications 资质要求
  262. */
  263. detail := qu.ObjToString(j_data["detail"])
  264. new_detail := pretreated.HtmlToText(detail)
  265. qualifications := ju.GetQualifications(new_detail)
  266. if qualifications != "" {
  267. (*tmp)["qualifications"] = qualifications
  268. }
  269. }
  270. // 落款识别~采购单位
  271. func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) {
  272. //落款实体
  273. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
  274. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  275. if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" {
  276. (*tmp)["buyer"] = new_buyer
  277. (*tmp)["inscribe_buyer"] = "落款结构实体"
  278. }
  279. }
  280. //落款特殊实体
  281. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" &&
  282. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  283. if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
  284. (*tmp)["buyer"] = new_buyer
  285. (*tmp)["inscribe_buyer"] = "落款特殊实体"
  286. }
  287. }
  288. //实体服务识别
  289. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
  290. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  291. if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
  292. (*tmp)["buyer"] = new_buyer
  293. (*tmp)["inscribe_buyer"] = "实体识别服务"
  294. }
  295. }
  296. //拟建不能存buyer
  297. if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
  298. qu.ObjToString((*tmp)["subtype"]) == "拟建" {
  299. delete((*tmp), "buyer")
  300. }
  301. //识别发布时间
  302. if qu.IntAll(j_data["publishtime"]) == -1 {
  303. if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
  304. if ext_publishtime := InscribePublishtime(j_data); ext_publishtime > int64(0) {
  305. (*tmp)["ext_publishtime"] = ext_publishtime
  306. }
  307. }
  308. } else {
  309. delete((*tmp), "ext_publishtime")
  310. }
  311. }
  312. // 识别实体
  313. func InscribeEntity(detail string, tmp map[string]interface{}) string {
  314. new_str := ""
  315. new_detail := pretreated.TextAfterRemoveTable(detail)
  316. if len(new_detail) > 200 {
  317. new_detail = detail[len(new_detail)-200:]
  318. }
  319. new_str = inscribe_entity_1.FindString(new_detail)
  320. if new_str == "" {
  321. new_str = inscribe_entity_2.FindString(new_detail)
  322. if new_str != "" {
  323. str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}")
  324. str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}")
  325. if str1 == str2 && str1 != "" {
  326. new_str = str1
  327. }
  328. }
  329. } else {
  330. new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
  331. }
  332. winner := qu.ObjToString(tmp["winner"])
  333. agency := qu.ObjToString(tmp["agency"])
  334. //与其它单位发生了重叠
  335. if new_str != "" && (new_str == winner || new_str == agency) {
  336. new_str = ""
  337. }
  338. if new_str != "" && exclude_entity.MatchString(new_str) {
  339. new_str = ""
  340. }
  341. return new_str
  342. }
  343. // 识别实体
  344. func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string {
  345. new_str := ""
  346. projectname := qu.ObjToString(tmp["projectname"])
  347. title := qu.ObjToString(tmp["title"])
  348. winner := qu.ObjToString(tmp["winner"])
  349. agency := qu.ObjToString(tmp["agency"])
  350. toptype := qu.ObjToString(tmp["toptype"])
  351. //采用-标题项目名称
  352. if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
  353. return new_str
  354. }
  355. if !entdfa_filtration.MatchString(title) {
  356. //采用-排除表格的文本识别
  357. new_detail := pretreated.TextAfterRemoveTable(detail)
  358. new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
  359. if len(new_detail) > 500 {
  360. new_detail = new_detail[len(new_detail)-500:]
  361. }
  362. if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
  363. return new_str
  364. }
  365. if toptype != "结果" {
  366. //采用-去除标签的纯文本(含表格)
  367. new_detail = pretreated.HtmlToText(detail)
  368. new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
  369. if len(new_detail) > 500 {
  370. new_detail = new_detail[len(new_detail)-500:]
  371. }
  372. if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
  373. return new_str
  374. }
  375. }
  376. }
  377. //采用-附件识别
  378. if !entdfa_filtration.MatchString(title) {
  379. if len(jf_detail) > 500 {
  380. jf_detail = jf_detail[len(jf_detail)-500:]
  381. }
  382. if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" {
  383. return new_str
  384. }
  385. }
  386. return new_str
  387. }
  388. // 实体识别方法
  389. func EmployEntDfaText(text string, winner string, agency string) string {
  390. new_str := ""
  391. if text == "" {
  392. return new_str
  393. }
  394. dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0
  395. if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 {
  396. for _, v := range res {
  397. if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) {
  398. if !(v == winner || v == agency) {
  399. l = cl
  400. new_str = v
  401. }
  402. }
  403. }
  404. }
  405. return new_str
  406. }
  407. // 识别发布时间
  408. func InscribePublishtime(j_data map[string]interface{}) int64 {
  409. //落款文本识别
  410. detail := pretreated.TextAfterRemoveTable(qu.ObjToString(j_data["detail"]))
  411. if len(detail) > 200 {
  412. detail = detail[len(detail)-200:]
  413. }
  414. new_str := inscribe_entity_1.FindString(detail)
  415. if new_str == "" {
  416. new_str = inscribe_entity_2.FindString(detail)
  417. if new_str != "" {
  418. new_str = inscribe_entity_2.ReplaceAllString(new_str, "${5}")
  419. }
  420. } else {
  421. new_str = inscribe_entity_1.ReplaceAllString(new_str, "${5}")
  422. }
  423. if data := clear.ObjToTimestamp([]interface{}{new_str}, ""); len(data) > 0 {
  424. if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
  425. return ext_publishtime
  426. }
  427. }
  428. //附件名称识别
  429. projectinfo := *qu.ObjToMap(j_data["projectinfo"])
  430. attachments := *qu.ObjToMap(projectinfo["attachments"])
  431. for _, v := range attachments {
  432. info := *qu.ObjToMap(v)
  433. filename := qu.ObjToString(info["filename"])
  434. if pt_str := inscribe_publishtime_1.FindString(filename); pt_str != "" {
  435. if data := clear.ObjToTimestamp([]interface{}{pt_str}, ""); len(data) > 0 {
  436. if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
  437. return ext_publishtime
  438. }
  439. }
  440. }
  441. }
  442. return int64(0)
  443. }
  444. // 识别特殊采购单位
  445. func InscribeSpecEntity(detail string) string {
  446. new_str := ""
  447. new_detail := pretreated.TextAfterRemoveTable(detail)
  448. if len(new_detail) > 200 {
  449. new_detail = detail[len(new_detail)-200:]
  450. }
  451. find_str := inscribe_entity_3.FindString(new_detail)
  452. if find_str != "" {
  453. new_str = inscribe_entity_3.ReplaceAllString(find_str, "${2}")
  454. }
  455. return new_str
  456. }
  457. func EmployPostEntDfa(data map[string]interface{}) map[string]interface{} {
  458. info := map[string]interface{}{}
  459. client := &http.Client{Timeout: 2 * time.Second}
  460. jsonStr, _ := json.Marshal(data)
  461. //172.17.4.238:9996,extcity.spdata.jianyu360.com
  462. resp, err := client.Post("http://172.17.4.238:9996/service/entity/", "application/json", bytes.NewBuffer(jsonStr))
  463. if err != nil {
  464. return info
  465. }
  466. res, err := io.ReadAll(resp.Body)
  467. if err != nil {
  468. return info
  469. }
  470. err = json.Unmarshal(res, &info)
  471. if err != nil {
  472. return info
  473. }
  474. return info
  475. }
  476. // 处理折扣系数-
  477. func dealWithDiscountBid(tmp map[string]interface{}) float64 {
  478. biddiscount := qu.Float64All(tmp["biddiscount"])
  479. biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
  480. biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
  481. baseCount := float64(1)
  482. if biddiscount_down > 0.0 {
  483. num1 := decimal.NewFromFloat(baseCount)
  484. num2 := decimal.NewFromFloat(biddiscount_down)
  485. decimalValue := num1.Sub(num2)
  486. res, _ := decimalValue.Float64()
  487. return res
  488. }
  489. if biddiscount_up > 0.0 {
  490. num1 := decimal.NewFromFloat(baseCount)
  491. num2 := decimal.NewFromFloat(biddiscount_up)
  492. decimalValue := num1.Add(num2)
  493. res, _ := decimalValue.Float64()
  494. return res
  495. }
  496. if biddiscount > 0.0 {
  497. if biddiscount > 1.0 && biddiscount <= 10.0 {
  498. num1 := decimal.NewFromFloat(10.0)
  499. num2 := decimal.NewFromFloat(biddiscount)
  500. decimalValue := num2.Div(num1)
  501. res, _ := decimalValue.Float64()
  502. return res
  503. } else if biddiscount > 10.0 {
  504. num1 := decimal.NewFromFloat(100.0)
  505. num2 := decimal.NewFromFloat(biddiscount)
  506. decimalValue := num2.Div(num1)
  507. res, _ := decimalValue.Float64()
  508. return res
  509. } else {
  510. return biddiscount
  511. }
  512. }
  513. return 0.0
  514. }
  515. // 精度丢失-相加
  516. func precisionAddFloat(tmp1, tmp2 float64) float64 {
  517. num1 := decimal.NewFromFloat(tmp1)
  518. num2 := decimal.NewFromFloat(tmp2)
  519. decimalValue := num2.Add(num1)
  520. res, _ := decimalValue.Float64()
  521. return res
  522. }
  523. // 特殊金额-处理判断-倍率关系
  524. func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
  525. //金额结果只有两种 - 倍率关系10000 - 过10E
  526. moneyIndex := []int{}
  527. moneyArr := []float64{}
  528. first_money := float64(0)
  529. difValue := map[string]interface{}{}
  530. for k, v := range val { //取第一个非负数,项目名称除外
  531. if v.IsTrue && v.Score > -1 {
  532. moneyArr = append(moneyArr, qu.Float64All(v.Value))
  533. moneyIndex = append(moneyIndex, k)
  534. key := ""
  535. if m, ok := v.Value.(float64); ok {
  536. key = fmt.Sprintf("%f", m)
  537. } else {
  538. key = qu.ObjToString(v.Value)
  539. }
  540. if difValue[key] == nil {
  541. difValue[key] = 1
  542. }
  543. //if len(difValue) > 2 {
  544. // return false, 0
  545. //}
  546. }
  547. }
  548. //计算金额数组
  549. if len(difValue) == 2 {
  550. money_1, money_2 := float64(0), float64(0)
  551. for k, v := range moneyArr {
  552. if k == 0 {
  553. money_1 = v
  554. } else {
  555. if v != money_1 {
  556. money_2 = v
  557. break
  558. }
  559. }
  560. }
  561. isRatio, new_money := false, float64(0) //判断金额是否为倍率关系
  562. if money_1 != float64(0) && money_2 != float64(0) {
  563. if money_1 == money_2*float64(10000) && money_1 >= 100000000 {
  564. isRatio = true
  565. new_money = money_2
  566. }
  567. if money_2 == money_1*float64(10000) && money_2 >= 100000000 {
  568. isRatio = true
  569. new_money = money_1
  570. }
  571. if isRatio { //采用新值
  572. for k, v := range moneyArr {
  573. if v == new_money {
  574. return true, moneyIndex[k]
  575. }
  576. }
  577. }
  578. }
  579. } else if len(difValue) > 2 { //多组金额
  580. is_exists := false
  581. for _, v := range moneyArr {
  582. if v >= 1000000000 {
  583. is_exists = true
  584. first_money = v
  585. }
  586. }
  587. if is_exists {
  588. for k, v := range moneyArr {
  589. if v*10000 == first_money {
  590. return true, moneyIndex[k]
  591. }
  592. }
  593. }
  594. } else {
  595. }
  596. return false, 0
  597. }
  598. // 筛选重复候选人-相关
  599. func filterRepeatWinArr(j *ju.Job) {
  600. if j.SpiderCode == "sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
  601. sort_WinOrder_Arr := make([][]map[string]interface{}, 0)
  602. sort_arr := make([]map[string]interface{}, 0)
  603. for _, v := range j.Winnerorder {
  604. sort := qu.IntAll(v["sort"])
  605. if sort == 1 { //为一组
  606. if len(sort_arr) > 0 {
  607. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  608. }
  609. sort_arr = make([]map[string]interface{}, 0)
  610. }
  611. sort_arr = append(sort_arr, v)
  612. }
  613. if len(sort_arr) > 0 {
  614. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  615. }
  616. if len(sort_WinOrder_Arr) > 0 { //有重复排序组-开始筛选清理
  617. isIndex := 0
  618. for index, winArr := range sort_WinOrder_Arr {
  619. if len(winArr) > 0 {
  620. if qu.ObjToString(winArr[0]["price"]) != "" &&
  621. qu.ObjToString(winArr[0]["entname"]) != "" {
  622. isIndex = index
  623. break
  624. }
  625. }
  626. }
  627. j.Winnerorder = sort_WinOrder_Arr[isIndex]
  628. }
  629. }
  630. }
  631. // 中标候选人经过清理之后,重新取出赋值
  632. func (e *ExtractTask) ResetWinnerorder(j *ju.Job) {
  633. if len(j.Winnerorder) == 0 {
  634. return
  635. }
  636. maxlen := len(j.Winnerorder) - 1
  637. //中标单位
  638. //i := 0
  639. winners := []*ju.ExtField{}
  640. bidamounts := []*ju.ExtField{}
  641. //对候选人单位名称进行清洗
  642. winorderLock.Lock()
  643. ruleArr := []*RuleCore{}
  644. ruleArr = e.RuleCores["all_all"]["winner"]
  645. for _, v := range j.Winnerorder {
  646. new_winner := qu.ObjToString(v["entname"])
  647. if new_winner != "" {
  648. for _, v1 := range ruleArr {
  649. for _, v2 := range v1.KVRuleCores {
  650. if new_winner == "" {
  651. break
  652. }
  653. new_winner = v2.RegPreBac.Reg.ReplaceAllString(new_winner, v2.RegPreBac.Replace)
  654. }
  655. }
  656. }
  657. v["entname"] = new_winner
  658. }
  659. winorderLock.Unlock()
  660. if maxlen > 0 {
  661. //新增-指定爬虫中标候选人过滤
  662. filterRepeatWinArr(j)
  663. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  664. return
  665. }
  666. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  667. if j.Winnerorder[0]["price"] != nil {
  668. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  669. if tmpPrice[len(tmpPrice)-1].(bool) {
  670. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  671. }
  672. }
  673. }
  674. if j.Result["winner"] == nil && len(winners) > 0 {
  675. j.Result["winner"] = winners
  676. } else if len(winners) > 0 {
  677. j.Result["winner"] = append(j.Result["winner"], winners...)
  678. }
  679. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  680. j.Result["bidamount"] = bidamounts
  681. } else if len(bidamounts) > 0 {
  682. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  683. }
  684. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  685. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  686. j.Result["winner"] = winners
  687. if j.Winnerorder[0]["price"] != nil {
  688. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  689. if tmpPrice[len(tmpPrice)-1].(bool) {
  690. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  691. }
  692. j.Result["bidamount"] = bidamounts
  693. }
  694. }
  695. }
  696. func RemoveReplicaSliceString(slc []string) []string {
  697. result := make([]string, 0)
  698. tempMap := make(map[string]bool, len(slc))
  699. for _, e := range slc {
  700. if tempMap[e] == false {
  701. tempMap[e] = true
  702. result = append(result, e)
  703. }
  704. }
  705. return result
  706. }
  707. // 分包中标单位是否-合理
  708. func isValidPkgWinner(winner string) bool {
  709. if utf8.RuneCountInString(winner) < 4 {
  710. return false
  711. }
  712. return true
  713. }
  714. // 组装kv
  715. func assembleKVText(j *ju.Job, tmp *map[string]interface{}) {
  716. var kvtext bytes.Buffer
  717. blocks := make([]ju.BlockAndTag, 0)
  718. for _, v := range j.Block {
  719. //分包和标签
  720. if ju.SaveBlock {
  721. xx, _ := json.Marshal(v)
  722. tmpblock := new(ju.TmpBlock)
  723. err := json.Unmarshal(xx, &tmpblock)
  724. if err != nil {
  725. if v.BPackage != nil {
  726. bpb, _ := json.Marshal(v.BPackage)
  727. tmpblock.BPackage = string(bpb)
  728. }
  729. tmpblock = rangeBlockToJson(v, *tmpblock)
  730. }
  731. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  732. }
  733. //把所有kv组装成一个字符串,存库
  734. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  735. if jv == nil {
  736. continue
  737. }
  738. for jv_k, jv_v := range jv.KvTags {
  739. for _, jv_vv := range jv_v {
  740. kvtext.WriteString(jv_k)
  741. kvtext.WriteString(":")
  742. kvtext.WriteString(jv_vv.Value)
  743. kvtext.WriteString("\n")
  744. }
  745. }
  746. }
  747. }
  748. if kvtext.Len() > 0 {
  749. (*tmp)["kvtext"] = kvtext.String()
  750. }
  751. if len(blocks) > 0 {
  752. if blocksBytes, err := json.Marshal(blocks); err == nil {
  753. if utf8.RuneCount(blocksBytes) < 100000 {
  754. (*tmp)["blocks"] = string(blocksBytes)
  755. }
  756. }
  757. }
  758. }
  759. // 辅助信息,如果没有排序先排序
  760. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  761. fieldalls := map[string][]map[string]interface{}{}
  762. if j == nil {
  763. return fieldalls
  764. }
  765. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  766. defer qykredis.Close()
  767. db := 0
  768. for field, val := range j.Result {
  769. //ju.Sort(val)
  770. if field == "buyer" {
  771. db = ju.BuyerDB
  772. } else if field == "winner" {
  773. db = ju.WinnerDB
  774. } else if field == "agency" {
  775. db = ju.AgencyDB
  776. }
  777. sfields := []map[string]interface{}{}
  778. for _, v := range val {
  779. standardized := false
  780. if _, err := qykredis.Do("SELECT", db); err != nil {
  781. fmt.Println("redis select err", err)
  782. } else {
  783. rep, err := qykredis.Do("GET", v.Value)
  784. if rep != nil && err == nil {
  785. standardized = true
  786. }
  787. }
  788. if field == "budget" || field == "bidamount" {
  789. if !v.IsTrue {
  790. continue
  791. }
  792. }
  793. sfield := map[string]interface{}{
  794. "val": v.Value,
  795. "type": v.Type,
  796. "score": v.Score,
  797. "blocktag": v.BlockTag,
  798. "sourceval": v.SourceValue,
  799. "standardized": standardized,
  800. }
  801. sfields = append(sfields, sfield)
  802. }
  803. fieldalls[field] = sfields
  804. }
  805. return fieldalls
  806. }