extraxtmethod.go 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. package extract
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/shopspring/decimal"
  7. "gopkg.in/mgo.v2/bson"
  8. "io"
  9. "jy/clear"
  10. "jy/pretreated"
  11. ju "jy/util"
  12. "net/http"
  13. qu "qfw/util"
  14. "qfw/util/redis"
  15. "regexp"
  16. "strings"
  17. "sync"
  18. "time"
  19. "unicode/utf8"
  20. )
  21. type scoreIndex struct {
  22. Score float64
  23. Index int
  24. }
  25. var (
  26. lock, lockrule sync.RWMutex
  27. lockclear, locktag sync.RWMutex
  28. blocktag sync.RWMutex
  29. JYUrl = "https://www.jianyu360.cn/article/content/%s.html"
  30. cut = ju.NewCut() //获取正文并清理
  31. ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
  32. TaskList map[string]*ExtractTask //任务列表
  33. ClearTaskList map[string]*ClearTask //清理任务列表
  34. saveLimit = 100 //抽取日志批量保存
  35. PageSize = 5000 //查询分页
  36. Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
  37. BiddingFields = map[string]interface{}{
  38. "_id": 1,
  39. "title": 1,
  40. "toptype": 1,
  41. "subtype": 1,
  42. "comeintime": 1,
  43. "publishtime": 1,
  44. "href": 1,
  45. "detail": 1,
  46. }
  47. Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
  48. NiJianField = []string{
  49. "string#approvecode",
  50. "string#total_investment",
  51. "string#funds",
  52. "string#owner",
  53. "string#projectaddr",
  54. "string#projectperiod",
  55. "string#project_scale",
  56. "string#project_person",
  57. "string#project_phone",
  58. "string#approvenumber",
  59. "string#projecttype",
  60. "string#approvestatus",
  61. "time#project_startdate",
  62. "time#project_completedate",
  63. "map#construction_area",
  64. "map#floor_area",
  65. }
  66. spidercode = map[string]bool{
  67. "gd_zhsggzyjyzx_jsgc_fjczbgg": true,
  68. "js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
  69. "zj_tzsyhggzyjyzx_jsgc_kbqk": true,
  70. "hb_tmsggzyjyxxw_jsgc_kbqk": true,
  71. "zj_nbsyyggzyjyw_jsgc_kbqk": true,
  72. "zj_zjsggzyjyzx_jyxx_kbjg": true,
  73. "zj_zjzdgcjyw_ztbjglxx_kbjg": true,
  74. "zj_lssggzyjyw_jsgc_kbsk": true,
  75. "zj_qzslyxggzyjyzx_gggs_xkbjl": true,
  76. "sc_mssggzydzjypt_jsgc_kbjl": true,
  77. "sc_pzhsggzyjyfwzx_jsgc_kbylb": true,
  78. "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true,
  79. "a_hbszbtbggfwpt_kbjl": true,
  80. "a_szsjsgcjyfwzxbafzx_kbqkgs": true,
  81. "a_szldzbyxgs_kbxx": true,
  82. "zj_zssssxggzyjyw_gcjs_kbjggs": true,
  83. "gd_szszfhjsj_kbqkgs": true,
  84. "a_gjggzyjypt_gcjs_kbjl": true,
  85. "a_gjggzyjypt_gcjs_kbjl_new": true,
  86. "zj_tzsyhggzyjyzx_kbjggg": true,
  87. "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true,
  88. "ah_czsggzyjyw_jsgc_kbjl": true,
  89. "ah_czsggzyjyw_zfcg_kbxx": true,
  90. "ah_whsggzyjyfww_kbxx_cgxm": true,
  91. "ah_whsggzyjyfww_kbxx_gcxm": true,
  92. }
  93. clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
  94. sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
  95. clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
  96. clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
  97. textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
  98. winorderLock sync.Mutex
  99. jfwinorderLock sync.Mutex
  100. )
  101. var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
  102. var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
  103. // 包含字母的实体单位
  104. var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
  105. // 落款单位抽取
  106. var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*[\n]+([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
  107. var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
  108. // 特殊实体
  109. var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府))")
  110. // 有效企业
  111. var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|办公室|车务段|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
  112. // 发布时间识别
  113. var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
  114. var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
  115. //var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
  116. // 实体通用企业
  117. var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$")
  118. var entdfa_clean = regexp.MustCompile("([\\s \n]+)")
  119. var entdfa_filtration = regexp.MustCompile("(开标记录)")
  120. // 清洗正文
  121. func CleanDetailText(detail string, summary string) string {
  122. detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
  123. detail = pretreated.RepairCon(detail)
  124. detail = ju.CutLableStr(summary + "\n" + detail)
  125. detail = cut.ClearHtml(summary + "\n" + detail)
  126. return detail
  127. }
  128. // 综合选取detail与contenthtml情况 true采用正文
  129. func SelectDetailSourceText(detail string, contenthtml string) bool {
  130. if len(detail) < 1000 {
  131. return false
  132. }
  133. if textSelectReg.MatchString(detail) && !textSelectReg.MatchString(contenthtml) {
  134. return true
  135. }
  136. return false
  137. }
  138. // 综合选取detail与contenthtml情况 true采用源码
  139. func SelectSourceStructText(detail string, contenthtml string) bool {
  140. arr1 := strings.Split(detail, "\n")
  141. arr2 := strings.Split(contenthtml, "\n")
  142. //正文长度相差不大且源码有效
  143. if len(detail)-len(contenthtml) < 500 && len(contenthtml) > 500 && len(arr1) == 1 && len(arr2) > len(arr1) {
  144. return true
  145. }
  146. return false
  147. }
  148. // 遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
  149. func file2text(doc *map[string]interface{}) {
  150. mnameone := map[string]bool{}
  151. mname := map[string]bool{}
  152. murl := map[string]string{}
  153. //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
  154. if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
  155. for _, attachs := range attach_text {
  156. if fileinfos, ok := attachs.(map[string]interface{}); ok {
  157. for _, fileinfo := range fileinfos {
  158. if ff, ok := fileinfo.(map[string]interface{}); ok {
  159. attach_url := qu.ObjToString(ff["attach_url"])
  160. ffname := qu.ObjToString(ff["file_name"])
  161. if clearStrReg.MatchString(ffname) {
  162. continue
  163. }
  164. mname[ffname] = true
  165. murl[ffname] = attach_url
  166. if sortStrReg.MatchString(ffname) {
  167. mnameone[ffname] = true
  168. }
  169. }
  170. }
  171. }
  172. }
  173. }
  174. tmpstr := ""
  175. for k := range mnameone {
  176. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  177. (*doc)["detailfile"] = tmpstr
  178. return
  179. }
  180. bs := ju.OssGetObject(murl[k])
  181. if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
  182. tmpstr += bs + "\n"
  183. } else {
  184. tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
  185. }
  186. }
  187. for k := range mname {
  188. if mnameone[k] {
  189. continue
  190. }
  191. if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
  192. (*doc)["detailfile"] = tmpstr
  193. return
  194. }
  195. bs := ju.OssGetObject(murl[k])
  196. if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) {
  197. tmpstr += bs + "\n"
  198. } else {
  199. tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n"
  200. }
  201. }
  202. (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "")
  203. }
  204. // 判断-附件分包是否无效判定(不通用)
  205. func isUsedPackageJF(jf_package map[string]map[string]interface{}) bool {
  206. if jf_package == nil || len(jf_package) == 0 {
  207. return false
  208. }
  209. for _, pack := range jf_package {
  210. budget := qu.Float64All(pack["budget"])
  211. bidamount := qu.Float64All(pack["bidamount"])
  212. if budget > 0.0 && budget <= 1.0 {
  213. return false
  214. }
  215. if bidamount > 0.0 && bidamount <= 1.0 {
  216. return false
  217. }
  218. }
  219. return true
  220. }
  221. // 是否有效分包
  222. func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
  223. if pkg == nil || len(pkg) == 0 {
  224. return false
  225. }
  226. for _, v := range pkg {
  227. p_winner := qu.ObjToString(v["winner"])
  228. p_budget := qu.Float64All(v["budget"])
  229. p_bidamout := qu.Float64All(v["bidamount"])
  230. if (p_winner != "" && effectivefirm.MatchString(p_winner)) || p_budget > float64(0) || p_bidamout > float64(0) {
  231. return true
  232. }
  233. }
  234. return false
  235. }
  236. // 判断-附件分包是否无效判定(不通用)
  237. func isExistsPackage(pkg map[string]map[string]interface{}) bool {
  238. if pkg == nil || len(pkg) == 0 {
  239. return false
  240. }
  241. if len(pkg) == 1 {
  242. for _, v := range pkg {
  243. winner := qu.ObjToString(v["winner"])
  244. budget := qu.Float64All(v["budget"])
  245. bidamout := qu.Float64All(v["bidamount"])
  246. if winner != "" || budget > float64(0) || bidamout > float64(0) {
  247. return true
  248. }
  249. }
  250. return false
  251. }
  252. return true
  253. }
  254. // getQualifications 添加所有资质新字段
  255. func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
  256. /**
  257. qualifications 资质要求
  258. */
  259. detail := qu.ObjToString(j_data["detail"])
  260. new_detail := pretreated.HtmlToText(detail)
  261. qualifications := ju.GetQualifications(new_detail)
  262. if qualifications != "" {
  263. (*tmp)["qualifications"] = qualifications
  264. }
  265. }
  266. // 落款识别~采购单位
  267. func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) {
  268. //落款实体
  269. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
  270. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  271. if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" {
  272. (*tmp)["buyer"] = new_buyer
  273. (*tmp)["inscribe_buyer"] = "落款结构实体"
  274. }
  275. }
  276. //落款特殊实体
  277. if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" &&
  278. !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  279. if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
  280. (*tmp)["buyer"] = new_buyer
  281. (*tmp)["inscribe_buyer"] = "落款特殊实体"
  282. }
  283. }
  284. //实体服务识别
  285. //if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
  286. // !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
  287. // if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
  288. // (*tmp)["buyer"] = new_buyer
  289. // (*tmp)["inscribe_buyer"] = "实体识别服务"
  290. // }
  291. //}
  292. //拟建不能存buyer
  293. if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
  294. qu.ObjToString((*tmp)["subtype"]) == "拟建" {
  295. delete((*tmp), "buyer")
  296. }
  297. //识别发布时间
  298. if qu.IntAll(j_data["publishtime"]) == -1 {
  299. if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
  300. if ext_publishtime := InscribePublishtime(j_data); ext_publishtime > int64(0) {
  301. (*tmp)["ext_publishtime"] = ext_publishtime
  302. }
  303. }
  304. } else {
  305. delete((*tmp), "ext_publishtime")
  306. }
  307. }
  308. // 识别实体
  309. func InscribeEntity(detail string, tmp map[string]interface{}) string {
  310. new_str := ""
  311. new_detail := pretreated.TextAfterRemoveTable(detail)
  312. if len(new_detail) > 200 {
  313. new_detail = detail[len(new_detail)-200:]
  314. }
  315. new_str = inscribe_entity_1.FindString(new_detail)
  316. if new_str == "" {
  317. new_str = inscribe_entity_2.FindString(new_detail)
  318. if new_str != "" {
  319. str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}")
  320. str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}")
  321. if str1 == str2 && str1 != "" {
  322. new_str = str1
  323. }
  324. }
  325. } else {
  326. new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
  327. }
  328. winner := qu.ObjToString(tmp["winner"])
  329. agency := qu.ObjToString(tmp["agency"])
  330. //与其它单位发生了重叠
  331. if new_str != "" && (new_str == winner || new_str == agency) {
  332. new_str = ""
  333. }
  334. if new_str != "" && exclude_entity.MatchString(new_str) {
  335. new_str = ""
  336. }
  337. return new_str
  338. }
  339. // 识别实体
  340. func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string {
  341. new_str := ""
  342. projectname := qu.ObjToString(tmp["projectname"])
  343. title := qu.ObjToString(tmp["title"])
  344. winner := qu.ObjToString(tmp["winner"])
  345. agency := qu.ObjToString(tmp["agency"])
  346. toptype := qu.ObjToString(tmp["toptype"])
  347. //采用-标题项目名称
  348. if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
  349. return new_str
  350. }
  351. if !entdfa_filtration.MatchString(title) {
  352. //采用-排除表格的文本识别
  353. new_detail := pretreated.TextAfterRemoveTable(detail)
  354. new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
  355. if len(new_detail) > 500 {
  356. new_detail = new_detail[len(new_detail)-500:]
  357. }
  358. if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
  359. return new_str
  360. }
  361. if toptype != "结果" {
  362. //采用-去除标签的纯文本(含表格)
  363. new_detail = pretreated.HtmlToText(detail)
  364. new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
  365. if len(new_detail) > 500 {
  366. new_detail = new_detail[len(new_detail)-500:]
  367. }
  368. if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
  369. return new_str
  370. }
  371. }
  372. }
  373. //采用-附件识别
  374. if !entdfa_filtration.MatchString(title) {
  375. if len(jf_detail) > 500 {
  376. jf_detail = jf_detail[len(jf_detail)-500:]
  377. }
  378. if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" {
  379. return new_str
  380. }
  381. }
  382. return new_str
  383. }
  384. // 实体识别方法
  385. func EmployEntDfaText(text string, winner string, agency string) string {
  386. new_str := ""
  387. if text == "" {
  388. return new_str
  389. }
  390. dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0
  391. if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 {
  392. for _, v := range res {
  393. if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) {
  394. if !(v == winner || v == agency) {
  395. l = cl
  396. new_str = v
  397. }
  398. }
  399. }
  400. }
  401. return new_str
  402. }
  403. // 识别发布时间
  404. func InscribePublishtime(j_data map[string]interface{}) int64 {
  405. //落款文本识别
  406. detail := pretreated.TextAfterRemoveTable(qu.ObjToString(j_data["detail"]))
  407. if len(detail) > 200 {
  408. detail = detail[len(detail)-200:]
  409. }
  410. new_str := inscribe_entity_1.FindString(detail)
  411. if new_str == "" {
  412. new_str = inscribe_entity_2.FindString(detail)
  413. if new_str != "" {
  414. new_str = inscribe_entity_2.ReplaceAllString(new_str, "${5}")
  415. }
  416. } else {
  417. new_str = inscribe_entity_1.ReplaceAllString(new_str, "${5}")
  418. }
  419. if data := clear.ObjToTimestamp([]interface{}{new_str}, ""); len(data) > 0 {
  420. if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
  421. return ext_publishtime
  422. }
  423. }
  424. //附件名称识别
  425. projectinfo := *qu.ObjToMap(j_data["projectinfo"])
  426. attachments := *qu.ObjToMap(projectinfo["attachments"])
  427. for _, v := range attachments {
  428. info := *qu.ObjToMap(v)
  429. filename := qu.ObjToString(info["filename"])
  430. if pt_str := inscribe_publishtime_1.FindString(filename); pt_str != "" {
  431. if data := clear.ObjToTimestamp([]interface{}{pt_str}, ""); len(data) > 0 {
  432. if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) {
  433. return ext_publishtime
  434. }
  435. }
  436. }
  437. }
  438. return int64(0)
  439. }
  440. // 识别特殊采购单位
  441. func InscribeSpecEntity(detail string) string {
  442. new_str := ""
  443. new_detail := pretreated.TextAfterRemoveTable(detail)
  444. if len(new_detail) > 200 {
  445. new_detail = detail[len(new_detail)-200:]
  446. }
  447. find_str := inscribe_entity_3.FindString(new_detail)
  448. if find_str != "" {
  449. new_str = inscribe_entity_3.ReplaceAllString(find_str, "${2}")
  450. }
  451. return new_str
  452. }
  453. func EmployPostEntDfa(data map[string]interface{}) map[string]interface{} {
  454. info := map[string]interface{}{}
  455. client := &http.Client{Timeout: 2 * time.Second}
  456. jsonStr, _ := json.Marshal(data)
  457. resp, err := client.Post("http://extcity.spdata.jianyu360.com/service/entity/", "application/json", bytes.NewBuffer(jsonStr))
  458. if err != nil {
  459. return info
  460. }
  461. res, err := io.ReadAll(resp.Body)
  462. if err != nil {
  463. return info
  464. }
  465. err = json.Unmarshal(res, &info)
  466. if err != nil {
  467. return info
  468. }
  469. return info
  470. }
  471. // 处理折扣系数-
  472. func dealWithDiscountBid(tmp map[string]interface{}) float64 {
  473. biddiscount := qu.Float64All(tmp["biddiscount"])
  474. biddiscount_up := qu.Float64All(tmp["biddiscount_up"])
  475. biddiscount_down := qu.Float64All(tmp["biddiscount_down"])
  476. baseCount := float64(1)
  477. if biddiscount_down > 0.0 {
  478. num1 := decimal.NewFromFloat(baseCount)
  479. num2 := decimal.NewFromFloat(biddiscount_down)
  480. decimalValue := num1.Sub(num2)
  481. res, _ := decimalValue.Float64()
  482. return res
  483. }
  484. if biddiscount_up > 0.0 {
  485. num1 := decimal.NewFromFloat(baseCount)
  486. num2 := decimal.NewFromFloat(biddiscount_up)
  487. decimalValue := num1.Add(num2)
  488. res, _ := decimalValue.Float64()
  489. return res
  490. }
  491. if biddiscount > 0.0 {
  492. if biddiscount > 1.0 && biddiscount <= 10.0 {
  493. num1 := decimal.NewFromFloat(10.0)
  494. num2 := decimal.NewFromFloat(biddiscount)
  495. decimalValue := num2.Div(num1)
  496. res, _ := decimalValue.Float64()
  497. return res
  498. } else if biddiscount > 10.0 {
  499. num1 := decimal.NewFromFloat(100.0)
  500. num2 := decimal.NewFromFloat(biddiscount)
  501. decimalValue := num2.Div(num1)
  502. res, _ := decimalValue.Float64()
  503. return res
  504. } else {
  505. return biddiscount
  506. }
  507. }
  508. return 0.0
  509. }
  510. // 精度丢失-相加
  511. func precisionAddFloat(tmp1, tmp2 float64) float64 {
  512. num1 := decimal.NewFromFloat(tmp1)
  513. num2 := decimal.NewFromFloat(tmp2)
  514. decimalValue := num2.Add(num1)
  515. res, _ := decimalValue.Float64()
  516. return res
  517. }
  518. // 特殊金额-处理判断-倍率关系
  519. func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
  520. //金额结果只有两种 - 倍率关系10000 - 过10E
  521. moneyIndex := []int{}
  522. moneyArr := []float64{}
  523. first_money := float64(0)
  524. difValue := map[string]interface{}{}
  525. for k, v := range val { //取第一个非负数,项目名称除外
  526. if v.IsTrue && v.Score > -1 {
  527. moneyArr = append(moneyArr, qu.Float64All(v.Value))
  528. moneyIndex = append(moneyIndex, k)
  529. key := ""
  530. if m, ok := v.Value.(float64); ok {
  531. key = fmt.Sprintf("%f", m)
  532. } else {
  533. key = qu.ObjToString(v.Value)
  534. }
  535. if difValue[key] == nil {
  536. difValue[key] = 1
  537. }
  538. //if len(difValue) > 2 {
  539. // return false, 0
  540. //}
  541. }
  542. }
  543. //计算金额数组
  544. if len(difValue) == 2 {
  545. money_1, money_2 := float64(0), float64(0)
  546. for k, v := range moneyArr {
  547. if k == 0 {
  548. money_1 = v
  549. } else {
  550. if v != money_1 {
  551. money_2 = v
  552. break
  553. }
  554. }
  555. }
  556. isRatio, new_money := false, float64(0) //判断金额是否为倍率关系
  557. if money_1 != float64(0) && money_2 != float64(0) {
  558. if money_1 == money_2*float64(10000) && money_1 >= 100000000 {
  559. isRatio = true
  560. new_money = money_2
  561. }
  562. if money_2 == money_1*float64(10000) && money_2 >= 100000000 {
  563. isRatio = true
  564. new_money = money_1
  565. }
  566. if isRatio { //采用新值
  567. for k, v := range moneyArr {
  568. if v == new_money {
  569. return true, moneyIndex[k]
  570. }
  571. }
  572. }
  573. }
  574. } else if len(difValue) > 2 { //多组金额
  575. is_exists := false
  576. for _, v := range moneyArr {
  577. if v >= 1000000000 {
  578. is_exists = true
  579. first_money = v
  580. }
  581. }
  582. if is_exists {
  583. for k, v := range moneyArr {
  584. if v*10000 == first_money {
  585. return true, moneyIndex[k]
  586. }
  587. }
  588. }
  589. } else {
  590. }
  591. return false, 0
  592. }
  593. // 筛选重复候选人-相关
  594. func filterRepeatWinArr(j *ju.Job) {
  595. if j.SpiderCode == "sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" {
  596. sort_WinOrder_Arr := make([][]map[string]interface{}, 0)
  597. sort_arr := make([]map[string]interface{}, 0)
  598. for _, v := range j.Winnerorder {
  599. sort := qu.IntAll(v["sort"])
  600. if sort == 1 { //为一组
  601. if len(sort_arr) > 0 {
  602. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  603. }
  604. sort_arr = make([]map[string]interface{}, 0)
  605. }
  606. sort_arr = append(sort_arr, v)
  607. }
  608. if len(sort_arr) > 0 {
  609. sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr)
  610. }
  611. if len(sort_WinOrder_Arr) > 0 { //有重复排序组-开始筛选清理
  612. isIndex := 0
  613. for index, winArr := range sort_WinOrder_Arr {
  614. if len(winArr) > 0 {
  615. if qu.ObjToString(winArr[0]["price"]) != "" &&
  616. qu.ObjToString(winArr[0]["entname"]) != "" {
  617. isIndex = index
  618. break
  619. }
  620. }
  621. }
  622. j.Winnerorder = sort_WinOrder_Arr[isIndex]
  623. }
  624. }
  625. }
  626. // 中标候选人经过清理之后,重新取出赋值
  627. func (e *ExtractTask) ResetWinnerorder(j *ju.Job) {
  628. if len(j.Winnerorder) == 0 {
  629. return
  630. }
  631. maxlen := len(j.Winnerorder) - 1
  632. //中标单位
  633. //i := 0
  634. winners := []*ju.ExtField{}
  635. bidamounts := []*ju.ExtField{}
  636. //对候选人单位名称进行清洗
  637. winorderLock.Lock()
  638. ruleArr := []*RuleCore{}
  639. ruleArr = e.RuleCores["all_all"]["winner"]
  640. for _, v := range j.Winnerorder {
  641. new_winner := qu.ObjToString(v["entname"])
  642. if new_winner != "" {
  643. for _, v1 := range ruleArr {
  644. for _, v2 := range v1.KVRuleCores {
  645. if new_winner == "" {
  646. break
  647. }
  648. new_winner = v2.RegPreBac.Reg.ReplaceAllString(new_winner, v2.RegPreBac.Replace)
  649. }
  650. }
  651. }
  652. v["entname"] = new_winner
  653. }
  654. winorderLock.Unlock()
  655. if maxlen > 0 {
  656. //新增-指定爬虫中标候选人过滤
  657. filterRepeatWinArr(j)
  658. if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 {
  659. return
  660. }
  661. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  662. if j.Winnerorder[0]["price"] != nil {
  663. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  664. if tmpPrice[len(tmpPrice)-1].(bool) {
  665. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  666. }
  667. }
  668. }
  669. if j.Result["winner"] == nil && len(winners) > 0 {
  670. j.Result["winner"] = winners
  671. } else if len(winners) > 0 {
  672. j.Result["winner"] = append(j.Result["winner"], winners...)
  673. }
  674. if j.Result["bidamount"] == nil && len(bidamounts) > 0 {
  675. j.Result["bidamount"] = bidamounts
  676. } else if len(bidamounts) > 0 {
  677. j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
  678. }
  679. if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
  680. winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
  681. j.Result["winner"] = winners
  682. if j.Winnerorder[0]["price"] != nil {
  683. tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
  684. if tmpPrice[len(tmpPrice)-1].(bool) {
  685. bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
  686. }
  687. j.Result["bidamount"] = bidamounts
  688. }
  689. }
  690. }
  691. func RemoveReplicaSliceString(slc []string) []string {
  692. result := make([]string, 0)
  693. tempMap := make(map[string]bool, len(slc))
  694. for _, e := range slc {
  695. if tempMap[e] == false {
  696. tempMap[e] = true
  697. result = append(result, e)
  698. }
  699. }
  700. return result
  701. }
  702. // 分包中标单位是否-合理
  703. func isValidPkgWinner(winner string) bool {
  704. if utf8.RuneCountInString(winner) < 4 {
  705. return false
  706. }
  707. return true
  708. }
  709. // 组装kv
  710. func assembleKVText(j *ju.Job, tmp *map[string]interface{}) {
  711. var kvtext bytes.Buffer
  712. blocks := make([]ju.BlockAndTag, 0)
  713. for _, v := range j.Block {
  714. //分包和标签
  715. if ju.SaveBlock {
  716. xx, _ := json.Marshal(v)
  717. tmpblock := new(ju.TmpBlock)
  718. err := json.Unmarshal(xx, &tmpblock)
  719. if err != nil {
  720. if v.BPackage != nil {
  721. bpb, _ := json.Marshal(v.BPackage)
  722. tmpblock.BPackage = string(bpb)
  723. }
  724. tmpblock = rangeBlockToJson(v, *tmpblock)
  725. }
  726. blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
  727. }
  728. //把所有kv组装成一个字符串,存库
  729. for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} {
  730. if jv == nil {
  731. continue
  732. }
  733. for jv_k, jv_v := range jv.KvTags {
  734. for _, jv_vv := range jv_v {
  735. kvtext.WriteString(jv_k)
  736. kvtext.WriteString(":")
  737. kvtext.WriteString(jv_vv.Value)
  738. kvtext.WriteString("\n")
  739. }
  740. }
  741. }
  742. }
  743. if kvtext.Len() > 0 {
  744. (*tmp)["kvtext"] = kvtext.String()
  745. }
  746. if len(blocks) > 0 {
  747. if blocksBytes, err := json.Marshal(blocks); err == nil {
  748. if utf8.RuneCount(blocksBytes) < 100000 {
  749. (*tmp)["blocks"] = string(blocksBytes)
  750. }
  751. }
  752. }
  753. }
  754. // 辅助信息,如果没有排序先排序
  755. func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
  756. fieldalls := map[string][]map[string]interface{}{}
  757. if j == nil {
  758. return fieldalls
  759. }
  760. qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
  761. defer qykredis.Close()
  762. db := 0
  763. for field, val := range j.Result {
  764. //ju.Sort(val)
  765. if field == "buyer" {
  766. db = ju.BuyerDB
  767. } else if field == "winner" {
  768. db = ju.WinnerDB
  769. } else if field == "agency" {
  770. db = ju.AgencyDB
  771. }
  772. sfields := []map[string]interface{}{}
  773. for _, v := range val {
  774. standardized := false
  775. if _, err := qykredis.Do("SELECT", db); err != nil {
  776. fmt.Println("redis select err", err)
  777. } else {
  778. rep, err := qykredis.Do("GET", v.Value)
  779. if rep != nil && err == nil {
  780. standardized = true
  781. }
  782. }
  783. if field == "budget" || field == "bidamount" {
  784. if !v.IsTrue {
  785. continue
  786. }
  787. }
  788. sfield := map[string]interface{}{
  789. "val": v.Value,
  790. "type": v.Type,
  791. "score": v.Score,
  792. "blocktag": v.BlockTag,
  793. "sourceval": v.SourceValue,
  794. "standardized": standardized,
  795. }
  796. sfields = append(sfields, sfield)
  797. }
  798. fieldalls[field] = sfields
  799. }
  800. return fieldalls
  801. }