extractcheck.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. package extract
  2. import (
  3. "fmt"
  4. "jy/clear"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "strings"
  9. "unicode/utf8"
  10. )
  11. // 去重冗余字段
  12. func delFiled(k string) bool {
  13. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  14. }
  15. // 检查字段-
  16. func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
  17. delete(tmp, "contenthtml")
  18. delete(tmp, "detail")
  19. //剑鱼链接方便查阅
  20. jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
  21. tmp["jytest_href"] = jyhref
  22. //对于招标类信息~若winner没有值~过滤掉中标相关信息
  23. if qu.ObjToString(tmp["toptype"]) == "招标" &&
  24. qu.ObjToString(tmp["subtype"]) != "单一" {
  25. delete(tmp, "winner")
  26. delete(tmp, "s_winner")
  27. delete(tmp, "bidamount")
  28. delete(tmp, "winnerorder")
  29. }
  30. tmp["repeat"] = 0
  31. //指定爬虫-金额处理-预算-中标金额异常
  32. if qu.ObjToString(tmp["spidercode"]) == "xz_xzzzqjzscjgycxxxpt_zbtzs" {
  33. if budget, ok := tmp["budget"].(float64); ok && budget > 0 && budget < 1000000 {
  34. tmp["budget"] = budget * 10000.0
  35. }
  36. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  37. tmp["bidamount"] = bidamount / 10000.0
  38. }
  39. }
  40. if qu.ObjToString(tmp["spidercode"]) == "js_jsszbtbw_zbhxrgs" {
  41. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  42. tmp["bidamount"] = bidamount / 10000.0
  43. }
  44. }
  45. //异常金额类型清洗-
  46. if _, ok := tmp["bidamount"].(string); ok {
  47. delete(tmp, "bidamount")
  48. }
  49. if _, ok := tmp["budget"].(string); ok {
  50. delete(tmp, "budget")
  51. }
  52. //budget bidamount 阈值限定
  53. if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
  54. tmp["budget_threshold"] = bg
  55. delete(tmp, "budget")
  56. }
  57. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
  58. tmp["bidamount_threshold"] = bg
  59. delete(tmp, "bidamount")
  60. }
  61. //对分包存储校验···package
  62. if tmp["package"] != nil {
  63. if isExistsPackage(tmp["package"].(map[string]map[string]interface{})) {
  64. tmp["is_exist_package"] = true
  65. } else {
  66. tmp["package_c"] = tmp["package"]
  67. delete(tmp, "package")
  68. }
  69. }
  70. //对于单位,金额与候选信息进行相互校验与选取
  71. if winner := qu.ObjToString(tmp["winner"]); winner != "" {
  72. if winnerorder := ju.IsMarkInterfaceMap(tmp["winnerorder"]); len(winnerorder) > 0 {
  73. isWin := false
  74. if tmp["package"] == nil {
  75. isWin = true
  76. } else {
  77. if !isUsedMultiPackage(tmp["package"].(map[string]map[string]interface{})) || winner == qu.ObjToString(tmp["s_winner"]) {
  78. isWin = true
  79. }
  80. }
  81. if isWin {
  82. isExists := false
  83. for k, v := range winnerorder {
  84. if k >= 2 { //仅对比前两名
  85. break
  86. }
  87. if winner == qu.ObjToString(v["entname"]) && qu.Float64All(v["price"]) > float64(0) {
  88. tmp["bidamount"] = qu.Float64All(v["price"])
  89. isExists = true
  90. break
  91. }
  92. }
  93. //单位不在候选人里面--金额一致
  94. if !isExists && len(winnerorder) > 1 && len(winnerorder) < 4 { //单位未在候选人里面找到-
  95. if entname := qu.ObjToString(winnerorder[0]["entname"]); entname != "" && qu.IntAll(winnerorder[0]["sort"]) == 1 {
  96. if price := qu.Float64All(winnerorder[0]["price"]); price > 0.0 && qu.Float64All(tmp["bidamount"]) == price {
  97. if !(strings.Contains(entname, winner) || strings.Contains(winner, entname)) {
  98. if effectivefirm.MatchString(entname) {
  99. tmp["winner"] = entname
  100. tmp["s_winner"] = entname
  101. }
  102. }
  103. }
  104. }
  105. }
  106. }
  107. }
  108. }
  109. //快速过滤一遍特殊字段
  110. for k, v := range tmp {
  111. if k == "qualifies" {
  112. continue
  113. }
  114. if k == "contract_guarantee" || k == "bid_guarantee" ||
  115. k == "is_acquire_tender" {
  116. if len(fmt.Sprint(v)) > 0 {
  117. tmp[k] = true
  118. } else {
  119. delete(tmp, k)
  120. }
  121. }
  122. if k == "is_joint_bidding" || k == "is_payment_deposit" {
  123. if fmt.Sprint(v) == "true" {
  124. tmp[k] = true
  125. } else {
  126. delete(tmp, k)
  127. }
  128. }
  129. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
  130. delete(tmp, k)
  131. }
  132. }
  133. //特殊字段~根据其他字段处理
  134. bid_bond := qu.ObjToString(tmp["bid_bond"])
  135. if bid_bond != "" && tmp["is_payment_deposit"] == nil {
  136. if strings.Contains(bid_bond, "保证金") && !clearbondReg.MatchString(bid_bond) {
  137. tmp["is_payment_deposit"] = true
  138. }
  139. }
  140. //特殊字段~根据其他字段处理
  141. bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
  142. if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
  143. if utf8.RuneCountInString(bidopenaddress) > 5 {
  144. tmp["bidopen_shape"] = "线下开标"
  145. }
  146. }
  147. //项目周期-有效值
  148. projectperiod := qu.ObjToString(tmp["projectperiod"])
  149. if projectperiod != "" {
  150. //项目周期包含日期,数字及日期单位可保留,其余可清洗
  151. isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
  152. if !isNeedValueReg.MatchString(projectperiod) {
  153. delete(tmp, "projectperiod")
  154. }
  155. }
  156. //工期单位是否有效-清理
  157. if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
  158. dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
  159. if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit) > 4 {
  160. delete(tmp, "project_timeunit")
  161. }
  162. //年-0 >5 删除
  163. if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"]) == 0 || qu.Int64All(tmp["project_duration"]) > 5) {
  164. delete(tmp, "project_timeunit")
  165. }
  166. }
  167. //中标单位统一
  168. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  169. strwin := qu.ObjToString(tmp["winner"])
  170. strwin_s := qu.ObjToString(tmp["s_winner"])
  171. if !strings.Contains(strwin_s, strwin) {
  172. tmp["s_winner"] = strwin
  173. }
  174. } else if qu.ObjToString(tmp["s_winner"]) == "" && qu.ObjToString(tmp["winner"]) != "" {
  175. tmp["s_winner"] = tmp["winner"]
  176. }
  177. //投标方式-
  178. bidway := qu.IntAll(tmp["bidway"])
  179. if bidway == 1 {
  180. tmp["bidway"] = "纸质投标"
  181. } else if bidway == 2 {
  182. tmp["bidway"] = "电子投标"
  183. } else {
  184. delete(tmp, "bidway")
  185. }
  186. //折扣系数
  187. discount := dealWithDiscountBid(tmp)
  188. if discount > 0.0 {
  189. tmp["biddiscount"] = discount
  190. } else {
  191. delete(tmp, "biddiscount")
  192. }
  193. delete(tmp, "biddiscount_up")
  194. delete(tmp, "biddiscount_down")
  195. //budget bidamount 阈值限定再次
  196. if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
  197. tmp["budget_threshold"] = bg
  198. delete(tmp, "budget")
  199. }
  200. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
  201. tmp["bidamount_threshold"] = bg
  202. delete(tmp, "bidamount")
  203. }
  204. //检查剑鱼发布-爬虫
  205. jyfb_data := *qu.ObjToMap(j_data["jyfb_data"])
  206. if jyfb_data != nil {
  207. for k, v := range jyfb_data {
  208. if k == "area" {
  209. delete(tmp, "district")
  210. }
  211. tmp[k] = v
  212. }
  213. }
  214. //针对拟建单位~需要验证~各种字段优先级
  215. if qu.ObjToString(tmp["toptype"]) == "拟建" &&
  216. qu.ObjToString(tmp["subtype"]) == "拟建" {
  217. nj_record := map[string]interface{}{}
  218. for _, v := range NiJianField {
  219. arr := strings.Split(v, "#")
  220. k_type, k_field := "", ""
  221. if len(arr) == 2 {
  222. k_type, k_field = arr[0], arr[1]
  223. } else {
  224. continue
  225. }
  226. tmpValue := tmp[k_field]
  227. is_use := false
  228. if k_type == "string" {
  229. if qu.ObjToString(j_data[k_field]) != "" {
  230. is_use = true
  231. tmp[k_field] = qu.ObjToString(j_data[k_field])
  232. }
  233. } else if k_type == "time" {
  234. if j_data[k_field] != nil {
  235. tmp["s_"+k_field] = j_data[k_field]
  236. }
  237. //开竣工日期,采集为字符串
  238. if qu.ObjToString(j_data[k_field]) != "" {
  239. new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "")
  240. if len(new_data) > 0 {
  241. if qu.Int64All(new_data[0]) > 0 {
  242. is_use = true
  243. tmp[k_field] = qu.Int64All(new_data[0])
  244. }
  245. }
  246. } else {
  247. if qu.Int64All(j_data[k_field]) > int64(0) {
  248. is_use = true
  249. tmp[k_field] = qu.Int64All(j_data[k_field])
  250. }
  251. }
  252. } else if k_type == "map" {
  253. p_info := *qu.ObjToMap(j_data["project_scale_info"])
  254. if qu.ObjToString(p_info[k_field]) != "" {
  255. is_use = true
  256. tmp[k_field] = qu.ObjToString(p_info[k_field])
  257. }
  258. } else {
  259. }
  260. if tmpValue != nil {
  261. nj_record[k_field] = map[string]interface{}{
  262. k_field: tmpValue,
  263. "is_use": is_use,
  264. }
  265. }
  266. }
  267. if len(nj_record) > 0 {
  268. tmp["nj_record"] = nj_record
  269. }
  270. }
  271. //投标截止日期与开始日期-核对
  272. publishtime := qu.Int64All(tmp["publishtime"])
  273. bidopentime := qu.Int64All(tmp["bidopentime"])
  274. bidendtime := qu.Int64All(tmp["bidendtime"])
  275. if qu.ObjToString(tmp["toptype"]) == "招标" && qu.Int64All(tmp["dataging"]) == 0 {
  276. if publishtime-bidopentime > 7*86400 && publishtime > 0 && bidopentime > 0 {
  277. delete(tmp, "bidopentime")
  278. }
  279. if publishtime-bidendtime > 7*86400 && publishtime > 0 && bidopentime > 0 {
  280. delete(tmp, "bidopentime")
  281. }
  282. }
  283. //企业资质检验,不含有资质时删除
  284. if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok {
  285. special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|专业承包资质|贰 级|叁 级|二类|一类|三类|综合资质|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|承装(修、试)|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|房屋建筑监理|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|建筑智能化|水利水电工程|城乡规划资质|水利工程|环境工程|市政工程|公路行业|交通工程|建筑行业|电子与智能化工程|工程监理|建筑工程|土地规划|地基基础工程)`
  286. reg := regexp.MustCompile(special)
  287. var res = make([]string, 0)
  288. datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n")
  289. for _, data := range datas {
  290. results := reg.FindAllString(data, -1)
  291. if len(results) > 0 {
  292. res = append(res, data)
  293. }
  294. }
  295. if len(res) == 0 {
  296. delete(tmp, "enterprise_qualification")
  297. } else {
  298. tmp["enterprise_qualification"] = strings.Join(res, "\n")
  299. }
  300. }
  301. return tmp
  302. }