extractcheck.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. package extract
  2. import (
  3. "fmt"
  4. "jy/clear"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "strings"
  9. "unicode/utf8"
  10. )
  11. // 去重冗余字段
  12. func delFiled(k string) bool {
  13. return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
  14. }
  15. // 检查字段-
  16. func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
  17. delete(tmp, "contenthtml")
  18. delete(tmp, "detail")
  19. //剑鱼链接方便查阅
  20. jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
  21. tmp["jytest_href"] = jyhref
  22. //对于招标类信息~若winner没有值~过滤掉中标相关信息
  23. if qu.ObjToString(tmp["toptype"]) == "招标" &&
  24. qu.ObjToString(tmp["subtype"]) != "单一" {
  25. delete(tmp, "winner")
  26. delete(tmp, "s_winner")
  27. delete(tmp, "bidamount")
  28. delete(tmp, "winnerorder")
  29. }
  30. tmp["repeat"] = 0
  31. //指定爬虫-金额处理-预算-中标金额异常
  32. if qu.ObjToString(tmp["spidercode"]) == "xz_xzzzqjzscjgycxxxpt_zbtzs" {
  33. if budget, ok := tmp["budget"].(float64); ok && budget > 0 && budget < 1000000 {
  34. tmp["budget"] = budget * 10000.0
  35. }
  36. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  37. tmp["bidamount"] = bidamount / 10000.0
  38. }
  39. }
  40. if qu.ObjToString(tmp["spidercode"]) == "js_jsszbtbw_zbhxrgs" {
  41. if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
  42. tmp["bidamount"] = bidamount / 10000.0
  43. }
  44. }
  45. //异常金额类型清洗-
  46. if _, ok := tmp["bidamount"].(string); ok {
  47. delete(tmp, "bidamount")
  48. }
  49. if _, ok := tmp["budget"].(string); ok {
  50. delete(tmp, "budget")
  51. }
  52. //budget bidamount 阈值限定
  53. if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
  54. tmp["budget_threshold"] = bg
  55. delete(tmp, "budget")
  56. }
  57. if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
  58. tmp["bidamount_threshold"] = bg
  59. delete(tmp, "bidamount")
  60. }
  61. //对于单位,金额与候选信息进行相互校验与选取
  62. if winner := qu.ObjToString(tmp["winner"]); winner != "" {
  63. if winnerorder := ju.IsMarkInterfaceMap(tmp["winnerorder"]); len(winnerorder) > 0 {
  64. isWin := false
  65. if tmp["package"] == nil {
  66. isWin = true
  67. } else {
  68. if !isUsedMultiPackage(tmp["package"].(map[string]map[string]interface{})) || winner == qu.ObjToString(tmp["s_winner"]) {
  69. isWin = true
  70. }
  71. }
  72. if isWin {
  73. isExists := false
  74. for k, v := range winnerorder {
  75. if k >= 2 { //仅对比前两名
  76. break
  77. }
  78. if winner == qu.ObjToString(v["entname"]) && qu.Float64All(v["price"]) > float64(0) {
  79. tmp["bidamount"] = qu.Float64All(v["price"])
  80. isExists = true
  81. break
  82. }
  83. }
  84. //单位不在候选人里面--金额一致
  85. if !isExists && len(winnerorder) > 1 && len(winnerorder) < 4 { //单位未在候选人里面找到-
  86. if entname := qu.ObjToString(winnerorder[0]["entname"]); entname != "" && qu.IntAll(winnerorder[0]["sort"]) == 1 {
  87. if price := qu.Float64All(winnerorder[0]["price"]); price > 0.0 && qu.Float64All(tmp["bidamount"]) == price {
  88. if !(strings.Contains(entname, winner) || strings.Contains(winner, entname)) {
  89. if effectivefirm.MatchString(entname) {
  90. tmp["winner"] = entname
  91. tmp["s_winner"] = entname
  92. }
  93. }
  94. }
  95. }
  96. }
  97. }
  98. }
  99. }
  100. //快速过滤一遍特殊字段
  101. for k, v := range tmp {
  102. if k == "qualifies" {
  103. continue
  104. }
  105. if k == "contract_guarantee" || k == "bid_guarantee" ||
  106. k == "is_acquire_tender" {
  107. if len(fmt.Sprint(v)) > 0 {
  108. tmp[k] = true
  109. } else {
  110. delete(tmp, k)
  111. }
  112. }
  113. if k == "is_joint_bidding" || k == "is_payment_deposit" {
  114. if fmt.Sprint(v) == "true" {
  115. tmp[k] = true
  116. } else {
  117. delete(tmp, k)
  118. }
  119. }
  120. if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
  121. delete(tmp, k)
  122. }
  123. }
  124. //特殊字段~根据其他字段处理
  125. bid_bond := qu.ObjToString(tmp["bid_bond"])
  126. if bid_bond != "" && tmp["is_payment_deposit"] == nil {
  127. if strings.Contains(bid_bond, "保证金") && !clearbondReg.MatchString(bid_bond) {
  128. tmp["is_payment_deposit"] = true
  129. }
  130. }
  131. //特殊字段~根据其他字段处理
  132. bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
  133. if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
  134. if utf8.RuneCountInString(bidopenaddress) > 5 {
  135. tmp["bidopen_shape"] = "线下开标"
  136. }
  137. }
  138. //项目周期-有效值
  139. projectperiod := qu.ObjToString(tmp["projectperiod"])
  140. if projectperiod != "" {
  141. //项目周期包含日期,数字及日期单位可保留,其余可清洗
  142. isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
  143. if !isNeedValueReg.MatchString(projectperiod) {
  144. delete(tmp, "projectperiod")
  145. }
  146. }
  147. //工期单位是否有效-清理
  148. if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
  149. dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
  150. if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit) > 4 {
  151. delete(tmp, "project_timeunit")
  152. }
  153. //年-0 >5 删除
  154. if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"]) == 0 || qu.Int64All(tmp["project_duration"]) > 5) {
  155. delete(tmp, "project_timeunit")
  156. }
  157. }
  158. //中标单位统一
  159. if tmp["winner"] != nil && tmp["s_winner"] != nil {
  160. strwin := qu.ObjToString(tmp["winner"])
  161. strwin_s := qu.ObjToString(tmp["s_winner"])
  162. if !strings.Contains(strwin_s, strwin) {
  163. tmp["s_winner"] = strwin
  164. }
  165. } else if qu.ObjToString(tmp["s_winner"]) == "" && qu.ObjToString(tmp["winner"]) != "" {
  166. tmp["s_winner"] = tmp["winner"]
  167. }
  168. //投标方式-
  169. bidway := qu.IntAll(tmp["bidway"])
  170. if bidway == 1 {
  171. tmp["bidway"] = "纸质投标"
  172. } else if bidway == 2 {
  173. tmp["bidway"] = "电子投标"
  174. } else {
  175. delete(tmp, "bidway")
  176. }
  177. //折扣系数
  178. discount := dealWithDiscountBid(tmp)
  179. if discount > 0.0 {
  180. tmp["biddiscount"] = discount
  181. } else {
  182. delete(tmp, "biddiscount")
  183. }
  184. delete(tmp, "biddiscount_up")
  185. delete(tmp, "biddiscount_down")
  186. //检查剑鱼发布-爬虫
  187. jyfb_data := *qu.ObjToMap(j_data["jyfb_data"])
  188. if jyfb_data != nil {
  189. for k, v := range jyfb_data {
  190. if k == "area" {
  191. delete(tmp, "district")
  192. }
  193. tmp[k] = v
  194. }
  195. }
  196. //针对拟建单位~需要验证~各种字段优先级
  197. if qu.ObjToString(tmp["toptype"]) == "拟建" &&
  198. qu.ObjToString(tmp["subtype"]) == "拟建" {
  199. nj_record := map[string]interface{}{}
  200. for _, v := range NiJianField {
  201. arr := strings.Split(v, "#")
  202. k_type, k_field := "", ""
  203. if len(arr) == 2 {
  204. k_type, k_field = arr[0], arr[1]
  205. } else {
  206. continue
  207. }
  208. tmpValue := tmp[k_field]
  209. is_use := false
  210. if k_type == "string" {
  211. if qu.ObjToString(j_data[k_field]) != "" {
  212. is_use = true
  213. tmp[k_field] = qu.ObjToString(j_data[k_field])
  214. }
  215. } else if k_type == "time" {
  216. if j_data[k_field] != nil {
  217. tmp["s_"+k_field] = j_data[k_field]
  218. }
  219. //开竣工日期,采集为字符串
  220. if qu.ObjToString(j_data[k_field]) != "" {
  221. new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "")
  222. if len(new_data) > 0 {
  223. if qu.Int64All(new_data[0]) > 0 {
  224. is_use = true
  225. tmp[k_field] = qu.Int64All(new_data[0])
  226. }
  227. }
  228. } else {
  229. if qu.Int64All(j_data[k_field]) > int64(0) {
  230. is_use = true
  231. tmp[k_field] = qu.Int64All(j_data[k_field])
  232. }
  233. }
  234. } else if k_type == "map" {
  235. p_info := *qu.ObjToMap(j_data["project_scale_info"])
  236. if qu.ObjToString(p_info[k_field]) != "" {
  237. is_use = true
  238. tmp[k_field] = qu.ObjToString(p_info[k_field])
  239. }
  240. } else {
  241. }
  242. if tmpValue != nil {
  243. nj_record[k_field] = map[string]interface{}{
  244. k_field: tmpValue,
  245. "is_use": is_use,
  246. }
  247. }
  248. }
  249. if len(nj_record) > 0 {
  250. tmp["nj_record"] = nj_record
  251. }
  252. }
  253. //投标截止日期与开始日期-核对
  254. publishtime := qu.Int64All(tmp["publishtime"])
  255. bidopentime := qu.Int64All(tmp["bidopentime"])
  256. bidendtime := qu.Int64All(tmp["bidendtime"])
  257. if qu.ObjToString(tmp["toptype"]) == "招标" && qu.Int64All(tmp["dataging"]) == 0 {
  258. if publishtime-bidopentime > 7*86400 && publishtime > 0 && bidopentime > 0 {
  259. delete(tmp, "bidopentime")
  260. }
  261. if publishtime-bidendtime > 7*86400 && publishtime > 0 && bidopentime > 0 {
  262. delete(tmp, "bidopentime")
  263. }
  264. }
  265. //企业资质检验,不含有资质时删除
  266. if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok {
  267. special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|专业承包资质|贰 级|叁 级|二类|一类|三类|综合资质|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|承装(修、试)|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|房屋建筑监理|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|建筑智能化|水利水电工程|城乡规划资质|水利工程|环境工程|市政工程|公路行业|交通工程|建筑行业|电子与智能化工程|工程监理|建筑工程|土地规划|地基基础工程)`
  268. reg := regexp.MustCompile(special)
  269. var res = make([]string, 0)
  270. datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n")
  271. for _, data := range datas {
  272. results := reg.FindAllString(data, -1)
  273. if len(results) > 0 {
  274. res = append(res, data)
  275. }
  276. }
  277. if len(res) == 0 {
  278. delete(tmp, "enterprise_qualification")
  279. } else {
  280. tmp["enterprise_qualification"] = strings.Join(res, "\n")
  281. }
  282. }
  283. return tmp
  284. }