zhengkun 1 rok pred
rodič
commit
6226f41e0c

+ 1 - 1
src/jy/extract/extract.go

@@ -526,7 +526,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				if !ju.Logic(vc.LuaLogic, tmp) {
 					continue
 				}
-				if vc.Field == "bidamount" {
+				if vc.Field == "budget" {
 					//log.Debug("调试抽取字段")
 				}
 				//抽取-前置规则

+ 21 - 0
src/jy/extract/extractcity_new.go

@@ -20,6 +20,26 @@ func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) {
 			(*tmp)[k] = v
 		}
 	}
+
+	//对于补充的地域信息进行标准化校验...
+	s_area := qu.ObjToString((*tmp)["s_area"])
+	s_city := qu.ObjToString((*tmp)["s_city"])
+	s_district := qu.ObjToString((*tmp)["s_district"])
+	if s_area != "" && s_area != "全国" {
+		s_rdata := e.StandardCheckCity(s_area, s_city, s_district)
+		delete((*tmp), "s_area")
+		delete((*tmp), "s_city")
+		delete((*tmp), "s_district")
+		if s_rdata["s_area"] != "" && s_rdata["s_area"] != "全国" {
+			(*tmp)["s_area"] = s_rdata["s_area"]
+		}
+		if s_rdata["s_city"] != "" {
+			(*tmp)["s_city"] = s_rdata["s_city"]
+		}
+		if s_rdata["s_district"] != "" {
+			(*tmp)["s_district"] = s_rdata["s_district"]
+		}
+	}
 }
 
 // 抽取地域信息
@@ -242,6 +262,7 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, jf *ju.Job, tmp *map[string]i
 	(*tmp)["city"] = f_city
 	(*tmp)["district"] = f_district
 	(*tmp)["regions_log"] = logRecordInfo
+
 }
 
 // 对组进行分析处理

+ 226 - 0
src/jy/extract/extractcity_other.go

@@ -0,0 +1,226 @@
+package extract
+
+import ju "jy/util"
+
+// 未提取到标准地域,进行补充
+func (e *ExtractTask) ExtractRegionOtherInfo(j *ju.Job, tmp *map[string]interface{}) {
+	e.GetMatchScores(j)
+	finishA, finishC, finishD := []string{}, []string{}, []string{}
+	if len(j.FullAreaScore) > 0 {
+		finishA = GetHighestScoreArr(j.FullAreaScore)
+		e.RemoveScoreRegion(finishA, j)
+		finishC = GetHighestScoreArr(j.FullCityScore)
+		finishD = GetHighestScoreArr(j.FullDistrictScore)
+	} else {
+		finishA = GetHighestScoreArr(j.SimAreaScore)
+		e.RemoveScoreRegion(finishA, j)
+		finishC = GetHighestScoreArr(j.SimCityScore)
+		finishD = GetHighestScoreArr(j.SimDistrictScore)
+	}
+	s_area, s_city, s_district := e.GetFinallyScoreRegion(finishA, finishC, finishD)
+	e.StandardizedegionInfo(&s_area, &s_city, &s_district)
+	//对于补充的地域信息进行标准化校验......
+	if s_area != "" && s_area != "全国" {
+		(*tmp)["s_area"] = s_area
+		(*tmp)["s_city"] = s_city
+		(*tmp)["s_district"] = s_district
+		s_rdata := e.StandardCheckCity(s_area, s_city, s_district)
+		for k, v := range s_rdata {
+			(*tmp)["s_"+k] = v
+		}
+	}
+}
+
+// 获取所有匹配地域并赋分数
+func (e *ExtractTask) GetMatchScores(j *ju.Job) {
+	j.FullAreaScore, j.FullCityScore, j.FullDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
+	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
+	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
+	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
+	for _, text := range e.Seg_SV.Cut(j.Content, true) {
+		if text == "" {
+			continue
+		}
+		for pos_full, trie_full := range e.Trie_Fulls {
+			if trie_full.Get(text) {
+				if pos_full == 0 { //省全称
+					if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !rf_area[tmpPbrief] { //取简称
+						j.FullAreaScore[tmpPbrief] += 1.0
+						rf_area[tmpPbrief] = true
+						break
+					}
+				} else if pos_full == 1 { //市全称
+					if cfMap := e.CityFullMap[text]; cfMap != nil {
+						if !rf_area[cfMap.P.Brief] {
+							j.FullAreaScore[cfMap.P.Brief] += 1.0
+							rf_area[cfMap.P.Brief] = true
+						}
+						if !rf_city[cfMap.Name] {
+							j.FullCityScore[cfMap.Name] += 1.0
+							rf_city[cfMap.Name] = true
+						}
+						break
+					}
+				} else if pos_full == 2 { //区全称
+					citys := e.DistrictCityMap[text]
+					if len(citys) > 0 {
+						if !rf_district[text] {
+							j.FullDistrictScore[text] += 1.0
+							rf_district[text] = true
+						}
+						for _, c := range citys {
+							if !rf_city[c.Name] {
+								j.FullCityScore[c.Name] += 1.0
+								rf_city[c.Name] = true
+							}
+							if !rf_area[c.P.Brief] {
+								j.FullAreaScore[c.P.Brief] += 1.0
+								rf_area[c.P.Brief] = true
+							}
+						}
+						break
+					}
+				}
+			}
+		}
+		//简称匹配
+		for pos_sim, trie_sim := range e.Trie_Sims {
+			if trie_sim.Get(text) {
+				if pos_sim == 0 && !rs_area[text] { //省简称
+					j.SimAreaScore[text] += 1.0
+					rs_area[text] = true
+					break
+				} else if pos_sim == 1 { //市简称
+					if cbMap := e.CityBriefMap[text]; cbMap != nil {
+						if !rs_area[cbMap.P.Brief] {
+							j.SimAreaScore[cbMap.P.Brief] += 1.0
+							rs_area[cbMap.P.Brief] = true
+						}
+						if !rs_city[cbMap.Name] {
+							j.SimCityScore[cbMap.Name] += 1.0
+							rs_city[cbMap.Name] = true
+						}
+						break
+					}
+				} else if pos_sim == 2 { //区简称
+					dfull_citys := e.DistrictSimAndAll[text]
+					if len(dfull_citys) == 1 {
+						for _, dfull_city := range dfull_citys {
+							for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
+								if ctmp == nil {
+									continue
+								}
+								if !rs_district[dfull] {
+									j.SimDistrictScore[dfull] += 1.0
+									rs_district[dfull] = true
+								}
+								if !rs_city[ctmp.Name] {
+									j.SimCityScore[ctmp.Name] += 1.0
+									rs_city[ctmp.Name] = true
+								}
+								if !rs_area[ctmp.P.Brief] {
+									j.SimAreaScore[ctmp.P.Brief] += 1.0
+									rs_area[ctmp.P.Brief] = true
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+// 获取最高分地域
+func GetHighestScoreArr(m map[string]float64) []string {
+	result := make(map[float64][]string)
+	tmpscore := 0.0
+	for str, score := range m {
+		if str != "" && tmpscore <= score {
+			if result[tmpscore] != nil && tmpscore != score {
+				delete(result, tmpscore)
+			}
+			if r := result[score]; r != nil {
+				r = append(r, str)
+				result[score] = r
+			} else {
+				result[score] = []string{str}
+			}
+			tmpscore = score
+		}
+	}
+	return result[tmpscore]
+}
+
+// 移除干扰地域
+func (e *ExtractTask) RemoveScoreRegion(finishP []string, j *ju.Job) {
+	if len(j.FullDistrictScore) > 0 {
+		for d, _ := range j.FullDistrictScore {
+			tmpCitys := e.DistrictCityMap[d]
+			for _, c := range tmpCitys {
+				if j.FullCityScore[c.Name] != 0 {
+					tmpPb := c.P.Brief
+					flag := false
+					for _, p := range finishP {
+						if tmpPb == p {
+							flag = true
+							break
+						}
+					}
+					if !flag {
+						delete(j.FullCityScore, c.Name)
+						delete(j.FullDistrictScore, d)
+					}
+				}
+			}
+		}
+	}
+	if len(j.FullCityScore) > 0 {
+		for tmpcity, _ := range j.FullCityScore {
+			c := e.CityFullMap[tmpcity]
+			if c == nil {
+				continue
+			}
+			tmpPb := c.P.Brief
+			flag := false
+			for _, p := range finishP {
+				if tmpPb == p {
+					flag = true
+					break
+				}
+			}
+			if !flag {
+				delete(j.FullCityScore, tmpcity)
+			}
+		}
+	}
+
+}
+
+// 获取最后分数地域
+func (e *ExtractTask) GetFinallyScoreRegion(finishA, finishC, finishD []string) (string, string, string) {
+	s_area, s_city, s_district := "", "", ""
+	tmpcity := []string{}
+	if len(finishA) == 1 {
+		s_area = finishA[0]
+		s_city, tmpcity = NewGetCity(s_area, s_city, e, finishC, tmpcity)
+		s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
+	} else if len(finishA) > 1 {
+		if len(finishC) == 1 {
+			s_city = finishC[0]
+			if cfMap := e.CityFullMap[s_city]; cfMap != nil {
+				s_area = cfMap.P.Brief
+				tmpcity = append(tmpcity, s_city)
+				s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
+			}
+		} else {
+			s_area = finishA[0] //抽取结果直接赋值
+			s_city, tmpcity = NewGetCity(s_area, s_city, e, finishC, tmpcity)
+			s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
+		}
+	}
+	if s_city != "" && s_city == s_district {
+		s_district = ""
+	}
+	return s_area, s_city, s_district
+}

+ 5 - 1
src/jy/extract/extractsave.go

@@ -408,7 +408,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//城市抽取
 		if e.IsExtractCity {
 			e.ExtractRegionInfo(j, jf, &tmp, true)
-			e.ExtractRegionClean(&tmp)
+			e.ExtractRegionClean(&tmp) //正常标准清洗
+			if qu.ObjToString(tmp["area"]) == "" || qu.ObjToString(tmp["全国"]) == "" {
+				//需要调试...
+				e.ExtractRegionOtherInfo(j, &tmp)
+			}
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {

+ 4 - 1
src/jy/pretreated/analycore.go

@@ -31,8 +31,11 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 			v1 = sv[0]
 		}
 	}
+	if k == "本期概算(万元)" {
+		//log.Debug("21323")
+	}
 	//对值单位的处理   (预算|费|价|额|规模|投资)
-	if moneyReg.MatchString(k) {
+	if moneyReg.MatchString(k) || strings.Contains(k, "万") {
 		v1 += GetMoneyUnit(k, v1)
 	}
 	//先清理key

+ 4 - 1
src/jy/pretreated/analymethod.go

@@ -31,7 +31,7 @@ var (
 	//清理表格td中的符号
 	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*")
 	//判断key是金额,对万元的处理
-	moneyReg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
+	moneyReg = regexp.MustCompile("(预算|概算|费|价|额|规模|投资)")
 	//特殊文本-为表头
 	specHeadReg = regexp.MustCompile("(成交供应商|中选人)")
 	//key不需要清理-例如折扣 费率
@@ -334,6 +334,9 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 	con = formattext20.ReplaceAllString(con, "\n")
 	con = formattext21.ReplaceAllString(con, "")
 
+	//特殊结构转换
+	con = formattext30.ReplaceAllString(con, "${1}")
+
 	//指定爬虫-特殊结构-计算抽取
 	if codeSite == "a_zgzfcgw_zfcghtgg_new" {
 		str := formattext50.FindString(con)

+ 3 - 0
src/jy/pretreated/analystep.go

@@ -46,6 +46,9 @@ var formattext12 = regexp.MustCompile("((成交|中标)价格|本期预算)\n[(
 var formattext13 = regexp.MustCompile("中选单位名称[::]标的一[::](.{4,25}公司)[;;]标的二[::](.{4,25}公司)\n中选金额[::]标的一[::]([0-9\\.人民币万元]+)[;;]标的二[::]([0-9\\.人民币万元]+)")
 var formattext14 = regexp.MustCompile("包号\n项目名称\n中标单位名称\n中标金额[((]元[))]\n1\n.*\n(.{4,25}公司)\n([0-9\\.]+)\n2\n.*\n(.{4,25}公司)\n([0-9\\.]+)")
 
+// 部分文本转换格式
+var formattext30 = regexp.MustCompile("[\\s  ]+([((]万元[))])")
+
 // 业绩相关~分割
 var formattext20 = regexp.MustCompile("(工程业绩|投标文件中填报的单位项目业绩名称)[::][\n]?1[、.].*\n2[、.].*\n(3[、.].*\n)?")
 var formattext21 = regexp.MustCompile("(完成过)([ \\s]+)?[一二三1-9]([ \\s]+)?项总投资([0-9〇零点壹贰叁肆伍陆柒捌玖拾佰仟万亿元圆角分整,,\\.]{3,}[万亿元圆角分整]+)")

+ 4 - 2
src/jy/pretreated/tablev2.go

@@ -346,8 +346,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite stri
 			if tagindex = strings.Index(td.Text, "("); tagindex <= 0 {
 				tagindex = strings.Index(td.Text, "(")
 			}
-			td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
-			td.BH = true
+			if tagindex >= 0 {
+				td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
+				td.BH = true
+			}
 		}
 		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3, isSite, codeSite) //td冒号kv
 		for k, v := range resm {

+ 1 - 1
udpcontrol/src/method.go

@@ -97,8 +97,8 @@ func extractRunningMonitoring() {
 						"skey":  "heart_extract" + k,
 					})
 					sendSingleOtherNode(by, keyArr[0], keyArr[1])
-					heart_num := qu.IntAll(heartAction[k])
 					heartlock.Lock()
+					heart_num := qu.IntAll(heartAction[k])
 					heartAction[k] = heart_num + 1
 					heartlock.Unlock()
 				}