Sfoglia il codice sorgente

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 4 anni fa
parent
commit
dac837b4ab

+ 3 - 9
src/config.json

@@ -26,7 +26,7 @@
     "fieldsfind": false,
     "qualityaudit": false,
     "saveblock": false,
-    "filelength": 500000,
+    "filelength": 150000,
     "iscltlog": false,
     "brandgoods": false,
     "pricenumber":true,
@@ -35,14 +35,8 @@
     "nextNode": [
         {
             "addr": "127.0.0.1",
-            "port": 1480,
-            "memo": "生中标企业"
-        },
-        {
-            "addr": "127.0.0.1",
-            "port": 1486,
-            "stype": "hangye",
-            "memo": "行业分类"
+            "port": 1485,
+            "memo": "抽取城市"
         }
     ],
     "esconfig": {

+ 5 - 5
src/jy/admin/task/task.go

@@ -97,11 +97,11 @@ func init() {
 	})
 	//任务测试
 	Admin.POST("/task/test", func(c *gin.Context) {
-		startid, _ := c.GetPostForm("startid")
-		num, _ := c.GetPostForm("num")
-		taskid, _ := c.GetPostForm("taskid")
-		resultcoll, _ := c.GetPostForm("resultcoll")
-		trackcoll, _ := c.GetPostForm("trackcoll")
+		startid := strings.TrimSpace(c.PostForm("startid"))
+		num := strings.TrimSpace(c.PostForm("num"))
+		taskid := strings.TrimSpace(c.PostForm("taskid"))
+		resultcoll := strings.TrimSpace(c.PostForm("resultcoll"))
+		trackcoll := strings.TrimSpace(c.PostForm("trackcoll"))
 		version, _ := Mgo.FindById("task", taskid, `{"s_version":1}`)
 		b := extract.StartExtractTestTask(taskid, startid, num, resultcoll, trackcoll)
 		if b { //保存结果表和日志表

+ 1 - 1
src/jy/cluster/ssh.go

@@ -49,7 +49,7 @@ var sshstr = `
 cd /opt
 kill -9 $(pidof extract_v3)
 rm -rf extract_v3*
-wget http://172.17.4.189:9080/res/extract_v3.tgz
+wget http://172.17.4.196:9080/res/extract_v3.tgz
 tar -xzvf extract_v3.tgz
 cd /opt/extract_v3
 chmod 777 extract_v3

+ 97 - 80
src/jy/extract/extract.go

@@ -18,8 +18,6 @@ import (
 	"time"
 	"unicode/utf8"
 
-	"github.com/PuerkitoBio/goquery"
-
 	log "github.com/donnie4w/go-logger/logger"
 	"gopkg.in/mgo.v2/bson"
 )
@@ -33,8 +31,8 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 100                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
-	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
+	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
 	Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -62,6 +60,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitTag(true)
 	ext.InitClearFn(false)
 	ext.InitClearFn(true)
+	ext.Lock()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		ext.InitCityInfo()
@@ -69,6 +68,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 		ext.InitAreaCode()
 		ext.InitPostCode()
 	}
+	ext.Unlock()
 	//质量审核
 	ext.InitAuditFields()
 	ext.InitAuditRule()
@@ -151,6 +151,7 @@ func StartExtractTaskId(taskId string) bool {
 	ext.InitTag(true)
 	ext.InitClearFn(false)
 	ext.InitClearFn(true)
+	ext.Lock()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		//ext.InitCityDFA()
@@ -158,6 +159,7 @@ func StartExtractTaskId(taskId string) bool {
 		ext.InitAreaCode()
 		ext.InitPostCode()
 	}
+	ext.Unlock()
 	//质量审核
 	ext.InitAuditFields()
 	ext.InitAuditRule()
@@ -216,7 +218,7 @@ func RunExtractTask(taskId string) {
 			//}
 			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时
 				continue
-			} //根据标题判断是否抽取
+			}
 			b := IsExtract("title", qu.ObjToString(v["title"]), "")
 			if !b {
 				continue
@@ -285,30 +287,22 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
-	if utf8.RuneCountInString(detail) < 2000 {
-		if doc["detailfile"] == nil || doc["detailfile"] == "" {
-			file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
-		}
-		detail += qu.ObjToString(doc["detailfile"])
-		doc["detail"] = detail
-	} else {
-		//正文小于200个字,有附件把附件内容加到正文
-		tmpDeatil := detail
-		tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
-		if err == nil {
-			conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
-			if conlen < 2000 {
-				if isextFile {
-					detail += qu.ObjToString(doc["detailfile"])
-					doc["detail"] = detail
-				}
-			} else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
-				//防止文本过长,造成抽取阻塞
-				log.Debug("文本太长", doc["_id"], conlen)
-				doc["detail"] = d3
-			}
-		}
-	}
+	//正文小于200个字,有附件把附件内容加到正文
+	//tmpDeatil := detail
+	//tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
+	//if err == nil {
+	//	conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
+	//	if conlen < 2000 {
+	//		if isextFile {
+	//			detail += qu.ObjToString(doc["detailfile"])
+	//			doc["detail"] = detail
+	//		}
+	//	} else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
+	//		//防止文本过长,造成抽取阻塞
+	//		log.Debug("文本太长", doc["_id"], conlen)
+	//		doc["detail"] = d3
+	//	}
+	//}
 
 	toptype := qu.ObjToString(doc["toptype"])
 	subtype := qu.ObjToString(doc["subtype"])
@@ -331,6 +325,9 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		if (*toMap)["extweight"] == nil {
 			(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
 		}
+		if (*toMap)["jsoncontent"] != nil {
+			delete(*toMap, "jsoncontent")
+		}
 	}
 	j = &ju.Job{
 		SourceMid:      qu.BsonIdToSId(doc["_id"]),
@@ -398,7 +395,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	}
 	qu.Try(func() {
 		pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
-		if isextFile {
+		if isextFile && strings.TrimSpace(jf.Content) != "" {
 			pretreated.AnalyStart(jf, isSite, codeSite)
 		}
 	}, func(err interface{}) {
@@ -407,9 +404,14 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	return j, jf, isSite
 }
 
+var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
+var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
+
 //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 func file2text(doc *map[string]interface{}) {
-	tmpstr := ""
+	mnameone := map[string]bool{}
+	mname := map[string]bool{}
+	murl := map[string]string{}
 	//if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
 	if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
 		for _, attachs := range attach_text {
@@ -417,22 +419,41 @@ func file2text(doc *map[string]interface{}) {
 				for _, fileinfo := range fileinfos {
 					if ff, ok := fileinfo.(map[string]interface{}); ok {
 						attach_url := qu.ObjToString(ff["attach_url"])
-						//if utf8.RuneCountInString(tmpstr+attach_url) < qu.IntAllDef(ju.Config["filelength"], 100000) {
-						//	tmpstr += attach_url + "\n"
-						//} else {
-						//	break
-						//}
-						bs := ju.OssGetObject(attach_url)
-						if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
-							tmpstr += bs + "\n"
-						} else {
-							break
+						ffname := qu.ObjToString(ff["file_name"])
+						if clearStrReg.MatchString(ffname) {
+							continue
+						}
+						mname[ffname] = true
+						murl[ffname] = attach_url
+						if sortStrReg.MatchString(ffname) {
+							mnameone[ffname] = true
 						}
 					}
 				}
 			}
 		}
 	}
+	tmpstr := ""
+	for k := range mnameone {
+		if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
+			(*doc)["detailfile"] = tmpstr
+			return
+		}
+		bs := ju.OssGetObject(murl[k])
+		if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
+			tmpstr += bs + "\n"
+		}
+	}
+	for k := range mname {
+		if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
+			(*doc)["detailfile"] = tmpstr
+			return
+		}
+		bs := ju.OssGetObject(murl[k])
+		if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
+			tmpstr += bs + "\n"
+		}
+	}
 	(*doc)["detailfile"] = tmpstr
 }
 
@@ -441,6 +462,14 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 	e.ExtractDetail(j, isSite, j.SpiderCode)
 	if jf != nil && jf.IsFile {
 		e.ExtractFile(jf, isSite, j.SpiderCode)
+		for tmpk, _ := range jf.Result {
+			if len(j.Result[tmpk]) == 0 {
+					j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
+			}
+		}
+		if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
+			j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
+		}
 	}
 	if isSite {
 		ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
@@ -605,6 +634,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				lockclear.Lock()
 				var cfn = []string{}
 				if isSite {
+					cfn = e.SiteClearFn[key]
 				} else {
 					cfn = e.ClearFn[key]
 				}
@@ -623,7 +653,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				}
 				before, _ := v.Value.(string)
 				v.Value = data[0]
-				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
+				BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
 				//添加行数清理的日志
 				//清理特殊符号
 				lockclear.Lock()
@@ -631,7 +661,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					text := qu.ObjToString(v.Value)
 					before = text
 					v.Value = clear.OtherClean(key, text)
-					BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
+					BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
 				}
 				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
 				lockclear.Unlock()
@@ -1198,34 +1228,10 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
 		for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
 			if k == 0 {
 				tp = "colon"
-				//				for _, vv := range v.Kvs {
-				//					qu.Debug("colon-kvs:", vv.Key, vv.Value)
-				//				}
-				//				for kkk, vv := range v.KvTags {
-				//					for _, vvv := range vv {
-				//						qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
-				//					}
-				//				}
 			} else if k == 1 {
 				tp = "space"
-				//				for _, vv := range v.Kvs {
-				//					qu.Debug("space-kvs:", vv.Key, vv.Value)
-				//				}
-				//				for kkk, vv := range v.KvTags {
-				//					for _, vvv := range vv {
-				//						qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
-				//					}
-				//				}
 			} else if k == 2 {
 				tp = "table"
-				//				for _, vv := range v.Kvs {
-				//					qu.Debug("table-kvs:", vv.Key, vv.Value)
-				//				}
-				//				for kkk, vv := range v.KvTags {
-				//					for _, vvv := range vv {
-				//						qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
-				//					}
-				//				}
 			}
 			if v == nil || v.KvTags == nil {
 				continue
@@ -1724,11 +1730,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		tmp := map[string]interface{}{} //抽取值
 		tmp["spidercode"] = j.SpiderCode
 		tmp["site"] = j.Site
-		tmp["jsondata"] = j.Jsondata
+		if len(*j.Jsondata) > 0 {
+			tmp["jsondata"] = j.Jsondata
+		}
 		for _, val := range result {
 			for _, v := range val { //取第一个非负数,项目名称除外
 				//存0是否有效
-				if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
+				if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue{
 					tmp[v.Field] = v.Value
 					break
 				}
@@ -1771,7 +1779,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				if qu.Float64All(tmp["budget"]) < tmpBudget {
 					tmp["budget"] = tmpBudget
 				}
-				if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/100 > qu.Float64All(tmp["budget"])) {
+				if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
 					tmp["bidamount"] = tmpBidamount
 				} else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
 					tmp["bidamount"] = tmpBidamount
@@ -1817,8 +1825,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(j.Winnerorder) > 0 { //候选人信息
 			for i, v := range j.Winnerorder {
 				if v["price"] != nil {
-					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0]
-				}
+					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0]				}
 			}
 			tmp["winnerorder"] = j.Winnerorder
 		}
@@ -1831,6 +1838,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				for _, v := range val { //取第一个非负数
 					if v.Score > -1 {
 						ffield[v.Field] = v.Value
+						if tmp[v.Field] == nil {
+							if v.Field == "budget" || v.Field == "bidamount" {
+								if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
+									tmp[v.Field] = v.Value
+								}
+							} else {
+								tmp[v.Field] = v.Value
+							}
+						}
 						break
 					}
 				}
@@ -1959,9 +1975,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["projectname"] = j.Title
 		}
 		tmp["repeat"] = 0
-		if ju.Ffield {
-			tmp["ffield"] = ffield
-		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -2013,9 +2026,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			//if len(auxinfof) > 0 {
 			//	tmp["fieldallf"] = auxinfof
 			//}
-			//if len(ffield) > 0 {
-			//	tmp["ffield"] = ffield
-			//}
+			if ju.Ffield {
+				if len(ffield) > 0 {
+					tmp["ffield"] = ffield
+				}
+			}
 			delete(tmp, "fieldall")
 			if len(j.BlockPackage) > 0 { //分包详情
 				if len(j.BlockPackage) > 10 {
@@ -2040,6 +2055,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 func checkFields(tmp map[string]interface{}) map[string]interface{} {
 	delete(tmp, "contenthtml")
 	delete(tmp, "detail")
+	//delete(tmp, "toptype")
+	//delete(tmp, "subtype")
 	if _, ok := tmp["bidamount"].(string); ok {
 		delete(tmp, "bidamount")
 	} else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]) {
@@ -2167,7 +2184,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	}
 	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
 		//jsondata清理
-		clearJd(j.Jsondata, e, j.SpiderCode)
+		clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
 		json.Unmarshal(marshalbt, &tmpjddata)
@@ -2393,7 +2410,7 @@ func resetWinnerorder(j *ju.Job) {
 	} else if len(bidamounts) > 0 {
 		j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
 	}
-
+	
 }
 func RemoveReplicaSliceString(slc []string) []string {
 	result := make([]string, 0)

+ 9 - 2
src/jy/extract/extractudp.go

@@ -111,6 +111,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		ext.InitTag(true)
 		ext.InitClearFn(false)
 		ext.InitClearFn(true)
+		ext.Lock()
 		if ext.IsExtractCity { //版本上控制是否开始城市抽取
 			//初始化城市DFA信息
 			//ext.InitCityDFA()
@@ -118,6 +119,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 			ext.InitAreaCode()
 			ext.InitPostCode()
 		}
+		ext.Unlock()
 		//质量审核
 		ext.InitAuditFields()
 		ext.InitAuditRule()
@@ -155,7 +157,12 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 			log.Debug("timestr", (*tsk)["timestr"], "count", count1+count2)
 			list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
 			for _, v := range *list {
-				if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+				//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+				//	log.Debug(index, qu.BsonIdToSId(v["_id"]), "//去除含敏感词数据")
+				//	continue
+				//}
+				if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时开标记录
+					log.Debug(index, qu.BsonIdToSId(v["_id"]), "//开标记录")
 					continue
 				}
 				var j, jf *ju.Job
@@ -217,7 +224,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		for i := 0; i < pageNum; i++ {
 			query = bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(sid)}}
 			fmt.Printf("page=%d,query=%v\n", i+1, query)
-			list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
+			list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, `{"_id":1}`, Fields, false, 0, limit)
 			for _, v := range *list {
 				//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				//	log.Debug(index, qu.BsonIdToSId(v["_id"]), "//去除含敏感词数据")

+ 18 - 14
src/res/fieldscore.json

@@ -11,10 +11,10 @@
         },
         "fields": {
             "projectname": {
-                "title": 1,
-                "table": 3,
-                "colon": 3,
-                "space": 3,
+                "title": 1.3,
+                "table": 2.6,
+                "colon": 1.8,
+                "space": 1.2,
                 "regexp": 1,
                 "kvweight": 1
             },
@@ -52,7 +52,7 @@
             "bidamount": {
                 "table": 3,
                 "colon": 2,
-                "space": 2,
+                "space": 0.5,
                 "regexp": 2,
                 "kvweight": 1
             },
@@ -80,12 +80,12 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(项目|工程|施工|服务|设备|采购|设计|系统)$",
+                "regstr": ".{2,100}(项目|工程|施工|服务|设备|设备采购|设计|系统)$",
                 "score": 2
             },
             {
                 "describe": "+3",
-                "regstr": "(供货|采购|监测|招标|询价)",
+                "regstr": "(供货|监测|询价采购)",
                 "score": 3
             }
         ],
@@ -97,17 +97,17 @@
             },
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(的|招标|名称|公示|公告|谈判|公开|通知|采购文件|交易中心|\\d#)$",
+                "regstr": ".{2,100}(的|招标|名称|公示|公告|谈判|公开|通知|采购文件|交易中心|\\d#|\\d\\*)$",
                 "score": -5
             },
             {
                 "describe": "包含词-10",
-                "regstr": "(万元|本项目|详见公告|test|^[0-9]{1}、\\W)",
+                "regstr": "(万元|本项目|详见公告|test|^[0-9]{1}、\\W|竞争性谈判)",
                 "score": -10
             },
             {
-                "describe": "包含词-3",
-                "regstr": "[,|,]",
+                "describe": "包含词-5",
+                "regstr": "[,|-|—|\\s| ]",
                 "score": -5
             },
             {
@@ -182,7 +182,7 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(总站|委员会|管委会|联合会|联合体|医院|卫计委|机关|社区|中心|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|集团|银行|[大中小]学|院|厂|店|段|场|社|室|部|厅|局|处|所|队|公司|监狱|监测站|血站|检查站|工作站|供应站|分行)$",
+                "regstr": ".{2,100}(总站|委员会|管委会|联合会|联合体|医院|卫计委|机关|社区|中心|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|集团|银行|[大中小]学|院|厂|店|段|场|社|室|部|厅|局|处|所|队|公司|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)$",
                 "score": 10
             }
         ],
@@ -209,7 +209,7 @@
             },
             {
                 "describe": "包含负分不再展示",
-                "regstr": "(详见|提出|面向|施工|获得|test|认定|一批|项目$|系统)",
+                "regstr": "(详见|提出|面向|[^实]施工[^程]|获得|test|认定|一批|项目$|详细请?见?正文)",
                 "score": -50
             },
             {
@@ -260,12 +260,16 @@
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(集团|公司|学校|中心|家具城|门诊|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$",
                 "score": 3
+            },{
+                "describe": "关键词",
+                "regstr": "(牵头方|联合体)",
+                "score": 3
             }
         ],
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(我公司|定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "regstr": "(我公司|定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交[^通]|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|注:|\\\\d[\\\\s]{0,10}(\\\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})\n",
                 "score": -20
             },
 			{

+ 0 - 14
src/res/specialsymbols.json

@@ -9,20 +9,6 @@
 			"agencyperson":true
         },
         "symbol": [
-			{
-				"symdelete":false,
-				"text":[
-                		"(",
-                		")"
-           		 	]				
-			},
-			{
-				"symdelete":false,
-				"text":[
-	                "(",
-	                ")"
-	            ]
-			},
 			{
 				"symdelete":true,
 				"text":[

+ 5 - 3
udp_city/src/config.json

@@ -6,13 +6,15 @@
   "mgodb_bidding": "192.168.3.207:27092",
   "mgodb_bidding_xs": "172.17.4.187:27083",
   "dbname_bidding": "qfw",
-  "findDb": "bidding",
+  "findDb": "baidu_xxbs_toid20200909_result",
   "udpport": "1485",
   "nextNode": [
     {
       "addr": "127.0.0.1",
-      "port": 1484,
-      "memo": "抽取回复"
+      "port": 1781,
+      "stype":"hangye",
+      "memo": "生kv招标分类"
     }
+
   ]
 }

+ 53 - 47
udp_city/src/main.go

@@ -2,6 +2,7 @@ package main
 
 import (
 	"encoding/json"
+	"fmt"
 	"gopkg.in/mgo.v2/bson"
 	"log"
 	mu "mfw/util"
@@ -93,7 +94,9 @@ func getCity(sid, eid, rep string) {
 	for i := 0; i < pageNum; i++ {
 		query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		log.Printf("page=%d,query=%v,db=%v\n", i+1, query, table)
-		list, _ := mgo.Mgo_Bidding.Find(table, query, nil, biddingFields, false, 0, limit)
+		list, _ := mgo.Mgo_Bidding.Find(table, query, map[string]interface{}{
+			"_id": 1,
+		}, biddingFields, false, 0, limit)
 		for _, v := range *list {
 			if qu.ObjToString(v["district"]) != "" && qu.ObjToString(v["city"]) != "" && qu.ObjToString(v["area"]) != "" && qu.ObjToString(v["area"]) != "全国" {
 				index++
@@ -140,67 +143,70 @@ func getCity(sid, eid, rep string) {
 }
 
 func cityMarshal(data map[string]interface{}) map[string]string {
-	rdata := make(map[string]string)
 	buyer := qu.ObjToString(data["buyer"])
+	bidarea := qu.ObjToString(data["area"])
+	bidcity := qu.ObjToString(data["city"])
+	biddistrict := qu.ObjToString(data["district"])
+	rdata := make(map[string]string)
 	tmp, _ := mgo.Mgo.FindOneByField("qyxy", `{"company_name":"`+buyer+`"}`, qyxyFields)
 	if tmp == nil || (*tmp) == nil {
 		return rdata
 	}
-	company_code := qu.ObjToString((*tmp)["company_code"])
+	company_code := fmt.Sprint((*tmp)["company_code"])
 	if len(company_code) > 5 {
 		province_city_district, _ := mgo.Mgo.FindOne("address", `{"code":"`+company_code[:6]+`"}`)
-		if province_city_district != nil && (*province_city_district) != nil && (*province_city_district)["Remarks"] != "废除" {
-			if qu.ObjToString(data["area"]) == "" || qu.ObjToString(data["area"]) == "全国" {
-				if province := qu.ObjToString((*province_city_district)["province"]); province != "" {
-					rdata["area"] = province
-				}
-				if city := qu.ObjToString((*province_city_district)["city"]); city != "" && !strings.Contains(city, rdata["area"]) {
-					rdata["city"] = city
-				}
-				if district := qu.ObjToString((*province_city_district)["district"]); district != "" && !strings.Contains(district, rdata["area"]) && !strings.Contains(district, rdata["city"]) {
-					rdata["district"] = district
-				}
-			} else if qu.ObjToString(data["city"]) == "" && qu.ObjToString((*province_city_district)["province"]) != "" && qu.ObjToString((*province_city_district)["province"]) == qu.ObjToString(data["area"]) {
-				if city := qu.ObjToString((*province_city_district)["city"]); city != "" && !strings.Contains(city, rdata["area"]) {
-					rdata["city"] = city
-				}
-				if district := qu.ObjToString((*province_city_district)["district"]); district != "" && !strings.Contains(district, rdata["area"]) && !strings.Contains(district, rdata["city"]) {
-					rdata["district"] = district
+		remarks := fmt.Sprint((*province_city_district)["Remarks"])
+		if remarks == "" || remarks == "废除" || remarks == "已作废" {
+		} else if province_city_district != nil && (*province_city_district) != nil {
+			codeprovince := qu.ObjToString((*province_city_district)["province"])
+			codecity := qu.ObjToString((*province_city_district)["city"])
+			codedistrict := qu.ObjToString((*province_city_district)["district"])
+			if bidarea == "" || bidarea == "全国" {
+				if codeprovince != "" {
+					rdata["area"] = codeprovince
+					if codecity != "" && codecity != codeprovince {
+						rdata["city"] = codecity
+						if codedistrict != "" && codedistrict != codecity {
+							rdata["district"] = codedistrict
+						}
+					}
 				}
-			} else if qu.ObjToString(data["district"]) == "" && qu.ObjToString((*province_city_district)["city"]) != "" && qu.ObjToString((*province_city_district)["city"]) == qu.ObjToString(data["city"]) {
-				if district := qu.ObjToString((*province_city_district)["district"]); district != "" && !strings.Contains(district, rdata["area"]) && !strings.Contains(district, rdata["city"]) {
-					rdata["district"] = district
+			} else if bidcity == "" && codecity != "" && bidarea == codeprovince {
+				if codecity != bidarea {
+					rdata["city"] = codecity
+					if codedistrict != "" && codecity != codedistrict {
+						rdata["district"] = codedistrict
+					}
 				}
+			} else if biddistrict == "" && codedistrict != "" && bidarea == codeprovince && codecity == bidcity {
+				rdata["district"] = codedistrict
 			}
 			return rdata
 		}
 	}
-	var province string
-	if qu.ObjToString(data["area"]) == "" || qu.ObjToString(data["area"]) == "全国" {
-		if province = qu.ObjToString((*tmp)["province"]); province != "" {
-			province = strings.TrimRight(province, "省")
-			province = strings.TrimRight(province, "市")
-			rdata["area"] = province
-		}
-		if city := qu.ObjToString((*tmp)["city"]); city != "" && province != city {
-			rdata["city"] = city
-		}
-		if district := qu.ObjToString((*tmp)["district"]); district != "" && rdata["city"] != district {
-			rdata["district"] = district
-		}
-	} else if qu.ObjToString(data["city"]) == "" && province != "" && qu.ObjToString(data["area"]) == province {
-		if city := qu.ObjToString((*tmp)["city"]); city != "" && province != city {
-			rdata["city"] = city
-		}
-		if district := qu.ObjToString((*tmp)["district"]); district != "" && qu.ObjToString((*tmp)["city"]) != district {
-			rdata["district"] = district
-		}
-	} else if qu.ObjToString(data["district"]) == "" && qu.ObjToString((*tmp)["city"]) != "" && qu.ObjToString((*tmp)["city"]) == qu.ObjToString(data["city"]) {
-		if district := qu.ObjToString((*tmp)["district"]); district != "" && qu.ObjToString(data["city"]) != district {
-			if qu.ObjToString(data["district"]) != rdata["city"] {
-				rdata["district"] = district
+
+	entprovince := qu.ObjToString((*tmp)["province"])
+	entprovince = strings.TrimRight(entprovince, "省")
+	entprovince = strings.TrimRight(entprovince, "市")
+	entcity := qu.ObjToString((*tmp)["city"])
+	entdistrict := qu.ObjToString((*tmp)["district"])
+	if bidarea == "" || bidarea == "全国" {
+		if entprovince != "" {
+			rdata["area"] = entprovince
+			if entcity != "" && entcity != entprovince {
+				rdata["city"] = entcity
+				if entdistrict != "" && entdistrict != entcity {
+					rdata["district"] = entdistrict
+				}
 			}
 		}
+	} else if bidcity == "" && entcity != "" && entprovince == bidarea {
+		rdata["city"] = entcity
+		if entdistrict != "" && entcity != entdistrict {
+			rdata["district"] = entdistrict
+		}
+	} else if biddistrict == "" && entdistrict != "" && entprovince == bidarea && bidcity == entcity {
+		rdata["district"] = entdistrict
 	}
 
 	return rdata