Selaa lähdekoodia

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 vuotta sitten
vanhempi
commit
1315070473

+ 5 - 2
src/jy/extract/extract.go

@@ -33,7 +33,7 @@ var (
 	ClearTaskList map[string]*ClearTask   //清理任务列表
 	saveLimit     = 100                   //抽取日志批量保存
 	PageSize      = 5000                  //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging:":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -331,6 +331,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		Result:    map[string][]*ju.ExtField{},
 		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
 		RuleBlock: e.RuleBlock,
+		Dataging: qu.IntAll(doc["dataging"]),
 	}
 	if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
 		delete((*j.Jsondata), "jsoncontent")
@@ -352,6 +353,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 			BuyerAddr:      qu.ObjToString(doc["buyeraddr"]),
 			RuleBlock:      e.RuleBlock,
 			IsFile:         isextFile,
+			Dataging: qu.IntAll(doc["dataging"]),
 		}
 		if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
 			delete((*jf.Jsondata), "jsoncontent")
@@ -1927,6 +1929,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//			qu.Debug(k, "---", v)
 		//		}
 		//tmp["extract_content"] = j.Content
+		tmp["dataging"] = j.Dataging
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -2081,7 +2084,7 @@ func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
 
 //去重冗余字段
 func delFiled(k string) bool {
-	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
+	return k=="detailfile"||k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
 func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {

+ 1 - 0
src/jy/extract/extractudp.go

@@ -130,6 +130,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		ext.ResultSave(true)
 		ext.BidSave(true)
 		ext.IsRun = true
+		ext.InitFile()
 	} else {
 		ext.BidTotal = 0
 	}

+ 5 - 2
src/jy/pretreated/analytable.go

@@ -1905,6 +1905,9 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 			if val == nil || val == "" || key == "采购项目预算金额" {
 				return
 			}
+			if key == "单位名称" && len(near.TR.TDs) > 1 {
+				key = near.TR.TDs[0].Val
+			}
 			table.SortKV.AddKey(key, val)
 			//if table.SortKV.Map[key] != nil {
 			pos := table.SortKV.Index[key]
@@ -2552,7 +2555,7 @@ var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
 var clearpkg = regexp.MustCompile("(标示|标识)")
 
 func RepairCon(con string) string {
-	con = clearpkg.ReplaceAllString(con,"")
+	con = clearpkg.ReplaceAllString(con, "")
 	res := saveThead.FindAllStringSubmatch(con, 1)
 	th := ""
 	if len(res) == 1 && len(res[0]) == 2 {
@@ -3610,7 +3613,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		//qutil.Debug(key, "---------------------------", val)
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*

+ 1 - 0
src/jy/util/article.go

@@ -40,6 +40,7 @@ type Job struct {
 	SimAreaScore      map[string]float64                //简称province得分
 	SimCityScore      map[string]float64                //简称city得分
 	SimDistrictScore  map[string]float64                //简称district得分
+	Dataging int
 }
 
 type ExtField struct {

+ 1 - 0
src/main_blocktest.go

@@ -119,6 +119,7 @@ func com(doc map[string]interface{}) {
 		Result:    map[string][]*ju.ExtField{},
 		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
 		RuleBlock: e.RuleBlock,
+		Dataging: qu.IntAll(doc["dataging"]),
 	}
 	e.TaskInfo.ProcessPool <- true
 	pretreated.AnalyStart(j, false, "")

+ 21 - 9
standardata/src/standarwinner.go

@@ -292,12 +292,14 @@ func comHisMegerNewData(name, datatype string, ps []map[string]interface{}) map[
 	data := map[string]interface{}{
 		"history_name":    "",
 		"credit_no":       "",
+		"company_email":   "",
 		"area_code":       qu.ObjToString(tmp["area_code"]),
 		"province":        qu.ObjToString(tmp["province"]),
 		"city":            "",
 		"district":        "",
 		"company_type":    qu.ObjToString(tmp["company_type"]),
 		"legal_person":    qu.ObjToString(tmp["legal_person"]),
+		"company_phone":   "",
 		"company_address": qu.ObjToString(tmp["company_address"]),
 		"business_scope":  qu.ObjToString(tmp["business_scope"]),
 		"wechat_accounts": []interface{}{},
@@ -324,21 +326,31 @@ func comHisMegerNewData(name, datatype string, ps []map[string]interface{}) map[
 		}
 	}
 
-	//网址
+	//从年报中取网址、邮箱、打电话
 	annual_reports := tmp["annual_reports"]
 	if annual_reports != nil {
 		report_websitesArr := []string{}
-		if anreports, ok := annual_reports.([]interface{}); ok {
-			for _, report_websites := range anreports {
-				if websites, ok := report_websites.([]interface{}); ok {
-					for _, website := range websites {
-						if rv, ok := website.(map[string]interface{}); ok {
-							web := qu.ObjToString(rv["website_url"])
-							if web != "" {
-								report_websitesArr = append(report_websitesArr, web)
+		if anreports, ok := annual_reports.(primitive.A); ok {
+			for _, anreportmp := range anreports {
+				if anreport, ok := anreportmp.(map[string]interface{}); ok {
+					if websites, ok := anreport["report_websites"].(primitive.A); ok {
+						for _, website := range websites {
+							if rv, ok := website.(map[string]interface{}); ok {
+								web := qu.ObjToString(rv["website_url"])
+								if web != "" {
+									report_websitesArr = append(report_websitesArr, web)
+								}
 							}
 						}
 					}
+					company_email := qu.ObjToString(anreport["company_email"])
+					if company_email != "" {
+						data["company_email"] = company_email
+					}
+					company_phone := qu.ObjToString(anreport["company_phone"])
+					if company_phone != "" {
+						data["company_phone"] = company_phone
+					}
 				}
 			}
 		}

+ 16 - 2
udpcreateindex/src/biddingall.go

@@ -66,6 +66,10 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 	var compare bson.M
 	bnil := false
 	for tmp := make(map[string]interface{}); query.Next(tmp); n++ {
+		// if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+		// 	tmp = make(map[string]interface{})
+		// 	continue
+		// }
 		update := map[string]interface{}{}
 		del := map[string]interface{}{} //记录extract没有值而bidding中有值的字段
 		//对比方法----------------
@@ -122,6 +126,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 		//下面可以多线程跑的--->
 		//处理分类
 		mpool <- true
+		_id := tmp["_id"]
 		go func(tmp, update, compare, del map[string]interface{}, bnil bool) {
 			defer func() {
 				<-mpool
@@ -187,7 +192,16 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 			if len(ps) > ESLEN {
 				tmp["projectscope"] = string(([]rune(ps))[:4000])
 			}
-
+			//对标的物为空处理
+			if filetext := getFileText(tmp); len(filetext) > 0 {
+				tmp["filetext"] = filetext
+			}
+			if purchasing, ok := tmp["purchasing"].(string); ok && purchasing == "" {
+				delete(tmp, "purchasing")
+			}
+			if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok && len(purchasinglist) == 0 {
+				delete(tmp, "purchasinglist")
+			}
 			//预算和中标金额
 			//			if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
 			//				tmp["budget"] = nil
@@ -272,7 +286,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 			UpdatesLock.Unlock()
 		}(tmp, update, compare, del, bnil)
 		if n%1000 == 0 {
-			log.Println("current:", n, tmp["_id"])
+			log.Println("current:", n, _id)
 		}
 		tmp = make(map[string]interface{})
 	}

+ 4 - 0
udpcreateindex/src/biddingindex.go

@@ -95,6 +95,10 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 	log.Println("开始迭代..")
 	for n, tmp := range infos {
 		n1++
+		if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+			tmp = make(map[string]interface{})
+			continue
+		}
 		update := map[string]interface{}{} //要更新的mongo数据
 		//对比方法----------------
 		tid := qutil.BsonIdToSId(tmp["_id"])

+ 4 - 4
udpcreateindex/src/config.json

@@ -30,15 +30,15 @@
     },
     "bidding": {
         "db": "mxs",
-        "collect": "test1",
+        "collect": "test",
         "index": "bidding_v2",
         "type": "bidding",
         "extractdb": "mxs",
-        "extractcollect": "test2",
+        "extractcollect": "extract",
         "indexfields":[ 
-        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo"
+        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo","filetext","purchasing","purchasinglist"
         ],
-        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,area,city,district,s_winner",
+        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,area,city,district,s_winner,toptype,subtype,subscopeclass,s_subscopeclass",
         "projectinfo": "approvecode,approvecontent,approvestatus,approvetime,industry",
         "multiIndex": ""
     },