Pārlūkot izejas kodu

多线程预处理,preinfo

zhengkun 1 gadu atpakaļ
vecāks
revīzija
4f5c91659f

+ 3 - 3
src/config.json

@@ -2,9 +2,9 @@
     "port": "9090",
     "mgodb": "127.0.0.1:27017",
     "dbsize": 3,
-    "dbname": "extract_local",
+    "dbname": "extract_pre",
     "site_addr": "127.0.0.1:27017",
-    "site_dbname": "extract_local",
+    "site_dbname": "extract_pre",
     "qyxy_addr": "127.0.0.1:27017",
     "qyxy_dbname": "extract_service",
     "qyxy_username": "",
@@ -22,7 +22,7 @@
     "pricenumber":true,
     "inscribe": true,
     "udpport": "6601",
-    "udptaskid": "60b493c2e138234cb4adb640",
+    "udptaskid": "655462aaff8a32f626742465",
     "nextNode": [],
     "esconfig": {},
     "istest": true,

+ 18 - 40
src/jy/extract/extract.go

@@ -74,7 +74,6 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 	n, _ := strconv.Atoi(num)
 	id := IdTrans(startId)
 	if id.Valid() {
-		//query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
 		list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
 		for _, v := range *list {
@@ -83,12 +82,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 			}
 			var j, jf *ju.Job
 			var isSite bool
-			if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
-				v["isextFile"] = true
-				j, jf, isSite = ext.PreInfo(v)
-			} else { //无附件
-				j, _, isSite = ext.PreInfo(v)
-			}
+			j, _, isSite = ext.PreInfo(v)
 			go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
 			ext.TaskInfo.ProcessPool <- true
 		}
@@ -231,28 +225,14 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	if doc["isextFile"] != nil {
 		isextFile = doc["isextFile"].(bool)
 	}
+	isextFile = false
 	detail := ""
 	summary := qu.ObjToString(doc["summary"])
 	detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
-	//d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
-	//d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
-	////log.Debug("正文长度:", len(d1), "~", "源码长度:", len(d2))
-	//if len(d1) > len(d2) || d2 == "" {
-	//	detail = d1
-	//	if SelectSourceStructText(d1, d2) {
-	//		detail = d2
-	//	}
-	//} else {
-	//	detail = d2
-	//	if SelectDetailSourceText(d1, d2) {
-	//		detail = d1
-	//	}
-	//}
 	//调整采用detail抽取
-	if utf8.RuneCountInString(detail) >= 100000 {
-		detail = detail[:100000]
+	if utf8.RuneCountInString(detail) >= 50000 {
+		detail = detail[:50000]
 	}
-	
 	doc["detail"] = detail
 	isClearnMoney := !clearMoneyReg.MatchString(detail)
 	if isClearnMoney {
@@ -303,19 +283,17 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		Content:        qu.ObjToString(doc["detail"]),
 		SpiderCode:     qu.ObjToString(doc["spidercode"]),
 		Site:           qu.ObjToString(doc["site"]),
-		//Domain:     qu.ObjToString(doc["domain"]),
-		//Href:       qu.ObjToString(doc["href"]),
-		Title:         qu.ObjToString(doc["title"]),
-		Data:          &doc,
-		City:          qu.ObjToString(doc["city"]),
-		Province:      qu.ObjToString(doc["area"]),
-		Jsondata:      toMap,
-		Result:        map[string][]*ju.ExtField{},
-		BuyerAddr:     qu.ObjToString(doc["buyeraddr"]),
-		RuleBlock:     e.RuleBlock,
-		Dataging:      qu.IntAll(doc["dataging"]),
-		IsClearnMoney: isClearnMoneystr,
-		IsUnRulesTab:  false,
+		Title:          qu.ObjToString(doc["title"]),
+		Data:           &doc,
+		City:           qu.ObjToString(doc["city"]),
+		Province:       qu.ObjToString(doc["area"]),
+		Jsondata:       toMap,
+		Result:         map[string][]*ju.ExtField{},
+		BuyerAddr:      qu.ObjToString(doc["buyeraddr"]),
+		RuleBlock:      e.RuleBlock,
+		Dataging:       qu.IntAll(doc["dataging"]),
+		IsClearnMoney:  isClearnMoneystr,
+		IsUnRulesTab:   false,
 	}
 	if isextFile {
 		jf = &ju.Job{
@@ -362,10 +340,10 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 			}
 		}
 	}
-	qu.Try(func() {
-		pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
+	qu.Try(func() { //不解析表格
+		pretreated.AnalyStartNoTable(j, isSite, codeSite) //job.Block分块
 		if isextFile && strings.TrimSpace(jf.Content) != "" {
-			pretreated.AnalyStart(jf, isSite, codeSite)
+			pretreated.AnalyStartNoTable(jf, isSite, codeSite)
 		}
 	}, func(err interface{}) {
 		log.Debug("pretreated.AnalyStart", err, j.SourceMid)

+ 5 - 13
src/jy/extract/extractsave.go

@@ -11,7 +11,6 @@ import (
 	"sort"
 	"strings"
 	"time"
-	"unicode/utf8"
 )
 
 // 分析抽取结果并保存
@@ -369,20 +368,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 
 		//添加字段来源
-		tmp["field_source"] = fieldSource
+		//tmp["field_source"] = fieldSource
 		//是否为不规则表格字段
-		if j.IsUnRulesTab {
-			tmp["is_UnRules_Tab"] = j.IsUnRulesTab
-		}
+		//if j.IsUnRulesTab {
+		//	tmp["is_UnRules_Tab"] = j.IsUnRulesTab
+		//}
 		//补充源表数据的数据
 		for k, v := range *doc {
-			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
-				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]
-			}
-			//去重冗余字段
-			if delFiled(k) {
-				continue
-			}
 			if tmp[k] == nil && BiddingFields[k] != nil {
 				tmp[k] = v
 			}
@@ -407,7 +399,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		e.getQualifications(&tmp, *j.Data)
 		//城市抽取
 		if e.IsExtractCity {
-			e.ExtractRegionInfo(j, jf, &tmp, true)
+			e.ExtractRegionInfo(j, jf, &tmp, false)
 			e.ExtractRegionClean(&tmp)
 		}
 		//品牌抽取

+ 98 - 13
src/jy/extract/extractudp.go

@@ -81,18 +81,18 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 					log.Debug("err", "sid=", sid, ",eid=", eid)
 				} else {
 					//新版本控制抽取
-					udpinfo, _ := rep["stype"].(string)
-					if udpinfo == "" {
-						udpinfo = "udpok"
-					}
-					IsExtStop = false
-					ExtractByUdp(sid, eid, ra)
-					if !IsExtStop {
-						log.Debug("抽取完成udp通知抽取id段-控制台", udpinfo, sid, "~", eid)
-						Udpclient.WriteUdp([]byte(udpinfo), mu.OP_NOOP, ra)
-					} else {
-						log.Debug("抽取强制中断udp不通知-控制台", udpinfo, sid, "~", eid)
-					}
+					//udpinfo, _ := rep["stype"].(string)
+					//if udpinfo == "" {
+					//	udpinfo = "udpok"
+					//}
+					//IsExtStop = false
+					//ExtractByUdp(sid, eid, ra)
+					//if !IsExtStop {
+					//	log.Debug("抽取完成udp通知抽取id段-控制台", udpinfo, sid, "~", eid)
+					//	Udpclient.WriteUdp([]byte(udpinfo), mu.OP_NOOP, ra)
+					//} else {
+					//	log.Debug("抽取强制中断udp不通知-控制台", udpinfo, sid, "~", eid)
+					//}
 
 					//发布数据~重采数据~测试流程
 					//key := sid + "-" + eid + "-" + qu.ObjToString(rep["stype"])
@@ -114,6 +114,14 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 					//	}
 					//}
 					//log.Debug("udp通知抽取完成,eid=", eid)
+
+					//预处理模块
+					key := sid + "-" + eid + "-" + qu.ObjToString(rep["stype"])
+					go Udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+					//保存段落合并段落···
+
+					log.Debug("udp通知抽取id段", sid, " ", eid)
+					ExtractByUdpPre(sid, eid, ra)
 				}
 			}
 		}
@@ -300,7 +308,6 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				wg.Add(1)
 				go func(wg *sync.WaitGroup, j, jf *ju.Job) {
 					defer wg.Done()
-					//log.Debug(index,j.SourceMid,)
 					ext.ExtractProcess(j, jf, isSite)
 				}(&wg, j, jf)
 				index++
@@ -319,5 +326,83 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 	}
 }
 
+func ExtractByUdpPre(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
+	defer qu.Catch()
+	if ext == nil {
+		ext = nil
+		ext = &ExtractTask{}
+		ext.Id = qu.ObjToString(ju.Config["udptaskid"])
+		ext.InitTaskInfo()
+		ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+		ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
+		ext.InitSite()
+		ext.InitRulePres()
+		ext.InitRuleBacks(false)
+		ext.InitRuleBacks(true)
+		ext.InitRuleCore(false)
+		ext.InitRuleCore(true)
+		ext.InitBlockRule()
+		ext.InitPkgCore()
+		ext.InitTag(false)
+		ext.InitTag(true)
+		ext.InitClearFn(false)
+		ext.InitClearFn(true)
+		ext.Lock()
+		if ext.IsExtractCity { //版本上控制是否开始城市抽取
+			ext.InitCityInfo()
+			ext.InitAreaCode()
+			ext.InitPostCode()
+		}
+		ext.Unlock()
+		//质量审核
+		ext.InitAuditFields()
+		ext.InitAuditRule()
+		ext.InitAuditClass()
+		ext.InitAuditRecogField()
+		//品牌抽取是否开启
+		ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+		ext.ResultSave(true)
+		ext.BidSave(true)
+		ext.InitFile()
+		ext.IsRun = true
+		ext.BidTotal = 0
+	} else {
+		ext.BidTotal = 0
+	}
+	query := bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
+	count1 := ext.TaskInfo.FDB.Count("bidding_nomal", query)
+	count2 := ext.TaskInfo.FDB.Count("bidding_file", query)
+	log.Debug("待抽取数量:", count1+count2)
+	list1, _ := ext.TaskInfo.FDB.Find("bidding_nomal", query, nil, Fields, false, -1, -1)
+	list2, _ := ext.TaskInfo.FDB.Find("bidding_file", query, nil, Fields, false, -1, -1)
+	new_list := append(*list1, *list2...)
+	now_time := time.Now().Unix()
+	total := 0
+	wg_mgo := &sync.WaitGroup{}
+	for _, v := range new_list {
+		if total%1000 == 0 {
+			log.Debug("cur index :", total, v["_id"])
+		}
+		total++
+		if spidercode[qu.ObjToString(v["spidercode"])] { //开标记录
+			continue
+		}
+		ext.TaskInfo.ProcessPool <- true
+		wg_mgo.Add(1)
+		go func(v map[string]interface{}) {
+			defer func() {
+				<-ext.TaskInfo.ProcessPool
+				wg_mgo.Done()
+			}()
+			var j, jf *ju.Job
+			var isSite bool
+			j, _, isSite = ext.PreInfo(v)
+			ext.ExtractProcess(j, jf, isSite)
+		}(v)
+	}
+	wg_mgo.Wait()
+	log.Debug("抽取完成:", total, ",耗时:", time.Now().Unix()-now_time)
+}
+
 // 中标预测信息抽取,ossid为附件识别后的id
 var exF *ExtractTask

+ 8 - 10
src/jy/extract/extraxtmethod.go

@@ -40,14 +40,12 @@ var (
 	BiddingFields                                             = map[string]interface{}{
 		"_id":         1,
 		"title":       1,
-		"site":        1,
-		"spidercode":  1,
 		"toptype":     1,
 		"subtype":     1,
 		"comeintime":  1,
 		"publishtime": 1,
 		"href":        1,
-		"dataging":    1,
+		"detail":      1,
 	}
 	Fields2     = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 	NiJianField = []string{
@@ -305,13 +303,13 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 		}
 	}
 	//实体服务识别
-	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
-		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
-		if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
-			(*tmp)["buyer"] = new_buyer
-			(*tmp)["inscribe_buyer"] = "实体识别服务"
-		}
-	}
+	//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
+	//	!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
+	//	if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
+	//		(*tmp)["buyer"] = new_buyer
+	//		(*tmp)["inscribe_buyer"] = "实体识别服务"
+	//	}
+	//}
 	//拟建不能存buyer
 	if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
 		qu.ObjToString((*tmp)["subtype"]) == "拟建" {

+ 126 - 0
src/jy/pretreated/analymethod.go

@@ -484,6 +484,132 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 	}
 }
 
+// 分析方法
+func AnalyStartNoTable(job *u.Job, isSite bool, codeSite string) {
+	con := job.Content
+	//全文的需要修复表格
+	con = RepairCon(con)
+	//格式化正文
+	//con = preConReg1.ReplaceAllString(con, "${1}${2}")
+	hisReg1_str := hisReg1.FindString(con)
+	if hisReg1_str != "" && !strings.Contains(hisReg1_str, "中标候选人得分") {
+		con = hisReg1.ReplaceAllString(con, "${4}")
+	}
+	hisReg2_str := hisReg2.FindString(con)
+	if hisReg2_str != "" && !strings.Contains(hisReg2_str, "中标候选人得分") {
+		con = hisReg2.ReplaceAllString(con, "${6}")
+	}
+	con = formattext.ReplaceAllString(con, "${1}:${2}")
+	con = formattext2.ReplaceAllString(con, "${1}")
+	con = formattext3.ReplaceAllString(con, "")
+	con = formattext4.ReplaceAllString(con, "\n${1}:${2}\n")
+	//特殊格式-影响分包候选人抽取-候选人等识别-替换
+	con = formattext5.ReplaceAllString(con, "中标金额:${2}\n")
+	con = formattext6.ReplaceAllString(con, "$1$2")
+	con = formattext7.ReplaceAllString(con, "$1$2")
+	//改变特殊结构
+	con = formattext10.ReplaceAllString(con, "\n分包$3\n中标单位:$5 中标金额:$6\n")
+	con = formattext11.ReplaceAllString(con, "${1}\n${2}\n预算金额:${4}\n${5}\n预算金额:${7}\n${8}\n")
+	con = formattext12.ReplaceAllString(con, "\n${1}:${3}万元\n")
+	con = formattext13.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${3}\n"+"包二\n中标单位:${2}\n中标金额:${4}\n")
+	con = formattext14.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${2}\n"+"包二\n中标单位:${3}\n中标金额:${4}\n")
+	//多供应商~文本结构~重构
+	if m_b, m_c := dealWithMultiSuppliersText(con); m_b {
+		con = m_c
+	}
+	//工程业绩描述影响抽取
+	con = formattext20.ReplaceAllString(con, "\n")
+	con = formattext21.ReplaceAllString(con, "")
+	//指定爬虫-特殊结构-计算抽取
+	if codeSite == "a_zgzfcgw_zfcghtgg_new" {
+		str := formattext50.FindString(con)
+		if str != "" {
+			new_str := dealWithSpecStructToSpiderCode(str)
+			if new_str != "" {
+				con = new_str + con
+			}
+		}
+	}
+	con = formatText(con, "all")
+	job.ContentClean = HtmlToText(job.Content)
+	job.Content = con
+	job.BlockPackage = map[string]*u.BlockPackage{}
+	//分块+处理每块kv
+	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite)
+	if len(blockArrays) > 0 { //有分块
+		//从块里面找分包-文本
+		if !job.IsFile {
+			job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
+		}
+		for _, bl := range blockArrays {
+			FindProjectCode(bl.Text, job) //匹配项目编号
+			//对块行内容业绩相关进行过滤
+			bl.Text = tableClearTextReg.ReplaceAllString(bl.Text, "")
+			//新加 未分块table中未能解析到中标候选人,从正文中解析-全文匹配一次
+			if (job.Winnerorder == nil || len(job.Winnerorder) == 0) || len(job.Winnerorder) > 8 {
+				//表格没有划分时候:-纯文本匹配
+				tmp_text := HtmlToText(bl.Text)
+				bl.Winnerorder = winnerOrderEntity.Find(tmp_text, true, 1, isSite, codeSite)
+				if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
+					job.Winnerorder = bl.Winnerorder
+				}
+			}
+			//无分包-附件-格式化文本处理-
+			if (job.BlockPackage == nil || len(job.BlockPackage) == 0) && job.IsFile {
+				tmp_text := HtmlToText(bl.Text)
+				job.BlockPackage = FindPackageFromText(job.Title, tmp_text, isSite, codeSite)
+			}
+			job.Block = append(job.Block, bl)
+		}
+	} else { //未分块,创建分块
+		bl := &u.Block{}
+		newCon := con
+		job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
+		bl.Text = HtmlToText(con)
+		FindProjectCode(bl.Text, job) //匹配项目编号 ~~ 清洗无效信息文本
+		if blTextReg.MatchString(bl.Text) && !unblTextReg.MatchString(bl.Text) {
+			if strings.Index(bl.Text, "业绩") > 1 {
+				//如果有采购单位信息~置前
+				before_arr := []string{}
+				if beforeTextReg.MatchString(bl.Text) {
+					before_arr = beforeTextReg.FindAllString(bl.Text, -1)
+				}
+				bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")]
+				if len(before_arr) > 0 {
+					bl.Text = strings.Join(before_arr, "\n") + bl.Text
+				}
+			}
+		}
+		//特殊-指定处理-结构转化formattext100
+		if formattext100.MatchString(bl.Text) {
+			new_str := formattext100.FindString(bl.Text)
+			new_str = formattext100.ReplaceAllString(new_str, "$1")
+			bl.Text = fmt.Sprintf("中标金额:%s万元\n", new_str) + bl.Text
+		}
+		//调用kv解析库-处理detail
+		bl.Text = formatText(bl.Text, "all")
+		//处理 :
+		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
+		//处理空格
+		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
+		//新加 未分块table中未能解析到   中标候选人,从正文中解析
+		if job.Winnerorder == nil || len(job.Winnerorder) == 0 || len(job.Winnerorder) > 8 {
+			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
+			if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
+				job.Winnerorder = bl.Winnerorder
+			}
+		} else { //table里面识别出单位候选人-未识别金额...
+			if onlyExistsWinEntName(job.Winnerorder) {
+				new_winorder := winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
+				if thanExistsNewWinOrder(job.Winnerorder, new_winorder) {
+					job.Winnerorder = new_winorder
+				}
+			}
+		}
+		job.Block = append(job.Block, bl)
+	}
+}
+
 // 是否有效分包
 func isUsefulPackage(pkg map[string]*u.BlockPackage) bool {
 	if pkg == nil || len(pkg) == 0 {