fengweiqiang vor 5 Jahren
Ursprung
Commit
e7a5dd0f40

+ 45 - 7
src/jy/extract/extract.go

@@ -27,12 +27,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask //任务列表
+	ClearTaskList map[string]*ClearTask   //清理任务列表
+	saveLimit     = 100                   //抽取日志批量保存
+	PageSize      = 5000                  //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -2087,7 +2087,45 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	for _, val := range result {
 		ju.Sort(val)
 	}
-	j.Result = JsonDataMergeProcessing(j, e)
+	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
+		marshalbt, _ := json.Marshal(j.Jsondata)
+		tmpjddata := make(map[string]interface{})
+		json.Unmarshal(marshalbt,&tmpjddata)
+		for _, jdkey := range ju.JsonData {
+			if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
+				for tmpk, tmpv := range j.Result[jdkey][:5] {
+					if jdkey == "budget" || jdkey == "bidamount" {
+						lockclear.Lock()
+						cfn := e.ClearFn[jdkey]
+						lockclear.Unlock()
+						if len(cfn) == 0 {
+							continue
+						}
+						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
+						if tmpv.Value == newNum[0] {
+							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
+							j.Result[jdkey] = append(j.Result[jdkey], extField)
+							ju.Sort(j.Result[jdkey])
+							delete((*j.Jsondata), jdkey)
+							break
+						}
+					}else {
+						if (*j.Jsondata)[jdkey] == tmpv.Value{
+							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
+							j.Result[jdkey] = append(j.Result[jdkey], extField)
+							ju.Sort(j.Result[jdkey])
+							delete((*j.Jsondata), jdkey)
+							break
+						}
+					}
+				}
+			}
+		}
+		if len(*j.Jsondata)>0{
+			j.Result = JsonDataMergeProcessing(j, e)
+		}
+		j.Jsondata = &tmpjddata
+	}
 	return doc, result, _id
 }
 

+ 2 - 2
src/jy/extract/score_jsondata.go

@@ -12,10 +12,10 @@ import (
 )
 
 func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.ExtField {
-	if len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0 {
+	jdextweight := util2.IntAll((*j.Jsondata)["extweight"])
+	if jdextweight==0{
 		return j.Result
 	}
-	jdextweight := util2.IntAll((*j.Jsondata)["extweight"])
 	tmps := make(map[string][]*util.ExtField)
 	for _, v := range util.JsonData {
 		tmp := make([]*util.ExtField, 0)

+ 5 - 2
src/jy/pretreated/analytable.go

@@ -51,8 +51,8 @@ var (
 	FindVal_1  = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)")
 	FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$")
 	//判断分包前排除
-	excludeKey  = regexp.MustCompile("(分包个数|标段代码|涉及包号|分包数量|包件号?|项目标号|规格|型号|招标范围|业绩|废标|标段选择要求)|(^编号$)|([^包段标]编号)") //编号|划分
-	excludeKey2 = regexp.MustCompile("包/[0-9]{0,4}[|||张]")
+	excludeKey  = regexp.MustCompile("(标识|数量|分包个数|标段代码|涉及包号|分包数量|包件号?|项目标号|规格|型号|招标范围|业绩|废标|标段选择要求)|(^编号$)|([^包段标]编号)") //编号|划分
+	excludeKey2 = regexp.MustCompile("包/[0-9]{0,4}[箱纸张]")
 	excludeKey3 = regexp.MustCompile("(分包个数|每包[0-9]*元|标线|国标|享受一包服务)")
 	//-------------
 
@@ -2549,7 +2549,10 @@ var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
 //需要保留thead
 var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
 
+var clearpkg = regexp.MustCompile("(标示|标识)")
+
 func RepairCon(con string) string {
+	con = clearpkg.ReplaceAllString(con,"")
 	res := saveThead.FindAllStringSubmatch(con, 1)
 	th := ""
 	if len(res) == 1 && len(res[0]) == 2 {

+ 2 - 2
src/jy/pretreated/division.go

@@ -56,9 +56,9 @@ var (
 	regEndWrap         = regexp.MustCompile("[\r\n]$")
 	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
 	regStrWrap         = regexp.MustCompile("分包名称[::]")
-	regBZJWarap        = regexp.MustCompile("(保证金.*|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]标室|[\\d一二三四五六七八九十]包密封|(一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
+	regBZJWarap        = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
 	regFJWarap         = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
-	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张/包|纸[\\d]*包)")
+	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张/包|纸[\\d]*包|/*[\\d]+包|相机包)")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
 	regFilter          = regexp.MustCompile("等$")