소스 검색

1、城市抽取修改
2、实体识别方法备份
3、减分逻辑部分调整
4、结果追踪调整方便测试

zhengkun 1 년 전
부모
커밋
e40c3495c7

+ 49 - 11
extcity/src/main.go

@@ -5,7 +5,8 @@ import (
 	"encoding/json"
 	"ext"
 	log "github.com/donnie4w/go-logger/logger"
-	"io/ioutil"
+	"gopkg.in/mgo.v2/bson"
+	"io"
 	"net/http"
 	qu "qfw/util"
 	"service"
@@ -26,8 +27,7 @@ func init() {
 	go http.ListenAndServe(ul.Port, nil)
 }
 func main() {
-	time.Sleep(2 * time.Second)
-	test()
+	testRegionInfo()
 	lock := make(chan bool)
 	<-lock
 	/*
@@ -36,13 +36,51 @@ func main() {
 	*/
 }
 
-func test() {
-	client := &http.Client{Timeout: 30 * time.Second}
-	data := map[string]interface{}{"detail": "我是正文金水区"}
-	jsonStr, _ := json.Marshal(data)
-	resp, _ := client.Post("http://127.0.0.1:9996/service/region", "application/json", bytes.NewBuffer(jsonStr))
-	res_info, _ := ioutil.ReadAll(resp.Body)
+func testRegionInfo() {
+	return
+	dataArr, _ := ul.TestMgo.Find("site", bson.M{"area": "全国"}, nil, map[string]interface{}{"site": 1})
+	isok := 0
+	for k, v := range dataArr {
+		if k%100 == 0 {
+			log.Debug("cur index ", k, "~", isok)
+		}
+		tmpid := ul.BsonTOStringId(v["_id"])
+		site := qu.ObjToString(v["site"])
+		info := test(site)
+		area := qu.ObjToString(info["area"])
+		if area != "" && area != "全国" {
+			isok++
+			//ul.SiteMgo.Save("123123", map[string]interface{}{
+			//	"site": site,
+			//	"area": area,
+			//})
+			log.Debug(tmpid, "~", area)
+			//ul.TestMgo.UpdateById("site", tmpid, map[string]interface{}{
+			//	"$set": map[string]interface{}{
+			//		"area": area,
+			//	},
+			//})
+		}
+	}
+	log.Debug("is over ", isok)
+}
+
+func test(detail string) map[string]interface{} {
 	info := map[string]interface{}{}
-	json.Unmarshal(res_info, &info)
-	log.Debug("测试~", info)
+	client := &http.Client{Timeout: 2 * time.Second}
+	data := map[string]interface{}{"detail": detail}
+	jsonStr, _ := json.Marshal(data)
+	resp, err := client.Post("http://127.0.0.1:9996/service/region", "application/json", bytes.NewBuffer(jsonStr))
+	if err != nil {
+		return info
+	}
+	res, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return info
+	}
+	err = json.Unmarshal(res, &info)
+	if err != nil {
+		return info
+	}
+	return info
 }

+ 11 - 0
extcity/src/util/init.go

@@ -11,6 +11,7 @@ var (
 	ExtMgo, SiteMgo, QyxyMgo *MongodbSim //抽取初始化-相关
 	SF                       map[string]interface{}
 	Port                     string
+	TestMgo                  *MongodbSim
 )
 
 func InitExt() {
@@ -55,6 +56,16 @@ func initMgo() {
 		PassWord:    qyxyconf["password"].(string),
 	}
 	QyxyMgo.InitPool()
+
+	//临时Mgo
+	TestMgo = &MongodbSim{
+		MongodbAddr: "127.0.0.1:12003",
+		DbName:      "editor",
+		Size:        10,
+		UserName:    "",
+		PassWord:    "",
+	}
+	TestMgo.InitPoolDirect()
 }
 
 func convertInterface(t interface{}) []string {

+ 90 - 16
extcity/src/util/mgo.go

@@ -142,22 +142,51 @@ func (m *MongodbSim) DestoryMongoConn(ms *MgoSess) {
 	ms = nil
 }
 
-func (m *MongodbSim) InitPool() {
+func (m *MongodbSim) InitPoolDirect() {
 	opts := options.Client()
 	opts.SetConnectTimeout(3 * time.Second)
 	opts.ApplyURI("mongodb://" + m.MongodbAddr)
 	opts.SetMaxPoolSize(uint64(m.Size))
+	opts.SetDirect(true)
 	m.pool = make(chan bool, m.Size)
 
-	if m.UserName !="" && m.PassWord !="" {
+	if m.UserName != "" && m.PassWord != "" {
 		cre := options.Credential{
-			Username:m.UserName,
-			Password:m.PassWord,
+			Username:   m.UserName,
+			Password:   m.PassWord,
+			AuthSource: "admin",
 		}
 		opts.SetAuth(cre)
 	}
 
+	opts.SetMaxConnIdleTime(2 * time.Hour)
+	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
+	m.ShortCtx, _ = context.WithTimeout(context.Background(), 1*time.Minute)
+	client, err := mongo.Connect(m.ShortCtx, opts)
+	if err != nil {
+		log.Println("mgo init error:", err.Error())
+	} else {
+		m.C = client
+		log.Println("init success")
+	}
+}
 
+func (m *MongodbSim) InitPool() {
+	opts := options.Client()
+	opts.SetConnectTimeout(3 * time.Second)
+	opts.ApplyURI("mongodb://" + m.MongodbAddr)
+	opts.SetMaxPoolSize(uint64(m.Size))
+	//opts.SetDirect(true)
+	m.pool = make(chan bool, m.Size)
+
+	if m.UserName != "" && m.PassWord != "" {
+		cre := options.Credential{
+			Username:   m.UserName,
+			Password:   m.PassWord,
+			AuthSource: "admin",
+		}
+		opts.SetAuth(cre)
+	}
 
 	opts.SetMaxConnIdleTime(2 * time.Hour)
 	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
@@ -178,7 +207,21 @@ func (m *MongodbSim) Close() {
 	<-m.pool
 }
 
-//批量插入
+// 新建表并生成索引
+func (m *MongodbSim) CreateIndex(c string, models []mongo.IndexModel) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	names, err := coll.Indexes().CreateMany(m.Ctx, models)
+	if err == nil && len(names) > 0 {
+		return true
+	} else {
+		log.Println("CreateIndex Error:", err)
+		return false
+	}
+}
+
+// 批量插入
 func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[int64]interface{}, bool) {
 	m.Open()
 	defer m.Close()
@@ -205,7 +248,7 @@ func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[
 	return r.UpsertedIDs, true
 }
 
-//批量插入
+// 批量插入
 func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
 	m.Open()
 	defer m.Close()
@@ -224,7 +267,7 @@ func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
 	return true
 }
 
-//保存
+// 保存
 func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
 	m.Open()
 	defer m.Close()
@@ -236,7 +279,7 @@ func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
 	return r.InsertedID
 }
 
-//更新by Id
+// 更新by Id
 func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
 	m.Open()
 	defer m.Close()
@@ -248,8 +291,7 @@ func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
 	return true
 }
 
-//更新by Id
-func (m *MongodbSim) UpdateByStringId(c, id string, doc map[string]interface{}) bool {
+func (m *MongodbSim) UpdateStrId(c, id string, doc map[string]interface{}) bool {
 	m.Open()
 	defer m.Close()
 	coll := m.C.Database(m.DbName).Collection(c)
@@ -260,7 +302,18 @@ func (m *MongodbSim) UpdateByStringId(c, id string, doc map[string]interface{})
 	return true
 }
 
-//删除by id
+func (m *MongodbSim) UpdateQueryData(c string, query map[string]interface{}, doc map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	_, err := coll.UpdateOne(m.Ctx, query, doc)
+	if err != nil {
+		return false
+	}
+	return true
+}
+
+// 删除by id
 func (m *MongodbSim) DeleteById(c, id string) int64 {
 	m.Open()
 	defer m.Close()
@@ -272,7 +325,7 @@ func (m *MongodbSim) DeleteById(c, id string) int64 {
 	return r.DeletedCount
 }
 
-//通过条件删除
+// 通过条件删除
 func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
 	m.Open()
 	defer m.Close()
@@ -284,7 +337,7 @@ func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
 	return r.DeletedCount
 }
 
-//findbyid
+// findbyid
 func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
 	m.Open()
 	defer m.Close()
@@ -295,7 +348,7 @@ func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
 	return v
 }
 
-//findone
+// findone
 func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]interface{} {
 	m.Open()
 	defer m.Close()
@@ -306,7 +359,7 @@ func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]
 	return v
 }
 
-//find
+// find
 func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields interface{}) ([]map[string]interface{}, error) {
 	m.Open()
 	defer m.Close()
@@ -317,6 +370,27 @@ func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields i
 		log.Fatal(err)
 		return nil, err
 	}
+
+	var results []map[string]interface{}
+	if err = r.All(m.Ctx, &results); err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	return results, nil
+}
+
+// find
+func (m *MongodbSim) FindLimit(c string, query map[string]interface{}, sort, fields interface{}, limit int64) ([]map[string]interface{}, error) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	op := options.Find()
+	r, err := coll.Find(m.Ctx, query, op.SetSort(sort), op.SetProjection(fields), op.SetLimit(limit))
+	if err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+
 	var results []map[string]interface{}
 	if err = r.All(m.Ctx, &results); err != nil {
 		log.Fatal(err)
@@ -325,7 +399,7 @@ func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields i
 	return results, nil
 }
 
-//创建_id
+// 创建_id
 func NewObjectId() primitive.ObjectID {
 	return primitive.NewObjectID()
 }

+ 20 - 7
src/jy/extract/extractcity_clean.go

@@ -2,15 +2,19 @@ package extract
 
 import (
 	"fmt"
+	"gopkg.in/mgo.v2/bson"
 	ju "jy/util"
 	qu "qfw/util"
 	"regexp"
 	"strings"
 )
 
-var cityEndReg *regexp.Regexp = regexp.MustCompile("(区|县|市)$")
+var CityEndReg *regexp.Regexp = regexp.MustCompile("(区|县|市)$")
 var ErrBuyerReg *regexp.Regexp = regexp.MustCompile("^(成都东部新区)")
 
+// 拆分
+var moityBuyerReg *regexp.Regexp = regexp.MustCompile(".*(公司)")
+
 func (e *ExtractTask) GetCheckFinallyRegionInfo(tmp map[string]interface{}, update_check *map[string]interface{}) {
 
 	area := qu.ObjToString(tmp["area"])
@@ -64,7 +68,7 @@ func (e *ExtractTask) GetCheckFinallyRegionInfo(tmp map[string]interface{}, upda
 	}
 }
 
-//企业表校验
+// 企业表校验
 func cityMarshal(data map[string]interface{}) map[string]string {
 	buyer := qu.ObjToString(data["buyer"])
 	bidarea := qu.ObjToString(data["area"])
@@ -75,6 +79,15 @@ func cityMarshal(data map[string]interface{}) map[string]string {
 		"company_name": buyer,
 	}
 	tmp := ju.Qyxy_Mgo.FindOne("qyxy_std", query_name)
+	if tmp == nil || len(tmp) < 2 {
+		//查不到企业的时候通过截取采购单位提取部分名称进行匹配
+		if bidarea == "" || bidarea == "全国" {
+			moity_buyer := moityBuyerReg.FindString(buyer)
+			if moity_buyer != buyer && moity_buyer != "" {
+				tmp = ju.Qyxy_Mgo.FindOne("qyxy_std", bson.M{"company_name": moity_buyer})
+			}
+		}
+	}
 	if tmp == nil || len(tmp) < 2 {
 		return rdata
 	}
@@ -144,7 +157,7 @@ func cityMarshal(data map[string]interface{}) map[string]string {
 	return rdata
 }
 
-//标准校验
+// 标准校验
 func (e *ExtractTask) StandardCheckCity(area string, city string, district string) map[string]string {
 	rdata := make(map[string]string, 0)
 	if area == "香港" || area == "澳门" || area == "台湾" || (area == "全国" && (city == "" && district == "")) {
@@ -240,7 +253,7 @@ func (e *ExtractTask) StandardCheckCity(area string, city string, district strin
 	return rdata
 }
 
-//更新日志
+// 更新日志
 func updateLogging(tmp map[string]interface{}, rdata map[string]string, desc string) map[string]interface{} {
 	umap := make(map[string]interface{})
 	if tmp["modifycheck"] == nil {
@@ -261,11 +274,11 @@ func copyUpdateData(tmp map[string]interface{}, update_check *map[string]interfa
 	}
 }
 
-//拆分三级县
+// 拆分三级县
 func aliasDataDistrict(district string) []string {
 	arr := []string{}
-	if cityEndReg.MatchString(district) {
-		str := cityEndReg.FindString(district)
+	if CityEndReg.MatchString(district) {
+		str := CityEndReg.FindString(district)
 		strings.TrimRight(district, str)
 		if str == "县" {
 			arr = append(arr, fmt.Sprintf("%s区", strings.TrimRight(district, str)))

+ 111 - 18
src/jy/extract/extractcity_new.go

@@ -1,7 +1,7 @@
 package extract
 
 import (
-	. "jy/pretreated"
+	"jy/pretreated"
 	ju "jy/util"
 	qu "qfw/util"
 	"strings"
@@ -23,7 +23,7 @@ func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) {
 }
 
 // 抽取地域信息
-func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
+func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, jf *ju.Job, tmp *map[string]interface{}, isLog bool) {
 	defer qu.Catch()
 	//日志记录
 	logRecordInfo := []map[string]interface{}{}
@@ -89,6 +89,9 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 	CompleteRegionInfo(&f_area, &f_city, &f_district)
 	//用到的字段
 	projectname := qu.ObjToString((*tmp)["projectname"])
+	if projectname == "" {
+		projectname = qu.ObjToString((*tmp)["title"])
+	}
 	buyer := qu.ObjToString((*tmp)["buyer"])
 	site := qu.ObjToString((*tmp)["site"])
 	//新疆兵团补充地域~
@@ -125,19 +128,26 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 		})
 	}
 
-	//正文补充地域~
+	//文本正文-提取补充
 	if f_area == "全国" || f_area == "" || f_city == "" {
-		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b {
-			if isLog {
-				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
-					"sup_detail": f_area + "~" + f_city + "~" + f_district,
-				})
-			}
+		if b := e.NewVerifySensitiveInfo(j.Title+"\n"+j.Content, &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_detail": f_area + "~" + f_city + "~" + f_district,
+			})
 		}
 	}
-	//代理机构抽省市
-	if f_city == "" {
-		keyArr := []string{"agencyaddr"}
+	//文本附件-提取补充
+	if (f_area == "全国" || f_area == "" || f_city == "") && jf != nil {
+		if b := e.NewVerifySensitiveInfo(jf.Title+"\n"+jf.ContentClean, &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_jfdetail": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
+
+	//疑似地址-提取补充-采用简称
+	if f_area == "全国" || f_area == "" || f_city == "" {
+		keyArr := []string{"brief_buyeraddr", "brief_agencyaddr"}
 		isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
 		if isExists { //是否存在抽取有效值
 			AnalysisIsUniqueInfo(new_regions, &all_regions)
@@ -155,7 +165,7 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 		}
 	}
 
-	//pcd切词提取--区县
+	//PCD切词提取--区县
 	e.LinkSpecialRuleBriefStep2(projectname, &f_area, &f_city, &f_district)
 	e.LinkSpecialRuleBriefStep2(buyer, &f_area, &f_city, &f_district)
 	if isLog {
@@ -163,7 +173,48 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 			"sup_link2": f_area + "~" + f_city + "~" + f_district,
 		})
 	}
-
+	//采用源码方式-提取补充
+	if f_area == "全国" || f_area == "" || f_city == "" {
+		if b := e.NewVerifySensitiveInfo(j.Title+"\n"+qu.ObjToString((*j.Data)["contenthtml"]), &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_contenthtml": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
+	//清洗的标题-提取补充
+	if f_area == "全国" || f_area == "" || f_city == "" {
+		if new_title := CleanTitleReg1.ReplaceAllString(j.Title, ""); new_title != j.Title && new_title != "" {
+			if b := e.NewVerifySensitiveInfo(new_title, &f_area, &f_city, &f_district); b && isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_title": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
+		}
+	}
+	//采购单位地址-提取补充
+	if f_area == "全国" || f_area == "" {
+		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["buyeraddr"]), &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_buyeraddr": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
+	//中标单位地址-提取补充
+	if f_area == "全国" || f_area == "" {
+		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["winneraddr"]), &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_winneraddr": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
+	//中标单位-提取补充
+	if f_area == "全国" || f_area == "" {
+		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["winner"]), &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_winner": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
 	//最终站点补充
 	if f_area == "全国" || f_area == "" {
 		if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
@@ -175,6 +226,14 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 			}
 		}
 	}
+	//站点名称-提取补充
+	if f_area == "全国" || f_area == "" {
+		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["site"]), &f_area, &f_city, &f_district); b && isLog {
+			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+				"sup_sitename": f_area + "~" + f_city + "~" + f_district,
+			})
+		}
+	}
 
 	//最终在清洗一遍数据
 	CompleteRegionInfo(&f_area, &f_city, &f_district)
@@ -199,6 +258,9 @@ func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]inter
 			text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"]))
 		} else if key == "projectname" {
 			text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"]))
+		} else if key == "brief_buyeraddr" || key == "brief_agencyaddr" {
+			new_key := strings.ReplaceAll(key, "brief_", "")
+			text = qu.ObjToString(tmp[new_key])
 		} else {
 			text = qu.ObjToString(tmp[key])
 		}
@@ -221,9 +283,13 @@ func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]inter
 			valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2)
 		} else {
 			isAddress, isBrief := false, false
-			if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" || key == "agencyaddr" {
+			if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" {
 				isAddress = true
 			}
+			if key == "brief_buyeraddr" || key == "brief_agencyaddr" {
+				isAddress = true
+				isBrief = true
+			}
 			valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2)
 		}
 		field_regions[key] = valuesArr
@@ -434,9 +500,25 @@ func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d st
 
 // 敏感词识别
 func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
+	if detail == "" {
+		return false
+	}
 	detail = SensitiveReg.ReplaceAllString(detail, "")
-	detail = TextAfterRemoveTable(detail)
 	detail = CleanDetailReg1.ReplaceAllString(detail, "")
+	detail = pretreated.HtmlToText(detail)
+	isChange := false
+	//全程省份
+	if *area == "" || *area == "全国" {
+		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
+		if len(fullProvinceArr) == 1 {
+			for _, v := range fullProvinceArr {
+				if sim_province := e.ProvinceMap[v]; sim_province != "" {
+					*area = sim_province
+					isChange = true
+				}
+			}
+		}
+	}
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
@@ -484,7 +566,18 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 			}
 		}
 	}
-
+	//简称省份
+	if *area == "" || *area == "全国" {
+		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
+		if len(simProvinceArr) == 1 {
+			for _, v := range simProvinceArr {
+				if v != "" {
+					*area = v
+					return true
+				}
+			}
+		}
+	}
 	//疑似固话提取~
 	if *area == "" || *area == "全国" {
 		fixedTelArr := FixedTelReg.FindAllString(detail, -1)
@@ -500,7 +593,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 			}
 		}
 	}
-	return false
+	return isChange
 }
 
 func resetFixedTelInfo(telArr []string) []string {

+ 23 - 31
src/jy/extract/extractcity_way.go

@@ -22,6 +22,8 @@ var SensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")
 
 var OperatorReg = regexp.MustCompile("^中国(电信|联通|移动).*公司$")
 
+var CleanTitleReg1 = regexp.MustCompile("[((].*[))]")
+
 // 取特殊类数据
 func GetFilialeByBuyerInfo(buyer string) string {
 	if FilialeReg1.MatchString(buyer) {
@@ -532,28 +534,7 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep1(text string, area *string, city
 	if *city != "" {
 		return
 	}
-	regions := map[string]map[string]map[string]string{}
-	wordsArr := e.Seg_Full.Cut(text, true)
-	for _, word := range wordsArr {
-		for pos_sim, trie_sim := range e.Trie_Sims {
-			if pos_sim == 2 {
-				if trie_sim.Get(word) {
-					citysArr := e.DistrictSimAndAll[word]
-					for _, full_citys := range citysArr {
-						for d, c := range full_citys {
-							if c == nil || c.P == nil || c.Name == "" {
-								continue
-							}
-							if c.P.Brief != "" && c.Name != "" && d != "" {
-								v_area, v_city, v_district := c.P.Brief, c.Name, d
-								UpdateRegionsInfo(v_area, v_city, v_district, &regions)
-							}
-						}
-					}
-				}
-			}
-		}
-	}
+	regions := e.FetchBriefRules(text, 1)
 	if len(regions) > 0 {
 		if *area == "" || *area == "全国" { //新增原则
 			LinkAddedRules(regions, area, city, district)
@@ -563,13 +544,30 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep1(text string, area *string, city
 	}
 }
 
-// 链路补充~简称类
+// 链路补充~简称类 切词方式 0默认   1指定
 func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city *string, district *string) {
 	if *city != "" {
 		return
 	}
+	regions := e.FetchBriefRules(text, 2)
+	if len(regions) > 0 {
+		if *area == "" || *area == "全国" { //新增原则
+			LinkAddedRules(regions, area, city, district)
+		} else { //补充原则
+			LinkSuppleRules(regions, area, city, district)
+		}
+	}
+}
+
+// 简称提取规则 1默认 2指定
+func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 	regions := map[string]map[string]map[string]string{}
-	wordsArr := e.Seg_SV.Cut(text, true)
+	wordsArr := []string{}
+	if cutype == 1 {
+		wordsArr = e.Seg_Full.Cut(text, true)
+	} else {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
 	for _, word := range wordsArr {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 			if pos_sim == 2 {
@@ -590,13 +588,7 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city
 			}
 		}
 	}
-	if len(regions) > 0 {
-		if *area == "" || *area == "全国" { //新增原则
-			LinkAddedRules(regions, area, city, district)
-		} else { //补充原则
-			LinkSuppleRules(regions, area, city, district)
-		}
-	}
+	return regions
 }
 
 // 新增原则

+ 7 - 0
src/jy/extract/extractinit.go

@@ -119,6 +119,8 @@ type ExtractTask struct {
 	S_CityDict     map[string][]S_City     //标准城市-map
 	S_DistrictDict map[string][]S_District //标准区县-map
 
+	SensitiveFullProvince *sensitive.Filter
+	SensitiveSimProvince  *sensitive.Filter
 	SensitiveFullCity     *sensitive.Filter
 	SensitiveSimCity      *sensitive.Filter
 	SensitiveFullDistrict *sensitive.Filter
@@ -1179,6 +1181,8 @@ func (e *ExtractTask) InitCityInfo() {
 		jc_province := qu.ObjToString(provinces["province"])      //省简称
 		//加载省信息
 		e.Trie_Full_Province.AddWords(all_province) //加入省全称Trie(k:浙江省)
+		e.SensitiveFullProvince.AddWord(all_province)
+		e.SensitiveSimProvince.AddWord(jc_province)
 		p := &Province{}
 		p.Name = all_province                     //省全称:浙江省
 		p.Brief = jc_province                     //省简称:浙江
@@ -1269,6 +1273,7 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 		for _, vdistrict_alias := range district_alias {
 			strvdistrict_alias := qu.ObjToString(vdistrict_alias)
 			e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
+			e.SensitiveFullDistrict.AddWord(strvdistrict_alias)
 			c_tmp := e.DistrictCityMap[strvdistrict_alias]
 			if len(c_tmp) == 0 {
 				tmpcarr := []*City{c}
@@ -1351,6 +1356,8 @@ func (e *ExtractTask) InitVar() {
 	e.S_DistrictDict = make(map[string][]S_District, 0)
 
 	//敏感词-筛选
+	e.SensitiveFullProvince = sensitive.New()
+	e.SensitiveSimProvince = sensitive.New()
 	e.SensitiveFullCity = sensitive.New()
 	e.SensitiveSimCity = sensitive.New()
 	e.SensitiveFullDistrict = sensitive.New()

+ 1 - 3
src/jy/extract/extractsave.go

@@ -397,14 +397,12 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		//落款识别
 		e.inscribeRecognize(&tmp, *j.Data)
-		//落款识别指定特殊采购单位
-		e.AimAtRecognizeBuyer(&tmp, *j.Data)
 		//根据正文获取资质要求
 		e.getQualifications(&tmp, *j.Data)
 		//城市抽取
 		if e.IsExtractCity {
 			//e.NewExtractCity(j, &tmp) //旧版
-			e.ExtractRegionInfo(j, &tmp, true)
+			e.ExtractRegionInfo(j, jf, &tmp, true)
 			e.ExtractRegionClean(&tmp)
 		}
 		//品牌抽取

+ 84 - 12
src/jy/extract/extraxtmethod.go

@@ -5,14 +5,18 @@ import (
 	"encoding/json"
 	"fmt"
 	"github.com/shopspring/decimal"
+	"gopkg.in/mgo.v2/bson"
+	"io"
 	"jy/clear"
 	"jy/pretreated"
 	ju "jy/util"
+	"net/http"
 	qu "qfw/util"
 	"qfw/util/redis"
 	"regexp"
 	"strings"
 	"sync"
+	"time"
 	"unicode/utf8"
 )
 
@@ -246,10 +250,25 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 	//落款实体
 	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
 		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
-		if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
+		if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" {
 			(*tmp)["buyer"] = new_buyer
 		}
 	}
+	//落款特殊实体
+	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" &&
+		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
+		if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
+			(*tmp)["buyer"] = new_buyer
+		}
+	}
+	//暂时关闭实体识别
+	//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
+	//	!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
+	//	if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]),*tmp); new_buyer != "" {
+	//		(*tmp)["buyer"] = new_buyer
+	//	}
+	//}
+
 	//拟建不能存buyer
 	if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
 		qu.ObjToString((*tmp)["subtype"]) == "拟建" {
@@ -269,7 +288,7 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 }
 
 // 识别实体
-func InscribeEntity(detail string) string {
+func InscribeEntity(detail string, tmp map[string]interface{}) string {
 	new_str := ""
 	new_detail := pretreated.TextAfterRemoveTable(detail)
 	if len(new_detail) > 200 {
@@ -288,12 +307,56 @@ func InscribeEntity(detail string) string {
 	} else {
 		new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
 	}
+	winner := qu.ObjToString(tmp["winner"])
+	agency := qu.ObjToString(tmp["agency"])
+	//与其它单位发生了重叠
+	if new_str != "" && (new_str == winner || new_str == agency) {
+		new_str = ""
+	}
 	if new_str != "" && exclude_entity.MatchString(new_str) {
 		new_str = ""
 	}
 	return new_str
 }
 
+// 识别实体
+func InscribeEntityDfa(detail string, tmp map[string]interface{}) string {
+	new_str := ""
+	projectname := qu.ObjToString(tmp["projectname"])
+	title := qu.ObjToString(tmp["title"])
+	winner := qu.ObjToString(tmp["winner"])
+	agency := qu.ObjToString(tmp["agency"])
+	new_detail := pretreated.TextAfterRemoveTable(detail)
+	if len(new_detail) > 200 {
+		new_detail = detail[len(new_detail)-200:]
+	}
+	dfa_info1, l_1 := EmployPostEntDfa(bson.M{"detail": new_detail}), 0
+	if res_1 := ju.ConvertInterface(dfa_info1["result"]); len(res_1) > 0 {
+		for _, v := range res_1 {
+			if cl := utf8.RuneCountInString(v); cl > l_1 && cl > 3 && !exclude_entity.MatchString(v) {
+				l_1 = cl
+				new_str = v
+			}
+		}
+	}
+	if new_str != "" {
+		return new_str
+	}
+	dfa_info2, l_2 := EmployPostEntDfa(bson.M{"detail": title + "\n" + projectname}), 0
+	if res_2 := ju.ConvertInterface(dfa_info2["result"]); len(res_2) > 0 {
+		for _, v := range res_2 {
+			if cl := utf8.RuneCountInString(v); v != "" && cl > l_2 && cl > 3 && !exclude_entity.MatchString(v) {
+				if v != "" && (v == winner || v == agency) {
+					continue //识别异常
+				}
+				l_2 = cl
+				new_str = v
+			}
+		}
+	}
+	return new_str
+}
+
 // 识别发布时间
 func InscribePublishtime(j_data map[string]interface{}) int64 {
 	//落款文本识别
@@ -332,16 +395,6 @@ func InscribePublishtime(j_data map[string]interface{}) int64 {
 	return int64(0)
 }
 
-// 针对识别-采购单位
-func (e *ExtractTask) AimAtRecognizeBuyer(tmp *map[string]interface{}, j_data map[string]interface{}) {
-	if qu.ObjToString((*tmp)["buyer"]) == "" && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" &&
-		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
-		if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" {
-			(*tmp)["buyer"] = new_buyer
-		}
-	}
-}
-
 // 识别特殊采购单位
 func InscribeSpecEntity(detail string) string {
 	new_str := ""
@@ -356,6 +409,25 @@ func InscribeSpecEntity(detail string) string {
 	return new_str
 }
 
+func EmployPostEntDfa(data map[string]interface{}) map[string]interface{} {
+	info := map[string]interface{}{}
+	client := &http.Client{Timeout: 2 * time.Second}
+	jsonStr, _ := json.Marshal(data)
+	resp, err := client.Post("http://extcity.spdata.jianyu360.com/service/entity/", "application/json", bytes.NewBuffer(jsonStr))
+	if err != nil {
+		return info
+	}
+	res, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return info
+	}
+	err = json.Unmarshal(res, &info)
+	if err != nil {
+		return info
+	}
+	return info
+}
+
 // 处理折扣系数-
 func dealWithDiscountBid(tmp map[string]interface{}) float64 {
 	biddiscount := qu.Float64All(tmp["biddiscount"])

+ 5 - 5
src/jy/extract/score.go

@@ -27,7 +27,7 @@ var (
 	CommonScore     map[string]float64
 	FieldsScore     map[string]map[string]float64
 	lengthValidReg0 = regexp.MustCompile(`(金额|单价)`)
-	lengthValidReg1 = regexp.MustCompile(`^(.{2}([大|||学][学|院]|公司|某部|学社|大队|党校|某(部|中心)|(联通|移动|电信)))$`)
+	lengthValidReg1 = regexp.MustCompile(`^(.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$`)
 	lengthValidReg2 = regexp.MustCompile(`([,,、])`)
 	lengthValidReg3 = regexp.MustCompile(`(.{4,20}公司)`)
 )
@@ -114,7 +114,7 @@ func init() {
 
 var CNreg = regexp.MustCompile("[\u4e00-\u9fa5]")
 
-//分析-打分排序
+// 分析-打分排序
 func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
@@ -169,7 +169,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	return doc, result, _id
 }
 
-//结果打分
+// 结果打分
 func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	qu.Catch()
 	result := j.Result
@@ -466,7 +466,7 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	return result
 }
 
-//项目编号权重清理
+// 项目编号权重清理
 func projectWeightClear(tmps []*ju.ExtField) []*ju.ExtField {
 	newList := make([]*ju.ExtField, 0)
 	if len(tmps) < 1 {
@@ -501,7 +501,7 @@ func projectWeightClear(tmps []*ju.ExtField) []*ju.ExtField {
 	return newList
 }
 
-//多供应商,不减分评判标准
+// 多供应商,不减分评判标准
 func isMultiSupplier(str string) bool {
 	arr := lengthValidReg2.Split(str, -1)
 	for _, v := range arr {

+ 2 - 3
src/jy/pretreated/analymethod.go

@@ -450,10 +450,9 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 			new_str = formattext100.ReplaceAllString(new_str, "$1")
 			bl.Text = fmt.Sprintf("中标金额:%s万元\n", new_str) + bl.Text
 		}
-
 		//调用kv解析库-处理detail
 		bl.Text = formatText(bl.Text, "all")
-		//处理 
+		//处理 :
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
 		//处理空格
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
@@ -471,7 +470,6 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 				}
 			}
 		}
-
 		//如果表格查询分包-有分包-但是没有有效值的话 ,正文重新查找
 		if len(tabs) > 0 && job.BlockPackage != nil {
 			if !isUsefulPackage(job.BlockPackage) { //表格未识别出有效分包-且文本里面无有效字样
@@ -548,6 +546,7 @@ func isResetUnitAmountSortKV(table *Table) {
 		table.SortKV.Map["中标金额"] = qu.ObjToString(table.SortKV.Map["中标金额"]) + "万元"
 	}
 }
+
 func isResetUnitPriceSortKV(table *Table) {
 	keyArr := []string{"序号", "数量", "单价"}
 	isMatch := true

+ 2 - 2
src/main.go

@@ -40,7 +40,7 @@ func main() {
 
 // 验证规则
 func testMain() {
-	con := `1234567`
-	text := con[3:7]
+	con := `2134576`
+	text := con[1:2]
 	log.Debug(text)
 }

+ 19 - 19
src/web/templates/admin/result_list.html

@@ -181,8 +181,8 @@ $(function () {
 				testtask=[
 						{label:"任务名称",s_label:"s_taskname",type:"tpl_list_local",must:true,url:"/admin/task/gettaskname"},
 						{label:"起始id",s_label:"s_startid",must:true},
-						{label:"抽取数量",s_label:"s_datanum",placeholder:"1",must:true},
-						{label:"结果版本",s_label:"s_resulttrack",must:true,placeholder:"a"}
+						// {label:"抽取数量",s_label:"s_datanum",placeholder:"1",must:false},
+						// {label:"结果版本",s_label:"s_resulttrack",placeholder:"1",must:false}
 				]
 				//测试启动按钮
 				testtaskbtn=[
@@ -193,26 +193,26 @@ $(function () {
 							var num = $("#s_datanum").val();
 							var taskid = $("#s_taskname").val();
 							var resulttrack = $("#s_resulttrack").val();
-							//taskid ="5eda01b0c566ca08409370bb"
-							// num = 1
-							//resulttrack = "a"
 							var bcon = true;
-							if(id == "" || resulttrack == ""){
+							if(id == ""){
 								bcon = false;
-								alert("红色标签的表单不能为空!");
+								alert("红色表单不能为空!");
 								return
 							}
-							var resultcoll = "result_"+resulttrack; 
-							var trackcoll = "track_"+resulttrack; 
-							if(num != ""){//验证数字
-								if(!reg.test(num)){
-									bcon = false;
-									alert("抽取数量填写错误!");
-									return
-								}
-							}else{
-								num = "1";
-							}
+							// var resultcoll = "result_"+resulttrack;
+							// var trackcoll = "track_"+resulttrack;
+							// if(num != ""){//验证数字
+							// 	if(!reg.test(num)){
+							// 		bcon = false;
+							// 		alert("抽取数量填写错误!");
+							// 		return
+							// 	}
+							// }else{
+							// 	num = "1";
+							// }
+							var resultcoll = "result_log";
+							var trackcoll = "track_log";
+							num = "1"
 							if(bcon){
 								//抽取测试								
 								$.post("/admin/task/test",{"startid":id,"num":num,"taskid":taskid,"resultcoll":resultcoll,"trackcoll":trackcoll},function(data){
@@ -228,7 +228,7 @@ $(function () {
 								},'json')
 								
 							}else{
-								alert("红色标签的表单不能为空!")
+								alert("红色表单不能为空!")
 							}
 						}
 					}

+ 1 - 1
udpcontrol/src/method.go

@@ -236,7 +236,7 @@ func splitIdMethod(sid string, eid string) ([]map[string]interface{}, []int64) {
 
 //计算生命周期
 func calculateLiftime(count int64) int64 {
-	time_one := 1500.0 / 1000.0 //暂定~每千条用时1000秒
+	time_one := 1500.0 / 1000.0 //暂定~每千条用时1500秒
 	life_time := int64(time_one * float64(count) * 3.0)
 	if life_time < 2400 {
 		life_time = 2400