Prechádzať zdrojové kódy

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 rokov pred
rodič
commit
5729c3544c

+ 2 - 0
src/jy/extract/isextract.go

@@ -23,6 +23,8 @@ func init() {
 
 func IsExtract(filed, title, content string) bool {
 	defer qu.Catch()
+	//临时的,抽取所有
+	return true
 	b := true
 	if N_extract[filed] != nil {
 		nregs := N_extract[filed]

+ 6 - 6
src/jy/pretreated/division.go

@@ -58,13 +58,13 @@ var (
 	regStrWrap         = regexp.MustCompile("分包名称[::]")
 	regBZJWarap        = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
 	regFJWarap         = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
-	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张/包|纸[\\d]*包|/*[\\d]+包|相机包)")
+	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
 	regFilter          = regexp.MustCompile("等$")
 	pkgFilter          = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
-	indexTile          = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
-	indexTile2         = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
+	indexTile          = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题
+	indexTile2         = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`)
 	regReplAllSpace2   = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
 	confusion          = map[string]string{
 		"参与": "canyu",
@@ -735,6 +735,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	startEndMap := map[int]int{}
 	pkgIndexMap := map[string][]int{}
 	indexPkgMap := map[int]string{}
+	
 	//小标题
 	titleindexs := indexTile.FindAllStringIndex(con, -1)
 	if len(titleindexs) == 0 {
@@ -765,9 +766,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			headkey := con[key[4]:key[5]]
 			headkey = regReplAllSpace.ReplaceAllString(headkey, "")
 			if !regDivision.MatchString(headkey) {
-				headkey += ""
+				headkey += ":"
 			}
-			headkey = moreColonReg.ReplaceAllString(headkey, "")
+			headkey = moreColonReg.ReplaceAllString(headkey, ":")
 			colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
 			if len(colonIndexs) > 1 {
 				headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
@@ -836,7 +837,6 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
 				headKey = indexKeyStringMap[iv]
-				text = indexKeyStringMap[iv] + "  " + text
 				//}
 				for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
 					delete(indexKeyStringMap, pkgIndexMap_v)

+ 1 - 1
src/jy/pretreated/multipackage.go

@@ -21,7 +21,7 @@ var (
 	//替换容易混淆的词
 	PreCon1 = regexp.MustCompile("(\\d+\\.?)+万?元")
 	//提取分包标识
-	MultiReg = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号|项)?)[     ]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?")
+	MultiReg = regexp.MustCompile("(([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-])+(包|标段))[::]?|(?:^|\\n)([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+(包|标段))|[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号|项|组)?)[     ]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?|(子项目[0-9]+)")
 	//匹配到的包格式分类统计
 	keyregs = []map[*regexp.Regexp]int{
 		map[*regexp.Regexp]int{

+ 1 - 1
src/jy/util/article.go

@@ -40,7 +40,7 @@ type Job struct {
 	SimAreaScore      map[string]float64                //简称province得分
 	SimCityScore      map[string]float64                //简称city得分
 	SimDistrictScore  map[string]float64                //简称district得分
-	Dataging int
+	Dataging          int
 }
 
 type ExtField struct {