Sfoglia il codice sorgente

抽取-修复方法-表格先关

zhengkun 3 anni fa
parent
commit
d08b895136

+ 2 - 3
data_monitoring/listen_data/src/main.go

@@ -81,7 +81,7 @@ func main()  {
 	//save_mgo = &MongodbSim{
 	//	MongodbAddr: "172.17.145.163:27083,172.17.4.187:27082",
 	//	DbName:      "qfw",
-	//	Size:        10,
+	//	Size:        20,
 	//	UserName: "zhengkun",
 	//	Password: "zk@123123",
 	//}
@@ -105,8 +105,7 @@ func main()  {
 	}
 	save_mgo.InitPool()
 
-	exportSpecifiedTimeData()
-
+	exportSensitiveTestData()
 	return
 
 

File diff suppressed because it is too large
+ 1803 - 3
data_monitoring/listen_data/src/zkmethod.go


+ 5 - 2
src/jy/clear/totimestamp.go

@@ -9,7 +9,7 @@ import (
 	"time"
 )
 
-var reg, regA, regB, regC, regD, regAfter *regexp.Regexp
+var reg, regA, regB, regC, regD, regAfter ,regAfterBool*regexp.Regexp
 
 const (
 	T = 365 * 86400
@@ -30,6 +30,8 @@ func init() {
 	regC, _ = regexp.Compile(`\s*\d+[::时]\d+分?[-—]`)
 	regD, _ = regexp.Compile(`([一|二|三|四|五|六|七|八|九|十|零|〇]{4})年([一|二|三|四|五|六|七|八|九|十]{1,2})月([一|二|三|四|五|六|七|八|九|十]{1,3})日([一|二|三|四|五|六|七|八|九|十]{1,3})时`)
 	regAfter, _ = regexp.Compile(`(下午D?\d{1,2}[时|:|:|h|H])`)
+	regAfterBool, _ = regexp.Compile(`(下午D?[1-2][0-9][时|:|:|h|H])`)
+
 }
 
 /*字符时间转时间戳
@@ -71,7 +73,8 @@ func ObjToTimestamp(data []interface{},spidercode ...string) []interface{} {
 	}
 	//2017年11月13日下午3时30分
 	addreptime := int64(0)
-	if regAfter.MatchString(tmp) {
+	//2021年09月10日下午15时30分
+	if regAfter.MatchString(tmp) && !regAfterBool.MatchString(tmp) {
 		addreptime = 12 * 60 * 60
 	}
 	regRepl, _ := regexp.Compile(`[,,]`)

+ 20 - 2
src/jy/extract/extract.go

@@ -26,6 +26,7 @@ import (
 
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
+    JYUrl = "https://www.jianyu360.com/article/content/%s.html"
 
 	cut           = ju.NewCut()                          //获取正文并清理
 	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
@@ -673,8 +674,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					continue
 				}
 
-				if vc.Field =="winner" {
-					log.Debug("调试抽取字段")
+				if vc.Field =="bidamount" {
+					//log.Debug("调试抽取字段")
 				}
 				////抽取-前置规则
 				//for _, v := range vc.RulePres {
@@ -2410,6 +2411,23 @@ func checkFields(tmp map[string]interface{}) map[string]interface{} {
 
 
 
+	//临时
+	//bidopentime := qu.Int64All(tmp["bidopentime"])
+	//bidendtime := qu.Int64All(tmp["bidendtime"])
+	//timeLayout := "2006-01-02 15:04:05"
+	//
+	//if bidopentime>0 {
+	//	bidopentime_str := time.Unix(bidopentime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
+	//	tmp["bidopentime"] = bidopentime_str
+	//}
+	//if bidendtime>0 {
+	//	bidendtime_str := time.Unix(bidendtime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串
+	//	tmp["bidendtime"] = bidendtime_str
+	//}
+
+	jyhref:= fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", BsonTOStringId(tmp["_id"])))
+	tmp["jyhref"] = jyhref
+
 	return tmp
 }
 //处理折扣系数

+ 11 - 5
src/jy/pretreated/analystep.go

@@ -13,10 +13,13 @@ import (
 
 	"github.com/PuerkitoBio/goquery"
 )
-
-var yjReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|主要人员相关资料|投标文件格式|唱标记录|否决投标的?情况说明")
+//投标文件格式
+var yjReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|主要人员相关资料|唱标记录|否决投标的?情况说明")
 var hisReg = regexp.MustCompile("(开标记录|类似业绩|历史业绩|填报项目业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</td>)")
-var hisReg2 = regexp.MustCompile("(开标记录|业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</tr>|</table>|</td>)")
+var hisReg2 = regexp.MustCompile("(开标记录|业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(.*原因及其依据.*[::]?)?[\n]?.*?[\n]?(</tr>|</table>|</td>)")
+
+
+
 var formattext = regexp.MustCompile("(投标总价)([0-9,.万元]*)")
 var formattext2 = regexp.MustCompile("中标单价.*(中标总价.*)")
 var formattext3 = regexp.MustCompile("(同类项目业绩、|[1-9].[0-9]包段划分)")
@@ -28,7 +31,7 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 	//格式化正文 -断点
 	con = formattext3.ReplaceAllString(con,"")
 	con = hisReg.ReplaceAllString(con, "${2}")
-	con = hisReg2.ReplaceAllString(con, "${2}")
+	con = hisReg2.ReplaceAllString(con, "${3}")
 	con = formattext.ReplaceAllString(con, "${1}:${2}")
 	con = formattext2.ReplaceAllString(con, "${1}")
 
@@ -63,9 +66,12 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 			}
 			FindProjectCode(bl.Text, job) //匹配项目编号
 			processTableInBlock(bl, job, isSite, codeSite) //处理表格
+
 			//新加 未分块table中未能解析到中标候选人,从正文中解析
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
-				bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
+				//表格没有划分时候:-纯文本匹配
+				tmp_text := HtmlToText(bl.Text)
+				bl.Winnerorder = winnerOrderEntity.Find(tmp_text, true, 1, isSite, codeSite)
 				job.Winnerorder = bl.Winnerorder
 			}
 			job.Block = append(job.Block, bl)

+ 15 - 2
src/jy/pretreated/analytable.go

@@ -497,7 +497,7 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 						//} else if len(tmpEntname) > 0 {
 						//fmt.Println("table winnerorder only has entname", tmpEntname)
 						//}
-						qutil.Debug("len-smap_v--", len(smap_v))
+						//qutil.Debug("len-smap_v--", len(smap_v))
 						if len(smap_v) > 2 { //只有排序信息 sort和sortstr
 							newSmap = append(newSmap, smap_v)
 						}
@@ -1514,7 +1514,9 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 		//sort := 1
 		nextdirect, nextvdirect := 0, 0
 		//开始抽取
-		for _, tr := range table.TRs {
+		//若第一排全为头-临时让第二排-新增 左临 查询,zhengkun
+		tb_first_allhead := false
+		for tr_index, tr := range table.TRs {
 			//if  kkk==18 || kkk==21 {
 			//	log.Println("调试指定tr")
 			//}
@@ -1530,6 +1532,9 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 					}
 				}
 				if numbh != 0 && numbh == len(tr.TDs) { //5e0d53ef0cf41612e0640495
+					if tr_index==0 {
+						tb_first_allhead = true
+					}
 					nextdirect, nextvdirect = 2, 1
 					continue
 				} else if nextdirect > 0 && nextvdirect > 0 {
@@ -1560,6 +1565,14 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 							//}
 						}
 					}
+					if tb_first_allhead && tr_index==1 { //临时-让第二排-向左比对
+						if !table.FindTdVal(td, 1, 2) { //table.FindTdVal()存储了table.SortKV
+							if !table.FindTdVal(td, vdirect, direct) {
+
+							}
+						}
+						tb_first_allhead = false
+					}
 					//fmt.Println("td:", td.Val, td.BH, td.HeadTd, td.KVDirect)
 				}
 			}

+ 1 - 1
src/jy/pretreated/winnerorder.go

@@ -39,7 +39,7 @@ var (
 	winnerReg7     = regexp.MustCompile("第[一二三四五六七八九十]{1}标段[::]")
 	winnerReg8     = regexp.MustCompile("(第[一二三四五六七八九十]中选候选人)[::\\s]+?[((]1[))][\\s]+?(单位名称)[::]?(.*)[\\s]+?[((]2[))][\\s]+(参选报价|投标报价(含税))[::]?(.*)")
 	//winnerReg8     = regexp.MustCompile("(第[一二三四五六七八九十]中标候选人)[::]?\n(1)单位名称:(.*)\n(2)投标报价(含税):(.*)")
-    winnerReg9     = regexp.MustCompile("(第[一二三四五六七八九十][中选]?候选人|中标人[1-9])[::\\s]+?([\u4E00-\u9FA5]{4,20})([0-9\\.\\s万元]+)")
+    winnerReg9     = regexp.MustCompile("(第[一二三四五六七八九十]中[|标]?候选人|中标人[1-9])[::\\s]+?([\u4E00-\u9FA5]{4,20})[\\s]+([0-9\\.\\s万元]+)")
     winnerReg10    = regexp.MustCompile("(第[一二三四五六七八九十]中标人)[::\\s]+?报价[¥]?([0-9\\.\\s万元]+)[;;]([\u4E00-\u9FA5]{4,20})")
 	winnerReg11     = regexp.MustCompile("([弟|第][一二三四五六七八九十]中[标|选]候选人)[::\\s]+?(单位名称|投标人名称)[::]?(.*)[\\s]+?(参选报价|投标报价[((]含税[))]|投标报价[((]元[))])[::]?(.*)")
 	winnerReg12     = regexp.MustCompile("(中[标|选]候选人[弟|第][一二三四五六七八九十0-9]名)[::\\s]+?(.*)[\\s,,]+?(投标报价)[::]?([0-9\\.\\s万元]+)")

+ 0 - 2
src/main.go

@@ -74,12 +74,10 @@ func main() {
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))
 
-
 	go func() {
 		http.ListenAndServe("localhost:10000", nil)
 	}()
 
 	lock := make(chan bool)
 	<-lock
-
 }

Some files were not shown because too many files changed in this diff