Browse Source

采集数据应用infoformat字段

maxiaoshan 2 years ago
parent
commit
0161c66502
3 changed files with 7 additions and 4 deletions
  1. 2 1
      src/spider/handler.go
  2. 2 2
      src/spider/script.go
  3. 3 1
      src/spider/spider.go

+ 2 - 1
src/spider/handler.go

@@ -197,6 +197,7 @@ func NewSpider(code, luafile string) *Spider {
 	spider.IsHistoricalMend = spider.GetBoolVar("spiderIsHistoricalMend")
 	spider.IsMustDownload = spider.GetBoolVar("spiderIsMustDownload")
 	spider.IsCompete = spider.GetBoolVar("spiderIsCompete")
+	spider.Infoformat = spider.GetIntVar("spiderInfoformat")
 	return spider
 }
 
@@ -313,7 +314,7 @@ func GetScriptByTmp(luaconfig map[string]interface{}) string {
 		} else {
 			script_content = luaconfig["str_content"].(string)
 		}
-		script += fmt.Sprintf(util.Tmp_Other, luaconfig["spidertype"], luaconfig["spiderhistorymaxpage"], luaconfig["spidermovevent"], luaconfig["spidercompete"])
+		script += fmt.Sprintf(util.Tmp_Other, luaconfig["spidertype"], luaconfig["spiderhistorymaxpage"], luaconfig["spidermovevent"], luaconfig["spidercompete"], luaconfig["infoformat"])
 		script += ` 
 			` + script_time + `
 			` + script_list + `

+ 2 - 2
src/spider/script.go

@@ -763,8 +763,8 @@ func (s *Script) LoadScript(site *string, code, script_file string, newstate boo
 		regstr := S.ToString(-1)
 		text := S.ToString(-2)
 		textReg := regexp.MustCompile(regstr)
-		spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
-		text = spaceReg.ReplaceAllString(text, "")
+		//spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
+		//text = spaceReg.ReplaceAllString(text, "")
 		result := textReg.FindString(text)
 		isMatch := false
 		if result != "" {

+ 3 - 1
src/spider/spider.go

@@ -59,6 +59,7 @@ type Spider struct {
 	IsHistoricalMend bool //是否是历史补漏爬虫
 	IsMustDownload   bool //是否强制下载
 	IsCompete        bool //区分新老爬虫
+	Infoformat       int  //区分爬虫类型 1:招标;2:拟建/审批;3:产权
 }
 
 var Es *es.Elastic
@@ -253,7 +254,8 @@ func DownloadHighDetail(code string) {
 					data["comeintime"] = time.Now().Unix()
 					data["spidercode"] = sp.Code
 					data["dataging"] = 0
-					data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
+					data["iscompete"] = sp.IsCompete   //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
+					data["infoformat"] = sp.Infoformat //爬虫类型
 					Store(sp.StoreMode, sp.StoreToMsgEvent, sp.Collection, sp.CoverAttr, data, true)
 					set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
 					MgoS.Update("spider_highlistdata", query, set, false, false)