浏览代码

采集应用infoformat字段

maxiaoshan 2 年之前
父节点
当前提交
8a958e806a
共有 5 个文件被更改,包括 19 次插入13 次删除
  1. 3 1
      src/spider/handler.go
  2. 2 2
      src/spider/script.go
  3. 1 0
      src/spider/spider.go
  4. 12 10
      src/task/task.go
  5. 1 0
      src/util/util.go

+ 3 - 1
src/spider/handler.go

@@ -59,6 +59,8 @@ func NewSpider(code, luafile string) (*Spider, string) {
 	spider.IsMustDownload = spider.GetBoolVar("spiderIsMustDownload")
 	//新老爬虫
 	spider.IsCompete = spider.GetBoolVar("spiderIsCompete")
+	//爬虫类型
+	spider.Infoformat = spider.GetIntVar("spiderInfoformat")
 	return spider, ""
 }
 
@@ -128,7 +130,7 @@ func GetScriptByTmp(luaconfig map[string]interface{}) string {
 		} else { //专家模式
 			script_content = luaconfig["str_content"].(string)
 		}
-		script += fmt.Sprintf(util.Tmp_Other, luaconfig["spidertype"], luaconfig["spiderhistorymaxpage"], luaconfig["spidermovevent"], luaconfig["spidercompete"])
+		script += fmt.Sprintf(util.Tmp_Other, luaconfig["spidertype"], luaconfig["spiderhistorymaxpage"], luaconfig["spidermovevent"], luaconfig["spidercompete"], luaconfig["infoformat"])
 		script += ` 
 			` + script_time + `
 			` + script_list + `

+ 2 - 2
src/spider/script.go

@@ -789,8 +789,8 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		regstr := S.ToString(-1)
 		text := S.ToString(-2)
 		textReg := regexp.MustCompile(regstr)
-		spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
-		text = spaceReg.ReplaceAllString(text, "")
+		//spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
+		//text = spaceReg.ReplaceAllString(text, "")
 		result := textReg.FindString(text)
 		isMatch := false
 		if result != "" {

+ 1 - 0
src/spider/spider.go

@@ -47,6 +47,7 @@ type Spider struct {
 	IsHistoricalMend bool //是否是历史补漏爬虫
 	IsMustDownload   bool //是否强制下载
 	IsCompete        bool //区分新老爬虫
+	Infoformat       int  //区分爬虫类型 1:招标;2:拟建/审批;3:产权
 	IsMainThread     bool //是否为主线程(多线程采集时区分是否为主线程)
 }
 

+ 12 - 10
src/task/task.go

@@ -197,8 +197,7 @@ func (t *Task) StartJob() {
 							tmp["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
 							//spider.Store(sp.StoreMode, sp.StoreToMsgEvent, sp.Collection, sp.CoverAttr, result, true)
 						}
-						tmp["state"] = 1
-						result := t.CheckField(tmp) //校验字段
+						result := t.CheckField(tmp, sp) //校验字段
 						//下载成功
 						update = append(update, query)
 						update = append(update, map[string]interface{}{"$set": result})
@@ -221,7 +220,7 @@ func (t *Task) StartJob() {
 }
 
 //字段校验
-func (t *Task) CheckField(tmp map[string]interface{}) map[string]interface{} {
+func (t *Task) CheckField(tmp map[string]interface{}, sp *spider.Spider) map[string]interface{} {
 	defer qu.Catch()
 	result := map[string]interface{}{}
 	if len(t.CheckFields) > 0 {
@@ -251,17 +250,20 @@ func (t *Task) CheckField(tmp map[string]interface{}) map[string]interface{} {
 				}
 			}
 			if !fieldOk { //字段值下载出错,该条数据下载失败
-				result["state"] = -1
-				return result
+				return map[string]interface{}{"state": -1}
 			} else if fieldOk && t.IsBidding { //bidding数据指定更新某字段
 				result[field] = tmp[field]
 			}
 		}
-		if t.IsBidding {
-			result["state"] = 1
-			return result
-		}
 	}
+	if t.IsBidding { //bidding数据
+		result["state"] = 1
+		return result
+	}
+	//非bidding信息
+	tmp["state"] = 1
+	tmp["T"] = sp.Collection
+	tmp["infoformat"] = sp.Infoformat
 	return tmp
 }
 
@@ -303,7 +305,7 @@ func (t *Task) SendNotBiddingData() {
 			id := tmp["_id"]
 			delete(tmp, "_id")
 			tmp["dataging"] = 0 //补充dataging字段
-			tmp["T"] = "bidding"
+			//tmp["T"] = "bidding"
 			ok, id, coll := SaveObj(4002, "title", tmp)
 			update := []map[string]interface{}{
 				map[string]interface{}{"_id": id},

+ 1 - 0
src/util/util.go

@@ -28,6 +28,7 @@ var Fields = map[string]interface{}{
 	"projectinfo": 1,
 	"T":           1,
 	"dataging":    1,
+	"infoformat":  1,
 	//"publishdept": 1,
 	//"type":        1,
 }