|
@@ -70,6 +70,8 @@ var SP = make(chan bool, 5)
|
|
|
var TimeChan = make(chan bool, 1)
|
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
var DelaySiteMap map[string]*DelaySite //延迟采集站点集合
|
|
|
+var RestrictAccessReg = regexp.MustCompile(`访问被拒绝`)
|
|
|
+
|
|
|
type DelaySite struct {
|
|
|
DelayTime int
|
|
|
Compete bool
|
|
@@ -159,39 +161,6 @@ func DownloadHighDetail(code string) {
|
|
|
}()
|
|
|
_id := tmp["_id"]
|
|
|
query := map[string]interface{}{"_id": _id}
|
|
|
- href := qu.ObjToString(tmp["href"])
|
|
|
- hashHref := HexText(href)
|
|
|
- //由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
|
- //为了避免重复下载,进行全量redis判重
|
|
|
- isExist := util.RedisClusterExists(hashHref)
|
|
|
- if isExist {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- return
|
|
|
- }
|
|
|
- //if code == "a_gcy_mcgg" { //竞品数据es title判重
|
|
|
- // title := qu.ObjToString(tmp["title"])
|
|
|
- // eTime := time.Now().Unix()
|
|
|
- // sTime := eTime - int64(7*86400)
|
|
|
- // esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
|
|
|
- // count := Es.Count(EsIndex, EsType, esQuery)
|
|
|
- // if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
|
|
|
- // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- // MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- // util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
- // return
|
|
|
- // }
|
|
|
- //}
|
|
|
- //competehref := qu.ObjToString(tmp["competehref"])
|
|
|
- //if competehref != "" { //验证三方网站数据剑鱼是否已采集
|
|
|
- // title := qu.ObjToString(tmp["title"])
|
|
|
- // one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
|
|
|
- // if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
|
|
|
- // set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- // MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
- // return
|
|
|
- // }
|
|
|
- //}
|
|
|
times := qu.IntAll(tmp["times"])
|
|
|
success := true //数据是否下载成功的标志
|
|
|
delete(tmp, "_id")
|
|
@@ -215,9 +184,9 @@ func DownloadHighDetail(code string) {
|
|
|
} /*else if data == nil && times >= 3 { //下载问题,建editor任务
|
|
|
DownloadErrorData(s.Code, tmp)
|
|
|
}*/
|
|
|
- } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
+ } /* else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
util.RedisClusterSet(hashHref, "", -1)
|
|
|
- }
|
|
|
+ }*/
|
|
|
if !success { //下载失败更新次数和状态
|
|
|
ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
|
if times >= 3 { //3次下载失败今天不再下载,state置为1
|
|
@@ -227,27 +196,40 @@ func DownloadHighDetail(code string) {
|
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
return
|
|
|
} else if data["delete"] != nil { //三级页过滤
|
|
|
- util.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "delete", "updatetime": time.Now().Unix()}}
|
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
return
|
|
|
}
|
|
|
//正文、附件分析,下载异常数据重新下载
|
|
|
- if AnalysisProjectInfo(data) {
|
|
|
+ if r := AnalysisProjectInfo(data); r != "" {
|
|
|
times++
|
|
|
ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
|
if times >= 3 { //3次下载失败今天不再下载,state置为1
|
|
|
ss["state"] = -1
|
|
|
- ss["detailfilerr"] = true
|
|
|
+ ss["detailfilerr"] = r
|
|
|
}
|
|
|
set := map[string]interface{}{"$set": ss}
|
|
|
MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
return
|
|
|
}
|
|
|
- t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
- if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
- data["publishtime"] = time.Now().Unix()
|
|
|
+ //t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
+ //if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
+ // data["publishtime"] = time.Now().Unix()
|
|
|
+ //}
|
|
|
+ tmphref := qu.ObjToString(data["href"])
|
|
|
+ publishtime := qu.Int64All(data["l_np_publishtime"])
|
|
|
+ if publishtime < time.Now().AddDate(-1, 0, 0).Unix() {
|
|
|
+ isExist, _ := util.ExistsBloomRedis("href", tmphref)
|
|
|
+ if isExist {
|
|
|
+ set := map[string]interface{}{"$set": map[string]interface{}{
|
|
|
+ "state": 1,
|
|
|
+ "updatetime": time.Now().Unix(),
|
|
|
+ "exist": "bloom_href",
|
|
|
+ }}
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ return
|
|
|
+ }
|
|
|
}
|
|
|
delete(data, "exit")
|
|
|
delete(data, "checkpublishtime")
|
|
@@ -271,9 +253,12 @@ func DownloadHighDetail(code string) {
|
|
|
}
|
|
|
|
|
|
//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
|
-func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
+func AnalysisProjectInfo(data map[string]interface{}) string {
|
|
|
defer qu.Catch()
|
|
|
detail := qu.ObjToString(data["detail"])
|
|
|
+ if RestrictAccessReg.MatchString(detail) { //限制访问
|
|
|
+ return "ip"
|
|
|
+ }
|
|
|
if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
|
|
|
if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
|
|
@@ -282,20 +267,21 @@ func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
if d, ok := data.(map[string]interface{}); ok {
|
|
|
fid := qu.ObjToString(d["fid"])
|
|
|
if fid != "" { //附件上传成功
|
|
|
- fileOk = true
|
|
|
- break
|
|
|
+ return ""
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- return !fileOk
|
|
|
+ if !fileOk {
|
|
|
+ return "detail_file"
|
|
|
+ }
|
|
|
} else {
|
|
|
- return true
|
|
|
+ return "detail_file"
|
|
|
}
|
|
|
} else {
|
|
|
- return true
|
|
|
+ return "detail_file"
|
|
|
}
|
|
|
}
|
|
|
- return false
|
|
|
+ return ""
|
|
|
}
|
|
|
|
|
|
//下载解析内容页
|
|
@@ -340,7 +326,7 @@ func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[strin
|
|
|
if value, ok := v.(lua.LString); ok {
|
|
|
data[key] = string(value)
|
|
|
} else if value, ok := v.(lua.LNumber); ok {
|
|
|
- data[key] = value
|
|
|
+ data[key] = int64(value)
|
|
|
} else if value, ok := v.(*lua.LTable); ok {
|
|
|
tmp := util.TableToMap(value)
|
|
|
data[key] = tmp
|