|
@@ -148,6 +148,24 @@ func (s *Spider) DownloadDetailItem(paramdata, tmp map[string]interface{}) {
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if data["delete"] != nil { //三级页过滤
|
|
|
|
+ //增量
|
|
|
|
+ util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
|
|
|
|
+ //全量
|
|
|
|
+ db := HexToBigIntMod(href)
|
|
|
|
+ hashHref := HexText(href)
|
|
|
|
+ util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
|
|
|
|
+ //更新数据状态
|
|
|
|
+ UpdateState(id, map[string]interface{}{"state": 3, "remark": "Detail Delete"})
|
|
|
|
+ //下载成功根据href更新spider_highlistdata中state
|
|
|
|
+ Mgo.Update("spider_highlistdata", map[string]interface{}{"href": tmp["href"]}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "by": "regather", "updatetime": time.Now().Unix()}}, false, true)
|
|
|
|
+ Mgo.Update("spider_listdata", map[string]interface{}{"href": tmp["href"]}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "by": "regather", "updatetime": time.Now().Unix()}}, false, true)
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ //正文、附件分析,下载异常数据重新下载
|
|
|
|
+ if AnalysisProjectInfo(data) {
|
|
|
|
+ return
|
|
|
|
+ }
|
|
//jsondata处理
|
|
//jsondata处理
|
|
if jsondata, ok := data["jsondata"].(string); ok {
|
|
if jsondata, ok := data["jsondata"].(string); ok {
|
|
jsondataMap := map[string]interface{}{}
|
|
jsondataMap := map[string]interface{}{}
|
|
@@ -201,6 +219,34 @@ func (s *Spider) DownloadDetailItem(paramdata, tmp map[string]interface{}) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
|
|
+func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ detail := qu.ObjToString(data["detail"])
|
|
|
|
+ if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
|
+ if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
|
|
|
|
+ if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
|
|
|
|
+ fileOk := false
|
|
|
|
+ for _, data := range attachments {
|
|
|
|
+ if d, ok := data.(map[string]interface{}); ok {
|
|
|
|
+ fid := qu.ObjToString(d["fid"])
|
|
|
|
+ if fid != "" { //附件上传成功
|
|
|
|
+ fileOk = true
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return !fileOk
|
|
|
|
+ } else {
|
|
|
|
+ return true
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ return true
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return false
|
|
|
|
+}
|
|
|
|
+
|
|
//下载解析内容页
|
|
//下载解析内容页
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
defer mu.Catch()
|
|
defer mu.Catch()
|