|
@@ -1014,10 +1014,22 @@ func (s *Spider) DownloadDetail(stype string) {
|
|
|
updateArr = append(updateArr, update)
|
|
|
spLock.Unlock()
|
|
|
return
|
|
|
- //deleteData := FilterByDetail(href, query, data) //针对列表页无法过滤需要在详情页过滤的数据,进行过滤处理
|
|
|
- //if deleteData {
|
|
|
- // return
|
|
|
- //}
|
|
|
+ }
|
|
|
+ //正文、附件分析,下载异常数据重新下载
|
|
|
+ if AnalysisProjectInfo(data) {
|
|
|
+ times++
|
|
|
+ ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
|
+ if times >= 3 { //3次下载失败今天不再下载,state置为1
|
|
|
+ ss["state"] = -1
|
|
|
+ ss["detailfilerr"] = true
|
|
|
+ }
|
|
|
+ set := map[string]interface{}{"$set": ss}
|
|
|
+ update = append(update, query)
|
|
|
+ update = append(update, set)
|
|
|
+ spLock.Lock()
|
|
|
+ updateArr = append(updateArr, update)
|
|
|
+ spLock.Unlock()
|
|
|
+ return
|
|
|
}
|
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
@@ -1087,6 +1099,34 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
|
+func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
+ defer qu.Catch()
|
|
|
+ detail := qu.ObjToString(data["detail"])
|
|
|
+ if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
+ if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
|
|
|
+ if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
|
|
|
+ fileOk := false
|
|
|
+ for _, data := range attachments {
|
|
|
+ if d, ok := data.(map[string]interface{}); ok {
|
|
|
+ fid := qu.ObjToString(d["fid"])
|
|
|
+ if fid != "" { //附件上传成功
|
|
|
+ fileOk = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return !fileOk
|
|
|
+ } else {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
//打印线程数
|
|
|
func AllThreadLog() {
|
|
|
logger.Info("Detail Download All Thread:", AllThreadNum)
|