|
@@ -11,8 +11,9 @@ import (
|
|
)
|
|
)
|
|
|
|
|
|
const (
|
|
const (
|
|
- DETAIL_TEXT = "详情请访问原网页!"
|
|
|
|
- DETAIL_FILE = "详情请下载附件!"
|
|
|
|
|
|
+ DETAIL_TEXT = "详情请访问原网页!"
|
|
|
|
+ DETAIL_FILE = "详情请下载附件!"
|
|
|
|
+ NEWS_INFOFORMAT = 5
|
|
)
|
|
)
|
|
|
|
|
|
var DetailReg = regexp.MustCompile(DETAIL_TEXT)
|
|
var DetailReg = regexp.MustCompile(DETAIL_TEXT)
|
|
@@ -124,27 +125,29 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
|
|
|
|
|
|
detail := qutil.ObjToString(tmp["detail"])
|
|
detail := qutil.ObjToString(tmp["detail"])
|
|
uuid := sp.Sha(detail) //拟建数据2022-04-26也改为detail判重
|
|
uuid := sp.Sha(detail) //拟建数据2022-04-26也改为detail判重
|
|
|
|
+ //2、重点字段校验
|
|
|
|
+ //2.1数据类型infoformat校验
|
|
|
|
+ infoformat := qutil.IntAll(tmp["infoformat"])
|
|
|
|
+ if infoformat == 0 {
|
|
|
|
+ errorData(LEVEL_ERROR, true, "infoformat", "Field Value Is Null", href, title, &warn, tmp)
|
|
|
|
+ infoformat = 1
|
|
|
|
+ tmp["infoformat"] = infoformat //设置默认值
|
|
|
|
+ }
|
|
//数据判重
|
|
//数据判重
|
|
if tmp["repeat"] == nil { //判断repeat,为了异常数据重推时不进行redis判重
|
|
if tmp["repeat"] == nil { //判断repeat,为了异常数据重推时不进行redis判重
|
|
- b, res = dataRepeat(tmp, href, hashHref, uuid, downloadFileOk)
|
|
|
|
|
|
+ b, res = dataRepeat(infoformat, tmp, href, hashHref, uuid, downloadFileOk)
|
|
if b && res == 4 {
|
|
if b && res == 4 {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- //2、重点字段校验
|
|
|
|
- //2.1保存表T校验
|
|
|
|
|
|
+
|
|
|
|
+ //2.2保存表T校验
|
|
T := qutil.ObjToString(tmp["T"])
|
|
T := qutil.ObjToString(tmp["T"])
|
|
- if T != "bidding" && T != SaveOtherColl && T != SaveYqColl {
|
|
|
|
|
|
+ if T != SaveBiddingColl && T != SaveOtherColl && T != SaveYqColl && T != SaveNewsColl {
|
|
errorData(LEVEL_ERROR, true, "T", "Save Coll Error", href, title, &warn, tmp)
|
|
errorData(LEVEL_ERROR, true, "T", "Save Coll Error", href, title, &warn, tmp)
|
|
res = 2
|
|
res = 2
|
|
T = SaveColl //设置默认值
|
|
T = SaveColl //设置默认值
|
|
}
|
|
}
|
|
- //2.2数据类型infoformat校验
|
|
|
|
- infoformat := qutil.IntAll(tmp["infoformat"])
|
|
|
|
- if infoformat == 0 {
|
|
|
|
- errorData(LEVEL_ERROR, true, "infoformat", "Field Value Is Null", href, title, &warn, tmp)
|
|
|
|
- tmp["infoformat"] = 1 //设置默认值
|
|
|
|
- }
|
|
|
|
//2.3title和s_title替换
|
|
//2.3title和s_title替换
|
|
if s_title := tmp["s_title"]; s_title != nil { //三级页有获取标题
|
|
if s_title := tmp["s_title"]; s_title != nil { //三级页有获取标题
|
|
stitle := qutil.ObjToString(s_title)
|
|
stitle := qutil.ObjToString(s_title)
|
|
@@ -344,57 +347,72 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
|
|
mgoid, mgocoll = saveData(T, result, downloadFileOk, iscompete)
|
|
mgoid, mgocoll = saveData(T, result, downloadFileOk, iscompete)
|
|
b = true
|
|
b = true
|
|
//5、推送成功的数据修改列表href redis值(用于去除重复采集,推送保存服务的数据,包含lua、python采集的相同数据)
|
|
//5、推送成功的数据修改列表href redis值(用于去除重复采集,推送保存服务的数据,包含lua、python采集的相同数据)
|
|
- sp.RedisSet("list", "list_"+hashHref, "1", 86400*365*2)
|
|
|
|
|
|
+ if infoformat == NEWS_INFOFORMAT {
|
|
|
|
+ tools.AddStr("list", href)
|
|
|
|
+ } else {
|
|
|
|
+ sp.RedisSet("list", "list_"+hashHref, "1", 86400*365*2)
|
|
|
|
+ }
|
|
return
|
|
return
|
|
}
|
|
}
|
|
|
|
|
|
// 数据判重
|
|
// 数据判重
|
|
-func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string, downloadFileOk bool) (b bool, res int) {
|
|
|
|
|
|
+func dataRepeat(infoformat int, tmp map[string]interface{}, href, hashHref, uuid string, downloadFileOk bool) (b bool, res int) {
|
|
detail := qutil.ObjToString(tmp["detail"])
|
|
detail := qutil.ObjToString(tmp["detail"])
|
|
hanArr := reg_han.FindAllString(detail, -1) //获取汉字集合
|
|
hanArr := reg_han.FindAllString(detail, -1) //获取汉字集合
|
|
hanArrLen := len(hanArr) //汉字个数
|
|
hanArrLen := len(hanArr) //汉字个数
|
|
filterDetail := sp.FilterDetail(detail) //只保留文本内容
|
|
filterDetail := sp.FilterDetail(detail) //只保留文本内容
|
|
//新版数据判重
|
|
//新版数据判重
|
|
publishtime := qutil.Int64All(tmp["publishtime"])
|
|
publishtime := qutil.Int64All(tmp["publishtime"])
|
|
- isExist := false //记录判重结果
|
|
|
|
- repeatby := "" //判重字段
|
|
|
|
- RedisLock.Lock() //redis判重锁
|
|
|
|
- defer RedisLock.Unlock() //
|
|
|
|
- if publishtime < time.Now().AddDate(-1, 0, 0).Unix() { //一年前数据判重或异常发布时间数据
|
|
|
|
- isExist, _ = sp.ExistsBloomRedis("href", href)
|
|
|
|
- if !isExist {
|
|
|
|
- sp.AddBloomRedis("href", href)
|
|
|
|
- //链接未判重,进行正文detail判重校验(特殊文本除外)
|
|
|
|
- if uuid != ANNEX_DETAIL_SHA && filterDetail != "无" && !DetailExists(filterDetail) {
|
|
|
|
- isExist, _ = sp.ExistsBloomRedis("detail", filterDetail)
|
|
|
|
- }
|
|
|
|
- if isExist { //历史数据被detail全量判重
|
|
|
|
- repeatby = "bloom_detail"
|
|
|
|
- } else { //未被判重,存入redis
|
|
|
|
- sp.AddBloomRedis("detail", filterDetail)
|
|
|
|
- sp.RedisSet("sha", uuid, "", 86400*365*2)
|
|
|
|
|
|
+ isExist := false //记录判重结果
|
|
|
|
+ repeatby := "" //判重字段
|
|
|
|
+ if infoformat == NEWS_INFOFORMAT { //新闻数据判重
|
|
|
|
+ if uuid != ANNEX_DETAIL_SHA && filterDetail != "无" && !DetailExists(filterDetail) {
|
|
|
|
+ isExist = tools.CheckStr("detail", uuid)
|
|
|
|
+ if isExist {
|
|
|
|
+ repeatby = "db_detail"
|
|
|
|
+ } else {
|
|
|
|
+ tools.AddStr("detail", uuid)
|
|
}
|
|
}
|
|
- } else { //历史数据被href全量判重
|
|
|
|
- repeatby = "bloom_href"
|
|
|
|
}
|
|
}
|
|
- } else { //一年内数据判重(只进行正文hash判重 问题:detail含有详情请访问原网页等字眼时,不会走正文判重)
|
|
|
|
- //增量href判重(为了去除lua、python采集的相同数据)
|
|
|
|
- r, err := sp.RedisGet("list", "list_"+hashHref)
|
|
|
|
- if err == nil && r != "" { //此条href数据已入库
|
|
|
|
- isExist = true
|
|
|
|
- repeatby = "list_href"
|
|
|
|
- } else {
|
|
|
|
- if uuid != ANNEX_DETAIL_SHA && filterDetail != "无" && !DetailExists(filterDetail) {
|
|
|
|
- isExist = sp.RedisExist("sha", uuid)
|
|
|
|
|
|
+ } else {
|
|
|
|
+ RedisLock.Lock() //redis判重锁
|
|
|
|
+ if publishtime < time.Now().AddDate(-1, 0, 0).Unix() { //一年前数据判重或异常发布时间数据
|
|
|
|
+ isExist, _ = sp.ExistsBloomRedis("href", href)
|
|
|
|
+ if !isExist {
|
|
|
|
+ sp.AddBloomRedis("href", href)
|
|
|
|
+ //链接未判重,进行正文detail判重校验(特殊文本除外)
|
|
|
|
+ if uuid != ANNEX_DETAIL_SHA && filterDetail != "无" && !DetailExists(filterDetail) {
|
|
|
|
+ isExist, _ = sp.ExistsBloomRedis("detail", filterDetail)
|
|
|
|
+ }
|
|
|
|
+ if isExist { //历史数据被detail全量判重
|
|
|
|
+ repeatby = "bloom_detail"
|
|
|
|
+ } else { //未被判重,存入redis
|
|
|
|
+ sp.AddBloomRedis("detail", filterDetail)
|
|
|
|
+ sp.RedisSet("sha", uuid, "", 86400*365*2)
|
|
|
|
+ }
|
|
|
|
+ } else { //历史数据被href全量判重
|
|
|
|
+ repeatby = "bloom_href"
|
|
}
|
|
}
|
|
- if isExist { //增量detail sha判重
|
|
|
|
- repeatby = "sha_detail"
|
|
|
|
- } else if qutil.ObjToString(tmp["competehref"]) == "" { //竞品数据sha不存
|
|
|
|
- sp.RedisSet("sha", uuid, "", 86400*365*2)
|
|
|
|
|
|
+ } else { //一年内数据判重(只进行正文hash判重 问题:detail含有详情请访问原网页等字眼时,不会走正文判重)
|
|
|
|
+ //增量href判重(为了去除lua、python采集的相同数据)
|
|
|
|
+ r, err := sp.RedisGet("list", "list_"+hashHref)
|
|
|
|
+ if err == nil && r != "" { //此条href数据已入库
|
|
|
|
+ isExist = true
|
|
|
|
+ repeatby = "list_href"
|
|
|
|
+ } else {
|
|
|
|
+ if uuid != ANNEX_DETAIL_SHA && filterDetail != "无" && !DetailExists(filterDetail) {
|
|
|
|
+ isExist = sp.RedisExist("sha", uuid)
|
|
|
|
+ }
|
|
|
|
+ if isExist { //增量detail sha判重
|
|
|
|
+ repeatby = "sha_detail"
|
|
|
|
+ } else if qutil.ObjToString(tmp["competehref"]) == "" { //竞品数据sha不存
|
|
|
|
+ sp.RedisSet("sha", uuid, "", 86400*365*2)
|
|
|
|
+ }
|
|
|
|
+ sp.AddBloomRedis("href", href)
|
|
|
|
+ sp.AddBloomRedis("detail", filterDetail)
|
|
}
|
|
}
|
|
- sp.AddBloomRedis("href", href)
|
|
|
|
- sp.AddBloomRedis("detail", filterDetail)
|
|
|
|
}
|
|
}
|
|
|
|
+ RedisLock.Unlock() //
|
|
}
|
|
}
|
|
if isExist {
|
|
if isExist {
|
|
//ReplaceFile(site, uuid, tmp)//正文判重,附件替换
|
|
//ReplaceFile(site, uuid, tmp)//正文判重,附件替换
|
|
@@ -403,8 +421,8 @@ func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string, downloa
|
|
tmp["repeatby"] = repeatby
|
|
tmp["repeatby"] = repeatby
|
|
tmp["s_sha"] = uuid
|
|
tmp["s_sha"] = uuid
|
|
tmp["hashref"] = hashHref
|
|
tmp["hashref"] = hashHref
|
|
- SaveMgoCache <- tmp //记录重复数据
|
|
|
|
- if repeatby == "bloom_detail" || repeatby == "sha_detail" { //被正文判重,保留数据,打上标记
|
|
|
|
|
|
+ SaveMgoCache <- tmp //记录重复数据
|
|
|
|
+ if repeatby == "bloom_detail" || repeatby == "sha_detail" || repeatby == "db_detail" { //被正文判重,保留数据,打上标记
|
|
/*
|
|
/*
|
|
日期:2024-04-26
|
|
日期:2024-04-26
|
|
逻辑:含有效附件且正文汉字个数小于100的数据不进行正文判重
|
|
逻辑:含有效附件且正文汉字个数小于100的数据不进行正文判重
|
|
@@ -466,8 +484,16 @@ func saveData(T string, result map[string]interface{}, dfOk, iscompete bool) (st
|
|
saveOtherMust()
|
|
saveOtherMust()
|
|
}
|
|
}
|
|
saveotherlock.Unlock()
|
|
saveotherlock.Unlock()
|
|
- } else if dfOk { //附件信息
|
|
|
|
- savecoll = "bidding_file"
|
|
|
|
|
|
+ } else if T == SaveNewsColl { //新闻数据
|
|
|
|
+ savecoll = SaveNewsColl
|
|
|
|
+ saveNewsLock.Lock()
|
|
|
|
+ SaveNewsCache = append(SaveNewsCache, result)
|
|
|
|
+ if len(SaveNewsCache) > 200 {
|
|
|
|
+ saveNewsMust()
|
|
|
|
+ }
|
|
|
|
+ saveNewsLock.Unlock()
|
|
|
|
+ } else if T == SaveBiddingColl && dfOk { //附件信息
|
|
|
|
+ savecoll = SaveFileColl
|
|
savefilelock.Lock()
|
|
savefilelock.Lock()
|
|
SaveFileCache = append(SaveFileCache, result)
|
|
SaveFileCache = append(SaveFileCache, result)
|
|
if len(SaveFileCache) > 200 {
|
|
if len(SaveFileCache) > 200 {
|