|
@@ -82,11 +82,14 @@ var EsIndex string
|
|
var EsType string
|
|
var EsType string
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
|
|
+var SaveMgoCache = make(chan map[string]interface{}, 1000) //保存爬虫采集非本站点数据
|
|
var SP = make(chan bool, 5)
|
|
var SP = make(chan bool, 5)
|
|
var SPH = make(chan bool, 5)
|
|
var SPH = make(chan bool, 5)
|
|
|
|
+var SPS = make(chan bool, 5)
|
|
var Mgo *mgo.MongodbSim
|
|
var Mgo *mgo.MongodbSim
|
|
var TimeChan = make(chan bool, 1)
|
|
var TimeChan = make(chan bool, 1)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
|
+var DomainNameReg = regexp.MustCompile(`(?://).+?(?:)[::/]`)
|
|
|
|
|
|
//心跳
|
|
//心跳
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
func UpdateHeart(site, channel, code, user, t string) {
|
|
@@ -286,7 +289,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
for i := 1; i <= tabLen; i++ {
|
|
for i := 1; i <= tabLen; i++ {
|
|
v := tbl.RawGetInt(i).(*lua.LTable)
|
|
v := tbl.RawGetInt(i).(*lua.LTable)
|
|
tmp := util.TableToMap(v)
|
|
tmp := util.TableToMap(v)
|
|
- //新增历史补漏
|
|
|
|
|
|
+ s.ThisSiteData(tmp) //统计当前下载数据是否是本站点数据
|
|
if !s.IsHistoricalMend { //不是历史补漏
|
|
if !s.IsHistoricalMend { //不是历史补漏
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
if s.DownDetail {
|
|
if s.DownDetail {
|
|
@@ -415,6 +418,24 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
return errs
|
|
return errs
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+func (s *Spider) ThisSiteData(tmp map[string]interface{}) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
|
+ url_dn := DomainNameReg.FindString(s.TargetChannelUrl)
|
|
|
|
+ href_dn := DomainNameReg.FindString(href)
|
|
|
|
+ if url_dn != href_dn {
|
|
|
|
+ SaveMgoCache <- map[string]interface{}{
|
|
|
|
+ "site": s.Name,
|
|
|
|
+ "channel": s.Channel,
|
|
|
|
+ "spidercode": s.Code,
|
|
|
|
+ "url": s.TargetChannelUrl,
|
|
|
|
+ "href": href,
|
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
|
+ "comeintime": time.Now().Unix(),
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
//遍历,开启三级页下载(历史补漏)
|
|
//遍历,开启三级页下载(历史补漏)
|
|
func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
//qu.Debug("--------------历史下载-----------------")
|
|
//qu.Debug("--------------历史下载-----------------")
|