|
@@ -15,6 +15,7 @@ import (
|
|
|
)
|
|
|
|
|
|
const SPIDER_MAXPAGENUM = 2 //默认列表页采集页数
|
|
|
+const DOWNLOADNODE = "bid"
|
|
|
|
|
|
var (
|
|
|
htmlModelReg = regexp.MustCompile(`{{[a-zA-z.()\d,:]{5,}}}|^(\$)`) //过滤模板语言
|
|
@@ -31,7 +32,9 @@ var (
|
|
|
//以关键词结尾
|
|
|
reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
|
|
|
|
|
|
- CheckStart int
|
|
|
+ CheckLuaChan = make(chan map[string]interface{}, 1000)
|
|
|
+ CheckLuaMap = make(map[string]bool)
|
|
|
+ CheckLuaMapLock = &sync.Mutex{}
|
|
|
)
|
|
|
|
|
|
type Data struct {
|
|
@@ -43,16 +46,8 @@ type Data struct {
|
|
|
Base64Type bool
|
|
|
}
|
|
|
|
|
|
-func TimeTaskLuaCheck() {
|
|
|
- defer qu.Catch()
|
|
|
- if CheckStart != 0 {
|
|
|
- return
|
|
|
- }
|
|
|
- CheckStart = 1
|
|
|
- defer func() {
|
|
|
- CheckStart = 0
|
|
|
- }()
|
|
|
- qu.Debug("爬虫质检开始...", time.Now().Unix())
|
|
|
+// TimeTaskGetLua 定时取爬虫放入通道
|
|
|
+func TimeTaskGetLua() {
|
|
|
query := map[string]interface{}{
|
|
|
"event": map[string]interface{}{
|
|
|
"$ne": 7410,
|
|
@@ -67,16 +62,38 @@ func TimeTaskLuaCheck() {
|
|
|
},
|
|
|
}
|
|
|
list, _ := util.MgoEB.Find("luaconfig", query, nil, nil, false, -1, -1)
|
|
|
+ for _, lua := range *list {
|
|
|
+ if qu.ObjToString(lua["report"]) != "" { //反馈问题的待审核爬虫,不参与机检
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ code := qu.ObjToString(lua["code"])
|
|
|
+ CheckLuaMapLock.Lock()
|
|
|
+ if !CheckLuaMap[code] {
|
|
|
+ CheckLuaChan <- lua
|
|
|
+ CheckLuaMap[code] = true
|
|
|
+ }
|
|
|
+ CheckLuaMapLock.Unlock()
|
|
|
+ }
|
|
|
+ qu.Debug("当前待质检爬虫个数:", len(CheckLuaMap))
|
|
|
+}
|
|
|
+
|
|
|
+// LuaCheckStart 质检流程
|
|
|
+func LuaCheckStart() {
|
|
|
wg := &sync.WaitGroup{}
|
|
|
ch := make(chan bool, 5)
|
|
|
- for _, l := range *list {
|
|
|
+ for {
|
|
|
+ lua := <-CheckLuaChan //取数据
|
|
|
ch <- true
|
|
|
wg.Add(1)
|
|
|
go func(lua map[string]interface{}) {
|
|
|
defer func() {
|
|
|
<-ch
|
|
|
wg.Done()
|
|
|
+ CheckLuaMapLock.Lock()
|
|
|
+ delete(CheckLuaMap, qu.ObjToString(lua["code"]))
|
|
|
+ CheckLuaMapLock.Unlock()
|
|
|
}()
|
|
|
+ qu.Debug("开始机检爬虫:", lua["code"], time.Now().Unix())
|
|
|
warnMap := map[string]string{} //异常集合
|
|
|
errMap := map[string]string{} //错误集合
|
|
|
LuaCheck(lua, errMap, warnMap)
|
|
@@ -99,10 +116,11 @@ func TimeTaskLuaCheck() {
|
|
|
},
|
|
|
}
|
|
|
util.MgoEB.UpdateById("luaconfig", lua["_id"], set)
|
|
|
- }(l)
|
|
|
+ qu.Debug("结束机检爬虫:", lua["code"], time.Now().Unix())
|
|
|
+ }(lua)
|
|
|
}
|
|
|
wg.Wait()
|
|
|
- qu.Debug("爬虫质检结束...", time.Now().Unix())
|
|
|
+ qu.Debug("爬虫质检异常结束...", time.Now().Unix())
|
|
|
}
|
|
|
|
|
|
// LuaCheck 爬虫机检
|
|
@@ -217,24 +235,29 @@ func LuaCheck(lua map[string]interface{}, err, warn map[string]string) {
|
|
|
|
|
|
func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]string) {
|
|
|
defer qu.Catch()
|
|
|
- downloadnode := "bid" //使用下载节点test、bid、comm
|
|
|
var script string
|
|
|
if lua["oldlua"] == nil {
|
|
|
script, _, _ = spider.GetScript(lua)
|
|
|
} else {
|
|
|
script = lua["luacontent"].(string)
|
|
|
}
|
|
|
- s := spider.CreateSpider(downloadnode, script)
|
|
|
+ s := spider.CreateSpider(DOWNLOADNODE, script)
|
|
|
s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
|
|
|
s.Timeout = 60
|
|
|
result := map[int64][]map[string]interface{}{}
|
|
|
downloadNum := 0
|
|
|
- for i := 1; i <= 3; i++ { //列表页重试三次
|
|
|
- result, downloadNum, _ = s.DownListPageItem() //列表页采集结果
|
|
|
- if downloadNum > 0 {
|
|
|
- break
|
|
|
+ for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
|
|
|
+ for i := 1; i <= 3; i++ { //每页列表重试三次
|
|
|
+ s.SpiderStartPage = int64(page)
|
|
|
+ result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
|
|
|
+ result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
|
|
|
+ if downloadNum_page > 0 {
|
|
|
+ downloadNum += downloadNum_page
|
|
|
+ break
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
if downloadNum == 0 {
|
|
|
err["列表页下载异常"] = "列表页下载量为0"
|
|
|
return
|
|
@@ -255,39 +278,73 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
|
|
|
warn["列表页翻页异常"] = "列表页可以翻页,最大页却为1"
|
|
|
}
|
|
|
var fileNum, downloadOkNum int //附件下载量,详情页下载量
|
|
|
+ n := 0
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
+ lock := &sync.Mutex{}
|
|
|
+ ch := make(chan bool, 10)
|
|
|
for _, list := range lists {
|
|
|
for _, l := range list {
|
|
|
- //校验title
|
|
|
- title := qu.ObjToString(l["title"])
|
|
|
- titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
|
|
|
- if len(titleRandomArr) > 0 {
|
|
|
- warn["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
|
|
|
+ if n > 50 {
|
|
|
+ break
|
|
|
}
|
|
|
- if !util.TitleHanReg.MatchString(title) {
|
|
|
- warn["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
|
|
|
- } else if str := util.TitleFilterReg.FindString(title); str != "" {
|
|
|
- str = "列表页title中包含异常信息(" + title + "):" + str
|
|
|
- err["列表页title中含异常信息"] = str
|
|
|
- }
|
|
|
- //校验发布时间
|
|
|
- publishtime := qu.ObjToString(l["publishtime"])
|
|
|
- if publishtime == "0" || publishtime == "" {
|
|
|
- warn["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
|
|
|
- } else {
|
|
|
- t, err_p := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
|
|
|
- if err_p != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
|
|
|
- warn["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
|
|
|
+ n++
|
|
|
+ ch <- true
|
|
|
+ wg.Add(1)
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
+ defer func() {
|
|
|
+ <-ch
|
|
|
+ wg.Done()
|
|
|
+ }()
|
|
|
+ spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
|
|
|
+ warnTmp := map[string]string{}
|
|
|
+ errTmp := map[string]string{}
|
|
|
+ fileNumTmp := 0
|
|
|
+ downloadOkNumTmp := 0
|
|
|
+ //校验title
|
|
|
+ title := qu.ObjToString(tmp["title"])
|
|
|
+ titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
|
|
|
+ if len(titleRandomArr) > 0 {
|
|
|
+ warnTmp["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
|
|
|
}
|
|
|
- }
|
|
|
- //href
|
|
|
- href := qu.ObjToString(l["href"])
|
|
|
- if str := util.HrefReg.FindString(href); str != "" {
|
|
|
- err["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
|
|
|
- }
|
|
|
- //详情页下载校验
|
|
|
- downloadDetail(&fileNum, &downloadOkNum, err, warn, l, s)
|
|
|
+ if !util.TitleHanReg.MatchString(title) {
|
|
|
+ warnTmp["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
|
|
|
+ } else if str := util.TitleFilterReg.FindString(title); str != "" {
|
|
|
+ str = "列表页title中包含异常信息(" + title + "):" + str
|
|
|
+ errTmp["列表页title中含异常信息"] = str
|
|
|
+ }
|
|
|
+ //校验发布时间
|
|
|
+ publishtime := qu.ObjToString(tmp["publishtime"])
|
|
|
+ if publishtime == "0" || publishtime == "" {
|
|
|
+ warnTmp["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
|
|
|
+ } else {
|
|
|
+ t, err_p := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
|
|
|
+ if err_p != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
|
|
|
+ warnTmp["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //href
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
+ if str := util.HrefReg.FindString(href); str != "" {
|
|
|
+ errTmp["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
|
|
|
+ }
|
|
|
+ //详情页下载校验
|
|
|
+ downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
|
|
|
+ spTmp.L.Close()
|
|
|
+
|
|
|
+ lock.Lock()
|
|
|
+ fileNum += fileNumTmp
|
|
|
+ downloadOkNum += downloadOkNumTmp
|
|
|
+ for k, v := range warnTmp {
|
|
|
+ warn[k] = v
|
|
|
+ }
|
|
|
+ for k, v := range errTmp {
|
|
|
+ err[k] = v
|
|
|
+ }
|
|
|
+ lock.Unlock()
|
|
|
+ }(l)
|
|
|
}
|
|
|
}
|
|
|
+ wg.Wait()
|
|
|
if fileNum == 0 {
|
|
|
warn["未下载任何附件"] = "未下载任何附件"
|
|
|
}
|
|
@@ -306,7 +363,7 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
|
|
|
}
|
|
|
var downloadDetailOk bool
|
|
|
for i := 1; i <= 3; i++ { //重试三次
|
|
|
- _, rep_err := s.DownloadDetailPageTest(param, data)
|
|
|
+ _, rep_err := s.DownloadDetailPage(param, data)
|
|
|
if rep_err == nil && len(data) > 0 {
|
|
|
downloadDetailOk = true
|
|
|
*downloadOkNum++ //记录详情页下载成功个数
|