123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190 |
- package util
- import (
- "fmt"
- mgo "mongodb"
- qu "qfw/util"
- "regexp"
- sp "spiderutil"
- "strings"
- "time"
- "github.com/yuin/gopher-lua"
- )
- var (
- MgoE *mgo.MongodbSim
- Province map[string][]string
- DomainNameReg = regexp.MustCompile(`(http|https)[::]+`)
- DownLoadReg = regexp.MustCompile(`download\(.*?\)`)
- CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`)
- TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`)
- TitleFilterReg2 = regexp.MustCompile(`((上|下)一页|阅读次数)`)
- CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"`
- JsonDataMap = map[string]bool{ //jsondata
- "extweight": true,
- "projecthref": true,
- "sourcewebsite": true,
- "sourcehref": true,
- "area_city_district": true,
- "projectname": true,
- "projectcode": true,
- "approvalno": true,
- "projectscope": true,
- "item": true,
- "buyer": true,
- "agency": true,
- "budget": true,
- "buyer_info": true,
- "buyerperson": true,
- "buyertel": true,
- "buyeraddr": true,
- "projectaddr": true,
- "publishdept": true,
- "funds": true,
- "paymenttype": true,
- "projectscale": true,
- "bidmethod": true,
- "bidopentime": true,
- "agency_info": true,
- "agencyperson": true,
- "agencytel": true,
- "agencyaddr": true,
- "isppp": true,
- "winner": true,
- "winneraddr": true,
- "winnerperson": true,
- "winnertel": true,
- "bidamount": true,
- "currency": true,
- "experts": true,
- "bidamounttype": true,
- "contractname": true,
- "countryprojectcode": true,
- "contractnumber": true,
- "projectperiod": true,
- "signaturedate": true,
- "multipackage": true,
- "package": true,
- "supervisorrate": true,
- "jsoncontent": true,
- "purchasinglist": true,
- "toptype": true,
- "subtype": true,
- "winnerorder": true,
- }
- )
- func InitMgo() {
- defer qu.Catch()
- MgoE = &mgo.MongodbSim{
- MongodbAddr: sp.Config.Dbaddr,
- DbName: sp.Config.Dbname,
- Size: 5,
- }
- MgoE.InitPool()
- }
- //初始化省市行政区划信息
- func InitAreaCity() {
- qu.ReadConfig("areacity.json", &Province)
- }
- //爬虫整体测试时校验爬虫代码
- func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) string {
- msg := []string{}
- //1.检测spidercode、site、channel
- if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 {
- spidercode := qu.ObjToString(param[0])
- site := qu.ObjToString(param[1])
- channel := qu.ObjToString(param[2])
- checkText := fmt.Sprintf(CheckText, spidercode, site, channel)
- if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) {
- msg = append(msg, "检查代码spidercode、site、channel字段值")
- }
- }
- //2.检测https
- isHttps := false
- for _, text := range DomainNameReg.FindAllString(liststr, -1) {
- if strings.Contains(text, "https") {
- isHttps = true
- }
- }
- if isHttps {
- downLoadText := DownLoadReg.FindString(contentstr)
- if downLoadText != "" {
- textArr := strings.Split(downLoadText, ",")
- if len(textArr) < 4 {
- msg = append(msg, "download方法添加下载参数")
- } else if len(textArr) == 4 {
- if !CodeTypeReg.MatchString(textArr[0]) || textArr[1] != "true" {
- msg = append(msg, "download方法添加下载参数")
- }
- }
- }
- }
- //3.检测title
- if strings.Contains(liststr, `item["title"]="a"`) {
- if !strings.Contains(contentstr, `data["title"]`) {
- msg = append(msg, "检查代码title的完整性")
- }
- }
- return strings.Join(msg, ",")
- }
- //爬虫整体测试时校验列表页和详情页内容
- func SpiderPassCheckListAndDetail(list []map[string]interface{}, data map[string]interface{}) string {
- msg := []string{}
- if len(list) > 0 {
- p_zero := 0
- h_flag := true
- n_flag := true
- l_flag := true
- for _, l := range list {
- //校验title
- title := qu.ObjToString(l["title"])
- if !TitleFilterReg1.MatchString(title) && h_flag {
- msg = append(msg, "列表页title中无汉字")
- h_flag = false
- } else if TitleFilterReg2.MatchString(title) && n_flag {
- msg = append(msg, "列表页title中含有上(下)一页")
- n_flag = false
- }
- publishtime := qu.ObjToString(l["publishtime"])
- if publishtime == "0" {
- p_zero++
- } else if l_flag {
- t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
- if t.Unix() <= 0 {
- msg = append(msg, "列表页数据发布时间异常")
- l_flag = false
- }
- }
- }
- if len(data) > 0 {
- //校验publishtime
- if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 {
- msg = append(msg, "三级页发布时间小于0")
- } else if p_zero == len(list) && l_np_publishtime == 0 {
- msg = append(msg, "三级页发布时间异常")
- }
- contenthtml := qu.ObjToString(data["contenthtml"])
- if strings.Contains(contenthtml, "img") {
- msg = append(msg, "contenthtml中含有img是否下载")
- }
- detail := qu.ObjToString(data["detail"])
- if TitleFilterReg2.MatchString(detail) {
- msg = append(msg, "三级页正文提取异常")
- }
- //校验jsondata
- if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
- for field, _ := range jsondata {
- if !JsonDataMap[field] {
- msg = append(msg, "jsondata中"+field+"属性错误")
- }
- }
- }
- }
- }
- return strings.Join(msg, ",")
- }
|