package util import ( "fmt" mgo "mongodb" qu "qfw/util" "regexp" sp "spiderutil" "strings" "time" "github.com/yuin/gopher-lua" ) var ( MgoE *mgo.MongodbSim Province map[string][]string DomainNameReg = regexp.MustCompile(`(http|https)[::]+`) DownLoadReg = regexp.MustCompile(`download\(.*?\)`) CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`) TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`) TitleFilterReg2 = regexp.MustCompile(`((上|下)一页|阅读次数)`) CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"` JsonDataMap = map[string]bool{ //jsondata "extweight": true, "projecthref": true, "sourcewebsite": true, "sourcehref": true, "area_city_district": true, "projectname": true, "projectcode": true, "approvalno": true, "projectscope": true, "item": true, "buyer": true, "agency": true, "budget": true, "buyer_info": true, "buyerperson": true, "buyertel": true, "buyeraddr": true, "projectaddr": true, "publishdept": true, "funds": true, "paymenttype": true, "projectscale": true, "bidmethod": true, "bidopentime": true, "agency_info": true, "agencyperson": true, "agencytel": true, "agencyaddr": true, "isppp": true, "winner": true, "winneraddr": true, "winnerperson": true, "winnertel": true, "bidamount": true, "currency": true, "experts": true, "bidamounttype": true, "contractname": true, "countryprojectcode": true, "contractnumber": true, "projectperiod": true, "signaturedate": true, "multipackage": true, "package": true, "supervisorrate": true, "jsoncontent": true, "purchasinglist": true, "toptype": true, "subtype": true, "winnerorder": true, } ) func InitMgo() { defer qu.Catch() MgoE = &mgo.MongodbSim{ MongodbAddr: sp.Config.Dbaddr, DbName: sp.Config.Dbname, Size: 5, } MgoE.InitPool() } //初始化省市行政区划信息 func InitAreaCity() { qu.ReadConfig("areacity.json", &Province) } //爬虫整体测试时校验爬虫代码 func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) string { msg := []string{} //1.检测spidercode、site、channel if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 { spidercode := qu.ObjToString(param[0]) site := qu.ObjToString(param[1]) channel := qu.ObjToString(param[2]) checkText := fmt.Sprintf(CheckText, spidercode, site, channel) if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) { msg = append(msg, "检查代码spidercode、site、channel字段值") } } //2.检测https isHttps := false for _, text := range DomainNameReg.FindAllString(liststr, -1) { if strings.Contains(text, "https") { isHttps = true } } if isHttps { downLoadText := DownLoadReg.FindString(contentstr) if downLoadText != "" { textArr := strings.Split(downLoadText, ",") if len(textArr) < 4 { msg = append(msg, "download方法添加下载参数") } else if len(textArr) == 4 { if !CodeTypeReg.MatchString(textArr[0]) || textArr[1] != "true" { msg = append(msg, "download方法添加下载参数") } } } } //3.检测title if strings.Contains(liststr, `item["title"]="a"`) { if !strings.Contains(contentstr, `data["title"]`) { msg = append(msg, "检查代码title的完整性") } } return strings.Join(msg, ",") } //爬虫整体测试时校验列表页和详情页内容 func SpiderPassCheckListAndDetail(list []map[string]interface{}, data map[string]interface{}) string { msg := []string{} if len(list) > 0 { p_zero := 0 h_flag := true n_flag := true l_flag := true for _, l := range list { //校验title title := qu.ObjToString(l["title"]) if !TitleFilterReg1.MatchString(title) && h_flag { msg = append(msg, "列表页title中无汉字") h_flag = false } else if TitleFilterReg2.MatchString(title) && n_flag { msg = append(msg, "列表页title中含有上(下)一页") n_flag = false } publishtime := qu.ObjToString(l["publishtime"]) if publishtime == "0" { p_zero++ } else if l_flag { t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local) if t.Unix() <= 0 { msg = append(msg, "列表页数据发布时间异常") l_flag = false } } } if len(data) > 0 { //校验publishtime if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 { msg = append(msg, "三级页发布时间小于0") } else if p_zero == len(list) && l_np_publishtime == 0 { msg = append(msg, "三级页发布时间异常") } contenthtml := qu.ObjToString(data["contenthtml"]) if strings.Contains(contenthtml, "img") { msg = append(msg, "contenthtml中含有img是否下载") } detail := qu.ObjToString(data["detail"]) if TitleFilterReg2.MatchString(detail) { msg = append(msg, "三级页正文提取异常") } //校验jsondata if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 { for field, _ := range jsondata { if !JsonDataMap[field] { msg = append(msg, "jsondata中"+field+"属性错误") } } } } } return strings.Join(msg, ",") }