package util import ( "fmt" mgo "mongodb" qu "qfw/util" "regexp" "sort" sp "spiderutil" "strings" "time" "github.com/yuin/gopher-lua" ) var ( //MgoE *mgo.MongodbSim //编辑器87 MgoEB *mgo.MongodbSim //编辑器163 MgoS *mgo.MongodbSim Province map[string][]string DomainNameReg = regexp.MustCompile(`(http|https)[::]+`) DownLoadReg = regexp.MustCompile(`download\(.*?\)`) CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`) TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`) TitleFilterReg2 = regexp.MustCompile(`((上|下)一页|阅读次数)`) Area []string //省份 DomainReg = regexp.MustCompile(`(?://).+?(?:[::/])`) SymbolReg = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+") ReplaceReg = regexp.MustCompile(`[]::/]+`) CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"` JsonDataMap = map[string]bool{ //jsondata "extweight": true, "projecthref": true, "sourcewebsite": true, "sourcehref": true, "area_city_district": true, "projectname": true, "projectcode": true, "approvalno": true, "projectscope": true, "item": true, "buyer": true, "agency": true, "budget": true, "buyer_info": true, "buyerperson": true, "buyertel": true, "buyeraddr": true, "projectaddr": true, "publishdept": true, "funds": true, "paymenttype": true, "projectscale": true, "bidmethod": true, "bidopentime": true, "agency_info": true, "agencyperson": true, "agencytel": true, "agencyaddr": true, "isppp": true, "winner": true, "winneraddr": true, "winnerperson": true, "winnertel": true, "bidamount": true, "currency": true, "experts": true, "bidamounttype": true, "contractname": true, "countryprojectcode": true, "contractnumber": true, "projectperiod": true, "signaturedate": true, "multipackage": true, "package": true, "supervisorrate": true, "jsoncontent": true, "purchasinglist": true, "toptype": true, "subtype": true, "winnerorder": true, "bidopendate": true, } ) func InitMgo() { defer qu.Catch() //MgoE = &mgo.MongodbSim{ // MongodbAddr: sp.Config.Dbaddr, // DbName: sp.Config.Dbname, // Size: 10, //} //MgoE.InitPool() MgoEB = &mgo.MongodbSim{ MongodbAddr: sp.Config.BidEditor.Addr, DbName: sp.Config.BidEditor.Db, Size: sp.Config.BidEditor.Size, UserName: sp.Config.BidEditor.Username, Password: sp.Config.BidEditor.Password, } MgoEB.InitPool() MgoS = &mgo.MongodbSim{ MongodbAddr: sp.Config.Dbaddr, DbName: sp.Config.Dbname2, Size: 10, } MgoS.InitPool() } //初始化省市行政区划信息 func InitAreaCity() { qu.ReadConfig("areacity.json", &Province) for area, _ := range Province { if area == "全国" { continue } Area = append(Area, area) } } //爬虫整体测试时校验爬虫代码 func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) string { msg := []string{} //校验含过滤方法stringFind但没有过滤注释“--关键词过滤” if strings.Contains(liststr, "stringFind") && !strings.Contains(liststr, "--关键词过滤") { msg = append(msg, "列表页代码有过滤方法stringFind但缺少注释:--关键词过滤") } if strings.Contains(contentstr, "--关键词过滤") && !strings.Contains(contentstr, "delete") { msg = append(msg, `三级页代码有过滤方法但缺少data["delete"]="true"`) } //1.检测spidercode、site、channel if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 { spidercode := qu.ObjToString(param[0]) site := qu.ObjToString(param[1]) channel := qu.ObjToString(param[2]) checkText := fmt.Sprintf(CheckText, spidercode, site, channel) if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) { msg = append(msg, "检查代码spidercode、site、channel字段值") } } //2.检测https isHttps := false for _, text := range DomainNameReg.FindAllString(liststr, -1) { if strings.Contains(text, "https") { isHttps = true } } if isHttps { for tmpStr, tmpText := range map[string]string{"列表页": liststr, "三级页": contentstr} { downLoadText := DownLoadReg.FindString(tmpText) if downLoadText != "" { textArr := strings.Split(downLoadText, ",") if len(textArr) < 4 { msg = append(msg, tmpStr+"download方法添加下载参数") } else if len(textArr) == 4 { if !CodeTypeReg.MatchString(textArr[0]) || (textArr[1] != "true" && textArr[1] != "false") { msg = append(msg, tmpStr+"download方法添加下载参数") } } } } } //3.检测title if strings.Contains(liststr, `item["title"]="a"`) { if !strings.Contains(contentstr, `data["title"]`) { msg = append(msg, "检查代码title的完整性") } } // 4.检测sendListNum if !strings.Contains(liststr, "sendListNum") { msg = append(msg, "sendListNum方法缺失") } return strings.Join(msg, ",") } //爬虫整体测试时校验列表页和详情页内容 func SpiderPassCheckListAndDetail(list []map[string]interface{}, data map[string]interface{}) string { msg := []string{} if len(list) > 0 { p_zero := 0 h_flag := true n_flag := true l_flag := true for _, l := range list { //校验title title := qu.ObjToString(l["title"]) if !TitleFilterReg1.MatchString(title) && h_flag { msg = append(msg, "列表页title中无汉字") h_flag = false } else if TitleFilterReg2.MatchString(title) && n_flag { msg = append(msg, "列表页title中含有上(下)一页") n_flag = false } publishtime := qu.ObjToString(l["publishtime"]) if publishtime == "0" { p_zero++ } else if l_flag { t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local) if t.Unix() <= 0 { msg = append(msg, "列表页数据发布时间异常") l_flag = false } } } if len(data) > 0 { //校验publishtime if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 { msg = append(msg, "三级页发布时间小于0") } else if p_zero == len(list) && l_np_publishtime == 0 { msg = append(msg, "三级页发布时间异常") } contenthtml := qu.ObjToString(data["contenthtml"]) if strings.Contains(contenthtml, "img") { msg = append(msg, "contenthtml中含有img是否下载") } detail := qu.ObjToString(data["detail"]) if TitleFilterReg2.MatchString(detail) { msg = append(msg, "三级页正文提取异常") } //校验jsondata if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 { for field, _ := range jsondata { if !JsonDataMap[field] { msg = append(msg, "jsondata中"+field+"属性错误") } } } } } return strings.Join(msg, ",") } func GetLuasInfoBySite(site string) (domain, status, event, platform, area, city, district string, remarktime int64) { shelveUp := 0 eventMap, platformMap := map[int]interface{}{}, map[string]interface{}{} eventArr, platformArr := []string{}, []string{} areaMap := map[string]int{} areaCityMap := map[string]map[string]int{} cityDistrictMap := map[string]map[string]int{} domainMap := map[string]int{} remarktime = time.Now().Unix() //luas, _ := MgoE.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1) luas, _ := MgoEB.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1) for _, l := range *luas { //remarktime if comeintime := qu.Int64All(l["comeintime"]); comeintime != int64(0) && comeintime < remarktime { remarktime = comeintime } //domain paramCommon := l["param_common"].([]interface{}) if len(paramCommon) >= 12 { href := qu.ObjToString(paramCommon[11]) domain := DomainReg.FindString(href) if domain != "" { domain = ReplaceReg.ReplaceAllString(domain, "") domainMap[domain] = domainMap[domain] + 1 } } //state、event、platform state := qu.IntAll(l["state"]) event := qu.IntAll(l["event"]) platform := qu.ObjToString(l["platform"]) if state == 5 { shelveUp++ } eventMap[event] = true platformMap[platform] = true //area、city、district if model, ok := l["model"].(map[string]interface{}); ok && model != nil { a := qu.ObjToString(model["area"]) c := qu.ObjToString(model["city"]) d := qu.ObjToString(model["district"]) if a != "" { areaMap[a] = areaMap[a] + 1 if c != "" { if cityNum := areaCityMap[a]; cityNum != nil { cityNum[c] = cityNum[c] + 1 } else { areaCityMap[a] = map[string]int{c: 1} } if d != "" { if distrctNum := cityDistrictMap[c]; distrctNum != nil { distrctNum[d] = distrctNum[d] + 1 } else { cityDistrictMap[c] = map[string]int{d: 1} } } } } } } // for e, _ := range eventMap { eventArr = append(eventArr, fmt.Sprint(e)) } event = strings.Join(eventArr, ",") for p, _ := range platformMap { platformArr = append(platformArr, p) } platform = strings.Join(platformArr, ",") // n := 0 for tmpDomain, num := range domainMap { if num > n { n = num domain = tmpDomain } } status = fmt.Sprintf("%d%s%d", shelveUp, "/", len(*luas)) // an, cn, dn := 0, 0, 0 for at, num := range areaMap { if num > an { area = at an = num } } if area != "" { for ct, num := range areaCityMap[area] { if num > cn { city = ct cn = num } } } if city != "" { for dt, num := range cityDistrictMap[city] { if num > dn { district = dt dn = num } } } return } type StringValSorter struct { Keys []string Vals []string } func MapStringValueSort(m map[string]string) *StringValSorter { vs := NewStringValSorter(m) vs.Sort() return vs } func NewStringValSorter(m map[string]string) *StringValSorter { vs := &StringValSorter{ Keys: make([]string, 0, len(m)), Vals: make([]string, 0, len(m)), } for k, v := range m { vs.Keys = append(vs.Keys, k) vs.Vals = append(vs.Vals, v) } return vs } func (vs *StringValSorter) Sort() { sort.Sort(vs) } func (vs *StringValSorter) Len() int { return len(vs.Vals) } func (vs *StringValSorter) Less(i, j int) bool { return vs.Vals[i] < vs.Vals[j] } func (vs *StringValSorter) Swap(i, j int) { vs.Vals[i], vs.Vals[j] = vs.Vals[j], vs.Vals[i] vs.Keys[i], vs.Keys[j] = vs.Keys[j], vs.Keys[i] }