package util import ( "bytes" "fmt" mgo "mongodb" qu "qfw/util" "regexp" "sort" sp "spiderutil" "strings" "time" "github.com/yuin/gopher-lua" ) const Role_Admin, Role_Examine, Role_Dev = 3, 2, 1 //管理员,审核员,开发员 var ( //MgoE *mgo.MongodbSim //编辑器87 MgoEB *mgo.MongodbSim //编辑器163 MgoS *mgo.MongodbSim Province map[string][]string City map[string][]string DomainNameReg = regexp.MustCompile(`(http|https)[::]+`) DownLoadReg = regexp.MustCompile(`download\(.*?\)`) CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`) TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`) TitleFilterReg2 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数)`) DetailFilterReg1 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`) Area []string //省份 DomainReg = regexp.MustCompile(`(?://).+?(?:[::/])`) SymbolReg = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+") ReplaceReg = regexp.MustCompile(`[]::/]+`) CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"` CheckText_Code = `item["spidercode"]="%s"` CheckText_Site = `item["site"]="%s"` CheckText_Channel = `item["channel"]="%s"` JsonDataMap = map[string]bool{ //jsondata "extweight": true, "projecthref": true, "sourcewebsite": true, "sourcehref": true, "area_city_district": true, "projectname": true, "projectcode": true, "approvalno": true, "projectscope": true, "item": true, "buyer": true, "agency": true, "budget": true, "buyer_info": true, "buyerperson": true, "buyertel": true, "buyeraddr": true, "projectaddr": true, "publishdept": true, "funds": true, "paymenttype": true, "projectscale": true, "bidmethod": true, "bidopentime": true, "agency_info": true, "agencyperson": true, "agencytel": true, "agencyaddr": true, "isppp": true, "winner": true, "winneraddr": true, "winnerperson": true, "winnertel": true, "bidamount": true, "currency": true, "experts": true, "bidamounttype": true, "contractname": true, "countryprojectcode": true, "contractnumber": true, "projectperiod": true, "signaturedate": true, "multipackage": true, "package": true, "supervisorrate": true, "jsoncontent": true, "purchasinglist": true, "toptype": true, "subtype": true, "winnerorder": true, "bidopendate": true, "bidtype": true, } Bu = "_bu" //创建采历史爬虫后缀 ) func InitMgo() { defer qu.Catch() //MgoE = &mgo.MongodbSim{ // MongodbAddr: sp.Config.Dbaddr, // DbName: sp.Config.Dbname, // Size: 10, //} //MgoE.InitPool() MgoEB = &mgo.MongodbSim{ MongodbAddr: sp.Config.BidEditor.Addr, DbName: sp.Config.BidEditor.Db, Size: sp.Config.BidEditor.Size, UserName: sp.Config.BidEditor.Username, Password: sp.Config.BidEditor.Password, } MgoEB.InitPool() MgoS = &mgo.MongodbSim{ MongodbAddr: sp.Config.Dbaddr, DbName: sp.Config.Dbname2, Size: 10, } MgoS.InitPool() } // 初始化省市行政区划信息 func InitAreaCity() { //qu.ReadConfig("areacity.json", &Province) //Area = append(Area, "全国") //for area, _ := range Province { // if area == "全国" { // continue // } // Area = append(Area, area) //} Province = map[string][]string{} City = map[string][]string{} Area = append(Area, "全国") list, _ := MgoEB.Find("address", nil, nil, nil, false, -1, -1) for _, tmp := range *list { province := qu.ObjToString(tmp["province"]) city := qu.ObjToString(tmp["city"]) district := qu.ObjToString(tmp["district"]) if province != "" && city == "" && district == "" { //area Area = append(Area, province) } else if province != "" && city != "" && district == "" { //city cityArr := Province[province] cityArr = append(cityArr, city) Province[province] = cityArr } else if province != "" && city != "" && district != "" { //district districtArr := City[city] districtArr = append(districtArr, district) City[city] = districtArr } } } // 爬虫整体测试时校验爬虫代码 func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) (msg []string) { //校验含过滤方法stringFind但没有过滤注释“--关键词过滤” //if strings.Contains(liststr, "stringFind") && !strings.Contains(liststr, "--关键词过滤") { // msg = append(msg, "列表页代码有过滤方法stringFind但缺少注释:--关键词过滤") //} //if strings.Contains(contentstr, "--关键词过滤") && !strings.Contains(contentstr, "delete") { // msg = append(msg, `三级页代码有过滤方法但缺少data["delete"]="true"`) //} //if !strings.Contains(contentstr, "s_title") { // msg = append(msg, "三级页缺少s_title") //} if !strings.Contains(contentstr, "getFileAttachmentsArrayWithTag") && !strings.Contains(contentstr, "downloadFile") { msg = append(msg, "三级页缺少下载附件方法") } //1.检测spidercode、site、channel //if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 { // spidercode := qu.ObjToString(param[0]) // site := qu.ObjToString(param[1]) // channel := qu.ObjToString(param[2]) // checkText := fmt.Sprintf(CheckText, spidercode, site, channel) // if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) { // msg = append(msg, "检查代码spidercode、site、channel字段值") // } //} //2.检测https //isHttps := false //for _, text := range DomainNameReg.FindAllString(liststr, -1) { // if strings.Contains(text, "https") { // isHttps = true // } //} //if isHttps { // for tmpStr, tmpText := range map[string]string{"列表页": liststr, "三级页": contentstr} { // downLoadText := DownLoadReg.FindString(tmpText) // if downLoadText != "" { // textArr := strings.Split(downLoadText, ",") // if len(textArr) < 4 { // msg = append(msg, tmpStr+"download方法添加下载参数") // } else if len(textArr) == 4 { // if !CodeTypeReg.MatchString(textArr[0]) || (textArr[1] != "true" && textArr[1] != "false") { // msg = append(msg, tmpStr+"download方法添加下载参数") // } // } // } // } //} //3.检测title //if strings.Contains(liststr, `item["title"]="a"`) { // if !strings.Contains(contentstr, `data["title"]`) { // msg = append(msg, "检查代码title的完整性") // } //} // 4.检测sendListNum //if !strings.Contains(liststr, "sendListNum") { // msg = append(msg, "sendListNum方法缺失") //} return } // 爬虫整体测试时校验列表页和详情页内容 func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, data map[string]interface{}) (msg []string) { msgMap := map[string]bool{} //校验列表页信息 for _, list := range result { for _, l := range list { //校验title title := qu.ObjToString(l["title"]) if !TitleFilterReg1.MatchString(title) { msgMap["列表页title中无汉字"] = true } else if TitleFilterReg2.MatchString(title) { msgMap["列表页title中含有上(下)一页"] = true } //校验发布时间 publishtime := qu.ObjToString(l["publishtime"]) if publishtime == "0" || publishtime == "" { msgMap["列表页publishtime取值异常"] = true } else { t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local) if err != nil || t.Unix() <= 0 { msgMap["列表页publishtime取值异常"] = true } } } } if len(data) > 0 { //校验publishtime if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok { if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 { msgMap["三级页publishtime取值异常"] = true } } else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok { if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 { msgMap["三级页publishtime取值异常"] = true } } else { msgMap["三级页publishtime值类型异常"] = true } contenthtml := qu.ObjToString(data["contenthtml"]) if strings.Contains(contenthtml, "img") { msgMap["contenthtml中含有img是否下载"] = true } if strings.Contains(contenthtml, "iframe") { msgMap["contenthtml中含有iframe是否下载"] = true } detail := qu.ObjToString(data["detail"]) if DetailFilterReg1.MatchString(detail) { msgMap["三级页正文提取包含无效内容"] = true } //校验jsondata if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 { for field, _ := range jsondata { if !JsonDataMap[field] { msgMap["jsondata中"+field+"属性错误"] = true } } } } for text, _ := range msgMap { msg = append(msg, text) } return } // 爬虫整体测试时校验列表页和详情页内容 func SpiderPassCheckListAndDetail_back(list []map[string]interface{}, data map[string]interface{}) (msg []string) { if len(list) > 0 { p_zero := 0 h_flag := true n_flag := true l_flag := true for _, l := range list { //校验title title := qu.ObjToString(l["title"]) if !TitleFilterReg1.MatchString(title) && h_flag { msg = append(msg, "列表页title中无汉字") h_flag = false } else if TitleFilterReg2.MatchString(title) && n_flag { msg = append(msg, "列表页title中含有上(下)一页") n_flag = false } publishtime := qu.ObjToString(l["publishtime"]) if publishtime == "0" { p_zero++ } else if l_flag { t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local) if t.Unix() <= 0 { msg = append(msg, "列表页数据发布时间异常") l_flag = false } } } if len(data) > 0 { //校验publishtime if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 { msg = append(msg, "三级页发布时间小于0") } else if p_zero == len(list) && l_np_publishtime == 0 { msg = append(msg, "三级页发布时间异常") } contenthtml := qu.ObjToString(data["contenthtml"]) if strings.Contains(contenthtml, "img") { msg = append(msg, "contenthtml中含有img是否下载") } detail := qu.ObjToString(data["detail"]) if TitleFilterReg2.MatchString(detail) { msg = append(msg, "三级页正文提取异常") } //校验jsondata if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 { for field, _ := range jsondata { if !JsonDataMap[field] { msg = append(msg, "jsondata中"+field+"属性错误") } } } } } return } func GetLuasInfoBySite(site, area, city, district string) (domain, status, event, platform, infotype, specialtype string, remarktime int64) { shelveUp := 0 eventMap, platformMap := map[int]interface{}{}, map[string]interface{}{} infoformatMap := map[int]bool{} eventArr, platformArr, infoformatArr := []string{}, []string{}, []string{} //areaMap := map[string]int{} //areaCityMap := map[string]map[string]int{} //cityDistrictMap := map[string]map[string]int{} domainMap := map[string]bool{} domainArr := []string{} remarktime = time.Now().Unix() //luas, _ := MgoE.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1) luas, _ := MgoEB.Find("luaconfig", `{"site":"`+site+`"}`, ``, `{"projecthref":1,"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1,"infoformat":1}`, false, -1, -1) arr := [][]map[string]interface{}{} for _, l := range *luas { update := []map[string]interface{}{} set := map[string]interface{}{} if b, ok := l["projecthref"].(bool); ok && b { //爬虫采集的数据是流程性信息 specialtype = "含流程数据" } //更新爬虫area、city、district if area != "" { set["model.area"] = area } if area == "全国" { set["model.city"] = "" set["model.district"] = "" } else if area != "" { if city != "" { set["model.city"] = city } if district != "" { set["model.district"] = district } } if len(set) > 0 { update = append(update, map[string]interface{}{"_id": l["_id"]}) update = append(update, map[string]interface{}{"$set": set}) arr = append(arr, update) } //remarktime if comeintime := qu.Int64All(l["comeintime"]); comeintime != int64(0) && comeintime < remarktime { remarktime = comeintime } //domain paramCommon := l["param_common"].([]interface{}) if len(paramCommon) >= 12 { href := qu.ObjToString(paramCommon[11]) domain := DomainReg.FindString(href) if domain != "" { domain = ReplaceReg.ReplaceAllString(domain, "") if !domainMap[domain] { domainArr = append(domainArr, domain) domainMap[domain] = true } } } //state、event、platform state := qu.IntAll(l["state"]) event := qu.IntAll(l["event"]) platform := qu.ObjToString(l["platform"]) if state == 5 || state == 11 { //5:lua已上架;11:python已上线 shelveUp++ } eventMap[event] = true platformMap[platform] = true //infoformat infoformat := qu.IntAll(l["infoformat"]) infoformatMap[infoformat] = true //area、city、district //if model, ok := l["model"].(map[string]interface{}); ok && model != nil { // a := qu.ObjToString(model["area"]) // c := qu.ObjToString(model["city"]) // d := qu.ObjToString(model["district"]) // if a != "" { // areaMap[a] = areaMap[a] + 1 // if c != "" { // if cityNum := areaCityMap[a]; cityNum != nil { // cityNum[c] = cityNum[c] + 1 // } else { // areaCityMap[a] = map[string]int{c: 1} // } // if d != "" { // if distrctNum := cityDistrictMap[c]; distrctNum != nil { // distrctNum[d] = distrctNum[d] + 1 // } else { // cityDistrictMap[c] = map[string]int{d: 1} // } // } // } // // } //} } //domain domain = strings.Join(domainArr, ";") for e, _ := range eventMap { eventArr = append(eventArr, fmt.Sprint(e)) } event = strings.Join(eventArr, ",") for p, _ := range platformMap { platformArr = append(platformArr, p) } sort.Strings(platformArr) platform = strings.Join(platformArr, ",") for infoformat, _ := range infoformatMap { text := "招标" if infoformat == 2 { text = "拟建/审批" } else if infoformat == 3 { text = "产权" } else if infoformat == 4 { text = "舆情" } infoformatArr = append(infoformatArr, text) } sort.Strings(infoformatArr) infotype = strings.Join(infoformatArr, ",") // status = fmt.Sprintf("%d%s%d", shelveUp, "/", len(*luas)) //批量更新 if len(arr) > 0 { MgoEB.UpdateBulk("luaconfig", arr...) arr = [][]map[string]interface{}{} } //an, cn, dn := 0, 0, 0 //for at, num := range areaMap { // if num > an { // area = at // an = num // } //} //if area != "" { // for ct, num := range areaCityMap[area] { // if num > cn { // city = ct // cn = num // } // } //} //if city != "" { // for dt, num := range cityDistrictMap[city] { // if num > dn { // district = dt // dn = num // } // } //} return } type StringValSorter struct { Keys []string Vals []string } func MapStringValueSort(m map[string]string) *StringValSorter { vs := NewStringValSorter(m) vs.Sort() return vs } func NewStringValSorter(m map[string]string) *StringValSorter { vs := &StringValSorter{ Keys: make([]string, 0, len(m)), Vals: make([]string, 0, len(m)), } for k, v := range m { vs.Keys = append(vs.Keys, k) vs.Vals = append(vs.Vals, v) } return vs } func (vs *StringValSorter) Sort() { sort.Sort(vs) } func (vs *StringValSorter) Len() int { return len(vs.Vals) } func (vs *StringValSorter) Less(i, j int) bool { return vs.Vals[i] < vs.Vals[j] } func (vs *StringValSorter) Swap(i, j int) { vs.Vals[i], vs.Vals[j] = vs.Vals[j], vs.Vals[i] vs.Keys[i], vs.Keys[j] = vs.Keys[j], vs.Keys[i] } type MyWrite struct { Byte *bytes.Buffer } func (m *MyWrite) Write(p []byte) (n int, err error) { n, err = m.Byte.Write(p) return } func (m *MyWrite) Reader(p []byte) (n int, err error) { n, err = m.Byte.Read(p) return } // 获取第day天凌晨的时间戳 func GetTime(day int) int64 { defer qu.Catch() nowTime := time.Now().AddDate(0, 0, day) timeStr := qu.FormatDate(&nowTime, qu.Date_Short_Layout) t, _ := time.ParseInLocation(qu.Date_Short_Layout, timeStr, time.Local) return t.Unix() }