123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536 |
- package util
- import (
- "bytes"
- "fmt"
- mgo "mongodb"
- qu "qfw/util"
- "regexp"
- "sort"
- sp "spiderutil"
- "strings"
- "time"
- "github.com/yuin/gopher-lua"
- )
- const Role_Admin, Role_Examine, Role_Dev = 3, 2, 1 //管理员,审核员,开发员
- var (
- //MgoE *mgo.MongodbSim //编辑器87
- MgoEB *mgo.MongodbSim //编辑器163
- MgoS *mgo.MongodbSim
- Province map[string][]string
- City map[string][]string
- DomainNameReg = regexp.MustCompile(`(http|https)[::]+`)
- DownLoadReg = regexp.MustCompile(`download\(.*?\)`)
- CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`)
- TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`)
- TitleFilterReg2 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数)`)
- DetailFilterReg1 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
- Area []string //省份
- DomainReg = regexp.MustCompile(`(?://).+?(?:[::/])`)
- SymbolReg = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+")
- ReplaceReg = regexp.MustCompile(`[]::/]+`)
- CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"`
- CheckText_Code = `item["spidercode"]="%s"`
- CheckText_Site = `item["site"]="%s"`
- CheckText_Channel = `item["channel"]="%s"`
- JsonDataMap = map[string]bool{ //jsondata
- "extweight": true,
- "projecthref": true,
- "sourcewebsite": true,
- "sourcehref": true,
- "area_city_district": true,
- "projectname": true,
- "projectcode": true,
- "approvalno": true,
- "projectscope": true,
- "item": true,
- "buyer": true,
- "agency": true,
- "budget": true,
- "buyer_info": true,
- "buyerperson": true,
- "buyertel": true,
- "buyeraddr": true,
- "projectaddr": true,
- "publishdept": true,
- "funds": true,
- "paymenttype": true,
- "projectscale": true,
- "bidmethod": true,
- "bidopentime": true,
- "agency_info": true,
- "agencyperson": true,
- "agencytel": true,
- "agencyaddr": true,
- "isppp": true,
- "winner": true,
- "winneraddr": true,
- "winnerperson": true,
- "winnertel": true,
- "bidamount": true,
- "currency": true,
- "experts": true,
- "bidamounttype": true,
- "contractname": true,
- "countryprojectcode": true,
- "contractnumber": true,
- "projectperiod": true,
- "signaturedate": true,
- "multipackage": true,
- "package": true,
- "supervisorrate": true,
- "jsoncontent": true,
- "purchasinglist": true,
- "toptype": true,
- "subtype": true,
- "winnerorder": true,
- "bidopendate": true,
- "bidtype": true,
- }
- Bu = "_bu" //创建采历史爬虫后缀
- )
- func InitMgo() {
- defer qu.Catch()
- //MgoE = &mgo.MongodbSim{
- // MongodbAddr: sp.Config.Dbaddr,
- // DbName: sp.Config.Dbname,
- // Size: 10,
- //}
- //MgoE.InitPool()
- MgoEB = &mgo.MongodbSim{
- MongodbAddr: sp.Config.BidEditor.Addr,
- DbName: sp.Config.BidEditor.Db,
- Size: sp.Config.BidEditor.Size,
- UserName: sp.Config.BidEditor.Username,
- Password: sp.Config.BidEditor.Password,
- }
- MgoEB.InitPool()
- MgoS = &mgo.MongodbSim{
- MongodbAddr: sp.Config.Dbaddr,
- DbName: sp.Config.Dbname2,
- Size: 10,
- }
- MgoS.InitPool()
- }
- // 初始化省市行政区划信息
- func InitAreaCity() {
- //qu.ReadConfig("areacity.json", &Province)
- //Area = append(Area, "全国")
- //for area, _ := range Province {
- // if area == "全国" {
- // continue
- // }
- // Area = append(Area, area)
- //}
- Province = map[string][]string{}
- City = map[string][]string{}
- Area = append(Area, "全国")
- list, _ := MgoEB.Find("address", nil, nil, nil, false, -1, -1)
- for _, tmp := range *list {
- province := qu.ObjToString(tmp["province"])
- city := qu.ObjToString(tmp["city"])
- district := qu.ObjToString(tmp["district"])
- if province != "" && city == "" && district == "" { //area
- Area = append(Area, province)
- } else if province != "" && city != "" && district == "" { //city
- cityArr := Province[province]
- cityArr = append(cityArr, city)
- Province[province] = cityArr
- } else if province != "" && city != "" && district != "" { //district
- districtArr := City[city]
- districtArr = append(districtArr, district)
- City[city] = districtArr
- }
- }
- }
- // 爬虫整体测试时校验爬虫代码
- func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) (msg []string) {
- //校验含过滤方法stringFind但没有过滤注释“--关键词过滤”
- //if strings.Contains(liststr, "stringFind") && !strings.Contains(liststr, "--关键词过滤") {
- // msg = append(msg, "列表页代码有过滤方法stringFind但缺少注释:--关键词过滤")
- //}
- //if strings.Contains(contentstr, "--关键词过滤") && !strings.Contains(contentstr, "delete") {
- // msg = append(msg, `三级页代码有过滤方法但缺少data["delete"]="true"`)
- //}
- //if !strings.Contains(contentstr, "s_title") {
- // msg = append(msg, "三级页缺少s_title")
- //}
- if !strings.Contains(contentstr, "getFileAttachmentsArrayWithTag") && !strings.Contains(contentstr, "downloadFile") {
- msg = append(msg, "三级页缺少下载附件方法")
- }
- //1.检测spidercode、site、channel
- //if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 {
- // spidercode := qu.ObjToString(param[0])
- // site := qu.ObjToString(param[1])
- // channel := qu.ObjToString(param[2])
- // checkText := fmt.Sprintf(CheckText, spidercode, site, channel)
- // if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) {
- // msg = append(msg, "检查代码spidercode、site、channel字段值")
- // }
- //}
- //2.检测https
- //isHttps := false
- //for _, text := range DomainNameReg.FindAllString(liststr, -1) {
- // if strings.Contains(text, "https") {
- // isHttps = true
- // }
- //}
- //if isHttps {
- // for tmpStr, tmpText := range map[string]string{"列表页": liststr, "三级页": contentstr} {
- // downLoadText := DownLoadReg.FindString(tmpText)
- // if downLoadText != "" {
- // textArr := strings.Split(downLoadText, ",")
- // if len(textArr) < 4 {
- // msg = append(msg, tmpStr+"download方法添加下载参数")
- // } else if len(textArr) == 4 {
- // if !CodeTypeReg.MatchString(textArr[0]) || (textArr[1] != "true" && textArr[1] != "false") {
- // msg = append(msg, tmpStr+"download方法添加下载参数")
- // }
- // }
- // }
- // }
- //}
- //3.检测title
- //if strings.Contains(liststr, `item["title"]="a"`) {
- // if !strings.Contains(contentstr, `data["title"]`) {
- // msg = append(msg, "检查代码title的完整性")
- // }
- //}
- // 4.检测sendListNum
- //if !strings.Contains(liststr, "sendListNum") {
- // msg = append(msg, "sendListNum方法缺失")
- //}
- return
- }
- // 爬虫整体测试时校验列表页和详情页内容
- func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, data map[string]interface{}) (msg []string) {
- msgMap := map[string]bool{}
- //校验列表页信息
- for _, list := range result {
- for _, l := range list {
- //校验title
- title := qu.ObjToString(l["title"])
- if !TitleFilterReg1.MatchString(title) {
- msgMap["列表页title中无汉字"] = true
- } else if TitleFilterReg2.MatchString(title) {
- msgMap["列表页title中含有上(下)一页"] = true
- }
- //校验发布时间
- publishtime := qu.ObjToString(l["publishtime"])
- if publishtime == "0" || publishtime == "" {
- msgMap["列表页publishtime取值异常"] = true
- } else {
- t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
- if err != nil || t.Unix() <= 0 {
- msgMap["列表页publishtime取值异常"] = true
- }
- }
- }
- }
- if len(data) > 0 {
- //校验publishtime
- if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok {
- if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
- msgMap["三级页publishtime取值异常"] = true
- }
- } else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok {
- if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
- msgMap["三级页publishtime取值异常"] = true
- }
- } else {
- msgMap["三级页publishtime值类型异常"] = true
- }
- contenthtml := qu.ObjToString(data["contenthtml"])
- if strings.Contains(contenthtml, "img") {
- msgMap["contenthtml中含有img是否下载"] = true
- }
- if strings.Contains(contenthtml, "iframe") {
- msgMap["contenthtml中含有iframe是否下载"] = true
- }
- detail := qu.ObjToString(data["detail"])
- if DetailFilterReg1.MatchString(detail) {
- msgMap["三级页正文提取包含无效内容"] = true
- }
- //校验jsondata
- if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
- for field, _ := range jsondata {
- if !JsonDataMap[field] {
- msgMap["jsondata中"+field+"属性错误"] = true
- }
- }
- }
- }
- for text, _ := range msgMap {
- msg = append(msg, text)
- }
- return
- }
- // 爬虫整体测试时校验列表页和详情页内容
- func SpiderPassCheckListAndDetail_back(list []map[string]interface{}, data map[string]interface{}) (msg []string) {
- if len(list) > 0 {
- p_zero := 0
- h_flag := true
- n_flag := true
- l_flag := true
- for _, l := range list {
- //校验title
- title := qu.ObjToString(l["title"])
- if !TitleFilterReg1.MatchString(title) && h_flag {
- msg = append(msg, "列表页title中无汉字")
- h_flag = false
- } else if TitleFilterReg2.MatchString(title) && n_flag {
- msg = append(msg, "列表页title中含有上(下)一页")
- n_flag = false
- }
- publishtime := qu.ObjToString(l["publishtime"])
- if publishtime == "0" {
- p_zero++
- } else if l_flag {
- t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
- if t.Unix() <= 0 {
- msg = append(msg, "列表页数据发布时间异常")
- l_flag = false
- }
- }
- }
- if len(data) > 0 {
- //校验publishtime
- if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 {
- msg = append(msg, "三级页发布时间小于0")
- } else if p_zero == len(list) && l_np_publishtime == 0 {
- msg = append(msg, "三级页发布时间异常")
- }
- contenthtml := qu.ObjToString(data["contenthtml"])
- if strings.Contains(contenthtml, "img") {
- msg = append(msg, "contenthtml中含有img是否下载")
- }
- detail := qu.ObjToString(data["detail"])
- if TitleFilterReg2.MatchString(detail) {
- msg = append(msg, "三级页正文提取异常")
- }
- //校验jsondata
- if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
- for field, _ := range jsondata {
- if !JsonDataMap[field] {
- msg = append(msg, "jsondata中"+field+"属性错误")
- }
- }
- }
- }
- }
- return
- }
- func GetLuasInfoBySite(site, area, city, district string) (domain, status, event, platform, infotype, specialtype string, remarktime int64) {
- shelveUp := 0
- eventMap, platformMap := map[int]interface{}{}, map[string]interface{}{}
- infoformatMap := map[int]bool{}
- eventArr, platformArr, infoformatArr := []string{}, []string{}, []string{}
- //areaMap := map[string]int{}
- //areaCityMap := map[string]map[string]int{}
- //cityDistrictMap := map[string]map[string]int{}
- domainMap := map[string]bool{}
- domainArr := []string{}
- remarktime = time.Now().Unix()
- //luas, _ := MgoE.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1)
- luas, _ := MgoEB.Find("luaconfig", `{"site":"`+site+`"}`, ``, `{"projecthref":1,"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1,"infoformat":1}`, false, -1, -1)
- arr := [][]map[string]interface{}{}
- for _, l := range *luas {
- update := []map[string]interface{}{}
- set := map[string]interface{}{}
- if b, ok := l["projecthref"].(bool); ok && b { //爬虫采集的数据是流程性信息
- specialtype = "含流程数据"
- }
- //更新爬虫area、city、district
- if area != "" {
- set["model.area"] = area
- }
- if area == "全国" {
- set["model.city"] = ""
- set["model.district"] = ""
- } else if area != "" {
- if city != "" {
- set["model.city"] = city
- }
- if district != "" {
- set["model.district"] = district
- }
- }
- if len(set) > 0 {
- update = append(update, map[string]interface{}{"_id": l["_id"]})
- update = append(update, map[string]interface{}{"$set": set})
- arr = append(arr, update)
- }
- //remarktime
- if comeintime := qu.Int64All(l["comeintime"]); comeintime != int64(0) && comeintime < remarktime {
- remarktime = comeintime
- }
- //domain
- paramCommon := l["param_common"].([]interface{})
- if len(paramCommon) >= 12 {
- href := qu.ObjToString(paramCommon[11])
- domain := DomainReg.FindString(href)
- if domain != "" {
- domain = ReplaceReg.ReplaceAllString(domain, "")
- if !domainMap[domain] {
- domainArr = append(domainArr, domain)
- domainMap[domain] = true
- }
- }
- }
- //state、event、platform
- state := qu.IntAll(l["state"])
- event := qu.IntAll(l["event"])
- platform := qu.ObjToString(l["platform"])
- if state == 5 || state == 11 { //5:lua已上架;11:python已上线
- shelveUp++
- }
- eventMap[event] = true
- platformMap[platform] = true
- //infoformat
- infoformat := qu.IntAll(l["infoformat"])
- infoformatMap[infoformat] = true
- //area、city、district
- //if model, ok := l["model"].(map[string]interface{}); ok && model != nil {
- // a := qu.ObjToString(model["area"])
- // c := qu.ObjToString(model["city"])
- // d := qu.ObjToString(model["district"])
- // if a != "" {
- // areaMap[a] = areaMap[a] + 1
- // if c != "" {
- // if cityNum := areaCityMap[a]; cityNum != nil {
- // cityNum[c] = cityNum[c] + 1
- // } else {
- // areaCityMap[a] = map[string]int{c: 1}
- // }
- // if d != "" {
- // if distrctNum := cityDistrictMap[c]; distrctNum != nil {
- // distrctNum[d] = distrctNum[d] + 1
- // } else {
- // cityDistrictMap[c] = map[string]int{d: 1}
- // }
- // }
- // }
- //
- // }
- //}
- }
- //domain
- domain = strings.Join(domainArr, ";")
- for e, _ := range eventMap {
- eventArr = append(eventArr, fmt.Sprint(e))
- }
- event = strings.Join(eventArr, ",")
- for p, _ := range platformMap {
- platformArr = append(platformArr, p)
- }
- sort.Strings(platformArr)
- platform = strings.Join(platformArr, ",")
- for infoformat, _ := range infoformatMap {
- text := "招标"
- if infoformat == 2 {
- text = "拟建/审批"
- } else if infoformat == 3 {
- text = "产权"
- } else if infoformat == 4 {
- text = "舆情"
- }
- infoformatArr = append(infoformatArr, text)
- }
- sort.Strings(infoformatArr)
- infotype = strings.Join(infoformatArr, ",")
- //
- status = fmt.Sprintf("%d%s%d", shelveUp, "/", len(*luas))
- //批量更新
- if len(arr) > 0 {
- MgoEB.UpdateBulk("luaconfig", arr...)
- arr = [][]map[string]interface{}{}
- }
- //an, cn, dn := 0, 0, 0
- //for at, num := range areaMap {
- // if num > an {
- // area = at
- // an = num
- // }
- //}
- //if area != "" {
- // for ct, num := range areaCityMap[area] {
- // if num > cn {
- // city = ct
- // cn = num
- // }
- // }
- //}
- //if city != "" {
- // for dt, num := range cityDistrictMap[city] {
- // if num > dn {
- // district = dt
- // dn = num
- // }
- // }
- //}
- return
- }
- type StringValSorter struct {
- Keys []string
- Vals []string
- }
- func MapStringValueSort(m map[string]string) *StringValSorter {
- vs := NewStringValSorter(m)
- vs.Sort()
- return vs
- }
- func NewStringValSorter(m map[string]string) *StringValSorter {
- vs := &StringValSorter{
- Keys: make([]string, 0, len(m)),
- Vals: make([]string, 0, len(m)),
- }
- for k, v := range m {
- vs.Keys = append(vs.Keys, k)
- vs.Vals = append(vs.Vals, v)
- }
- return vs
- }
- func (vs *StringValSorter) Sort() {
- sort.Sort(vs)
- }
- func (vs *StringValSorter) Len() int {
- return len(vs.Vals)
- }
- func (vs *StringValSorter) Less(i, j int) bool {
- return vs.Vals[i] < vs.Vals[j]
- }
- func (vs *StringValSorter) Swap(i, j int) {
- vs.Vals[i], vs.Vals[j] = vs.Vals[j], vs.Vals[i]
- vs.Keys[i], vs.Keys[j] = vs.Keys[j], vs.Keys[i]
- }
- type MyWrite struct {
- Byte *bytes.Buffer
- }
- func (m *MyWrite) Write(p []byte) (n int, err error) {
- n, err = m.Byte.Write(p)
- return
- }
- func (m *MyWrite) Reader(p []byte) (n int, err error) {
- n, err = m.Byte.Read(p)
- return
- }
|