util.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. package util
  2. import (
  3. "fmt"
  4. mgo "mongodb"
  5. qu "qfw/util"
  6. "regexp"
  7. "sort"
  8. sp "spiderutil"
  9. "strings"
  10. "time"
  11. "github.com/yuin/gopher-lua"
  12. )
  13. var (
  14. //MgoE *mgo.MongodbSim //编辑器87
  15. MgoEB *mgo.MongodbSim //编辑器163
  16. MgoS *mgo.MongodbSim
  17. Province map[string][]string
  18. DomainNameReg = regexp.MustCompile(`(http|https)[::]+`)
  19. DownLoadReg = regexp.MustCompile(`download\(.*?\)`)
  20. CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`)
  21. TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`)
  22. TitleFilterReg2 = regexp.MustCompile(`((上|下)一页|阅读次数)`)
  23. Area []string //省份
  24. DomainReg = regexp.MustCompile(`(?://).+?(?:[::/])`)
  25. SymbolReg = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+")
  26. ReplaceReg = regexp.MustCompile(`[]::/]+`)
  27. CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"`
  28. JsonDataMap = map[string]bool{ //jsondata
  29. "extweight": true,
  30. "projecthref": true,
  31. "sourcewebsite": true,
  32. "sourcehref": true,
  33. "area_city_district": true,
  34. "projectname": true,
  35. "projectcode": true,
  36. "approvalno": true,
  37. "projectscope": true,
  38. "item": true,
  39. "buyer": true,
  40. "agency": true,
  41. "budget": true,
  42. "buyer_info": true,
  43. "buyerperson": true,
  44. "buyertel": true,
  45. "buyeraddr": true,
  46. "projectaddr": true,
  47. "publishdept": true,
  48. "funds": true,
  49. "paymenttype": true,
  50. "projectscale": true,
  51. "bidmethod": true,
  52. "bidopentime": true,
  53. "agency_info": true,
  54. "agencyperson": true,
  55. "agencytel": true,
  56. "agencyaddr": true,
  57. "isppp": true,
  58. "winner": true,
  59. "winneraddr": true,
  60. "winnerperson": true,
  61. "winnertel": true,
  62. "bidamount": true,
  63. "currency": true,
  64. "experts": true,
  65. "bidamounttype": true,
  66. "contractname": true,
  67. "countryprojectcode": true,
  68. "contractnumber": true,
  69. "projectperiod": true,
  70. "signaturedate": true,
  71. "multipackage": true,
  72. "package": true,
  73. "supervisorrate": true,
  74. "jsoncontent": true,
  75. "purchasinglist": true,
  76. "toptype": true,
  77. "subtype": true,
  78. "winnerorder": true,
  79. "bidopendate": true,
  80. }
  81. )
  82. func InitMgo() {
  83. defer qu.Catch()
  84. //MgoE = &mgo.MongodbSim{
  85. // MongodbAddr: sp.Config.Dbaddr,
  86. // DbName: sp.Config.Dbname,
  87. // Size: 10,
  88. //}
  89. //MgoE.InitPool()
  90. MgoEB = &mgo.MongodbSim{
  91. MongodbAddr: sp.Config.BidEditor.Addr,
  92. DbName: sp.Config.BidEditor.Db,
  93. Size: sp.Config.BidEditor.Size,
  94. UserName: sp.Config.BidEditor.Username,
  95. Password: sp.Config.BidEditor.Password,
  96. }
  97. MgoEB.InitPool()
  98. MgoS = &mgo.MongodbSim{
  99. MongodbAddr: sp.Config.Dbaddr,
  100. DbName: sp.Config.Dbname2,
  101. Size: 10,
  102. }
  103. MgoS.InitPool()
  104. }
  105. //初始化省市行政区划信息
  106. func InitAreaCity() {
  107. qu.ReadConfig("areacity.json", &Province)
  108. for area, _ := range Province {
  109. if area == "全国" {
  110. continue
  111. }
  112. Area = append(Area, area)
  113. }
  114. }
  115. //爬虫整体测试时校验爬虫代码
  116. func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) string {
  117. msg := []string{}
  118. //校验含过滤方法stringFind但没有过滤注释“--关键词过滤”
  119. if strings.Contains(liststr, "stringFind") && !strings.Contains(liststr, "--关键词过滤") {
  120. msg = append(msg, "列表页代码有过滤方法stringFind但缺少注释:--关键词过滤")
  121. }
  122. if strings.Contains(contentstr, "--关键词过滤") && !strings.Contains(contentstr, "delete") {
  123. msg = append(msg, `三级页代码有过滤方法但缺少data["delete"]="true"`)
  124. }
  125. //1.检测spidercode、site、channel
  126. if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 {
  127. spidercode := qu.ObjToString(param[0])
  128. site := qu.ObjToString(param[1])
  129. channel := qu.ObjToString(param[2])
  130. checkText := fmt.Sprintf(CheckText, spidercode, site, channel)
  131. if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) {
  132. msg = append(msg, "检查代码spidercode、site、channel字段值")
  133. }
  134. }
  135. //2.检测https
  136. isHttps := false
  137. for _, text := range DomainNameReg.FindAllString(liststr, -1) {
  138. if strings.Contains(text, "https") {
  139. isHttps = true
  140. }
  141. }
  142. if isHttps {
  143. for tmpStr, tmpText := range map[string]string{"列表页": liststr, "三级页": contentstr} {
  144. downLoadText := DownLoadReg.FindString(tmpText)
  145. if downLoadText != "" {
  146. textArr := strings.Split(downLoadText, ",")
  147. if len(textArr) < 4 {
  148. msg = append(msg, tmpStr+"download方法添加下载参数")
  149. } else if len(textArr) == 4 {
  150. if !CodeTypeReg.MatchString(textArr[0]) || (textArr[1] != "true" && textArr[1] != "false") {
  151. msg = append(msg, tmpStr+"download方法添加下载参数")
  152. }
  153. }
  154. }
  155. }
  156. }
  157. //3.检测title
  158. if strings.Contains(liststr, `item["title"]="a"`) {
  159. if !strings.Contains(contentstr, `data["title"]`) {
  160. msg = append(msg, "检查代码title的完整性")
  161. }
  162. }
  163. // 4.检测sendListNum
  164. if !strings.Contains(liststr, "sendListNum") {
  165. msg = append(msg, "sendListNum方法缺失")
  166. }
  167. return strings.Join(msg, ",")
  168. }
  169. //爬虫整体测试时校验列表页和详情页内容
  170. func SpiderPassCheckListAndDetail(list []map[string]interface{}, data map[string]interface{}) string {
  171. msg := []string{}
  172. if len(list) > 0 {
  173. p_zero := 0
  174. h_flag := true
  175. n_flag := true
  176. l_flag := true
  177. for _, l := range list {
  178. //校验title
  179. title := qu.ObjToString(l["title"])
  180. if !TitleFilterReg1.MatchString(title) && h_flag {
  181. msg = append(msg, "列表页title中无汉字")
  182. h_flag = false
  183. } else if TitleFilterReg2.MatchString(title) && n_flag {
  184. msg = append(msg, "列表页title中含有上(下)一页")
  185. n_flag = false
  186. }
  187. publishtime := qu.ObjToString(l["publishtime"])
  188. if publishtime == "0" {
  189. p_zero++
  190. } else if l_flag {
  191. t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
  192. if t.Unix() <= 0 {
  193. msg = append(msg, "列表页数据发布时间异常")
  194. l_flag = false
  195. }
  196. }
  197. }
  198. if len(data) > 0 {
  199. //校验publishtime
  200. if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 {
  201. msg = append(msg, "三级页发布时间小于0")
  202. } else if p_zero == len(list) && l_np_publishtime == 0 {
  203. msg = append(msg, "三级页发布时间异常")
  204. }
  205. contenthtml := qu.ObjToString(data["contenthtml"])
  206. if strings.Contains(contenthtml, "img") {
  207. msg = append(msg, "contenthtml中含有img是否下载")
  208. }
  209. detail := qu.ObjToString(data["detail"])
  210. if TitleFilterReg2.MatchString(detail) {
  211. msg = append(msg, "三级页正文提取异常")
  212. }
  213. //校验jsondata
  214. if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
  215. for field, _ := range jsondata {
  216. if !JsonDataMap[field] {
  217. msg = append(msg, "jsondata中"+field+"属性错误")
  218. }
  219. }
  220. }
  221. }
  222. }
  223. return strings.Join(msg, ",")
  224. }
  225. func GetLuasInfoBySite(site string) (domain, status, event, platform, area, city, district string, remarktime int64) {
  226. shelveUp := 0
  227. eventMap, platformMap := map[int]interface{}{}, map[string]interface{}{}
  228. eventArr, platformArr := []string{}, []string{}
  229. areaMap := map[string]int{}
  230. areaCityMap := map[string]map[string]int{}
  231. cityDistrictMap := map[string]map[string]int{}
  232. domainMap := map[string]int{}
  233. remarktime = time.Now().Unix()
  234. //luas, _ := MgoE.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1)
  235. luas, _ := MgoEB.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1)
  236. for _, l := range *luas {
  237. //remarktime
  238. if comeintime := qu.Int64All(l["comeintime"]); comeintime != int64(0) && comeintime < remarktime {
  239. remarktime = comeintime
  240. }
  241. //domain
  242. paramCommon := l["param_common"].([]interface{})
  243. if len(paramCommon) >= 12 {
  244. href := qu.ObjToString(paramCommon[11])
  245. domain := DomainReg.FindString(href)
  246. if domain != "" {
  247. domain = ReplaceReg.ReplaceAllString(domain, "")
  248. domainMap[domain] = domainMap[domain] + 1
  249. }
  250. }
  251. //state、event、platform
  252. state := qu.IntAll(l["state"])
  253. event := qu.IntAll(l["event"])
  254. platform := qu.ObjToString(l["platform"])
  255. if state == 5 {
  256. shelveUp++
  257. }
  258. eventMap[event] = true
  259. platformMap[platform] = true
  260. //area、city、district
  261. if model, ok := l["model"].(map[string]interface{}); ok && model != nil {
  262. a := qu.ObjToString(model["area"])
  263. c := qu.ObjToString(model["city"])
  264. d := qu.ObjToString(model["district"])
  265. if a != "" {
  266. areaMap[a] = areaMap[a] + 1
  267. if c != "" {
  268. if cityNum := areaCityMap[a]; cityNum != nil {
  269. cityNum[c] = cityNum[c] + 1
  270. } else {
  271. areaCityMap[a] = map[string]int{c: 1}
  272. }
  273. if d != "" {
  274. if distrctNum := cityDistrictMap[c]; distrctNum != nil {
  275. distrctNum[d] = distrctNum[d] + 1
  276. } else {
  277. cityDistrictMap[c] = map[string]int{d: 1}
  278. }
  279. }
  280. }
  281. }
  282. }
  283. }
  284. //
  285. for e, _ := range eventMap {
  286. eventArr = append(eventArr, fmt.Sprint(e))
  287. }
  288. event = strings.Join(eventArr, ",")
  289. for p, _ := range platformMap {
  290. platformArr = append(platformArr, p)
  291. }
  292. platform = strings.Join(platformArr, ",")
  293. //
  294. n := 0
  295. for tmpDomain, num := range domainMap {
  296. if num > n {
  297. n = num
  298. domain = tmpDomain
  299. }
  300. }
  301. status = fmt.Sprintf("%d%s%d", shelveUp, "/", len(*luas))
  302. //
  303. an, cn, dn := 0, 0, 0
  304. for at, num := range areaMap {
  305. if num > an {
  306. area = at
  307. an = num
  308. }
  309. }
  310. if area != "" {
  311. for ct, num := range areaCityMap[area] {
  312. if num > cn {
  313. city = ct
  314. cn = num
  315. }
  316. }
  317. }
  318. if city != "" {
  319. for dt, num := range cityDistrictMap[city] {
  320. if num > dn {
  321. district = dt
  322. dn = num
  323. }
  324. }
  325. }
  326. return
  327. }
  328. type StringValSorter struct {
  329. Keys []string
  330. Vals []string
  331. }
  332. func MapStringValueSort(m map[string]string) *StringValSorter {
  333. vs := NewStringValSorter(m)
  334. vs.Sort()
  335. return vs
  336. }
  337. func NewStringValSorter(m map[string]string) *StringValSorter {
  338. vs := &StringValSorter{
  339. Keys: make([]string, 0, len(m)),
  340. Vals: make([]string, 0, len(m)),
  341. }
  342. for k, v := range m {
  343. vs.Keys = append(vs.Keys, k)
  344. vs.Vals = append(vs.Vals, v)
  345. }
  346. return vs
  347. }
  348. func (vs *StringValSorter) Sort() {
  349. sort.Sort(vs)
  350. }
  351. func (vs *StringValSorter) Len() int {
  352. return len(vs.Vals)
  353. }
  354. func (vs *StringValSorter) Less(i, j int) bool {
  355. return vs.Vals[i] < vs.Vals[j]
  356. }
  357. func (vs *StringValSorter) Swap(i, j int) {
  358. vs.Vals[i], vs.Vals[j] = vs.Vals[j], vs.Vals[i]
  359. vs.Keys[i], vs.Keys[j] = vs.Keys[j], vs.Keys[i]
  360. }