util.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. package util
  2. import (
  3. "fmt"
  4. mgo "mongodb"
  5. qu "qfw/util"
  6. "regexp"
  7. sp "spiderutil"
  8. "strings"
  9. "time"
  10. "github.com/yuin/gopher-lua"
  11. )
  12. var (
  13. MgoE *mgo.MongodbSim
  14. Province map[string][]string
  15. DomainNameReg = regexp.MustCompile(`(http|https)[::]+`)
  16. DownLoadReg = regexp.MustCompile(`download\(.*?\)`)
  17. CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`)
  18. TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`)
  19. TitleFilterReg2 = regexp.MustCompile(`((上|下)一页|阅读次数)`)
  20. CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"`
  21. JsonDataMap = map[string]bool{ //jsondata
  22. "extweight": true,
  23. "projecthref": true,
  24. "sourcewebsite": true,
  25. "sourcehref": true,
  26. "area_city_district": true,
  27. "projectname": true,
  28. "projectcode": true,
  29. "approvalno": true,
  30. "projectscope": true,
  31. "item": true,
  32. "buyer": true,
  33. "agency": true,
  34. "budget": true,
  35. "buyer_info": true,
  36. "buyerperson": true,
  37. "buyertel": true,
  38. "buyeraddr": true,
  39. "projectaddr": true,
  40. "publishdept": true,
  41. "funds": true,
  42. "paymenttype": true,
  43. "projectscale": true,
  44. "bidmethod": true,
  45. "bidopentime": true,
  46. "agency_info": true,
  47. "agencyperson": true,
  48. "agencytel": true,
  49. "agencyaddr": true,
  50. "isppp": true,
  51. "winner": true,
  52. "winneraddr": true,
  53. "winnerperson": true,
  54. "winnertel": true,
  55. "bidamount": true,
  56. "currency": true,
  57. "experts": true,
  58. "bidamounttype": true,
  59. "contractname": true,
  60. "countryprojectcode": true,
  61. "contractnumber": true,
  62. "projectperiod": true,
  63. "signaturedate": true,
  64. "multipackage": true,
  65. "package": true,
  66. "supervisorrate": true,
  67. "jsoncontent": true,
  68. "purchasinglist": true,
  69. "toptype": true,
  70. "subtype": true,
  71. "winnerorder": true,
  72. }
  73. )
  74. func InitMgo() {
  75. defer qu.Catch()
  76. MgoE = &mgo.MongodbSim{
  77. MongodbAddr: sp.Config.Dbaddr,
  78. DbName: sp.Config.Dbname,
  79. Size: 5,
  80. }
  81. MgoE.InitPool()
  82. }
  83. //初始化省市行政区划信息
  84. func InitAreaCity() {
  85. qu.ReadConfig("areacity.json", &Province)
  86. }
  87. //爬虫整体测试时校验爬虫代码
  88. func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) string {
  89. msg := []string{}
  90. //1.检测spidercode、site、channel
  91. if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 {
  92. spidercode := qu.ObjToString(param[0])
  93. site := qu.ObjToString(param[1])
  94. channel := qu.ObjToString(param[2])
  95. checkText := fmt.Sprintf(CheckText, spidercode, site, channel)
  96. if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) {
  97. msg = append(msg, "检查代码spidercode、site、channel字段值")
  98. }
  99. }
  100. //2.检测https
  101. isHttps := false
  102. for _, text := range DomainNameReg.FindAllString(liststr, -1) {
  103. if strings.Contains(text, "https") {
  104. isHttps = true
  105. }
  106. }
  107. if isHttps {
  108. downLoadText := DownLoadReg.FindString(contentstr)
  109. if downLoadText != "" {
  110. textArr := strings.Split(downLoadText, ",")
  111. if len(textArr) < 4 {
  112. msg = append(msg, "download方法添加下载参数")
  113. } else if len(textArr) == 4 {
  114. if !CodeTypeReg.MatchString(textArr[0]) || textArr[1] != "true" {
  115. msg = append(msg, "download方法添加下载参数")
  116. }
  117. }
  118. }
  119. }
  120. //3.检测title
  121. if strings.Contains(liststr, `item["title"]="a"`) {
  122. if !strings.Contains(contentstr, `data["title"]`) {
  123. msg = append(msg, "检查代码title的完整性")
  124. }
  125. }
  126. return strings.Join(msg, ",")
  127. }
  128. //爬虫整体测试时校验列表页和详情页内容
  129. func SpiderPassCheckListAndDetail(list []map[string]interface{}, data map[string]interface{}) string {
  130. msg := []string{}
  131. if len(list) > 0 {
  132. p_zero := 0
  133. h_flag := true
  134. n_flag := true
  135. l_flag := true
  136. for _, l := range list {
  137. //校验title
  138. title := qu.ObjToString(l["title"])
  139. if !TitleFilterReg1.MatchString(title) && h_flag {
  140. msg = append(msg, "列表页title中无汉字")
  141. h_flag = false
  142. } else if TitleFilterReg2.MatchString(title) && n_flag {
  143. msg = append(msg, "列表页title中含有上(下)一页")
  144. n_flag = false
  145. }
  146. publishtime := qu.ObjToString(l["publishtime"])
  147. if publishtime == "0" {
  148. p_zero++
  149. } else if l_flag {
  150. t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
  151. if t.Unix() <= 0 {
  152. msg = append(msg, "列表页数据发布时间异常")
  153. l_flag = false
  154. }
  155. }
  156. }
  157. if len(data) > 0 {
  158. //校验publishtime
  159. if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 {
  160. msg = append(msg, "三级页发布时间小于0")
  161. } else if p_zero == len(list) && l_np_publishtime == 0 {
  162. msg = append(msg, "三级页发布时间异常")
  163. }
  164. contenthtml := qu.ObjToString(data["contenthtml"])
  165. if strings.Contains(contenthtml, "img") {
  166. msg = append(msg, "contenthtml中含有img是否下载")
  167. }
  168. detail := qu.ObjToString(data["detail"])
  169. if TitleFilterReg2.MatchString(detail) {
  170. msg = append(msg, "三级页正文提取异常")
  171. }
  172. //校验jsondata
  173. if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
  174. for field, _ := range jsondata {
  175. if !JsonDataMap[field] {
  176. msg = append(msg, "jsondata中"+field+"属性错误")
  177. }
  178. }
  179. }
  180. }
  181. }
  182. return strings.Join(msg, ",")
  183. }