util.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. package util
  2. import (
  3. "bytes"
  4. "fmt"
  5. mgo "mongodb"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. sp "spiderutil"
  10. "strings"
  11. "sync"
  12. "time"
  13. "github.com/yuin/gopher-lua"
  14. )
  15. const Role_Admin, Role_Examine, Role_Dev = 3, 2, 1 //管理员,审核员,开发员
  16. var (
  17. //MgoE *mgo.MongodbSim //编辑器87
  18. MgoEB *mgo.MongodbSim //编辑器163
  19. MgoS *mgo.MongodbSim
  20. Province map[string][]string
  21. City map[string][]string
  22. DomainNameReg = regexp.MustCompile(`(http|https)[::]+`)
  23. DownLoadReg = regexp.MustCompile(`download\(.*?\)`)
  24. CodeTypeReg = regexp.MustCompile(`(utf8|utf-8|gbk)`)
  25. TitleFilterReg1 = regexp.MustCompile(`[\p{Han}]`)
  26. TitleFilterReg2 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数)`)
  27. DetailFilterReg1 = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
  28. Area []string //省份
  29. DomainReg = regexp.MustCompile(`(?://).+?(?:[::/])`)
  30. SymbolReg = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+")
  31. ReplaceReg = regexp.MustCompile(`[]::/]+`)
  32. CheckText = `item["spidercode"]="%s";item["site"]="%s";item["channel"]="%s"`
  33. CheckText_Code = `item["spidercode"]="%s"`
  34. CheckText_Site = `item["site"]="%s"`
  35. CheckText_Channel = `item["channel"]="%s"`
  36. JsonDataMap = map[string]bool{ //jsondata
  37. "extweight": true,
  38. "projecthref": true,
  39. "sourcewebsite": true,
  40. "sourcehref": true,
  41. "area_city_district": true,
  42. "projectname": true,
  43. "projectcode": true,
  44. "approvalno": true,
  45. "projectscope": true,
  46. "item": true,
  47. "buyer": true,
  48. "agency": true,
  49. "budget": true,
  50. "buyer_info": true,
  51. "buyerperson": true,
  52. "buyertel": true,
  53. "buyeraddr": true,
  54. "projectaddr": true,
  55. "publishdept": true,
  56. "funds": true,
  57. "paymenttype": true,
  58. "projectscale": true,
  59. "bidmethod": true,
  60. "bidopentime": true,
  61. "agency_info": true,
  62. "agencyperson": true,
  63. "agencytel": true,
  64. "agencyaddr": true,
  65. "isppp": true,
  66. "winner": true,
  67. "winneraddr": true,
  68. "winnerperson": true,
  69. "winnertel": true,
  70. "bidamount": true,
  71. "currency": true,
  72. "experts": true,
  73. "bidamounttype": true,
  74. "contractname": true,
  75. "countryprojectcode": true,
  76. "contractnumber": true,
  77. "projectperiod": true,
  78. "signaturedate": true,
  79. "multipackage": true,
  80. "package": true,
  81. "supervisorrate": true,
  82. "jsoncontent": true,
  83. "purchasinglist": true,
  84. "toptype": true,
  85. "subtype": true,
  86. "winnerorder": true,
  87. "bidopendate": true,
  88. "bidtype": true,
  89. }
  90. Bu = "_bu" //创建采历史爬虫后缀
  91. )
  92. func InitMgo() {
  93. defer qu.Catch()
  94. //MgoE = &mgo.MongodbSim{
  95. // MongodbAddr: sp.Config.Dbaddr,
  96. // DbName: sp.Config.Dbname,
  97. // Size: 10,
  98. //}
  99. //MgoE.InitPool()
  100. MgoEB = &mgo.MongodbSim{
  101. MongodbAddr: sp.Config.BidEditor.Addr,
  102. DbName: sp.Config.BidEditor.Db,
  103. Size: sp.Config.BidEditor.Size,
  104. UserName: sp.Config.BidEditor.Username,
  105. Password: sp.Config.BidEditor.Password,
  106. }
  107. MgoEB.InitPool()
  108. MgoS = &mgo.MongodbSim{
  109. MongodbAddr: sp.Config.Dbaddr,
  110. DbName: sp.Config.Dbname2,
  111. Size: 10,
  112. }
  113. MgoS.InitPool()
  114. }
  115. // 初始化省市行政区划信息
  116. func InitAreaCity() {
  117. //qu.ReadConfig("areacity.json", &Province)
  118. //Area = append(Area, "全国")
  119. //for area, _ := range Province {
  120. // if area == "全国" {
  121. // continue
  122. // }
  123. // Area = append(Area, area)
  124. //}
  125. Province = map[string][]string{}
  126. City = map[string][]string{}
  127. Area = append(Area, "全国")
  128. list, _ := MgoEB.Find("address", nil, nil, nil, false, -1, -1)
  129. for _, tmp := range *list {
  130. province := qu.ObjToString(tmp["province"])
  131. city := qu.ObjToString(tmp["city"])
  132. district := qu.ObjToString(tmp["district"])
  133. if province != "" && city == "" && district == "" { //area
  134. Area = append(Area, province)
  135. } else if province != "" && city != "" && district == "" { //city
  136. cityArr := Province[province]
  137. cityArr = append(cityArr, city)
  138. Province[province] = cityArr
  139. } else if province != "" && city != "" && district != "" { //district
  140. districtArr := City[city]
  141. districtArr = append(districtArr, district)
  142. City[city] = districtArr
  143. }
  144. }
  145. }
  146. // 爬虫整体测试时校验爬虫代码
  147. func SpiderPassCheckLua(liststr, contentstr string, lua map[string]interface{}) (msg []string) {
  148. //校验含过滤方法stringFind但没有过滤注释“--关键词过滤”
  149. //if strings.Contains(liststr, "stringFind") && !strings.Contains(liststr, "--关键词过滤") {
  150. // msg = append(msg, "列表页代码有过滤方法stringFind但缺少注释:--关键词过滤")
  151. //}
  152. //if strings.Contains(contentstr, "--关键词过滤") && !strings.Contains(contentstr, "delete") {
  153. // msg = append(msg, `三级页代码有过滤方法但缺少data["delete"]="true"`)
  154. //}
  155. //if !strings.Contains(contentstr, "s_title") {
  156. // msg = append(msg, "三级页缺少s_title")
  157. //}
  158. if !strings.Contains(contentstr, "getFileAttachmentsArrayWithTag") && !strings.Contains(contentstr, "downloadFile") {
  159. msg = append(msg, "三级页缺少下载附件方法")
  160. }
  161. //1.检测spidercode、site、channel
  162. //if param, ok := lua["param_common"].([]interface{}); ok && len(param) >= 3 {
  163. // spidercode := qu.ObjToString(param[0])
  164. // site := qu.ObjToString(param[1])
  165. // channel := qu.ObjToString(param[2])
  166. // checkText := fmt.Sprintf(CheckText, spidercode, site, channel)
  167. // if strings.Contains(liststr, `item["spidercode"]`) && !strings.Contains(liststr, checkText) {
  168. // msg = append(msg, "检查代码spidercode、site、channel字段值")
  169. // }
  170. //}
  171. //2.检测https
  172. //isHttps := false
  173. //for _, text := range DomainNameReg.FindAllString(liststr, -1) {
  174. // if strings.Contains(text, "https") {
  175. // isHttps = true
  176. // }
  177. //}
  178. //if isHttps {
  179. // for tmpStr, tmpText := range map[string]string{"列表页": liststr, "三级页": contentstr} {
  180. // downLoadText := DownLoadReg.FindString(tmpText)
  181. // if downLoadText != "" {
  182. // textArr := strings.Split(downLoadText, ",")
  183. // if len(textArr) < 4 {
  184. // msg = append(msg, tmpStr+"download方法添加下载参数")
  185. // } else if len(textArr) == 4 {
  186. // if !CodeTypeReg.MatchString(textArr[0]) || (textArr[1] != "true" && textArr[1] != "false") {
  187. // msg = append(msg, tmpStr+"download方法添加下载参数")
  188. // }
  189. // }
  190. // }
  191. // }
  192. //}
  193. //3.检测title
  194. //if strings.Contains(liststr, `item["title"]="a"`) {
  195. // if !strings.Contains(contentstr, `data["title"]`) {
  196. // msg = append(msg, "检查代码title的完整性")
  197. // }
  198. //}
  199. // 4.检测sendListNum
  200. //if !strings.Contains(liststr, "sendListNum") {
  201. // msg = append(msg, "sendListNum方法缺失")
  202. //}
  203. return
  204. }
  205. // 爬虫整体测试时校验列表页和详情页内容
  206. func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, data map[string]interface{}) (msg []string) {
  207. msgMap := map[string]bool{}
  208. //校验列表页信息
  209. for _, list := range result {
  210. for _, l := range list {
  211. //校验title
  212. title := qu.ObjToString(l["title"])
  213. if !TitleFilterReg1.MatchString(title) {
  214. msgMap["列表页title中无汉字"] = true
  215. } else if TitleFilterReg2.MatchString(title) {
  216. msgMap["列表页title中含有上(下)一页"] = true
  217. }
  218. //校验发布时间
  219. publishtime := qu.ObjToString(l["publishtime"])
  220. if publishtime == "0" || publishtime == "" {
  221. msgMap["列表页publishtime取值异常"] = true
  222. } else {
  223. t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
  224. if err != nil || t.Unix() <= 0 {
  225. msgMap["列表页publishtime取值异常"] = true
  226. }
  227. }
  228. }
  229. }
  230. if len(data) > 0 {
  231. //校验publishtime
  232. if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok {
  233. if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
  234. msgMap["三级页publishtime取值异常"] = true
  235. }
  236. } else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok {
  237. if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
  238. msgMap["三级页publishtime取值异常"] = true
  239. }
  240. } else {
  241. msgMap["三级页publishtime值类型异常"] = true
  242. }
  243. contenthtml := qu.ObjToString(data["contenthtml"])
  244. if strings.Contains(contenthtml, "img") {
  245. msgMap["contenthtml中含有img是否下载"] = true
  246. }
  247. if strings.Contains(contenthtml, "iframe") {
  248. msgMap["contenthtml中含有iframe是否下载"] = true
  249. }
  250. detail := qu.ObjToString(data["detail"])
  251. if DetailFilterReg1.MatchString(detail) {
  252. msgMap["三级页正文提取包含无效内容"] = true
  253. }
  254. //校验jsondata
  255. if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
  256. for field, _ := range jsondata {
  257. if !JsonDataMap[field] {
  258. msgMap["jsondata中"+field+"属性错误"] = true
  259. }
  260. }
  261. }
  262. }
  263. for text, _ := range msgMap {
  264. msg = append(msg, text)
  265. }
  266. return
  267. }
  268. // 爬虫整体测试时校验列表页和详情页内容
  269. func SpiderPassCheckListAndDetail_back(list []map[string]interface{}, data map[string]interface{}) (msg []string) {
  270. if len(list) > 0 {
  271. p_zero := 0
  272. h_flag := true
  273. n_flag := true
  274. l_flag := true
  275. for _, l := range list {
  276. //校验title
  277. title := qu.ObjToString(l["title"])
  278. if !TitleFilterReg1.MatchString(title) && h_flag {
  279. msg = append(msg, "列表页title中无汉字")
  280. h_flag = false
  281. } else if TitleFilterReg2.MatchString(title) && n_flag {
  282. msg = append(msg, "列表页title中含有上(下)一页")
  283. n_flag = false
  284. }
  285. publishtime := qu.ObjToString(l["publishtime"])
  286. if publishtime == "0" {
  287. p_zero++
  288. } else if l_flag {
  289. t, _ := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
  290. if t.Unix() <= 0 {
  291. msg = append(msg, "列表页数据发布时间异常")
  292. l_flag = false
  293. }
  294. }
  295. }
  296. if len(data) > 0 {
  297. //校验publishtime
  298. if l_np_publishtime := data["l_np_publishtime"].(lua.LNumber); l_np_publishtime <= 0 {
  299. msg = append(msg, "三级页发布时间小于0")
  300. } else if p_zero == len(list) && l_np_publishtime == 0 {
  301. msg = append(msg, "三级页发布时间异常")
  302. }
  303. contenthtml := qu.ObjToString(data["contenthtml"])
  304. if strings.Contains(contenthtml, "img") {
  305. msg = append(msg, "contenthtml中含有img是否下载")
  306. }
  307. detail := qu.ObjToString(data["detail"])
  308. if TitleFilterReg2.MatchString(detail) {
  309. msg = append(msg, "三级页正文提取异常")
  310. }
  311. //校验jsondata
  312. if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {
  313. for field, _ := range jsondata {
  314. if !JsonDataMap[field] {
  315. msg = append(msg, "jsondata中"+field+"属性错误")
  316. }
  317. }
  318. }
  319. }
  320. }
  321. return
  322. }
  323. func GetLuasInfoBySite(site, area, city, district string) (domain, status, event, platform, infotype, specialtype string, remarktime int64) {
  324. shelveUp := 0
  325. eventMap, platformMap := map[int]interface{}{}, map[string]interface{}{}
  326. infoformatMap := map[int]bool{}
  327. eventArr, platformArr, infoformatArr := []string{}, []string{}, []string{}
  328. //areaMap := map[string]int{}
  329. //areaCityMap := map[string]map[string]int{}
  330. //cityDistrictMap := map[string]map[string]int{}
  331. domainMap := map[string]bool{}
  332. domainArr := []string{}
  333. remarktime = time.Now().Unix()
  334. //luas, _ := MgoE.Find("luaconfig", `{"param_common.1":"`+site+`"}`, ``, `{"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1}`, false, -1, -1)
  335. luas, _ := MgoEB.Find("luaconfig", `{"site":"`+site+`"}`, ``, `{"projecthref":1,"model":1,"event":1,"state":1,"platform":1,"param_common":1,"comeintime":1,"infoformat":1}`, false, -1, -1)
  336. arr := [][]map[string]interface{}{}
  337. for _, l := range *luas {
  338. update := []map[string]interface{}{}
  339. set := map[string]interface{}{}
  340. if b, ok := l["projecthref"].(bool); ok && b { //爬虫采集的数据是流程性信息
  341. specialtype = "含流程数据"
  342. }
  343. //更新爬虫area、city、district
  344. if area != "" {
  345. set["model.area"] = area
  346. }
  347. if area == "全国" {
  348. set["model.city"] = ""
  349. set["model.district"] = ""
  350. } else if area != "" {
  351. if city != "" {
  352. set["model.city"] = city
  353. }
  354. if district != "" {
  355. set["model.district"] = district
  356. }
  357. }
  358. if len(set) > 0 {
  359. update = append(update, map[string]interface{}{"_id": l["_id"]})
  360. update = append(update, map[string]interface{}{"$set": set})
  361. arr = append(arr, update)
  362. }
  363. //remarktime
  364. if comeintime := qu.Int64All(l["comeintime"]); comeintime != int64(0) && comeintime < remarktime {
  365. remarktime = comeintime
  366. }
  367. //domain
  368. paramCommon := l["param_common"].([]interface{})
  369. if len(paramCommon) >= 12 {
  370. href := qu.ObjToString(paramCommon[11])
  371. domain := DomainReg.FindString(href)
  372. if domain != "" {
  373. domain = ReplaceReg.ReplaceAllString(domain, "")
  374. if !domainMap[domain] {
  375. domainArr = append(domainArr, domain)
  376. domainMap[domain] = true
  377. }
  378. }
  379. }
  380. //state、event、platform
  381. state := qu.IntAll(l["state"])
  382. event := qu.IntAll(l["event"])
  383. platform := qu.ObjToString(l["platform"])
  384. if state == 5 || state == 11 { //5:lua已上架;11:python已上线
  385. shelveUp++
  386. }
  387. eventMap[event] = true
  388. platformMap[platform] = true
  389. //infoformat
  390. infoformat := qu.IntAll(l["infoformat"])
  391. infoformatMap[infoformat] = true
  392. //area、city、district
  393. //if model, ok := l["model"].(map[string]interface{}); ok && model != nil {
  394. // a := qu.ObjToString(model["area"])
  395. // c := qu.ObjToString(model["city"])
  396. // d := qu.ObjToString(model["district"])
  397. // if a != "" {
  398. // areaMap[a] = areaMap[a] + 1
  399. // if c != "" {
  400. // if cityNum := areaCityMap[a]; cityNum != nil {
  401. // cityNum[c] = cityNum[c] + 1
  402. // } else {
  403. // areaCityMap[a] = map[string]int{c: 1}
  404. // }
  405. // if d != "" {
  406. // if distrctNum := cityDistrictMap[c]; distrctNum != nil {
  407. // distrctNum[d] = distrctNum[d] + 1
  408. // } else {
  409. // cityDistrictMap[c] = map[string]int{d: 1}
  410. // }
  411. // }
  412. // }
  413. //
  414. // }
  415. //}
  416. }
  417. //domain
  418. domain = strings.Join(domainArr, ";")
  419. for e, _ := range eventMap {
  420. eventArr = append(eventArr, fmt.Sprint(e))
  421. }
  422. event = strings.Join(eventArr, ",")
  423. for p, _ := range platformMap {
  424. platformArr = append(platformArr, p)
  425. }
  426. sort.Strings(platformArr)
  427. platform = strings.Join(platformArr, ",")
  428. for infoformat, _ := range infoformatMap {
  429. text := "招标"
  430. if infoformat == 2 {
  431. text = "拟建/审批"
  432. } else if infoformat == 3 {
  433. text = "产权"
  434. } else if infoformat == 4 {
  435. text = "舆情"
  436. }
  437. infoformatArr = append(infoformatArr, text)
  438. }
  439. sort.Strings(infoformatArr)
  440. infotype = strings.Join(infoformatArr, ",")
  441. //
  442. status = fmt.Sprintf("%d%s%d", shelveUp, "/", len(*luas))
  443. //批量更新
  444. if len(arr) > 0 {
  445. MgoEB.UpdateBulk("luaconfig", arr...)
  446. arr = [][]map[string]interface{}{}
  447. }
  448. //an, cn, dn := 0, 0, 0
  449. //for at, num := range areaMap {
  450. // if num > an {
  451. // area = at
  452. // an = num
  453. // }
  454. //}
  455. //if area != "" {
  456. // for ct, num := range areaCityMap[area] {
  457. // if num > cn {
  458. // city = ct
  459. // cn = num
  460. // }
  461. // }
  462. //}
  463. //if city != "" {
  464. // for dt, num := range cityDistrictMap[city] {
  465. // if num > dn {
  466. // district = dt
  467. // dn = num
  468. // }
  469. // }
  470. //}
  471. return
  472. }
  473. type StringValSorter struct {
  474. Keys []string
  475. Vals []string
  476. }
  477. func MapStringValueSort(m map[string]string) *StringValSorter {
  478. vs := NewStringValSorter(m)
  479. vs.Sort()
  480. return vs
  481. }
  482. func NewStringValSorter(m map[string]string) *StringValSorter {
  483. vs := &StringValSorter{
  484. Keys: make([]string, 0, len(m)),
  485. Vals: make([]string, 0, len(m)),
  486. }
  487. for k, v := range m {
  488. vs.Keys = append(vs.Keys, k)
  489. vs.Vals = append(vs.Vals, v)
  490. }
  491. return vs
  492. }
  493. func (vs *StringValSorter) Sort() {
  494. sort.Sort(vs)
  495. }
  496. func (vs *StringValSorter) Len() int {
  497. return len(vs.Vals)
  498. }
  499. func (vs *StringValSorter) Less(i, j int) bool {
  500. return vs.Vals[i] < vs.Vals[j]
  501. }
  502. func (vs *StringValSorter) Swap(i, j int) {
  503. vs.Vals[i], vs.Vals[j] = vs.Vals[j], vs.Vals[i]
  504. vs.Keys[i], vs.Keys[j] = vs.Keys[j], vs.Keys[i]
  505. }
  506. type MyWrite struct {
  507. Byte *bytes.Buffer
  508. }
  509. func (m *MyWrite) Write(p []byte) (n int, err error) {
  510. n, err = m.Byte.Write(p)
  511. return
  512. }
  513. func (m *MyWrite) Reader(p []byte) (n int, err error) {
  514. n, err = m.Byte.Read(p)
  515. return
  516. }
  517. // 获取第day天凌晨的时间戳
  518. func GetTime(day int) int64 {
  519. defer qu.Catch()
  520. nowTime := time.Now().AddDate(0, 0, day)
  521. timeStr := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
  522. t, _ := time.ParseInLocation(qu.Date_Short_Layout, timeStr, time.Local)
  523. return t.Unix()
  524. }
  525. var CodeTimeCache = &TimeCache{
  526. Data: map[string]interface{}{},
  527. mutex: sync.Mutex{},
  528. }
  529. type TimeCache struct {
  530. Data map[string]interface{}
  531. mutex sync.Mutex
  532. }
  533. func (c *TimeCache) Set(key string, value interface{}, duration time.Duration) {
  534. c.mutex.Lock()
  535. defer c.mutex.Unlock()
  536. c.Data[key] = value
  537. go func() {
  538. time.Sleep(duration)
  539. c.mutex.Lock()
  540. defer c.mutex.Unlock()
  541. delete(c.Data, key)
  542. }()
  543. }
  544. func (c *TimeCache) Get(key string) (interface{}, bool) {
  545. c.mutex.Lock()
  546. defer c.mutex.Unlock()
  547. value, ok := c.Data[key]
  548. return value, ok
  549. }
  550. func (c *TimeCache) Del(key string) {
  551. c.mutex.Lock()
  552. defer c.mutex.Unlock()
  553. delete(c.Data, key)
  554. }