采集列表页(关键词).py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-06-01
  4. ---------
  5. @summary: 千里马列表页采集
  6. ---------
  7. @author: Dzr
  8. """
  9. import json
  10. import math
  11. import random
  12. import time
  13. from pathlib import Path
  14. import requests
  15. from loguru import logger
  16. from pybloom_live import BloomFilter
  17. from pymongo import MongoClient
  18. from login import auto_login, account_pool
  19. _cookies = None
  20. _headers = None
  21. _proxies = None
  22. def send_wechat_warning(msg, send=True):
  23. markdown = f'采集异常中断,请切换d模式处理。'
  24. markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
  25. if not send:
  26. logger.info(markdown)
  27. return
  28. url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=079193d8-1856-443e-9f6d-ecc5c883bf11'
  29. headers_ = {'Content-Type': 'application/json'}
  30. json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown, "mentioned_mobile_list":["17610673271"]}}
  31. request_params = dict(headers=headers_, json=json_data, timeout=10)
  32. response = requests.post(url, **request_params)
  33. logger.info(response.json())
  34. def setup_cfg(username):
  35. global _cookies, _headers, _proxies
  36. file = (Path(__file__).parent / f'account/{username}.json').absolute()
  37. with open(file, encoding='utf-8') as rp:
  38. json_data = json.load(rp)
  39. _cookies = json_data['cookies']
  40. _headers = json_data['headers']
  41. _proxies = json_data['proxies']
  42. def launch_filter():
  43. """创建布隆过滤器"""
  44. logger.debug('创建布隆过滤器...')
  45. backup = (Path(__file__).parent / 'backup')
  46. if not backup.exists():
  47. backup.mkdir(exist_ok=True)
  48. file = (backup / 'bloomfilter.f')
  49. if not file.exists():
  50. file.touch() # 初始创建存储文件
  51. bf = BloomFilter(capacity=1000000, error_rate=0.001) # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
  52. else:
  53. if file.stat().st_size == 0:
  54. bf = BloomFilter(capacity=1000000, error_rate=0.001)
  55. else:
  56. bf = BloomFilter.fromfile(file.open('rb'))
  57. return file, bf
  58. def fetch(collection, username, keywords, page, page_size, channel, bf):
  59. # rest/service/website/search/solr -> cookies
  60. global _cookies, _headers, _proxies
  61. response = None
  62. try:
  63. json_data = {
  64. 'keywords': keywords,
  65. 'timeType': '4',
  66. 'beginTime': '2024-01-01',
  67. 'endTime': '2024-12-04',
  68. 'filtermode': 5,
  69. 'searchMode': 1,
  70. 'currentPage': page,
  71. 'numPerPage': page_size,
  72. 'sortType': 2,
  73. 'allType': -1,
  74. 'noticeSegmentTypeStr': '',
  75. 'beginAmount': '',
  76. 'endAmount': '',
  77. 'purchasingUnitIdList': '',
  78. 'threeClassifyTagStr': '',
  79. 'fourLevelCategoryIdListStr': '',
  80. 'threeLevelCategoryIdListStr': '',
  81. 'levelId': '',
  82. 'tab': 0,
  83. 'searchDataType': 0,
  84. 'types': '-1',
  85. 'showContent': 1,
  86. 'hasTenderTransferProject': 1,
  87. 'newAreas': '',
  88. 'hasChooseSortType': 1,
  89. 'summaryType': 0,
  90. }
  91. response = requests.post(
  92. 'https://search.vip.qianlima.com/rest/service/website/search/solr',
  93. cookies=_cookies,
  94. headers=_headers,
  95. json=json_data,
  96. proxies=_proxies,
  97. timeout=60
  98. )
  99. assert response.status_code == 200
  100. result = response.json()
  101. try:
  102. total = result['data']['rowCount']
  103. except TypeError:
  104. return False, -1, 0
  105. dedup_count = 0
  106. count = 0
  107. insert_lst = []
  108. data = result['data']['data']
  109. for item in data:
  110. href = item.get('url')
  111. if href is None or href in bf:
  112. dedup_count += 1
  113. # logger.debug(f'重复数据[{href}]')
  114. continue
  115. item['channel'] = channel
  116. insert_lst.append(item)
  117. if len(insert_lst) == page_size:
  118. collection.insert_many(insert_lst, ordered=False)
  119. count += len(insert_lst)
  120. insert_lst = []
  121. bf.add(href)
  122. if len(insert_lst) > 0:
  123. collection.insert_many(insert_lst, ordered=False)
  124. count += len(insert_lst)
  125. logger.info(f'自动翻页|第{page}页|入库{count}条|重复{dedup_count}条')
  126. return True, total, len(data)
  127. except AssertionError:
  128. logger.error(f'{username}|账号异常|请求失败')
  129. # send_wechat_warning(msg=response.content.decode())
  130. return False, -2, 0
  131. except requests.exceptions.RequestException as e:
  132. logger.exception(f'网络请求错误, 原因:{e}')
  133. return False, -3, 0
  134. def spider(username, keywords, bf, coll, channel):
  135. setup_cfg(username)
  136. page = 1
  137. page_size = 100
  138. # 翻页
  139. retries = 0
  140. while True:
  141. ok, total, count = fetch(coll, username, keywords, page, page_size, channel, bf)
  142. if ok is False:
  143. state = total
  144. if state == -1:
  145. logger.info(f'{username}|请求参数错误|修改参数')
  146. return False
  147. elif state == -2:
  148. logger.info(f'{username}|访问频繁|3秒后切换账号')
  149. time.sleep(3)
  150. return
  151. else:
  152. logger.error(f'{username}|网络异常|准备重试~{retries}')
  153. if retries > 3:
  154. return
  155. else:
  156. retries += 1
  157. continue
  158. # time.sleep(math.log(random.randint(100, 2400), 2))
  159. time.sleep(.5)
  160. if ok is True and count < page_size:
  161. logger.info(f'采集完成|保存{total}条')
  162. break
  163. else:
  164. page += 1
  165. return True
  166. def main():
  167. f, bf = launch_filter() # 创建布隆过滤器,预计插入100万个元素,错误率0.1%
  168. client = MongoClient('192.168.3.182', 27017)
  169. coll = client['zjb_poc']['qlm_data_lst']
  170. channel = '综合'
  171. keywords = '黑龙江省八目科技开发有限公司'
  172. try:
  173. username, password = account_pool.pop(0)
  174. auto_login(username, password, proxy=True, headless=True, auto_quit=True)
  175. spider(username, keywords, bf, coll, channel)
  176. except KeyboardInterrupt:
  177. pass
  178. finally:
  179. bf.tofile(f.open('wb')) # 保存布隆过滤器到本地
  180. logger.info('采集结束')
  181. if __name__ == '__main__':
  182. main()