list_spider.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. import random
  2. import time
  3. from collections import namedtuple
  4. from urllib.parse import quote
  5. import requests
  6. from lxml.html import fromstring, HtmlElement
  7. from config.load import crawler_url, region
  8. from crawler.crawl_scheduler import Scheduler
  9. from crawler.login import login, load_login_cookies, login_session_check
  10. from utils.databases import mongo_table, int2long, es_query
  11. from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
  12. from utils.log import logger
  13. from utils.socks5 import Proxy
  14. CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
  15. class ListSpider:
  16. def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
  17. self.crawl_menus = [
  18. # CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
  19. CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
  20. CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
  21. CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
  22. CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
  23. CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
  24. CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
  25. ]
  26. self.crawl_max_page = crawl_max_page or 1
  27. self.crawl_tab = mongo_table(db, crawl_tab)
  28. self.user = None
  29. self.session = None
  30. def crawl_request(self, url: str, refer: str, **kwargs):
  31. headers = {
  32. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
  33. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  34. 'Referer': refer
  35. }
  36. request_params = {}
  37. request_params.setdefault('headers', headers)
  38. request_params.setdefault('timeout', 60)
  39. if kwargs.get('cookies') is not None:
  40. request_params.setdefault('cookies', kwargs.get('cookies'))
  41. retries = 0
  42. proxy, proxies = None, None
  43. while retries < 5:
  44. try:
  45. response = self.session.get(url, **request_params)
  46. except requests.exceptions.Timeout:
  47. time.sleep(10)
  48. retries += 1
  49. continue
  50. element = fromstring(response.text)
  51. feature = '//div[@id="pages"]/following-sibling::table//tr'
  52. if element.xpath('//*[@id="password"]|//*[@id="renew_pop"]'):
  53. '''当出现登录或者注册弹窗页面,此时添加账号身份信息并检查账号状态'''
  54. retry_login = login_session_check(self.session, self.user.phone)
  55. if retry_login:
  56. logger.info(f"[重新登录]{self.user.phone}")
  57. self.session, code = login(*self.user, proxies=proxies)
  58. if code != 200:
  59. '''1小时内登录频繁会限制ip,此时添加代理登录账号'''
  60. if proxy is None:
  61. proxy = Proxy(True)
  62. else:
  63. proxy.switch()
  64. proxies = proxy.proxies
  65. retries += 1
  66. login_cookies = load_login_cookies(self.user.phone)
  67. request_params.update({'cookies': login_cookies})
  68. elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
  69. return response
  70. else:
  71. '''没有搜索到任何内容的页面'''
  72. return None
  73. raise VoidCrawlError(code=100020, reason='列表页访问失败')
  74. def crawl_response(self, response, menu: CrawlMenu):
  75. results = []
  76. element: HtmlElement = fromstring(response.text)
  77. feature = '//div[@id="pages"]/following-sibling::table//tr'
  78. for node in element.xpath(feature):
  79. publish_time = "".join(node.xpath('./td[6]/text()')).strip()
  80. if '-' not in publish_time:
  81. publish_time = "".join(node.xpath('./td[7]/text()')).strip()
  82. area = "".join("".join(node.xpath('./td[5]/text()')).split())
  83. title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
  84. competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
  85. item = {
  86. "site": "元博网(采购与招标网)",
  87. "channel": menu.channel,
  88. "area": area if area != '跨省' else '全国',
  89. "_d": "comeintime",
  90. "comeintime": int2long(int(time.time())),
  91. "T": "bidding",
  92. "sendflag": "false",
  93. "spidercode": menu.spidercode,
  94. "city": "",
  95. "type": "",
  96. "publishdept": "",
  97. "title": title,
  98. "competehref": competehref,
  99. "href": "#",
  100. "publishtime": publish_time,
  101. "l_np_publishtime": int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))),
  102. }
  103. if title is None:
  104. raise CustomCheckError(code=10107, reason='发布标题为空')
  105. item['count'] = es_query(item["title"], item["l_np_publishtime"])
  106. item['crawl'] = False
  107. # print(f'>>> {title} - {competehref}')
  108. results.append(item)
  109. if len(results) > 0:
  110. self.crawl_tab.insert_many(results)
  111. return len(results)
  112. def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
  113. for region_id, region_name in region.items():
  114. previous_url = None
  115. crawl_total, cookies = 1, None
  116. self.session = requests.session()
  117. '''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
  118. page_size = 30
  119. for page in range(1, self.crawl_max_page + 1):
  120. '''生成 url 和 refer'''
  121. if page == 1:
  122. url = crawler_url['home_page'].format(
  123. region_id,
  124. sc.yesterday,
  125. sc.yesterday,
  126. page_size,
  127. menu.table_type
  128. )
  129. previous_url = url
  130. refer = crawler_url['refer'].format(quote(region_name))
  131. else:
  132. url = crawler_url['list_url'].format(
  133. region_id,
  134. sc.yesterday,
  135. sc.yesterday,
  136. page,
  137. page_size,
  138. menu.table_type
  139. )
  140. refer = previous_url
  141. previous_url = url
  142. print(">>> ", url)
  143. sc.crawl_url = url
  144. sc.spider_code = menu.spidercode
  145. '''添加身份信息cookies'''
  146. if crawl_total >= 4:
  147. '''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
  148. cookies = load_login_cookies(self.user.phone)
  149. '''数据采集'''
  150. try:
  151. response = self.crawl_request(url, refer, cookies=cookies)
  152. if response is None:
  153. logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-0条')
  154. break
  155. item_size = self.crawl_response(response, menu)
  156. logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-{item_size}条')
  157. sc.crawl_counter(item_size)
  158. if item_size < page_size:
  159. '''当前页面的发布的条数小于网站页面固定发布数量,下一页不存在数据,直接跳过'''
  160. break
  161. else:
  162. crawl_total += 1
  163. except (JyBasicException, Exception) as e:
  164. sc.crawl_counter(0)
  165. logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
  166. menu.channel,
  167. region_name,
  168. page,
  169. e.__class__.__name__,
  170. ))
  171. sc.err_record(e)
  172. finally:
  173. sc.wait_for_next_task(random.choice(range(2, 6)))
  174. self.session.close()
  175. def start(self):
  176. for menu in self.crawl_menus:
  177. with Scheduler(site='元博网', crawl_type='list') as scheduler:
  178. if scheduler.crawl_start:
  179. self.user = scheduler.user
  180. self.crawl_spider(scheduler, menu)
  181. scheduler.finished()
  182. if __name__ == '__main__':
  183. ListSpider(
  184. db='py_spider',
  185. crawl_tab='ybw_list',
  186. crawl_max_page=134,
  187. ).start()