ListPageSpider.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. import random
  2. import time
  3. import re
  4. import json
  5. from collections import namedtuple
  6. import requests
  7. from lxml.html import fromstring, HtmlElement
  8. from config.load import region
  9. from crawler.crawl_scheduler import Scheduler
  10. from utils.databases import mongo_table, int2long,redis_client, es_query
  11. from utils.log import logger
  12. from utils.tools import sha1, check_crawl_title,get_proxy
  13. from utils.execptions import JyBasicException,CustomCheckError
  14. from login import get_cookies
  15. CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
  16. class ListSpider:
  17. def __init__(self, db: str, crawl_tab: str, crawl_max_page=None, enable_proxy=False, allow_show_exception=False):
  18. self.crawl_menus = [
  19. CrawlMenu('招标预告', 'a_bdzbw_zbyg', 'retrieval_list.do?single=true&ChannelIds=102'),
  20. CrawlMenu('招标公告', 'a_bdzbw_zbgg', 'retrieval_list.do?single=true&ChannelIds=52'),
  21. CrawlMenu('公告变更', 'a_bdzbw_ggbg', 'retrieval_list.do?single=true&ChannelIds=51'),
  22. CrawlMenu('招标答疑', 'a_bdzbw_zbdy', 'retrieval_list.do?single=true&ChannelIds=103'),
  23. CrawlMenu('资审结果', 'a_bdzbw_zsjg', 'retrieval_list.do?single=true&ChannelIds=105'),
  24. CrawlMenu('招标文件', 'a_bdzbw_zbwj', 'retrieval_list.do?single=true&ChannelIds=104'),
  25. CrawlMenu('中标公告', 'a_bdzbw_zhbgg', 'retrieval_list.do?single=true&ChannelIds=101'),
  26. CrawlMenu('采购意向', 'a_bdzbw_cgyx', 'retrieval_list.do?single=true&ChannelIds=114'),
  27. CrawlMenu('审批项目', 'a_bdzbw_spxm', 'spxm_list.do'),
  28. CrawlMenu('拍卖出让', 'a_bdzbw_pmcr', 'retrieval_list.do?single=true&ChannelIds=115'),
  29. CrawlMenu('土地矿产', 'a_bdzbw_tdkc', 'retrieval_list.do?single=true&ChannelIds=116'),
  30. CrawlMenu('产权交易', 'a_bdzbw_cqjy', 'retrieval_list.do?single=true&ChannelIds=117'),
  31. ]
  32. self.total = 0
  33. self.crawl_max_page = crawl_max_page or 1
  34. self.crawl_tab = mongo_table(db, crawl_tab)
  35. self.r = redis_client()
  36. self.session = requests.session()
  37. self.proxy = get_proxy()
  38. self.redis_key = 'bdzbw_2024'
  39. self.allow_show_exception = allow_show_exception
  40. self.cookies = None
  41. def read_cookies(self):
  42. with open('./login_cookie.json','r') as f:
  43. cookies = f.read()
  44. return json.loads(cookies)
  45. def crawl_request(self, url: str, data):
  46. headers = {
  47. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  48. "Accept-Language": "zh-CN,zh;q=0.9",
  49. "Cache-Control": "no-cache",
  50. "Connection": "keep-alive",
  51. "Content-Type": "application/x-www-form-urlencoded",
  52. "Origin": "http://www.bidizhaobiao.com",
  53. "Pragma": "no-cache",
  54. "Upgrade-Insecure-Requests": "1",
  55. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
  56. }
  57. request_params = {}
  58. request_params.setdefault('headers', headers)
  59. request_params.setdefault('timeout', 120)
  60. retries = 0
  61. while retries < 2:
  62. try:
  63. self.cookies = self.read_cookies()
  64. response = self.session.post(url, data=data, cookies=self.cookies,
  65. proxies=self.proxy, **request_params)
  66. except:
  67. self.proxy.switch()
  68. retries += 1
  69. time.sleep(20)
  70. continue
  71. if response.status_code == 403:
  72. self.proxy = get_proxy()
  73. get_cookies(self.session,self.proxy.proxies)
  74. retries += 1
  75. elif response.status_code == 200:
  76. element = fromstring(response.text)
  77. time.sleep(2)
  78. if element.xpath('//*[@id="searchResultList"]') or element.xpath('//*[@id="ulList"]'):
  79. return response
  80. else:
  81. '''没有搜索到列表页'''
  82. return None
  83. else:
  84. self.proxy = get_proxy()
  85. retries += 1
  86. return None
  87. def crawl_response(self, response, menu: CrawlMenu, pro_area):
  88. results = []
  89. last_page = []
  90. increase = []
  91. element: HtmlElement = fromstring(response.text)
  92. feature = '//div[@id="searchResultList"]/div[2]/div|//div[@id="ulList"]/table[@class="tableList"]/tbody/tr'
  93. for node in element.xpath(feature):
  94. try:
  95. if node.xpath('./div[1]/div[1]/p/a[1]'):
  96. competehref = "".join(node.xpath('./div[1]/div[1]/p/a[1]/@href')).strip()
  97. title = "".join("".join(node.xpath('./div[1]/div[1]/p/a[1]//text()')).split())
  98. area = "".join("".join(node.xpath('./div[1]/div[2]/div[2]/a/span/text()')).split())
  99. publish_time = "".join("".join(node.xpath('./div[1]/div[2]/div[4]/p/text()')).split())
  100. else:
  101. href_info = "".join(node.xpath('./td[@class="projectName"]/a/@onclick')).strip()
  102. href_params = "".join(re.findall("spxmInfo\('(.*?)'", href_info, re.S)).strip()
  103. competehref = f"http://www.bidizhaobiao.com/spxm-{href_params}.html"
  104. title = "".join(node.xpath('./td[@class="projectName"]/a/text()')).strip()
  105. area = "".join(node.xpath('./td[@class="address"]/span/text()')).strip()
  106. publish_time = "".join(node.xpath('./td[@class="time"]/span/text()')).strip()
  107. except:
  108. '''某条数据格式异常'''
  109. continue
  110. item = {
  111. "site": "比地招标网",
  112. "channel": menu.channel,
  113. "area": pro_area,
  114. "_d": "comeintime",
  115. "comeintime": int2long(int(time.time())),
  116. "T": "bidding",
  117. "sendflag": "false",
  118. "spidercode": menu.spidercode,
  119. "city": area,
  120. "type": "",
  121. "publishdept": "",
  122. "title": title,
  123. "competehref": competehref,
  124. "href": "#",
  125. "publishtime": publish_time,
  126. "l_np_publishtime": int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))),
  127. }
  128. if title is None or publish_time is None:
  129. raise CustomCheckError(code=10107, reason='发布标题或时间为空')
  130. present_time = time.strftime("%Y-%m-%d 00:00:00", time.localtime(int(round(time.time()))))
  131. timeArray = time.strptime(present_time, "%Y-%m-%d %H:%M:%S")
  132. start_date = round(time.mktime(timeArray)) - 86400
  133. if item.get('l_np_publishtime') >= start_date:
  134. last_page.append(item)
  135. # logger.debug(item)
  136. item['crawl'] = False
  137. sign = sha1(item['competehref'])
  138. if not self.r.hexists(self.redis_key, sign):
  139. increase.append(item)
  140. if check_crawl_title(title):
  141. # item["count"] = 0
  142. item["count"] = es_query(item["title"], item["l_np_publishtime"])
  143. results.append(item)
  144. self.r.hset(self.redis_key, sign, '')
  145. if len(results) > 0:
  146. self.crawl_tab.insert_many(results)
  147. return len(results),len(last_page),len(increase)
  148. def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
  149. for region_id, region_name in region.items():
  150. page_size = 22
  151. for page in range(1, self.crawl_max_page + 1):
  152. url = f'http://www.bidizhaobiao.com/advsearch/{menu.table_type}'
  153. data = {
  154. "pageNum": f"{page}",
  155. "province_id": f"{region_id}",
  156. "provinceCityJson": '{'+f'{region_name}'+":[]}",
  157. "searchCondition.dtype": "50",
  158. "searchCondition.SearchType": "any",
  159. "searchCondition.infosources": "",
  160. "searchCondition.regionId": "",
  161. "provinceState": f"{region_name}",
  162. "searchCondition.Pattern": "30",
  163. "searchCondition.isOr": "false",
  164. "isSelectDtype": "0",
  165. "isSelectPattern": "0",
  166. }
  167. sc.crawl_url = url
  168. sc.spider_code = menu.spidercode
  169. try:
  170. response = self.crawl_request(url,data)
  171. if response is None:
  172. logger.info(f'[没有搜索到列表页]{menu.channel}-{region_name}-第{page}页-0条')
  173. break
  174. item_size = self.crawl_response(response, menu, region_name)
  175. self.total += item_size[0]
  176. logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-{item_size[0]}条')
  177. # 第一次采集
  178. if item_size[1] < page_size:
  179. '''当前页面之后已无信息'''
  180. break
  181. # 增量采集
  182. if item_size[2] == 0:
  183. '''当前及之后页面信息已采集'''
  184. break
  185. except (JyBasicException, Exception) as e:
  186. logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
  187. menu.channel,
  188. region_name,
  189. page,
  190. e.__class__.__name__,
  191. ))
  192. finally:
  193. sc.wait_for_next_task(random.choice(range(2, 6)))
  194. logger.debug(f'[{menu.channel}]-[采集地区]-{region_name}-已采集{self.total}条数据')
  195. self.session.close()
  196. def start(self):
  197. with Scheduler(site='比地招标网', crawl_type='list') as scheduler:
  198. for menu in self.crawl_menus:
  199. if scheduler.crawl_start:
  200. self.crawl_spider(scheduler, menu)
  201. scheduler.finished(5)
  202. logger.info(f'本次共采集{self.total}条数据')
  203. if __name__ == '__main__':
  204. ListSpider(
  205. db='py_spider',
  206. crawl_tab='bdzbw_list',
  207. crawl_max_page=1,
  208. ).start()