123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- import random
- import time
- from collections import namedtuple
- from urllib.parse import quote
- import requests
- from lxml.html import fromstring, HtmlElement
- from config.load import crawler_url, region
- from crawler.crawl_scheduler import Scheduler
- from crawler.login import login, load_login_cookies, login_session_check
- from utils.databases import mongo_table, int2long, es_query
- from utils.execptions import CustomCheckError, VoidCrawlError, JyBasicException
- from utils.log import logger
- from utils.socks5 import Proxy
- CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
- class ListSpider:
- def __init__(self, db: str, crawl_tab: str, crawl_max_page=None):
- self.crawl_menus = [
- # CrawlMenu('企业采购', 'a_ybwcgyzbw_qycg', '7%2C'),
- CrawlMenu('政府采购', 'a_ybwcgyzbw_zfcg', '6%2C'),
- CrawlMenu('招标预告', 'a_ybwcgyzbw_zbyg', '5%2C'),
- CrawlMenu('中标公示', 'a_ybwcgyzbw_zbgs', '4%2C'),
- CrawlMenu('服务招标', 'a_ybwcgyzbw_fwzb', '3%2C'),
- CrawlMenu('货物招标', 'a_ybwcgyzbw_hwzb', '2%2C'),
- CrawlMenu('工程招标', 'a_ybwcgyzbw_gczb', '1%2C'),
- ]
- self.crawl_max_page = crawl_max_page or 1
- self.crawl_tab = mongo_table(db, crawl_tab)
- self.user = None
- self.session = None
- def crawl_request(self, url: str, refer: str, **kwargs):
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'Referer': refer
- }
- request_params = {}
- request_params.setdefault('headers', headers)
- request_params.setdefault('timeout', 60)
- if kwargs.get('cookies') is not None:
- request_params.setdefault('cookies', kwargs.get('cookies'))
- retries = 0
- proxy, proxies = None, None
- while retries < 5:
- try:
- response = self.session.get(url, **request_params)
- except requests.exceptions.Timeout:
- time.sleep(10)
- retries += 1
- continue
- element = fromstring(response.text)
- feature = '//div[@id="pages"]/following-sibling::table//tr'
- if element.xpath('//*[@id="password"]|//*[@id="renew_pop"]'):
- '''当出现登录或者注册弹窗页面,此时添加账号身份信息并检查账号状态'''
- retry_login = login_session_check(self.session, self.user.phone)
- if retry_login:
- logger.info(f"[重新登录]{self.user.phone}")
- self.session, code = login(*self.user, proxies=proxies)
- if code != 200:
- '''1小时内登录频繁会限制ip,此时添加代理登录账号'''
- if proxy is None:
- proxy = Proxy(True)
- else:
- proxy.switch()
- proxies = proxy.proxies
- retries += 1
- login_cookies = load_login_cookies(self.user.phone)
- request_params.update({'cookies': login_cookies})
- elif element.xpath('//*[@id="pages"]') and len(element.xpath(feature)) > 0:
- return response
- else:
- '''没有搜索到任何内容的页面'''
- return None
- raise VoidCrawlError(code=100020, reason='列表页访问失败')
- def crawl_response(self, response, menu: CrawlMenu):
- results = []
- element: HtmlElement = fromstring(response.text)
- feature = '//div[@id="pages"]/following-sibling::table//tr'
- for node in element.xpath(feature):
- publish_time = "".join(node.xpath('./td[6]/text()')).strip()
- if '-' not in publish_time:
- publish_time = "".join(node.xpath('./td[7]/text()')).strip()
- area = "".join("".join(node.xpath('./td[5]/text()')).split())
- title = "".join("".join(node.xpath('./td[2]/a/text()')).split())
- competehref = 'https://www.chinabidding.cn{}'.format("".join(node.xpath('./td[2]/a/@href')))
- item = {
- "site": "元博网(采购与招标网)",
- "channel": menu.channel,
- "area": area if area != '跨省' else '全国',
- "_d": "comeintime",
- "comeintime": int2long(int(time.time())),
- "T": "bidding",
- "sendflag": "false",
- "spidercode": menu.spidercode,
- "city": "",
- "type": "",
- "publishdept": "",
- "title": title,
- "competehref": competehref,
- "href": "#",
- "publishtime": publish_time,
- "l_np_publishtime": int2long(int(time.mktime(time.strptime(publish_time, "%Y-%m-%d")))),
- }
- if title is None:
- raise CustomCheckError(code=10107, reason='发布标题为空')
- item['count'] = es_query(item["title"], item["l_np_publishtime"])
- item['crawl'] = False
- # print(f'>>> {title} - {competehref}')
- results.append(item)
- if len(results) > 0:
- self.crawl_tab.insert_many(results)
- return len(results)
- def crawl_spider(self, sc: Scheduler, menu: CrawlMenu):
- for region_id, region_name in region.items():
- previous_url = None
- crawl_total, cookies = 1, None
- self.session = requests.session()
- '''每个普通账号仅能查询4000条数据,设置每页最大条数:100,共计40页'''
- page_size = 30
- for page in range(1, self.crawl_max_page + 1):
- '''生成 url 和 refer'''
- if page == 1:
- url = crawler_url['home_page'].format(
- region_id,
- sc.yesterday,
- sc.yesterday,
- page_size,
- menu.table_type
- )
- previous_url = url
- refer = crawler_url['refer'].format(quote(region_name))
- else:
- url = crawler_url['list_url'].format(
- region_id,
- sc.yesterday,
- sc.yesterday,
- page,
- page_size,
- menu.table_type
- )
- refer = previous_url
- previous_url = url
- print(">>> ", url)
- sc.crawl_url = url
- sc.spider_code = menu.spidercode
- '''添加身份信息cookies'''
- if crawl_total >= 4:
- '''列表数据从第4页开始,普通登录账号登录状态下才能获取数据'''
- cookies = load_login_cookies(self.user.phone)
- '''数据采集'''
- try:
- response = self.crawl_request(url, refer, cookies=cookies)
- if response is None:
- logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-0条')
- break
- item_size = self.crawl_response(response, menu)
- logger.info(f'[采集成功]{menu.channel}-{region_name}-第{page}页-{item_size}条')
- sc.crawl_counter(item_size)
- if item_size < page_size:
- '''当前页面的发布的条数小于网站页面固定发布数量,下一页不存在数据,直接跳过'''
- break
- else:
- crawl_total += 1
- except (JyBasicException, Exception) as e:
- sc.crawl_counter(0)
- logger.error('[采集失败]{}-{}-第{}页, 错误类型:{}'.format(
- menu.channel,
- region_name,
- page,
- e.__class__.__name__,
- ))
- sc.err_record(e)
- finally:
- sc.wait_for_next_task(random.choice(range(2, 6)))
- self.session.close()
- def start(self):
- for menu in self.crawl_menus:
- with Scheduler(site='元博网', crawl_type='list') as scheduler:
- if scheduler.crawl_start:
- self.user = scheduler.user
- self.crawl_spider(scheduler, menu)
- scheduler.finished()
- if __name__ == '__main__':
- ListSpider(
- db='py_spider',
- crawl_tab='ybw_list',
- crawl_max_page=134,
- ).start()
|