123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178 |
- from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
- from urllib.parse import urljoin
- from lxml.html import fromstring, HtmlElement
- from config.load import crawl_sites
- from crawler.defaults import fetch_page_by_post, fetch_page_by_get, crawl_request
- from crawler.fields import (
- SaveCompanyInformation,
- BulletinBasicFields,
- )
- class SHSpider:
- def __init__(self):
- self.sign = 'sh'
- self.enable_proxy = None
- self.site = 'http://www.sydjsh.cn/'
- def extract_text_and_save(self, url, yw_type):
- # print(url, yw_type)
- response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
- element = fromstring(response.text)
- nodes = element.xpath('//table[@id="content"]//tr[position()>1]')
- for node in nodes:
- if yw_type in ['SL', 'BZ']:
- item = BulletinBasicFields(
- company="".join(node.xpath('./td[3]/text()')),
- legal_person="".join(node.xpath('./td[5]/text()')),
- capital="".join(node.xpath('./td[7]/text()')) + '万元',
- capital_origin="".join(node.xpath('./td[6]/text()')),
- purpose_and_business="".join(node.xpath('./td[8]/text()')),
- address="".join(node.xpath('./td[4]/text()')),
- social_id="".join(node.xpath('./td[2]/text()')),
- status='create',
- province='上海'
- )
- SaveCompanyInformation(item, self.sign)
- elif yw_type in ['BG', 'JGQTXS', 'JGQTBG', 'JGQTCX', 'JGQTBL', 'JGQTGQ']:
- item = BulletinBasicFields(
- social_id="".join(node.xpath('./td[2]/text()')),
- company="".join(node.xpath('./td[3]/text()')),
- status='modify',
- province='上海'
- )
- SaveCompanyInformation(item, self.sign)
- elif yw_type == 'ZX':
- item = BulletinBasicFields(
- social_id="".join(node.xpath('./td[3]/text()')),
- company="".join(node.xpath('./td[4]/text()')),
- status='cancellation',
- province='上海'
- )
- SaveCompanyInformation(item, self.sign)
- def generate_snapshot_links(self, url, data):
- list_links = []
- response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
- # print(url, data)
- element = fromstring(response.text)
- nodes = element.xpath('//div[@class="center1"]/ul/li')
- for node in nodes:
- href = "".join(node.xpath('./a/@href'))
- if data['yw_type'] == 'JGQTCX':
- # 机关注销公告列表页面的网站地址官方拼接是错误的,此处需要replace一下
- href = href.replace('jgqtXc', 'jgqtCx')
- elif data['yw_type'] == 'JGQTXS':
- href = href.replace('JgqtCl', 'jgqtCl')
- url = urljoin(self.site, href)
- list_links.append(url)
- yield from list_links
- def crawl_spider(self, task: tuple):
- url, data = task # 列表页信息
- with ThreadPoolExecutor(max_workers=5) as Executor:
- futures = []
- for link in self.generate_snapshot_links(url, data):
- futures.append(Executor.submit(self.extract_text_and_save, link, data['yw_type']))
- wait(futures, return_when=ALL_COMPLETED)
- def task_list(self):
- for spider in crawl_sites.get(self.sign):
- url = "".join(spider.keys())
- data: dict = spider.get(url)
- total_page = int(data.get('pageTotal'))
- for page in range(1, total_page + 1):
- item = {
- "pageIndex": str(page),
- "yw_type": data.get('yw_type'),
- "vl": "item",
- "type": data.get('type'),
- "pageSize": data.get('pageSize')
- }
- yield url, item
- def run(self, enable_proxy=None, max_workers: int = 1):
- self.enable_proxy = enable_proxy or False
- with ThreadPoolExecutor(max_workers=max_workers) as Executor:
- Executor.map(self.crawl_spider, self.task_list())
- class SHNDSpider:
- """上海事业单位编制网 - 年度报告"""
- def __init__(self):
- self.enable_proxy = None
- self.sign = 'sh'
- self.district_mapping = {
- '310000': {'region': ('上海', '上海市', '市属'), 'max_page_number': 622},
- '310106': {'region': ('上海', '上海市', '静安区'), 'max_page_number': 281},
- '310104': {'region': ('上海', '上海市', '徐汇区'), 'max_page_number': 208},
- '310113': {'region': ('上海', '上海市', '宝山区'), 'max_page_number': 361},
- '310109': {'region': ('上海', '上海市', '虹口区'), 'max_page_number': 186},
- '310112': {'region': ('上海', '上海市', '闵行区'), 'max_page_number': 361},
- '310230': {'region': ('上海', '上海市', '崇明区'), 'max_page_number': 317},
- '310105': {'region': ('上海', '上海市', '长宁区'), 'max_page_number': 170},
- '310107': {'region': ('上海', '上海市', '普陀区'), 'max_page_number': 231},
- '310117': {'region': ('上海', '上海市', '松江区'), 'max_page_number': 314},
- '310115': {'region': ('上海', '上海市', '浦东新区'), 'max_page_number': 741},
- '310101': {'region': ('上海', '上海市', '黄浦区'), 'max_page_number': 225},
- '310110': {'region': ('上海', '上海市', '杨浦区'), 'max_page_number': 210},
- '310114': {'region': ('上海', '上海市', '嘉定区'), 'max_page_number': 284},
- '310116': {'region': ('上海', '上海市', '金山区'), 'max_page_number': 265},
- '310226': {'region': ('上海', '上海市', '奉贤区'), 'max_page_number': 265},
- '310118': {'region': ('上海', '上海市', '青浦区'), 'max_page_number': 273}
- }
- self.url = 'http://www.sydjsh.cn/ndbg.do'
- def extract_text_and_save(self, element: HtmlElement, code: str, **request_params):
- province, city, county = self.district_mapping.get(code).get('region')
- nodes = element.xpath('//*[@class="cursor"]')
- for node in nodes:
- social_id = "".join(node.xpath('./td[1]/text()'))
- company = "".join(node.xpath('./td[2]/text()'))
- if len(social_id) == 0 and len(company) == 0:
- continue
- item = BulletinBasicFields(
- social_id=social_id,
- company=company,
- district_code=code,
- province=province,
- city=city,
- county=county,
- url=request_params.get('url'),
- request_data=request_params.get('request_data'),
- page=request_params.get('page')
- )
- SaveCompanyInformation(item, self.sign)
- def generate_request_tasks(self):
- results = []
- for geo_code, data in self.district_mapping.items():
- max_page_number = data.get('max_page_number') + 1
- for page in range(1, max_page_number):
- results.append({
- "pageIndex": str(page),
- "keyword": "",
- "type": "4",
- "year": "",
- "geo_code": geo_code
- })
- yield from results
- def crawl_spider(self, data: dict):
- geo_code = data.get('geo_code')
- page = data.get('pageIndex')
- response = crawl_request(fetch_page_by_post, self.url, self.enable_proxy, data=data)
- element = fromstring(response.text)
- self.extract_text_and_save(element, geo_code, page=page, url=self.url, request_data=data)
- def run(self, enable_proxy=None, max_workers: int = 1):
- self.enable_proxy = enable_proxy or False
- with ThreadPoolExecutor(max_workers=max_workers) as Executor:
- Executor.map(self.crawl_spider, self.generate_request_tasks())
|