|
@@ -0,0 +1,186 @@
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
+
|
|
|
+from lxml.html import fromstring, HtmlElement
|
|
|
+
|
|
|
+from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
|
|
|
+from crawler.fields import BulletinBasicFields, SaveCompanyInformation
|
|
|
+
|
|
|
+
|
|
|
+class HBSpider:
|
|
|
+ """湖北 - 事业单位编制网"""
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.sign = 'hb'
|
|
|
+ self.enable_proxy = None
|
|
|
+ self.max_page_number = 40
|
|
|
+ self.years = ['2020', '2019', '2018', '2017', '2016', '2015']
|
|
|
+ self.district_mapping = {
|
|
|
+ '420000': {'region': ('湖北省', '', '')},
|
|
|
+ '420100': {'region': ('湖北省', '武汉市', '')},
|
|
|
+ '420102': {'region': ('湖北省', '', '江岸区')},
|
|
|
+ '420103': {'region': ('湖北省', '', '江汉区')},
|
|
|
+ '420104': {'region': ('湖北省', '', '硚口区')},
|
|
|
+ '420105': {'region': ('湖北省', '', '汉阳区')},
|
|
|
+ '420106': {'region': ('湖北省', '', '武昌区')},
|
|
|
+ '420107': {'region': ('湖北省', '', '青山区')},
|
|
|
+ '420111': {'region': ('湖北省', '', '洪山区')},
|
|
|
+ '420112': {'region': ('湖北省', '', '东西湖区')},
|
|
|
+ '420113': {'region': ('湖北省', '', '武汉市汉南区')},
|
|
|
+ '420114': {'region': ('湖北省', '', '蔡甸区')},
|
|
|
+ '420115': {'region': ('湖北省', '', '江夏区')},
|
|
|
+ '420116': {'region': ('湖北省', '', '黄陂区')},
|
|
|
+ '420117': {'region': ('湖北省', '', '新洲区')},
|
|
|
+ '420118': {'region': ('湖北省', '', '武汉经济技术开发区')},
|
|
|
+ '420119': {'region': ('湖北省', '', '武汉东湖新技术开发区')},
|
|
|
+ '420200': {'region': ('湖北省', '黄石市', '')},
|
|
|
+ '420202': {'region': ('湖北省', '', '黄石港区')},
|
|
|
+ '420203': {'region': ('湖北省', '', '西塞山区(石灰窑区)')},
|
|
|
+ '420204': {'region': ('湖北省', '', '下陆区')},
|
|
|
+ '420205': {'region': ('湖北省', '', '铁山区')},
|
|
|
+ '420222': {'region': ('湖北省', '', '阳新县')},
|
|
|
+ '420281': {'region': ('湖北省', '', '大冶市')},
|
|
|
+ '420300': {'region': ('湖北省', '十堰市', '')},
|
|
|
+ '420302': {'region': ('湖北省', '', '茅箭区')},
|
|
|
+ '420303': {'region': ('湖北省', '', '张湾区')},
|
|
|
+ '420321': {'region': ('湖北省', '', '十堰市郧阳区')},
|
|
|
+ '420322': {'region': ('湖北省', '', '郧西县')},
|
|
|
+ '420323': {'region': ('湖北省', '', '竹山县')},
|
|
|
+ '420324': {'region': ('湖北省', '', '竹溪县')},
|
|
|
+ '420325': {'region': ('湖北省', '', '房县')},
|
|
|
+ '420381': {'region': ('湖北省', '', '丹江口市')},
|
|
|
+ '420500': {'region': ('湖北省', '宜昌市', '')},
|
|
|
+ '420502': {'region': ('湖北省', '', '西陵区')},
|
|
|
+ '420503': {'region': ('湖北省', '', '伍家岗区')},
|
|
|
+ '420504': {'region': ('湖北省', '', '点军区')},
|
|
|
+ '420505': {'region': ('湖北省', '', '猇亭区')},
|
|
|
+ '420521': {'region': ('湖北省', '', '夷陵区')},
|
|
|
+ '420525': {'region': ('湖北省', '', '远安县')},
|
|
|
+ '420526': {'region': ('湖北省', '', '兴山县')},
|
|
|
+ '420527': {'region': ('湖北省', '', '秭归县')},
|
|
|
+ '420528': {'region': ('湖北省', '', '长阳土家族自治县')},
|
|
|
+ '420529': {'region': ('湖北省', '', '五峰土家族自治县')},
|
|
|
+ '420581': {'region': ('湖北省', '', '宜都市')},
|
|
|
+ '420582': {'region': ('湖北省', '', '当阳市')},
|
|
|
+ '420583': {'region': ('湖北省', '', '枝江市')},
|
|
|
+ '420600': {'region': ('湖北省', '襄阳市', '')},
|
|
|
+ '420602': {'region': ('湖北省', '', '襄城区')},
|
|
|
+ '420606': {'region': ('湖北省', '', '樊城区')},
|
|
|
+ '420621': {'region': ('湖北省', '', '襄州区')},
|
|
|
+ '420624': {'region': ('湖北省', '', '南漳县')},
|
|
|
+ '420625': {'region': ('湖北省', '', '谷城县')},
|
|
|
+ '420626': {'region': ('湖北省', '', '保康县')},
|
|
|
+ '420682': {'region': ('湖北省', '', '老河口市')},
|
|
|
+ '420683': {'region': ('湖北省', '', '枣阳市')},
|
|
|
+ '420684': {'region': ('湖北省', '', '宜城市')},
|
|
|
+ '420700': {'region': ('湖北省', '鄂州市', '')},
|
|
|
+ '420702': {'region': ('湖北省', '', '梁子湖区')},
|
|
|
+ '420703': {'region': ('湖北省', '', '华容区')},
|
|
|
+ '420704': {'region': ('湖北省', '', '鄂城区')},
|
|
|
+ '420800': {'region': ('湖北省', '荆门市', '')},
|
|
|
+ '420802': {'region': ('湖北省', '', '荆门市东宝区')},
|
|
|
+ '420803': {'region': ('湖北省', '', '荆门市掇刀区')},
|
|
|
+ '420804': {'region': ('湖北省', '', '荆门市屈家岭管理区')},
|
|
|
+ '420821': {'region': ('湖北省', '', '京山市')},
|
|
|
+ '420822': {'region': ('湖北省', '', '沙洋县')},
|
|
|
+ '420881': {'region': ('湖北省', '', '钟祥市')},
|
|
|
+ '420900': {'region': ('湖北省', '孝感市', '')},
|
|
|
+ '420902': {'region': ('湖北省', '', '孝南区')},
|
|
|
+ '420921': {'region': ('湖北省', '', '孝昌县')},
|
|
|
+ '420922': {'region': ('湖北省', '', '大悟县')},
|
|
|
+ '420923': {'region': ('湖北省', '', '云梦县')},
|
|
|
+ '420981': {'region': ('湖北省', '', '应城市')},
|
|
|
+ '420982': {'region': ('湖北省', '', '安陆市')},
|
|
|
+ '420984': {'region': ('湖北省', '', '汉川市')},
|
|
|
+ '421000': {'region': ('湖北省', '荆州市', '')},
|
|
|
+ '421002': {'region': ('湖北省', '', '沙市区')},
|
|
|
+ '421003': {'region': ('湖北省', '', '荆州区')},
|
|
|
+ '421022': {'region': ('湖北省', '', '公安县')},
|
|
|
+ '421023': {'region': ('湖北省', '', '监利县')},
|
|
|
+ '421024': {'region': ('湖北省', '', '江陵县')},
|
|
|
+ '421081': {'region': ('湖北省', '', '石首市')},
|
|
|
+ '421083': {'region': ('湖北省', '', '洪湖市')},
|
|
|
+ '421087': {'region': ('湖北省', '', '松滋市')},
|
|
|
+ '421100': {'region': ('湖北省', '黄冈市', '')},
|
|
|
+ '421102': {'region': ('湖北省', '', '黄州区')},
|
|
|
+ '421121': {'region': ('湖北省', '', '团风县')},
|
|
|
+ '421122': {'region': ('湖北省', '', '红安县')},
|
|
|
+ '421123': {'region': ('湖北省', '', '罗田县')},
|
|
|
+ '421124': {'region': ('湖北省', '', '英山县')},
|
|
|
+ '421125': {'region': ('湖北省', '', '浠水县')},
|
|
|
+ '421126': {'region': ('湖北省', '', '蕲春县')},
|
|
|
+ '421127': {'region': ('湖北省', '', '黄梅县')},
|
|
|
+ '421181': {'region': ('湖北省', '', '麻城市')},
|
|
|
+ '421182': {'region': ('湖北省', '', '武穴市')},
|
|
|
+ '421200': {'region': ('湖北省', '咸宁市', '')},
|
|
|
+ '421202': {'region': ('湖北省', '', '咸安区')},
|
|
|
+ '421221': {'region': ('湖北省', '', '嘉鱼县')},
|
|
|
+ '421222': {'region': ('湖北省', '', '通城县')},
|
|
|
+ '421223': {'region': ('湖北省', '', '崇阳县')},
|
|
|
+ '421224': {'region': ('湖北省', '', '通山县')},
|
|
|
+ '421281': {'region': ('湖北省', '', '赤壁市')},
|
|
|
+ '421300': {'region': ('湖北省', '随州市', '')},
|
|
|
+ '421302': {'region': ('湖北省', '', '曾都区')},
|
|
|
+ '421304': {'region': ('湖北省', '', '随县')},
|
|
|
+ '421381': {'region': ('湖北省', '', '广水市')},
|
|
|
+ '422800': {'region': ('湖北省', '恩施土家族苗族自治州', '')},
|
|
|
+ '422801': {'region': ('湖北省', '', '恩施市')},
|
|
|
+ '422802': {'region': ('湖北省', '', '利川市')},
|
|
|
+ '422822': {'region': ('湖北省', '', '建始县')},
|
|
|
+ '422823': {'region': ('湖北省', '', '巴东县')},
|
|
|
+ '422825': {'region': ('湖北省', '', '宣恩县')},
|
|
|
+ '422826': {'region': ('湖北省', '', '咸丰县')},
|
|
|
+ '422827': {'region': ('湖北省', '', '来凤县')},
|
|
|
+ '422828': {'region': ('湖北省', '', '鹤峰县')},
|
|
|
+ '429004': {'region': ('湖北省', '', '仙桃市')},
|
|
|
+ '429005': {'region': ('湖北省', '', '潜江市')},
|
|
|
+ '429006': {'region': ('湖北省', '', '天门市')},
|
|
|
+ '429021': {'region': ('湖北省', '', '神农架林区')}
|
|
|
+ }
|
|
|
+
|
|
|
+ def extract_text_and_save(self, element: HtmlElement, region: tuple, code: str, **request_params):
|
|
|
+ nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
|
|
|
+ province, city, county = region
|
|
|
+ for node in nodes:
|
|
|
+ social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
|
|
|
+ company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
|
|
|
+ if len(social_id) == 0 and len(company) == 0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ item = BulletinBasicFields(
|
|
|
+ social_id=social_id,
|
|
|
+ company=company,
|
|
|
+ district_code=code,
|
|
|
+ province=province,
|
|
|
+ city=city,
|
|
|
+ county=county,
|
|
|
+ page=request_params.get('page', ''),
|
|
|
+ url=request_params.get('url', '')
|
|
|
+ )
|
|
|
+ SaveCompanyInformation(item, self.sign)
|
|
|
+
|
|
|
+ def generate_request_tasks(self):
|
|
|
+ results = []
|
|
|
+ url = crawl_params('general')
|
|
|
+ for district_code, data in self.district_mapping.items():
|
|
|
+ region = data.get('region')
|
|
|
+ for year in self.years:
|
|
|
+ for page in range(1, self.max_page_number + 1):
|
|
|
+ link = url.format(
|
|
|
+ page=page,
|
|
|
+ district_code=district_code,
|
|
|
+ year=year,
|
|
|
+ select_page=page
|
|
|
+ )
|
|
|
+ results.append((link, region, district_code, page))
|
|
|
+ yield from results
|
|
|
+
|
|
|
+ def crawl_spider(self, task: tuple):
|
|
|
+ url, region, district_code, page = task
|
|
|
+ response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
|
|
|
+ element = fromstring(response.text)
|
|
|
+ self.extract_text_and_save(element, region, district_code, page=page, url=url)
|
|
|
+
|
|
|
+ def run(self, enable_proxy=None, max_workers: int = 1):
|
|
|
+ self.enable_proxy = enable_proxy or False
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as Executor:
|
|
|
+ Executor.map(self.crawl_spider, self.generate_request_tasks())
|