GDSpider.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. from concurrent.futures import ThreadPoolExecutor
  2. from lxml.html import fromstring, HtmlElement
  3. from crawler.defaults import fetch_page_by_post, crawl_request, crawl_params
  4. from crawler.fields import BulletinBasicFields, SaveCompanyInformation
  5. class GDSpider:
  6. def __init__(self):
  7. self.sign = 'gd'
  8. self.enable_proxy = None
  9. def extract_text_and_save(self, element: HtmlElement, **request_params):
  10. nodes = element.xpath('//*[@name="frm"]/div/table[2]//tr[position()>1]')
  11. for node in nodes:
  12. social_id = "".join(node.xpath('./td[2]/text()')).strip()
  13. item = BulletinBasicFields(
  14. social_id=social_id,
  15. company="".join(node.xpath('./td[3]//text()')).strip(),
  16. district_code=social_id[2:8],
  17. province='广东省',
  18. url=request_params.get('url'),
  19. request_data=request_params.get('request_data'),
  20. page=request_params.get('page')
  21. )
  22. SaveCompanyInformation(item, self.sign)
  23. def crawl_spider(self, task: tuple):
  24. url, data, page = task
  25. headers = {
  26. 'Content-Type': 'application/x-www-form-urlencoded',
  27. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
  28. }
  29. response = crawl_request(fetch_page_by_post, url, self.enable_proxy, headers=headers, data=data)
  30. element = fromstring(response.text)
  31. self.extract_text_and_save(element, url=url, page=page, request_data=data)
  32. def generate_request_tasks(self):
  33. results = []
  34. for spider in crawl_params(self.sign):
  35. url = "".join(spider.keys())
  36. params: dict = spider.get(url)
  37. total_page = int(params.get('pageInfo.pageTotal'))
  38. for page in range(1, total_page + 1):
  39. item = {**params}
  40. item.update({
  41. 'pageInfo.switchingPage': 'true',
  42. 'pageInfo.pageIndex': str(page)
  43. })
  44. results.append((url, item, page))
  45. yield from results
  46. def run(self, enable_proxy=None, max_workers: int = 1):
  47. self.enable_proxy = enable_proxy or False
  48. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  49. Executor.map(self.crawl_spider, self.generate_request_tasks())