SXSpider.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. from concurrent.futures import ThreadPoolExecutor
  2. from lxml.html import fromstring, HtmlElement
  3. from crawler.defaults import fetch_page_by_post, crawl_request, crawl_params
  4. from crawler.fields import (
  5. SaveCompanyInformation,
  6. BulletinBasicFields
  7. )
  8. class SXSpider:
  9. def __init__(self):
  10. self.sign = 'sx'
  11. self.enable_proxy = None
  12. def extract_text_and_save(self, element: HtmlElement, **request_params):
  13. nodes = element.xpath('//ul[@class="listLeft-item"]/li')
  14. for node in nodes:
  15. name = "".join(node.xpath('./a/text()')).strip()
  16. item = BulletinBasicFields(
  17. company=name,
  18. province='陕西省',
  19. url=request_params.get('url'),
  20. request_data=request_params.get('request_data'),
  21. page=request_params.get('page')
  22. )
  23. SaveCompanyInformation(item, self.sign)
  24. def crawl_spider(self, task: tuple):
  25. url, data, page = task
  26. response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
  27. element = fromstring(response.text)
  28. self.extract_text_and_save(element, url=url, page=page, request_data=data)
  29. def generate_request_tasks(self):
  30. results = []
  31. for spider in crawl_params(self.sign):
  32. url = "".join(spider.keys())
  33. data: dict = spider.get(url)
  34. total_page = int(data.get('pageTotal'))
  35. for page in range(1, total_page + 1):
  36. item = {
  37. 'code': '',
  38. 'year': data.get('year'),
  39. 'contentUrlPage.pageSize': data.get('pageSize'),
  40. 'contentUrlPage.currentPage': str(page)
  41. }
  42. results.append((url, item, page))
  43. yield from results
  44. def run(self, enable_proxy=None, max_workers: int = 1):
  45. self.enable_proxy = enable_proxy or False
  46. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  47. Executor.map(self.crawl_spider, self.generate_request_tasks())