detail_spider.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. import time
  2. import requests.exceptions
  3. from lxml.html import fromstring, HtmlElement, tostring
  4. from lxml.html.clean import Cleaner
  5. from pymongo.errors import DuplicateKeyError
  6. from crawler.check_utils import CheckText, CheckTask
  7. from crawler.clean_html import clean_html
  8. from crawler.crawl_record import update_records, load_records
  9. from crawler.crawl_scheduler import Scheduler
  10. from crawler.login import login, load_login_cookies, login_check
  11. from utils.databases import MongoDBS
  12. from utils.execptions import VoidCrawlError, JyBasicException
  13. from utils.log import logger
  14. from utils.socks5 import Proxy
  15. from utils.tools import int2long
  16. def iter_node(element: HtmlElement):
  17. yield element
  18. for sub_element in element:
  19. if isinstance(sub_element, HtmlElement):
  20. yield from iter_node(sub_element)
  21. def pre_parse(element: HtmlElement):
  22. """对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用"""
  23. pre_remove = {
  24. 'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
  25. 'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
  26. 'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
  27. 'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
  28. 'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
  29. 'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
  30. 'cen_right fr', 'log_col5', 'agreement', 'log_col3',
  31. 'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
  32. 'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
  33. 'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
  34. 'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
  35. 'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
  36. 'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
  37. 'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
  38. 'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
  39. 'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
  40. 'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
  41. 'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
  42. 'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
  43. 'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
  44. 'display:none'
  45. }
  46. for node in iter_node(element):
  47. id_attr = node.attrib.get('id')
  48. class_attr = node.attrib.get('class')
  49. style_attr = node.attrib.get('style')
  50. if any([id_attr in pre_remove,
  51. class_attr in pre_remove,
  52. style_attr in pre_remove]):
  53. node.drop_tree()
  54. return element
  55. def page_source(element: HtmlElement):
  56. clear = Cleaner(
  57. forms=False,
  58. style=True
  59. )
  60. return clear.clean_html(tostring(element, encoding="utf-8").decode())
  61. class DetailSpider:
  62. def __init__(
  63. self,
  64. db: str,
  65. crawl_tab: str,
  66. save_tab: str,
  67. crawl_total=None,
  68. ):
  69. self.crawl_tab = MongoDBS(db, crawl_tab).coll
  70. self.save_tab = MongoDBS(db, save_tab).coll
  71. self.crawl_total = crawl_total or 6000
  72. self.user = None
  73. def crawl_request(self, url):
  74. headers = {
  75. 'Host': 'www.chinabidding.cn',
  76. 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
  77. 'sec-ch-ua-mobile': '?0',
  78. 'sec-ch-ua-platform': '"Windows"',
  79. 'Upgrade-Insecure-Requests': '1',
  80. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
  81. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  82. 'Sec-Fetch-Site': 'none',
  83. 'Sec-Fetch-Mode': 'navigate',
  84. 'Sec-Fetch-User': '?1',
  85. 'Sec-Fetch-Dest': 'document',
  86. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  87. }
  88. request_params = {}
  89. request_params.setdefault('headers', headers)
  90. request_params.setdefault('timeout', 60)
  91. retries = 0
  92. proxy, proxies = None, None
  93. while retries < 3:
  94. login_cookies = load_login_cookies(self.user.phone)
  95. if login_cookies is None:
  96. login(*self.user)
  97. continue
  98. elif 'cookies' not in request_params:
  99. request_params.setdefault('cookies', login_cookies)
  100. else:
  101. request_params.update({'cookies': login_cookies})
  102. try:
  103. r = requests.get(url, **request_params)
  104. '''账号登录状态检查'''
  105. retry_login = login_check(self.user.phone, url, False)
  106. if retry_login:
  107. logger.info(f"[重新登录]{self.user.phone}")
  108. _, code = login(*self.user, proxies=proxies)
  109. if code == 200:
  110. retries += 1
  111. else:
  112. if proxy is None:
  113. proxy = Proxy(True)
  114. else:
  115. proxy.switch()
  116. proxies = proxy.proxies
  117. retries += 1
  118. continue
  119. element = fromstring(r.text)
  120. nodes = element.xpath('//*[@id="main_dom"]/div[1]')
  121. if len(nodes) != 1:
  122. raise VoidCrawlError
  123. else:
  124. node = nodes[0]
  125. logger.info(f'[采集正文] id={node.attrib.get("id")}')
  126. return r
  127. except requests.RequestException:
  128. retries += 1
  129. continue
  130. return None
  131. def crawl_response(self, response, item):
  132. element: HtmlElement = fromstring(response.text)
  133. node = element.xpath('//*[@id="infoDescription"]')[0]
  134. node = pre_parse(node)
  135. features = {
  136. './div[@class="ckgys_cont"]',
  137. './/div[@class="detail-title ng-scope"]',
  138. './/table[@class="detail_Table"]',
  139. }
  140. for feature in features:
  141. extract_node = node.xpath(feature)
  142. if len(extract_node) > 0:
  143. valid_node = extract_node[0]
  144. break
  145. else:
  146. valid_node = node
  147. html = page_source(valid_node)
  148. '''检查文本内容'''
  149. CheckText(html)
  150. item["contenthtml"] = html
  151. item["detail"] = clean_html(html)
  152. item["comeintime"] = int2long(int(time.time()))
  153. del item['count'], item['crawl']
  154. if 'crawl_status' in item:
  155. del item['crawl_status']
  156. try:
  157. self.save_tab.insert_one(item)
  158. except DuplicateKeyError:
  159. pass
  160. logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
  161. def set_crawl_status(self, item: dict, status: bool):
  162. self.crawl_tab.update_one(
  163. {'_id': item['_id']},
  164. {'$set': {'crawl': status}}
  165. )
  166. def crawl_spider(self, sc: Scheduler):
  167. while True:
  168. if load_records(self.user.phone, sc.today) >= self.crawl_total:
  169. return True
  170. item = sc.crawl_task
  171. if len(item) == 0:
  172. return False
  173. self.set_crawl_status(item, True)
  174. '''使用调度器记录采集内容,出现错误时错误写入数据库'''
  175. sc.spider_code = item['spidercode']
  176. sc.crawl_url = item['competehref']
  177. try:
  178. '''检查请求采集任务'''
  179. CheckTask(item)
  180. url = item['competehref']
  181. response = self.crawl_request(url)
  182. if response is not None:
  183. self.crawl_response(response, item)
  184. self.crawl_tab.update_one(
  185. {"_id": item["_id"]},
  186. {'$set': {'crawl_status': 'finished'}}
  187. )
  188. update_records(self.user.phone, 1)
  189. except JyBasicException as e:
  190. if e.code == 10105:
  191. '''检查出该异常时,程序会将es查询结果更新采集表'''
  192. self.crawl_tab.update_one(
  193. {"_id": item["_id"]},
  194. {'$set': {'count': item['count']}}
  195. )
  196. else:
  197. sc.err_record(e)
  198. self.crawl_tab.update_one(
  199. {"_id": item["_id"]},
  200. {'$set': {'crawl_status': 'error'}}
  201. )
  202. finally:
  203. self.set_crawl_status(item, False)
  204. sc.wait_for_next_task()
  205. def start(self):
  206. query = {'used': False, 'site': '元博网', 'class': 'detail'}
  207. while True:
  208. with Scheduler(query) as scheduler:
  209. scheduler.crawl_type = 'detail'
  210. if scheduler.crawl_start:
  211. self.user = scheduler.user
  212. finished = self.crawl_spider(scheduler)
  213. if finished:
  214. '''完成采集任务'''
  215. scheduler.finished()
  216. else:
  217. '''暂无采集任务'''
  218. scheduler.wait_for_next_task()
  219. if __name__ == '__main__':
  220. DetailSpider(
  221. db='py_spider',
  222. crawl_tab='ybw_list',
  223. save_tab='data_bak',
  224. crawl_total=6000,
  225. ).start()