detail_spider.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import random
  2. import time
  3. import re
  4. import requests.exceptions
  5. from lxml.html import fromstring, HtmlElement, tostring
  6. from lxml.html.clean import Cleaner
  7. from crawler.check_utils import CheckText, CheckTask
  8. from crawler.clean_html import cleaner
  9. from crawler.crawl_scheduler import Scheduler
  10. from crawler.login import login, load_login_cookies, login_check
  11. from utils.databases import mongo_table, int2long
  12. from utils.execptions import YbwCrawlError
  13. from utils.log import logger
  14. def iter_node(element: HtmlElement):
  15. yield element
  16. for sub_element in element:
  17. if isinstance(sub_element, HtmlElement):
  18. yield from iter_node(sub_element)
  19. def pre_parse(element: HtmlElement):
  20. """对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用"""
  21. pre_remove = {
  22. 'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
  23. 'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
  24. 'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
  25. 'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
  26. 'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
  27. 'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
  28. 'cen_right fr', 'log_col5', 'agreement', 'log_col3',
  29. 'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
  30. 'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
  31. 'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
  32. 'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
  33. 'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
  34. 'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
  35. 'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
  36. 'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
  37. 'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
  38. 'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
  39. 'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
  40. 'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
  41. 'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
  42. 'display:none'
  43. }
  44. for node in iter_node(element):
  45. id_attr = node.attrib.get('id')
  46. class_attr = node.attrib.get('class')
  47. style_attr = node.attrib.get('style')
  48. if any([id_attr in pre_remove,
  49. class_attr in pre_remove,
  50. style_attr in pre_remove]):
  51. node.drop_tree()
  52. return element
  53. def page_source(element: HtmlElement):
  54. clear = Cleaner(
  55. forms=False,
  56. style=True
  57. )
  58. return clear.clean_html(tostring(element, encoding="utf-8").decode())
  59. class DetailSpider:
  60. def __init__(
  61. self,
  62. db: str,
  63. crawl_tab: str,
  64. save_tab: str,
  65. ):
  66. self.crawl_tab = mongo_table(db, crawl_tab)
  67. self.save_tab = mongo_table(db, save_tab)
  68. self.save_url = mongo_table("editor", "source_url")
  69. self.user = None
  70. def _update_crawl_task(self, tid, **kwargs):
  71. self.crawl_tab.update_one({'_id': tid}, {'$set': kwargs})
  72. def _lock_task(self, task: dict):
  73. update = {'crawl': True}
  74. self._update_crawl_task(task['_id'], **update)
  75. def _release_task(self, task: dict):
  76. update = {'crawl': False}
  77. self._update_crawl_task(task['_id'], **update)
  78. def json_request(self, fid, request_params):
  79. url = "https://www.chinabidding.cn/agency.info.Detail/show"
  80. params = {
  81. "fid": f"{fid}"
  82. }
  83. res = requests.get(url, params=params, **request_params)
  84. return res
  85. def crawl_request(self, item: dict):
  86. url = item['competehref']
  87. headers = {
  88. 'Host': 'www.chinabidding.cn',
  89. 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
  90. 'sec-ch-ua-mobile': '?0',
  91. 'sec-ch-ua-platform': '"Windows"',
  92. 'Upgrade-Insecure-Requests': '1',
  93. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
  94. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  95. 'Sec-Fetch-Site': 'none',
  96. 'Sec-Fetch-Mode': 'navigate',
  97. 'Sec-Fetch-User': '?1',
  98. 'Sec-Fetch-Dest': 'document',
  99. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  100. }
  101. request_params = {}
  102. request_params.setdefault('headers', headers)
  103. request_params.setdefault('timeout', 60)
  104. retries = 0
  105. retries_502, max_retries_502 = 0, 15
  106. proxy, proxies = None, None
  107. while retries < 3:
  108. if retries_502 > max_retries_502:
  109. # 网站已移除该数据
  110. self._update_crawl_task(item['_id'], crawl_status='remove')
  111. break
  112. login_cookies = load_login_cookies(self.user.phone)
  113. if login_cookies is None:
  114. login(*self.user)
  115. continue
  116. elif 'cookies' not in request_params:
  117. request_params.setdefault('cookies', login_cookies)
  118. else:
  119. request_params.update({'cookies': login_cookies})
  120. fid = "".join(re.findall('\?fid=(.*)',url))
  121. if fid:
  122. try:
  123. r = self.json_request(fid, request_params)
  124. # 账号登录状态检查
  125. retry_login = login_check(self.user.phone, url, False)
  126. if retry_login:
  127. logger.info(f"[重新登录]{self.user.phone}")
  128. _, code = login(*self.user, proxies=proxies)
  129. if code == 200:
  130. retries += 1
  131. else:
  132. time.sleep(1800)
  133. retries += 1
  134. continue
  135. logger.info(f'[采集正文] fid_{fid}')
  136. return r
  137. except:
  138. retries += 1
  139. continue
  140. else:
  141. try:
  142. r = requests.get(url, **request_params)
  143. # 账号登录状态检查
  144. retry_login = login_check(self.user.phone, url, False)
  145. if retry_login:
  146. logger.info(f"[重新登录]{self.user.phone}")
  147. _, code = login(*self.user, proxies=proxies)
  148. if code == 200:
  149. retries += 1
  150. else:
  151. time.sleep(1800)
  152. retries += 1
  153. continue
  154. element = fromstring(r.text)
  155. nodes = element.xpath('//*[@id="main_dom"]/div[1]')
  156. if len(nodes) != 1:
  157. retries_502 += 1
  158. logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
  159. continue
  160. else:
  161. node = nodes[0]
  162. logger.info(f'[采集正文] id={node.attrib.get("id")}')
  163. return r
  164. except requests.RequestException:
  165. retries += 1
  166. continue
  167. return None
  168. def crawl_response(self, response, item):
  169. if re.match('^\{', response.text):
  170. html = response.json().get('c_info').get('content')
  171. else:
  172. element: HtmlElement = fromstring(response.text)
  173. node = element.xpath('//*[@id="infoDescription"]')[0]
  174. node = pre_parse(node)
  175. features = {
  176. './div[@class="ckgys_cont"]',
  177. './/div[@class="detail-title ng-scope"]',
  178. './/table[@class="detail_Table"]',
  179. }
  180. for feature in features:
  181. extract_node = node.xpath(feature)
  182. if len(extract_node) > 0:
  183. valid_node = extract_node[0]
  184. break
  185. else:
  186. valid_node = node
  187. html = page_source(valid_node)
  188. '''检查原始页面内容'''
  189. source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', html)
  190. if source_url:
  191. self.save_url.insert_one({
  192. "site": "元博网",
  193. "title": item['title'],
  194. "source_url": source_url.group(1),
  195. "comeintime": int2long(int(time.time()))
  196. })
  197. CheckText(html)
  198. item["contenthtml"] = html
  199. special = {
  200. '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '',
  201. # 'DD000E;|EE000F;|FF000E;': '',
  202. '[(]?[)]?[A-Z]{2}000[A-Z]{1};[(]?[\d{1,4}]*[;]?[)]?[;]?': '',
  203. }
  204. item["detail"] = cleaner(html, special)
  205. item["comeintime"] = int2long(int(time.time()))
  206. '''检查清洗之后的详情'''
  207. source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', item["detail"])
  208. if source_url:
  209. self.save_url.insert_one({
  210. "site": "元博网",
  211. "title": item['title'],
  212. "source_url": source_url.group(1),
  213. "comeintime": int2long(int(time.time()))
  214. })
  215. CheckText(item["detail"])
  216. insert = {}
  217. for key, val in item.items():
  218. if key not in ['crawl_status', 'crawl', 'count', '_id']:
  219. insert[key] = val
  220. self.save_tab.insert_one(insert)
  221. logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
  222. def crawl_spider(self, sc: Scheduler):
  223. while True:
  224. next_task_interval = None
  225. logger.info(f"[count:]{str(sc.count)}")
  226. if sc.count >= sc.total:
  227. return True
  228. item = sc.crawl_task
  229. if len(item) == 0:
  230. return False
  231. self._lock_task(item)
  232. # 记录采集异常的爬虫代码与来源
  233. sc.spider_code = item['spidercode']
  234. sc.crawl_url = item['competehref']
  235. try:
  236. # 检查请求采集任务
  237. CheckTask(item)
  238. response = self.crawl_request(item)
  239. if response is not None:
  240. self.crawl_response(response, item)
  241. self._update_crawl_task(item["_id"], crawl_status='finished')
  242. sc.crawl_counter(1)
  243. next_task_interval = random.choice(range(3,9))
  244. except (YbwCrawlError, Exception) as e:
  245. if getattr(e, 'code', None) is None:
  246. err = YbwCrawlError(unknown_err=e)
  247. sc.err_record(err)
  248. elif e.code == 10105:
  249. # 抛出异常时,将es查询统计结果进行更新
  250. self._update_crawl_task(item["_id"], count=item['count'])
  251. logger.info('[重复数据]{}-{}'.format(item['title'], item['publishtime']))
  252. else:
  253. sc.err_record(e)
  254. self._update_crawl_task(item["_id"], crawl_status='error')
  255. logger.info('[问题数据]{}-{}'.format(item['title'], item['publishtime']))
  256. sc.crawl_counter(0)
  257. next_task_interval = 0.1
  258. finally:
  259. self._release_task(item)
  260. sc.wait_for_next_task(next_task_interval)
  261. def start(self):
  262. while True:
  263. with Scheduler(site='元博网', crawl_type='detail') as scheduler:
  264. if scheduler.crawl_start:
  265. self.user = scheduler.user
  266. self.crawl_spider(scheduler)
  267. scheduler.finished(10)
  268. if __name__ == '__main__':
  269. DetailSpider(
  270. db='py_spider',
  271. crawl_tab='ybw_list',
  272. save_tab='data_bak',
  273. ).start()