ybw_details.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-06-17
  4. ---------
  5. @summary: 元博网 详情页采集
  6. ---------
  7. @author: Lzz
  8. """
  9. import random
  10. import re
  11. import time
  12. import requests.exceptions
  13. from lxml.html import fromstring, HtmlElement, tostring
  14. from lxml.html.clean import Cleaner
  15. from pymongo import MongoClient
  16. import setting
  17. import utils.tools as tool
  18. from dbs.RedisDB import RedisFilter
  19. from log import logger
  20. from utils.check_utils import CheckText, CheckTask
  21. from utils.clean_html import cleaner
  22. from utils.login import User, load_login_cookies, login, login_check
  23. _proxies = setting.PROXIES
  24. def iter_node(element: HtmlElement):
  25. yield element
  26. for sub_element in element:
  27. if isinstance(sub_element, HtmlElement):
  28. yield from iter_node(sub_element)
  29. def pre_parse(element: HtmlElement):
  30. """对 HTML 进行预处理可能会破坏 HTML 原有的结构,导致根据原始 HTML 编写的 XPath 不可用"""
  31. pre_remove = {
  32. 'log_col2', 'log_col1', 'cz', 'iconfont closei', 'p2 p1', 'cnxh_b',
  33. 'msg_error', 'r_gg TB-focus', 'april', 'cont2', 'to_login', 'regtxt',
  34. 'shouchang an_n sc', 'april_title red', 'cn_lt', 'dayin an_n',
  35. 'dl_zc vip_t free_member', 'rmbq', 'login-form cl', 'dian_g fr',
  36. 'di_n', 'd_fx', 'd_tub', 'd_dy', 'anniu1', 'cnxh_list', 'btns cl',
  37. 'active', 'close', 'd_an fr', 'avatar', 'toolbar', 'deng_l',
  38. 'cen_right fr', 'log_col5', 'agreement', 'log_col3',
  39. 'shouchang_af an_n sc_after', 'fast_box', 'di_nr fl', 'xgfj', 'dianh',
  40. 'cnxh_list tab_b2 city_list', 'contract cl', 'zb_cen_r fr', 'd_zsms',
  41. 'sc_after active', 'dl_k', 'ewm_b', 'fl', 'wypj', 'rukou', 'p1',
  42. 'dl_zc', 'success', 'daoh h_30', 'bd', 'april_content', 'print',
  43. 'foot', 'cnxh zbgg', 'april_first', 'fastlog', 'tx_mc user_name',
  44. 'tab_h2', 'fanding an_n', 'toux', 'log_col4 cl', 'hangy rem_1', 'red',
  45. 'regshadow', 'bottom', 'dl_zc vip_t fee_member', 'xszn fl', 'no-print',
  46. 'cnxh_b zbgg_b', 'rem rem_1', 'logshadowz', 'd_pj fl', 'tjgjc',
  47. 'spdujaiwlohh', 'di_ewm fr', 'dian_h fl',
  48. 'tab_h2 zbgg_b_gray', 'fanshou an_n fs', 'login-btn', 'fl gjc',
  49. 'agreeshadow', 'guang_db', 'footer_1', 'log_p', 'cnxh_list tab_b2',
  50. 'd_sw', 'april_close', 'd_sc', 'erweima no-print', 'qgzx', 'p2', 'sc',
  51. 'hd', 'log_col6', 'dh_b', 'dian_guang', 'zhu_c', 'ck cai_k', 'april_box',
  52. 'display:none'
  53. }
  54. for node in iter_node(element):
  55. id_attr = node.attrib.get('id')
  56. class_attr = node.attrib.get('class')
  57. style_attr = node.attrib.get('style')
  58. if any([id_attr in pre_remove,
  59. class_attr in pre_remove,
  60. style_attr in pre_remove]):
  61. node.drop_tree()
  62. return element
  63. def page_source(element: HtmlElement):
  64. clear = Cleaner(
  65. forms=False,
  66. style=True
  67. )
  68. return clear.clean_html(tostring(element, encoding="utf-8").decode())
  69. class DetailSpider:
  70. def __init__(self):
  71. _mgo = MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
  72. self.ybw_list = _mgo[setting.MONGO_DB]["ybw_list"]
  73. self.ybw_info = _mgo[setting.MONGO_DB]["ybw_info"]
  74. self.save_tab = _mgo[setting.MONGO_DB]["data_bak"]
  75. self.dedup = RedisFilter()
  76. self.user = User(phone=setting.ACCOUNT, passwd=setting.PASSWORD)
  77. self.login_times = 0
  78. def json_request(self, fid, request_params):
  79. headers = {
  80. "Accept": "application/json, text/plain, */*",
  81. "Accept-Language": "zh-CN,zh;q=0.9",
  82. "Connection": "keep-alive",
  83. "Referer": "https://www.chinabidding.cn/public/bidagency/index.html",
  84. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  85. }
  86. url = "https://www.chinabidding.cn/agency.info.Detail/show"
  87. params = {
  88. "fid": f"{fid}"
  89. }
  90. return requests.get(url, headers=headers, params=params, **request_params)
  91. def crawl_request(self, item: dict):
  92. url = item["competehref"]
  93. headers = {
  94. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  95. 'Host': 'www.chinabidding.cn',
  96. 'Upgrade-Insecure-Requests': '1',
  97. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
  98. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  99. }
  100. request_params = {}
  101. request_params.setdefault('headers', headers)
  102. request_params.setdefault('timeout', 30)
  103. request_params.setdefault('proxies', _proxies)
  104. # 登录代理
  105. proxy_params = dict(proxies=_proxies, timeout=180)
  106. retries = 0
  107. retries_502, max_retries_502 = 0, 3
  108. while retries < 3:
  109. if retries_502 > max_retries_502:
  110. # 网站已移除该数据
  111. self.ybw_list.update_one({'_id': item["_id"]}, {'$set': {"crawl_status": "remove"}})
  112. break
  113. login_cookies = load_login_cookies(self.user.phone)
  114. if login_cookies is None:
  115. login(*self.user, **proxy_params)
  116. self.login_times += 1
  117. self.update_account_login_times()
  118. continue
  119. if 'cookies' not in request_params:
  120. request_params.setdefault('cookies', login_cookies)
  121. else:
  122. request_params.update({'cookies': login_cookies})
  123. fid = "".join(re.findall('\?fid=(.*)', url)).split('&')[0]
  124. if fid:
  125. try:
  126. request_params.pop('headers', None)
  127. r = self.json_request(fid, request_params)
  128. # 账号登录状态检查
  129. retry_login = login_check(self.user.phone, url, False, **proxy_params)
  130. if retry_login:
  131. logger.info(f"[重新登录]{self.user.phone}")
  132. _, code = login(*self.user, **proxy_params)
  133. self.login_times += 1
  134. retries += 1
  135. if code != 200:
  136. time.sleep(600)
  137. continue
  138. logger.info(f'[采集正文] fid_{fid}')
  139. return r
  140. except:
  141. retries += 1
  142. finally:
  143. self.update_account_login_times()
  144. else:
  145. try:
  146. r = requests.get(url, **request_params)
  147. # 账号登录状态检查
  148. retry_login = login_check(self.user.phone, url, False, **proxy_params)
  149. if retry_login:
  150. logger.info(f"[重新登录]{self.user.phone}")
  151. _, code = login(*self.user, **proxy_params)
  152. self.login_times += 1
  153. retries += 1
  154. if code != 200:
  155. time.sleep(1800)
  156. continue
  157. element = fromstring(r.content.decode())
  158. nodes = element.xpath('//*[@id="main_dom"]/div[1]')
  159. if len(nodes) != 1:
  160. retries_502 += 1
  161. logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
  162. else:
  163. node = nodes[0] # list index out of range
  164. logger.info(f'[采集正文] id={node.attrib.get("id")}')
  165. return r
  166. except requests.RequestException:
  167. retries += 1
  168. finally:
  169. self.update_account_login_times()
  170. return None
  171. def crawl_response(self, response, item):
  172. if re.match('^\{', response.text):
  173. html = response.json().get('c_info').get('content')
  174. else:
  175. element: HtmlElement = fromstring(response.text)
  176. node = element.xpath('//*[@id="infoDescription"]')[0]
  177. node = pre_parse(node)
  178. features = {
  179. './div[@class="ckgys_cont"]',
  180. './/div[@class="detail-title ng-scope"]',
  181. './/table[@class="detail_Table"]',
  182. }
  183. for feature in features:
  184. extract_node = node.xpath(feature)
  185. if len(extract_node) > 0:
  186. valid_node = extract_node[0]
  187. break
  188. else:
  189. valid_node = node
  190. html = page_source(valid_node)
  191. '''检查原始页面内容'''
  192. CheckText(html)
  193. item["contenthtml"] = html
  194. special = {
  195. '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '',
  196. # 'DD000E;|EE000F;|FF000E;': '',
  197. '[(]?[)]?[A-Z]{2}000[A-Z]{1};[(]?[\d{1,4}]*[;]?[)]?[;]?': '',
  198. }
  199. item["detail"] = cleaner(html, special)
  200. item["comeintime"] = tool.int2long(int(time.time()))
  201. '''检查清洗之后的详情'''
  202. CheckText(item["detail"])
  203. insert = {}
  204. for key, val in item.items():
  205. if key not in ['crawl_status', 'crawl', 'count', '_id']:
  206. insert[key] = val
  207. self.save_tab.insert_one(insert)
  208. logger.info('[采集成功]{}-{}'.format(item['title'], item['publishtime']))
  209. def update_account_login_times(self):
  210. self.ybw_info.update_one(
  211. {"account": self.user.phone},
  212. {"$set": {
  213. "login_times": self.login_times,
  214. "update_time": tool.get_current_date()
  215. }}
  216. )
  217. def crawl_spider(self, account, item):
  218. _id = item["_id"]
  219. err = "unknown error"
  220. try:
  221. CheckTask(item) # 检查请求采集任务
  222. response = self.crawl_request(item)
  223. if response is not None:
  224. self.crawl_response(response, item)
  225. self.ybw_list.update_one({"_id": _id}, {"$set": {"crawl_status": "finished"}})
  226. self.ybw_info.update_one(
  227. {"account": self.user.phone},
  228. {"$set": {
  229. "count": account["count"] + 1,
  230. "update_time": tool.get_current_date(),
  231. }}
  232. )
  233. return True
  234. except Exception as e:
  235. err = e
  236. logger.error(f"请求错误:{err}")
  237. self.ybw_list.update_one({'_id': _id}, {'$set': {'crawl_status': f'{err}'}})
  238. return False
  239. def start(self):
  240. logger.info(" *** start ***")
  241. query = {"crawl_status": {"$exists": False}, "es_count": 0}
  242. sort = [('publishtime', -1)]
  243. limit = 100
  244. with self.ybw_list.find(query, sort=sort).limit(limit) as cursor:
  245. tasks = [doc for doc in cursor]
  246. download_count = 0
  247. for item in tasks:
  248. # 检查账号
  249. account = self.ybw_info.find_one({"account": self.user.phone})
  250. if account is None:
  251. logger.error(f"数据库无此账号信息|{self.user.phone}")
  252. return
  253. # 登录次数检查
  254. self.login_times = account["login_times"]
  255. if self.login_times >= 3:
  256. logger.warning(f"账号限制|{self.user.phone}")
  257. return
  258. # 数据发布时间延迟采集
  259. publish_ts = tool.date_to_timestamp(item["publishtime"])
  260. if publish_ts > int(time.time()) - 43200:
  261. logger.warning("未到采集时间")
  262. continue
  263. fp = "detail_" + item.get("competehref")
  264. if not self.dedup.get(fp):
  265. self.dedup.add(fp)
  266. download_count += 1
  267. rst = self.crawl_spider(account, item)
  268. if not rst:
  269. self.dedup.delete(fp)
  270. if download_count >= account["total"]:
  271. logger.warning("当日采集数量已达上限")
  272. break
  273. time.sleep(random.randint(80, 180))
  274. logger.info(" *** end ***")
  275. if __name__ == '__main__':
  276. DetailSpider().start()