# coding: utf-8 import time import uuid import httpx import requests from loguru import logger from lxml.html import fromstring from tools import dedup from tools import get_proxy, ua from tools import news_keyword_coll, news_list_coll def analysis_info(site, page, search_word, response): data_count = 0 select_lst = [] if response: html_parse = fromstring(response.content.decode()) select_lst = html_parse.xpath("//div[@id='content_left']//h3/a") for elem in select_lst: title = str(elem.xpath("./@aria-label")[0]).replace("标题:", "") url = elem.xpath("./@href")[0] if not dedup.get(url): item = dict( _id=str(uuid.uuid4()), url=url, list_title=title, searchengine="baidu", searchwords=search_word, site=site, ) news_list_coll.insert_one(item) dedup.add(url) data_count += 1 tips = [ f"第{page}页--{search_word}", f"采集量:{len(select_lst)}", f"入库量:{data_count}" ] logger.info(",".join(tips)) def get_list_response(key, page_num, follow_redirects=False): url = "https://www.baidu.com/s" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "User-Agent": ua.random, } params = { "tn": "news", "rtt": "4", "bsst": "1", "cl": "2", "wd": key, "medium": "0", "tngroupname": "organic_news", "newVideo": "12", "goods_entry_switch": "1", "rsv_dl": "news_b_pn", "pn": page_num * 20 } try: proxy = get_proxy() except requests.RequestException as e: logger.error(f'HTTP Proxy Exception for {e}') raise e request_kwargs = dict( headers=headers, params=params, timeout=10, proxy=proxy, follow_redirects=follow_redirects # 百度图形验证,302跳转 ) try: response = httpx.get(url, **request_kwargs) response.raise_for_status() return response except httpx.HTTPError as exc: logger.error(f"HTTP Exception for {exc.request.url} - {exc}") def baidu_search(document): titles = document["key"] site = document["site"] for pn in range(0, 4): try: response = get_list_response(f"intitle:{titles}", pn) analysis_info(site, pn + 1, titles, response) except Exception: break news_keyword_coll.update_one({"_id": document["_id"]}, {"$set": {"down": 1}}) def start(): news_keyword_coll.update_many({}, {"$unset": {"down": ""}}) # 重置全部 down 字段 search_items = [item for item in news_keyword_coll.find({"down": {"$exists": 0}})] while search_items: item = search_items.pop(0) baidu_search(item) if __name__ == '__main__': while 1: start() logger.info("本轮执行完成, 将延时3小时后执行.") time.sleep(3 * 3600)