data_spider
/
news_baidu


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							# coding: utf-8

import time
import uuid

import httpx
import requests
from loguru import logger
from lxml.html import fromstring

from tools import dedup
from tools import get_proxy, ua
from tools import news_keyword_coll, news_list_coll


def analysis_info(site, page, search_word, response):
    data_count = 0

    select_lst = []
    if response:
        html_parse = fromstring(response.content.decode())
        select_lst = html_parse.xpath("//div[@id='content_left']//h3/a")

    for elem in select_lst:
        title = str(elem.xpath("./@aria-label")[0]).replace("标题：", "")
        url = elem.xpath("./@href")[0]
        if not dedup.get(url):
            item = dict(
                _id=str(uuid.uuid4()),
                url=url,
                list_title=title,
                searchengine="baidu",
                searchwords=search_word,
                site=site,
            )
            news_list_coll.insert_one(item)
            dedup.add(url)
            data_count += 1

    tips = [
        f"第{page}页--{search_word}",
        f"采集量:{len(select_lst)}",
        f"入库量:{data_count}"
    ]
    logger.info(",".join(tips))


def get_list_response(key, page_num, follow_redirects=False):
    url = "https://www.baidu.com/s"
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "User-Agent": ua.random,
    }
    params = {
        "tn": "news",
        "rtt": "4",
        "bsst": "1",
        "cl": "2",
        "wd": key,
        "medium": "0",
        "tngroupname": "organic_news",
        "newVideo": "12",
        "goods_entry_switch": "1",
        "rsv_dl": "news_b_pn",
        "pn": page_num * 20
    }

    try:
        proxy = get_proxy()
    except requests.RequestException as e:
        logger.error(f'HTTP Proxy Exception for {e}')
        raise e

    request_kwargs = dict(
        headers=headers,
        params=params,
        timeout=10,
        proxy=proxy,
        follow_redirects=follow_redirects  # 百度图形验证，302跳转
    )
    try:
        response = httpx.get(url, **request_kwargs)
        response.raise_for_status()
        return response
    except httpx.HTTPError as exc:
        logger.error(f"HTTP Exception for {exc.request.url} - {exc}")


def baidu_search(document):
    titles = document["key"]
    site = document["site"]
    for pn in range(0, 4):
        try:
            response = get_list_response(f"intitle:{titles}", pn)
            analysis_info(site, pn + 1, titles, response)
        except Exception:
            break

    news_keyword_coll.update_one({"_id": document["_id"]}, {"$set": {"down": 1}})


def start():
    news_keyword_coll.update_many({}, {"$unset": {"down": ""}})  # 重置全部 down 字段
    search_items = [item for item in news_keyword_coll.find({"down": {"$exists": 0}})]
    while search_items:
        item = search_items.pop(0)
        baidu_search(item)


if __name__ == '__main__':
    while 1:
        start()
        logger.info("本轮执行完成, 将延时3小时后执行.")
        time.sleep(3 * 3600)