123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- # coding: utf-8
- import time
- import uuid
- import httpx
- import requests
- from loguru import logger
- from lxml.html import fromstring
- from tools import dedup
- from tools import get_proxy, ua
- from tools import news_keyword_coll, news_list_coll
- def analysis_info(site, page, search_word, response):
- data_count = 0
- select_lst = []
- if response:
- html_parse = fromstring(response.content.decode())
- select_lst = html_parse.xpath("//div[@id='content_left']//h3/a")
- for elem in select_lst:
- title = str(elem.xpath("./@aria-label")[0]).replace("标题:", "")
- url = elem.xpath("./@href")[0]
- if not dedup.get(url):
- item = dict(
- _id=str(uuid.uuid4()),
- url=url,
- list_title=title,
- searchengine="baidu",
- searchwords=search_word,
- site=site,
- )
- news_list_coll.insert_one(item)
- dedup.add(url)
- data_count += 1
- tips = [
- f"第{page}页--{search_word}",
- f"采集量:{len(select_lst)}",
- f"入库量:{data_count}"
- ]
- logger.info(",".join(tips))
- def get_list_response(key, page_num, follow_redirects=False):
- url = "https://www.baidu.com/s"
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "User-Agent": ua.random,
- }
- params = {
- "tn": "news",
- "rtt": "4",
- "bsst": "1",
- "cl": "2",
- "wd": key,
- "medium": "0",
- "tngroupname": "organic_news",
- "newVideo": "12",
- "goods_entry_switch": "1",
- "rsv_dl": "news_b_pn",
- "pn": page_num * 20
- }
- try:
- proxy = get_proxy()
- except requests.RequestException as e:
- logger.error(f'HTTP Proxy Exception for {e}')
- raise e
- request_kwargs = dict(
- headers=headers,
- params=params,
- timeout=10,
- proxy=proxy,
- follow_redirects=follow_redirects # 百度图形验证,302跳转
- )
- try:
- response = httpx.get(url, **request_kwargs)
- response.raise_for_status()
- return response
- except httpx.HTTPError as exc:
- logger.error(f"HTTP Exception for {exc.request.url} - {exc}")
- def baidu_search(document):
- titles = document["key"]
- site = document["site"]
- for pn in range(0, 4):
- try:
- response = get_list_response(f"intitle:{titles}", pn)
- analysis_info(site, pn + 1, titles, response)
- except Exception:
- break
- news_keyword_coll.update_one({"_id": document["_id"]}, {"$set": {"down": 1}})
- def start():
- news_keyword_coll.update_many({}, {"$unset": {"down": ""}}) # 重置全部 down 字段
- search_items = [item for item in news_keyword_coll.find({"down": {"$exists": 0}})]
- while search_items:
- item = search_items.pop(0)
- baidu_search(item)
- if __name__ == '__main__':
- while 1:
- start()
- logger.info("本轮执行完成, 将延时3小时后执行.")
- time.sleep(3 * 3600)
|