1 year ago · a7da57192f
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,44 @@
 
				+# 拉取镜像
			
 
				+FROM ubuntu:20.04
			
 
				+
			
 
				+# 设置中文编码
			
 
				+ENV LANG=C.UTF-8
			
 
				+
			
 
				+# 配置容器时间
			
 
				+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
			
 
				+
			
 
				+# 更新源 - 阿里源（ecs）
			
 
				+RUN sed -i s@/archive.ubuntu.com/@/mirrors.cloud.aliyuncs.com/@g /etc/apt/sources.list
			
 
				+RUN sed -i s@/security.ubuntu.com/@/mirrors.cloud.aliyuncs.com/@g /etc/apt/sources.list
			
 
				+RUN apt-get clean && apt-get update
			
 
				+RUN apt-get install -y wget unzip curl vim
			
 
				+
			
 
				+# 设置vim编码（防止中文乱码）
			
 
				+RUN grep -qxF 'set encoding=utf8' /etc/vim/vimrc || echo 'set encoding=utf8' >> /etc/vim/vimrc
			
 
				+
			
 
				+# 安装 python3.8.10 gcc相关配置
			
 
				+WORKDIR /opt
			
 
				+RUN apt-get install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev gcc g++ libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev libncurses-dev libnspr4-dev libdbus-glib-1-2
			
 
				+# python3.8.10下载与解压缩
			
 
				+RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
			
 
				+# 进入安装目录
			
 
				+WORKDIR /opt/Python-3.8.10
			
 
				+# 创建安装目录，编译安装(为了加快构建时间，修改-j以使其对应于处理器中的内核数，查找内核数编号通过键:nproc。不覆盖默认python环境使用 altinstall)
			
 
				+RUN mkdir /usr/local/python38 && ./configure --prefix=/usr/local/python38 && make -j 8 && make altinstall
			
 
				+# 添加python3的软连接
			
 
				+RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3.8 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
			
 
				+# 更换pip源&更新pip
			
 
				+RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && pip3 install --upgrade pip
			
 
				+
			
 
				+# 配置python环境加载路径
			
 
				+ENV VIRTUAL_ENV=/usr/local/python38
			
 
				+# 加入系统环境变量
			
 
				+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
			
 
				+
			
 
				+# 复制项目依赖与安装
			
 
				+COPY requirements.txt requirements.txt
			
 
				+RUN pip3 install -r requirements.txt
			
 
				+RUN pip3 install httpx[socks]
			
 
				+
			
 
				+# 设置工作目录
			
 
				+WORKDIR /mnt
			
--- a/README.md
+++ b/README.md
@@ -1,3 +1,16 @@
 
				-# news_baidu
			
 
				+# 百度舆情采集
			
 
				 
			
 
				-百度舆情
			
 
				+#### 构建镜像
			
 
				+```shell
			
 
				+docker build -t baidu_news:latest . 
			
 
				+```
			
 
				+
			
 
				+#### 部署运行
			
 
				+```shell
			
 
				+docker-compose up -d
			
 
				+```
			
 
				+
			
 
				+#### 停止运行
			
 
				+```shell
			
 
				+docker-compose down
			
 
				+```
			
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,39 @@
 
				+version: "3"
			
 
				+services:
			
 
				+  crawl-list:
			
 
				+    container_name: baidu_list
			
 
				+    image: baidu_news:latest
			
 
				+    volumes:
			
 
				+      - /mnt/news_baidu:/mnt
			
 
				+    restart: always
			
 
				+    privileged: true
			
 
				+    shm_size: 4GB
			
 
				+    logging:
			
 
				+      driver: "json-file"
			
 
				+      options:
			
 
				+        max-size: "200k"
			
 
				+        max-file: "10"
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        limits:
			
 
				+          memory: 4G
			
 
				+    command: 'python3 news_list.py'
			
 
				+
			
 
				+  crawl-detail:
			
 
				+    container_name: baidu_detail
			
 
				+    image: baidu_news:latest
			
 
				+    volumes:
			
 
				+      - /mnt/news_baidu:/mnt
			
 
				+    restart: always
			
 
				+    privileged: true
			
 
				+    shm_size: 4GB
			
 
				+    logging:
			
 
				+      driver: "json-file"
			
 
				+      options:
			
 
				+        max-size: "200k"
			
 
				+        max-file: "10"
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        limits:
			
 
				+          memory: 10G
			
 
				+    command: 'python3 news_detail.py'
			
--- a/news_detail.py
+++ b/news_detail.py
@@ -0,0 +1,150 @@
 
				+# coding:utf-8
			
 
				+import re
			
 
				+import time
			
 
				+import warnings
			
 
				+from concurrent.futures import ThreadPoolExecutor, wait
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+import httpx
			
 
				+import urllib3
			
 
				+from gne import GeneralNewsExtractor
			
 
				+from loguru import logger
			
 
				+
			
 
				+from tools import news_list, news_detail, UserAgent, requests
			
 
				+
			
 
				+warnings.simplefilter("ignore", UserWarning)
			
 
				+urllib3.disable_warnings()
			
 
				+extractor = GeneralNewsExtractor()
			
 
				+UA = UserAgent()
			
 
				+
			
 
				+
			
 
				+def extract_chinese(text):
			
 
				+    pattern = re.compile(r'[\u4e00-\u9fff]+')  # 匹配Unicode中文范围
			
 
				+    return True if re.findall(pattern, text) else False
			
 
				+
			
 
				+
			
 
				+def date_to_timestamp(pubulishtime):
			
 
				+    timeArray = time.strptime(pubulishtime, "%Y-%m-%d")
			
 
				+    timestamp = int(time.mktime(timeArray))
			
 
				+    return timestamp
			
 
				+
			
 
				+
			
 
				+def httpx_get_url(info):
			
 
				+    info["url"] = info["url"] if str(info["url"]).count("https") else str(info["url"]).replace("http", "https")
			
 
				+
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml, application/xml;q=0.9, image/avif, image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+        "Cache-Control": "no-cache",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
			
 
				+    }
			
 
				+
			
 
				+    req = httpx.get(str(info["url"]), timeout=10, headers=headers)
			
 
				+    if req.status_code == 200:
			
 
				+        item = {}
			
 
				+        res = urlparse(info["url"])
			
 
				+        result = extractor.extract(req.text, with_body_html=False)
			
 
				+        item["title"] = result["title"]
			
 
				+        try:
			
 
				+            item["list_title"] = info["list_title"]
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+        item["detail"] = result["content"]
			
 
				+        item["contenthtml"] = req.text
			
 
				+        new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace( "/", "-")
			
 
				+        item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
			
 
				+        item["infourl"] = info["url"]
			
 
				+        item["domain"] = res.netloc
			
 
				+        item["searchwords"] = info["searchwords"]
			
 
				+        item["searchengine"] = "baidu"
			
 
				+        item["comeintime"] = int(time.time())
			
 
				+        # item["project"] = info["project"]
			
 
				+        item["type"] = True
			
 
				+        news_detail.insert_one(item)
			
 
				+        logger.info(f"下载信息: {item['title']}")
			
 
				+
			
 
				+
			
 
				+def get_url(info):
			
 
				+    headers = {
			
 
				+        "Accept": "application/json",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+        "Cache-Control": "no-cache",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "User-Agent": UA.random,
			
 
				+    }
			
 
				+    try:
			
 
				+        req = requests.get(info["url"],  headers=headers, timeout=10, verify=False)
			
 
				+        req.encoding = req.apparent_encoding
			
 
				+        if req.status_code == 200:
			
 
				+            item = {}
			
 
				+            res = urlparse(info["url"])
			
 
				+            result = extractor.extract(req.text, with_body_html=False)
			
 
				+            item["title"] = result["title"]
			
 
				+            try:
			
 
				+                item["list_title"] = info["list_title"]
			
 
				+            except:
			
 
				+                pass
			
 
				+
			
 
				+            item["detail"] = result["content"]
			
 
				+            item["contenthtml"] = req.text
			
 
				+            new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace("/", "-")
			
 
				+            item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
			
 
				+            item["infourl"] = info["url"]
			
 
				+            item["domain"] = res.netloc
			
 
				+            item["searchwords"] = info["searchwords"]
			
 
				+            item["searchengine"] = "baidu"
			
 
				+            item["comeintime"] = int(time.time())
			
 
				+            item["site"] = info["site"]
			
 
				+            item["type"] = True
			
 
				+            news_detail.insert_one(item)
			
 
				+            logger.info(f"下载信息:{item['title']}")
			
 
				+    except:
			
 
				+        logger.error(f"下载失败:{info['list_title']}")
			
 
				+
			
 
				+
			
 
				+def run(task):
			
 
				+    if task["url"].count("baijiahao"):
			
 
				+        httpx_get_url(task)
			
 
				+    else:
			
 
				+        get_url(task)
			
 
				+
			
 
				+    news_list.delete_one({"_id": task["_id"]})
			
 
				+
			
 
				+
			
 
				+def spider(workers=1):
			
 
				+    with ThreadPoolExecutor(max_workers=workers) as p:
			
 
				+        fs = [p.submit(run, task) for task in news_list.find()]
			
 
				+        wait(fs)
			
 
				+
			
 
				+
			
 
				+def Isvalid():
			
 
				+    q = {"isvalid": {"$exists": 0}}
			
 
				+    f = {"contenthtml": 0, "detail": 0}
			
 
				+    with news_detail.find(q, projection=f, no_cursor_timeout=True) as cursor:
			
 
				+        for info in cursor:
			
 
				+            # 标题乱码
			
 
				+            seqs = ["...", "…"]
			
 
				+            title = info["title"]
			
 
				+            if list(filter(lambda x: x in seqs, title)) or not extract_chinese(title):
			
 
				+                news_detail.update_one({"_id": info["_id"]}, {"$set": {"title": info["list_title"]}})
			
 
				+
			
 
				+            # 发布时间大于2023/7/1
			
 
				+            isvalid = False
			
 
				+            try:
			
 
				+                if 1688140800 <= date_to_timestamp(info["pubulishtime"]) <= int(time.time()):
			
 
				+                    isvalid = True
			
 
				+            except ValueError:
			
 
				+                pass
			
 
				+
			
 
				+            news_detail.update_one({"_id": info["_id"]}, {"$set": {"isvalid": isvalid}})
			
 
				+            logger.info(f"数据校验:{info['title']}")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    while 1:
			
 
				+        spider(workers=10)
			
 
				+        Isvalid()
			
 
				+        logger.info("本轮执行完成, 将延时5分钟后执行.")
			
 
				+        time.sleep(300)
			
--- a/news_list.py
+++ b/news_list.py
@@ -0,0 +1,112 @@
 
				+# coding: utf-8
			
 
				+
			
 
				+import time
			
 
				+import uuid
			
 
				+
			
 
				+import httpx
			
 
				+from loguru import logger
			
 
				+from lxml import etree
			
 
				+
			
 
				+from tools import client, news_list, get_proxy, r, redis_key, sha1, ua
			
 
				+
			
 
				+
			
 
				+def analysis_info(site, page, searchword, select_lst):
			
 
				+    data_count = 0
			
 
				+    data_lst = []
			
 
				+    for elem in select_lst:
			
 
				+        title = str(elem.xpath("./@aria-label")[0]).replace("标题：", "")
			
 
				+        url = elem.xpath("./@href")[0]
			
 
				+        href_sign = sha1(url)
			
 
				+        if not r.hexists(redis_key, href_sign):
			
 
				+            data_lst.append(dict(
			
 
				+                _id=str(uuid.uuid4()),
			
 
				+                url=url,
			
 
				+                list_title=title,
			
 
				+                searchengine="baidu",
			
 
				+                searchwords=searchword,
			
 
				+                site=site,
			
 
				+            ))
			
 
				+            r.hset(redis_key, href_sign, 1)
			
 
				+
			
 
				+        if len(data_lst) >= 50:
			
 
				+            news_list.insert_many(data_lst)
			
 
				+            data_count += len(data_lst)
			
 
				+            data_lst = []
			
 
				+
			
 
				+    if data_lst:
			
 
				+        news_list.insert_many(data_lst)
			
 
				+        data_count += len(data_lst)
			
 
				+
			
 
				+    tips = [
			
 
				+        f"第{page}页--{searchword}",
			
 
				+        f"采集量:{len(select_lst)}",
			
 
				+        f"入库量:{data_count}"
			
 
				+    ]
			
 
				+    logger.info(",".join(tips))
			
 
				+
			
 
				+
			
 
				+def get_url(key, page):
			
 
				+    proxies = get_proxy()
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+        "Cache-Control": "no-cache",
			
 
				+        "Connection": "keep-alive",
			
 
				+        "User-Agent": ua.random,
			
 
				+    }
			
 
				+    params = {
			
 
				+        "tn": "news",
			
 
				+        "rtt": "4",
			
 
				+        "bsst": "1",
			
 
				+        "cl": "2",
			
 
				+        "wd": key,
			
 
				+        "medium": "0",
			
 
				+        "tngroupname": "organic_news",
			
 
				+        "newVideo": "12",
			
 
				+        "goods_entry_switch": "1",
			
 
				+        "rsv_dl": "news_b_pn",
			
 
				+        "pn": page * 20
			
 
				+    }
			
 
				+    request_kwargs = dict(
			
 
				+        headers=headers,
			
 
				+        timeout=10,
			
 
				+        proxies=proxies,
			
 
				+        params=params,
			
 
				+        follow_redirects=True
			
 
				+    )
			
 
				+    url = "https://www.baidu.com/s"
			
 
				+    try:
			
 
				+        req = httpx.get(url, **request_kwargs)
			
 
				+        html = etree.HTML(req.text)
			
 
				+        li_list = html.xpath("//div[@id='content_left']//h3/a")
			
 
				+        return li_list
			
 
				+    except Exception as e:
			
 
				+        logger.exception(e)
			
 
				+        return []
			
 
				+
			
 
				+
			
 
				+def baidu_search(document):
			
 
				+    searchwords = document["key"]
			
 
				+    site = document["site"]
			
 
				+
			
 
				+    for i in range(0, 4):
			
 
				+        extract_items = get_url(f"intitle:{searchwords}", i)
			
 
				+        analysis_info(site, i + 1, searchwords, extract_items)
			
 
				+
			
 
				+    client.update_one({"_id": document["_id"]}, {"$set": {"down": 1}})
			
 
				+
			
 
				+
			
 
				+def start():
			
 
				+    client.update_many({}, {"$unset": {"down": ""}})  # 重置全部 down 字段
			
 
				+    search_items = [item for item in client.find({"down": {"$exists": 0}})]
			
 
				+
			
 
				+    while search_items:
			
 
				+        items = search_items.pop(0)
			
 
				+        baidu_search(items)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    while 1:
			
 
				+        start()
			
 
				+        logger.info("本轮执行完成, 将延时3小时后执行.")
			
 
				+        time.sleep(3 * 3600)
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
 
				+gne
			
 
				+httpx==0.24.0
			
 
				+loguru
			
 
				+lxml
			
 
				+pymongo==3.12.0
			
 
				+redis==3.5.3
			
 
				+requests==2.31.0
			
 
				+fake-useragent
			
--- a/tools.py
+++ b/tools.py
@@ -0,0 +1,47 @@
 
				+# coding: utf-8
			
 
				+
			
 
				+import hashlib
			
 
				+import logging
			
 
				+
			
 
				+import redis
			
 
				+import requests
			
 
				+from fake_useragent import UserAgent
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+ua = UserAgent()
			
 
				+
			
 
				+
			
 
				+dbm = MongoClient("172.17.4.87", 27080).hp_news
			
 
				+client = dbm.news_Keywords
			
 
				+news_list = dbm.news_list
			
 
				+news_detail = dbm.news_detail
			
 
				+
			
 
				+r = redis.Redis(
			
 
				+    host='172.17.162.28',
			
 
				+    password='k5ZJR5KV4q7DRZ92DQ',
			
 
				+    port=7361,
			
 
				+    db=19
			
 
				+)
			
 
				+redis_key = "news"
			
 
				+
			
 
				+
			
 
				+def sha1(text: str):
			
 
				+    """
			
 
				+    十六进制数字字符串形式摘要值
			
 
				+    @param text: 字符串文本
			
 
				+    @return: 摘要值
			
 
				+    """
			
 
				+    _sha1 = hashlib.sha1()
			
 
				+    _sha1.update(text.encode("utf-8"))
			
 
				+    return _sha1.hexdigest()
			
 
				+
			
 
				+
			
 
				+def get_proxy():
			
 
				+    headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
			
 
				+    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
			
 
				+    proxy = proxy.get("data")
			
 
				+    logging.info("切换代理：{}".format(proxy))
			
 
				+    if not proxy:
			
 
				+        return
			
 
				+
			
 
				+    return proxy.get("http")