Browse Source

添加新项目

dzr 1 year ago
parent
commit
a7da57192f
7 changed files with 415 additions and 2 deletions
  1. 44 0
      Dockerfile
  2. 15 2
      README.md
  3. 39 0
      docker-compose.yml
  4. 150 0
      news_detail.py
  5. 112 0
      news_list.py
  6. 8 0
      requirements.txt
  7. 47 0
      tools.py

+ 44 - 0
Dockerfile

@@ -0,0 +1,44 @@
+# 拉取镜像
+FROM ubuntu:20.04
+
+# 设置中文编码
+ENV LANG=C.UTF-8
+
+# 配置容器时间
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
+
+# 更新源 - 阿里源(ecs)
+RUN sed -i s@/archive.ubuntu.com/@/mirrors.cloud.aliyuncs.com/@g /etc/apt/sources.list
+RUN sed -i s@/security.ubuntu.com/@/mirrors.cloud.aliyuncs.com/@g /etc/apt/sources.list
+RUN apt-get clean && apt-get update
+RUN apt-get install -y wget unzip curl vim
+
+# 设置vim编码(防止中文乱码)
+RUN grep -qxF 'set encoding=utf8' /etc/vim/vimrc || echo 'set encoding=utf8' >> /etc/vim/vimrc
+
+# 安装 python3.8.10 gcc相关配置
+WORKDIR /opt
+RUN apt-get install -y build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev gcc g++ libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev libncurses-dev libnspr4-dev libdbus-glib-1-2
+# python3.8.10下载与解压缩
+RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
+# 进入安装目录
+WORKDIR /opt/Python-3.8.10
+# 创建安装目录,编译安装(为了加快构建时间,修改-j以使其对应于处理器中的内核数,查找内核数编号通过键:nproc。不覆盖默认python环境使用 altinstall)
+RUN mkdir /usr/local/python38 && ./configure --prefix=/usr/local/python38 && make -j 8 && make altinstall
+# 添加python3的软连接
+RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3.8 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
+# 更换pip源&更新pip
+RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && pip3 install --upgrade pip
+
+# 配置python环境加载路径
+ENV VIRTUAL_ENV=/usr/local/python38
+# 加入系统环境变量
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# 复制项目依赖与安装
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+RUN pip3 install httpx[socks]
+
+# 设置工作目录
+WORKDIR /mnt

+ 15 - 2
README.md

@@ -1,3 +1,16 @@
-# news_baidu
+# 百度舆情采集
 
-百度舆情
+#### 构建镜像
+```shell
+docker build -t baidu_news:latest . 
+```
+
+#### 部署运行
+```shell
+docker-compose up -d
+```
+
+#### 停止运行
+```shell
+docker-compose down
+```

+ 39 - 0
docker-compose.yml

@@ -0,0 +1,39 @@
+version: "3"
+services:
+  crawl-list:
+    container_name: baidu_list
+    image: baidu_news:latest
+    volumes:
+      - /mnt/news_baidu:/mnt
+    restart: always
+    privileged: true
+    shm_size: 4GB
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "200k"
+        max-file: "10"
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+    command: 'python3 news_list.py'
+
+  crawl-detail:
+    container_name: baidu_detail
+    image: baidu_news:latest
+    volumes:
+      - /mnt/news_baidu:/mnt
+    restart: always
+    privileged: true
+    shm_size: 4GB
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "200k"
+        max-file: "10"
+    deploy:
+      resources:
+        limits:
+          memory: 10G
+    command: 'python3 news_detail.py'

+ 150 - 0
news_detail.py

@@ -0,0 +1,150 @@
+# coding:utf-8
+import re
+import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor, wait
+from urllib.parse import urlparse
+
+import httpx
+import urllib3
+from gne import GeneralNewsExtractor
+from loguru import logger
+
+from tools import news_list, news_detail, UserAgent, requests
+
+warnings.simplefilter("ignore", UserWarning)
+urllib3.disable_warnings()
+extractor = GeneralNewsExtractor()
+UA = UserAgent()
+
+
+def extract_chinese(text):
+    pattern = re.compile(r'[\u4e00-\u9fff]+')  # 匹配Unicode中文范围
+    return True if re.findall(pattern, text) else False
+
+
+def date_to_timestamp(pubulishtime):
+    timeArray = time.strptime(pubulishtime, "%Y-%m-%d")
+    timestamp = int(time.mktime(timeArray))
+    return timestamp
+
+
+def httpx_get_url(info):
+    info["url"] = info["url"] if str(info["url"]).count("https") else str(info["url"]).replace("http", "https")
+
+    headers = {
+        "Accept": "text/html,application/xhtml+xml, application/xml;q=0.9, image/avif, image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
+    }
+
+    req = httpx.get(str(info["url"]), timeout=10, headers=headers)
+    if req.status_code == 200:
+        item = {}
+        res = urlparse(info["url"])
+        result = extractor.extract(req.text, with_body_html=False)
+        item["title"] = result["title"]
+        try:
+            item["list_title"] = info["list_title"]
+        except:
+            pass
+
+        item["detail"] = result["content"]
+        item["contenthtml"] = req.text
+        new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace( "/", "-")
+        item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
+        item["infourl"] = info["url"]
+        item["domain"] = res.netloc
+        item["searchwords"] = info["searchwords"]
+        item["searchengine"] = "baidu"
+        item["comeintime"] = int(time.time())
+        # item["project"] = info["project"]
+        item["type"] = True
+        news_detail.insert_one(item)
+        logger.info(f"下载信息: {item['title']}")
+
+
+def get_url(info):
+    headers = {
+        "Accept": "application/json",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "User-Agent": UA.random,
+    }
+    try:
+        req = requests.get(info["url"],  headers=headers, timeout=10, verify=False)
+        req.encoding = req.apparent_encoding
+        if req.status_code == 200:
+            item = {}
+            res = urlparse(info["url"])
+            result = extractor.extract(req.text, with_body_html=False)
+            item["title"] = result["title"]
+            try:
+                item["list_title"] = info["list_title"]
+            except:
+                pass
+
+            item["detail"] = result["content"]
+            item["contenthtml"] = req.text
+            new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace("/", "-")
+            item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
+            item["infourl"] = info["url"]
+            item["domain"] = res.netloc
+            item["searchwords"] = info["searchwords"]
+            item["searchengine"] = "baidu"
+            item["comeintime"] = int(time.time())
+            item["site"] = info["site"]
+            item["type"] = True
+            news_detail.insert_one(item)
+            logger.info(f"下载信息:{item['title']}")
+    except:
+        logger.error(f"下载失败:{info['list_title']}")
+
+
+def run(task):
+    if task["url"].count("baijiahao"):
+        httpx_get_url(task)
+    else:
+        get_url(task)
+
+    news_list.delete_one({"_id": task["_id"]})
+
+
+def spider(workers=1):
+    with ThreadPoolExecutor(max_workers=workers) as p:
+        fs = [p.submit(run, task) for task in news_list.find()]
+        wait(fs)
+
+
+def Isvalid():
+    q = {"isvalid": {"$exists": 0}}
+    f = {"contenthtml": 0, "detail": 0}
+    with news_detail.find(q, projection=f, no_cursor_timeout=True) as cursor:
+        for info in cursor:
+            # 标题乱码
+            seqs = ["...", "…"]
+            title = info["title"]
+            if list(filter(lambda x: x in seqs, title)) or not extract_chinese(title):
+                news_detail.update_one({"_id": info["_id"]}, {"$set": {"title": info["list_title"]}})
+
+            # 发布时间大于2023/7/1
+            isvalid = False
+            try:
+                if 1688140800 <= date_to_timestamp(info["pubulishtime"]) <= int(time.time()):
+                    isvalid = True
+            except ValueError:
+                pass
+
+            news_detail.update_one({"_id": info["_id"]}, {"$set": {"isvalid": isvalid}})
+            logger.info(f"数据校验:{info['title']}")
+
+
+if __name__ == '__main__':
+    while 1:
+        spider(workers=10)
+        Isvalid()
+        logger.info("本轮执行完成, 将延时5分钟后执行.")
+        time.sleep(300)

+ 112 - 0
news_list.py

@@ -0,0 +1,112 @@
+# coding: utf-8
+
+import time
+import uuid
+
+import httpx
+from loguru import logger
+from lxml import etree
+
+from tools import client, news_list, get_proxy, r, redis_key, sha1, ua
+
+
+def analysis_info(site, page, searchword, select_lst):
+    data_count = 0
+    data_lst = []
+    for elem in select_lst:
+        title = str(elem.xpath("./@aria-label")[0]).replace("标题:", "")
+        url = elem.xpath("./@href")[0]
+        href_sign = sha1(url)
+        if not r.hexists(redis_key, href_sign):
+            data_lst.append(dict(
+                _id=str(uuid.uuid4()),
+                url=url,
+                list_title=title,
+                searchengine="baidu",
+                searchwords=searchword,
+                site=site,
+            ))
+            r.hset(redis_key, href_sign, 1)
+
+        if len(data_lst) >= 50:
+            news_list.insert_many(data_lst)
+            data_count += len(data_lst)
+            data_lst = []
+
+    if data_lst:
+        news_list.insert_many(data_lst)
+        data_count += len(data_lst)
+
+    tips = [
+        f"第{page}页--{searchword}",
+        f"采集量:{len(select_lst)}",
+        f"入库量:{data_count}"
+    ]
+    logger.info(",".join(tips))
+
+
+def get_url(key, page):
+    proxies = get_proxy()
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "User-Agent": ua.random,
+    }
+    params = {
+        "tn": "news",
+        "rtt": "4",
+        "bsst": "1",
+        "cl": "2",
+        "wd": key,
+        "medium": "0",
+        "tngroupname": "organic_news",
+        "newVideo": "12",
+        "goods_entry_switch": "1",
+        "rsv_dl": "news_b_pn",
+        "pn": page * 20
+    }
+    request_kwargs = dict(
+        headers=headers,
+        timeout=10,
+        proxies=proxies,
+        params=params,
+        follow_redirects=True
+    )
+    url = "https://www.baidu.com/s"
+    try:
+        req = httpx.get(url, **request_kwargs)
+        html = etree.HTML(req.text)
+        li_list = html.xpath("//div[@id='content_left']//h3/a")
+        return li_list
+    except Exception as e:
+        logger.exception(e)
+        return []
+
+
+def baidu_search(document):
+    searchwords = document["key"]
+    site = document["site"]
+
+    for i in range(0, 4):
+        extract_items = get_url(f"intitle:{searchwords}", i)
+        analysis_info(site, i + 1, searchwords, extract_items)
+
+    client.update_one({"_id": document["_id"]}, {"$set": {"down": 1}})
+
+
+def start():
+    client.update_many({}, {"$unset": {"down": ""}})  # 重置全部 down 字段
+    search_items = [item for item in client.find({"down": {"$exists": 0}})]
+
+    while search_items:
+        items = search_items.pop(0)
+        baidu_search(items)
+
+
+if __name__ == '__main__':
+    while 1:
+        start()
+        logger.info("本轮执行完成, 将延时3小时后执行.")
+        time.sleep(3 * 3600)

+ 8 - 0
requirements.txt

@@ -0,0 +1,8 @@
+gne
+httpx==0.24.0
+loguru
+lxml
+pymongo==3.12.0
+redis==3.5.3
+requests==2.31.0
+fake-useragent

+ 47 - 0
tools.py

@@ -0,0 +1,47 @@
+# coding: utf-8
+
+import hashlib
+import logging
+
+import redis
+import requests
+from fake_useragent import UserAgent
+from pymongo import MongoClient
+
+ua = UserAgent()
+
+
+dbm = MongoClient("172.17.4.87", 27080).hp_news
+client = dbm.news_Keywords
+news_list = dbm.news_list
+news_detail = dbm.news_detail
+
+r = redis.Redis(
+    host='172.17.162.28',
+    password='k5ZJR5KV4q7DRZ92DQ',
+    port=7361,
+    db=19
+)
+redis_key = "news"
+
+
+def sha1(text: str):
+    """
+    十六进制数字字符串形式摘要值
+    @param text: 字符串文本
+    @return: 摘要值
+    """
+    _sha1 = hashlib.sha1()
+    _sha1.update(text.encode("utf-8"))
+    return _sha1.hexdigest()
+
+
+def get_proxy():
+    headers = {"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"}
+    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
+    proxy = proxy.get("data")
+    logging.info("切换代理:{}".format(proxy))
+    if not proxy:
+        return
+
+    return proxy.get("http")