|
@@ -6,16 +6,19 @@ from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
import httpx
|
|
|
+import requests
|
|
|
import urllib3
|
|
|
from gne import GeneralNewsExtractor
|
|
|
from loguru import logger
|
|
|
|
|
|
-from tools import news_list, news_detail, UserAgent, requests
|
|
|
+from tools import news_list_coll, news_detail_coll
|
|
|
+from tools import ua
|
|
|
|
|
|
warnings.simplefilter("ignore", UserWarning)
|
|
|
urllib3.disable_warnings()
|
|
|
+
|
|
|
+# gne
|
|
|
extractor = GeneralNewsExtractor()
|
|
|
-UA = UserAgent()
|
|
|
|
|
|
|
|
|
def extract_chinese(text):
|
|
@@ -23,28 +26,28 @@ def extract_chinese(text):
|
|
|
return True if re.findall(pattern, text) else False
|
|
|
|
|
|
|
|
|
-def date_to_timestamp(pubulishtime):
|
|
|
- timeArray = time.strptime(pubulishtime, "%Y-%m-%d")
|
|
|
- timestamp = int(time.mktime(timeArray))
|
|
|
- return timestamp
|
|
|
-
|
|
|
+def date_to_timestamp(publish_time):
|
|
|
+ time_array = time.strptime(publish_time, "%Y-%m-%d")
|
|
|
+ return int(time.mktime(time_array))
|
|
|
|
|
|
-def httpx_get_url(info):
|
|
|
- info["url"] = info["url"] if str(info["url"]).count("https") else str(info["url"]).replace("http", "https")
|
|
|
|
|
|
+def get_detail_by_httpx(info):
|
|
|
+ url = info["url"] if str(info["url"]).count("https") else str(info["url"]).replace("http", "https")
|
|
|
headers = {
|
|
|
"Accept": "text/html,application/xhtml+xml, application/xml;q=0.9, image/avif, image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
"Cache-Control": "no-cache",
|
|
|
"Connection": "keep-alive",
|
|
|
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
|
|
|
+ "User-Agent": ua.random,
|
|
|
}
|
|
|
|
|
|
- req = httpx.get(str(info["url"]), timeout=10, headers=headers)
|
|
|
+ req = httpx.get(str(url), timeout=10, headers=headers)
|
|
|
if req.status_code == 200:
|
|
|
+ html = req.content.decode()
|
|
|
+ result = extractor.extract(html, with_body_html=False)
|
|
|
+ url_parser = urlparse(url)
|
|
|
+
|
|
|
item = {}
|
|
|
- res = urlparse(info["url"])
|
|
|
- result = extractor.extract(req.text, with_body_html=False)
|
|
|
item["title"] = result["title"]
|
|
|
try:
|
|
|
item["list_title"] = info["list_title"]
|
|
@@ -52,35 +55,37 @@ def httpx_get_url(info):
|
|
|
pass
|
|
|
|
|
|
item["detail"] = result["content"]
|
|
|
- item["contenthtml"] = req.text
|
|
|
+ item["contenthtml"] = html
|
|
|
new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace( "/", "-")
|
|
|
item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
|
|
|
- item["infourl"] = info["url"]
|
|
|
- item["domain"] = res.netloc
|
|
|
+ item["infourl"] = url
|
|
|
+ item["domain"] = url_parser.netloc
|
|
|
item["searchwords"] = info["searchwords"]
|
|
|
item["searchengine"] = "baidu"
|
|
|
item["comeintime"] = int(time.time())
|
|
|
- # item["project"] = info["project"]
|
|
|
item["type"] = True
|
|
|
- news_detail.insert_one(item)
|
|
|
+ news_detail_coll.insert_one(item)
|
|
|
logger.info(f"下载信息: {item['title']}")
|
|
|
|
|
|
|
|
|
-def get_url(info):
|
|
|
+def get_detail_by_requests(info):
|
|
|
+ url = info["url"]
|
|
|
headers = {
|
|
|
"Accept": "application/json",
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
"Cache-Control": "no-cache",
|
|
|
"Connection": "keep-alive",
|
|
|
- "User-Agent": UA.random,
|
|
|
+ "User-Agent": ua.random,
|
|
|
}
|
|
|
try:
|
|
|
- req = requests.get(info["url"], headers=headers, timeout=10, verify=False)
|
|
|
+ req = requests.get(url, headers=headers, timeout=10, verify=False)
|
|
|
req.encoding = req.apparent_encoding
|
|
|
if req.status_code == 200:
|
|
|
+ url_parser = urlparse(url)
|
|
|
+ html = req.content.decode()
|
|
|
+ result = extractor.extract(html, with_body_html=False)
|
|
|
+
|
|
|
item = {}
|
|
|
- res = urlparse(info["url"])
|
|
|
- result = extractor.extract(req.text, with_body_html=False)
|
|
|
item["title"] = result["title"]
|
|
|
try:
|
|
|
item["list_title"] = info["list_title"]
|
|
@@ -88,17 +93,17 @@ def get_url(info):
|
|
|
pass
|
|
|
|
|
|
item["detail"] = result["content"]
|
|
|
- item["contenthtml"] = req.text
|
|
|
+ item["contenthtml"] = html
|
|
|
new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace("/", "-")
|
|
|
item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
|
|
|
- item["infourl"] = info["url"]
|
|
|
- item["domain"] = res.netloc
|
|
|
+ item["infourl"] = url
|
|
|
+ item["domain"] = url_parser.netloc
|
|
|
item["searchwords"] = info["searchwords"]
|
|
|
item["searchengine"] = "baidu"
|
|
|
item["comeintime"] = int(time.time())
|
|
|
item["site"] = info["site"]
|
|
|
item["type"] = True
|
|
|
- news_detail.insert_one(item)
|
|
|
+ news_detail_coll.insert_one(item)
|
|
|
logger.info(f"下载信息:{item['title']}")
|
|
|
except:
|
|
|
logger.error(f"下载失败:{info['list_title']}")
|
|
@@ -106,29 +111,29 @@ def get_url(info):
|
|
|
|
|
|
def run(task):
|
|
|
if task["url"].count("baijiahao"):
|
|
|
- httpx_get_url(task)
|
|
|
+ get_detail_by_httpx(task)
|
|
|
else:
|
|
|
- get_url(task)
|
|
|
+ get_detail_by_requests(task)
|
|
|
|
|
|
- news_list.delete_one({"_id": task["_id"]})
|
|
|
+ news_list_coll.delete_one({"_id": task["_id"]})
|
|
|
|
|
|
|
|
|
-def spider(workers=1):
|
|
|
+def spider(workers):
|
|
|
with ThreadPoolExecutor(max_workers=workers) as p:
|
|
|
- fs = [p.submit(run, task) for task in news_list.find()]
|
|
|
+ fs = [p.submit(run, task) for task in news_list_coll.find()]
|
|
|
wait(fs)
|
|
|
|
|
|
|
|
|
def Isvalid():
|
|
|
q = {"isvalid": {"$exists": 0}}
|
|
|
f = {"contenthtml": 0, "detail": 0}
|
|
|
- with news_detail.find(q, projection=f, no_cursor_timeout=True) as cursor:
|
|
|
+ with news_detail_coll.find(q, projection=f, no_cursor_timeout=True) as cursor:
|
|
|
for info in cursor:
|
|
|
# 标题乱码
|
|
|
seqs = ["...", "…"]
|
|
|
title = info["title"]
|
|
|
if list(filter(lambda x: x in seqs, title)) or not extract_chinese(title):
|
|
|
- news_detail.update_one({"_id": info["_id"]}, {"$set": {"title": info["list_title"]}})
|
|
|
+ news_detail_coll.update_one({"_id": info["_id"]}, {"$set": {"title": info["list_title"]}})
|
|
|
|
|
|
# 发布时间大于2023/7/1
|
|
|
isvalid = False
|
|
@@ -138,7 +143,7 @@ def Isvalid():
|
|
|
except ValueError:
|
|
|
pass
|
|
|
|
|
|
- news_detail.update_one({"_id": info["_id"]}, {"$set": {"isvalid": isvalid}})
|
|
|
+ news_detail_coll.update_one({"_id": info["_id"]}, {"$set": {"isvalid": isvalid}})
|
|
|
logger.info(f"数据校验:{info['title']}")
|
|
|
|
|
|
|