|
@@ -0,0 +1,150 @@
|
|
|
+# coding:utf-8
|
|
|
+import re
|
|
|
+import time
|
|
|
+import warnings
|
|
|
+from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
+from urllib.parse import urlparse
|
|
|
+
|
|
|
+import httpx
|
|
|
+import urllib3
|
|
|
+from gne import GeneralNewsExtractor
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from tools import news_list, news_detail, UserAgent, requests
|
|
|
+
|
|
|
+warnings.simplefilter("ignore", UserWarning)
|
|
|
+urllib3.disable_warnings()
|
|
|
+extractor = GeneralNewsExtractor()
|
|
|
+UA = UserAgent()
|
|
|
+
|
|
|
+
|
|
|
+def extract_chinese(text):
|
|
|
+ pattern = re.compile(r'[\u4e00-\u9fff]+') # 匹配Unicode中文范围
|
|
|
+ return True if re.findall(pattern, text) else False
|
|
|
+
|
|
|
+
|
|
|
+def date_to_timestamp(pubulishtime):
|
|
|
+ timeArray = time.strptime(pubulishtime, "%Y-%m-%d")
|
|
|
+ timestamp = int(time.mktime(timeArray))
|
|
|
+ return timestamp
|
|
|
+
|
|
|
+
|
|
|
+def httpx_get_url(info):
|
|
|
+ info["url"] = info["url"] if str(info["url"]).count("https") else str(info["url"]).replace("http", "https")
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ "Accept": "text/html,application/xhtml+xml, application/xml;q=0.9, image/avif, image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
|
|
|
+ }
|
|
|
+
|
|
|
+ req = httpx.get(str(info["url"]), timeout=10, headers=headers)
|
|
|
+ if req.status_code == 200:
|
|
|
+ item = {}
|
|
|
+ res = urlparse(info["url"])
|
|
|
+ result = extractor.extract(req.text, with_body_html=False)
|
|
|
+ item["title"] = result["title"]
|
|
|
+ try:
|
|
|
+ item["list_title"] = info["list_title"]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ item["detail"] = result["content"]
|
|
|
+ item["contenthtml"] = req.text
|
|
|
+ new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace( "/", "-")
|
|
|
+ item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
|
|
|
+ item["infourl"] = info["url"]
|
|
|
+ item["domain"] = res.netloc
|
|
|
+ item["searchwords"] = info["searchwords"]
|
|
|
+ item["searchengine"] = "baidu"
|
|
|
+ item["comeintime"] = int(time.time())
|
|
|
+ # item["project"] = info["project"]
|
|
|
+ item["type"] = True
|
|
|
+ news_detail.insert_one(item)
|
|
|
+ logger.info(f"下载信息: {item['title']}")
|
|
|
+
|
|
|
+
|
|
|
+def get_url(info):
|
|
|
+ headers = {
|
|
|
+ "Accept": "application/json",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "User-Agent": UA.random,
|
|
|
+ }
|
|
|
+ try:
|
|
|
+ req = requests.get(info["url"], headers=headers, timeout=10, verify=False)
|
|
|
+ req.encoding = req.apparent_encoding
|
|
|
+ if req.status_code == 200:
|
|
|
+ item = {}
|
|
|
+ res = urlparse(info["url"])
|
|
|
+ result = extractor.extract(req.text, with_body_html=False)
|
|
|
+ item["title"] = result["title"]
|
|
|
+ try:
|
|
|
+ item["list_title"] = info["list_title"]
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ item["detail"] = result["content"]
|
|
|
+ item["contenthtml"] = req.text
|
|
|
+ new_pubulishtime = str(result["publish_time"]).split(" ")[0].replace("年", "-").replace("月", "-").replace("日","").replace("/", "-")
|
|
|
+ item["pubulishtime"] = new_pubulishtime.split("T")[0] if len(new_pubulishtime) > 11 else new_pubulishtime
|
|
|
+ item["infourl"] = info["url"]
|
|
|
+ item["domain"] = res.netloc
|
|
|
+ item["searchwords"] = info["searchwords"]
|
|
|
+ item["searchengine"] = "baidu"
|
|
|
+ item["comeintime"] = int(time.time())
|
|
|
+ item["site"] = info["site"]
|
|
|
+ item["type"] = True
|
|
|
+ news_detail.insert_one(item)
|
|
|
+ logger.info(f"下载信息:{item['title']}")
|
|
|
+ except:
|
|
|
+ logger.error(f"下载失败:{info['list_title']}")
|
|
|
+
|
|
|
+
|
|
|
+def run(task):
|
|
|
+ if task["url"].count("baijiahao"):
|
|
|
+ httpx_get_url(task)
|
|
|
+ else:
|
|
|
+ get_url(task)
|
|
|
+
|
|
|
+ news_list.delete_one({"_id": task["_id"]})
|
|
|
+
|
|
|
+
|
|
|
+def spider(workers=1):
|
|
|
+ with ThreadPoolExecutor(max_workers=workers) as p:
|
|
|
+ fs = [p.submit(run, task) for task in news_list.find()]
|
|
|
+ wait(fs)
|
|
|
+
|
|
|
+
|
|
|
+def Isvalid():
|
|
|
+ q = {"isvalid": {"$exists": 0}}
|
|
|
+ f = {"contenthtml": 0, "detail": 0}
|
|
|
+ with news_detail.find(q, projection=f, no_cursor_timeout=True) as cursor:
|
|
|
+ for info in cursor:
|
|
|
+ # 标题乱码
|
|
|
+ seqs = ["...", "…"]
|
|
|
+ title = info["title"]
|
|
|
+ if list(filter(lambda x: x in seqs, title)) or not extract_chinese(title):
|
|
|
+ news_detail.update_one({"_id": info["_id"]}, {"$set": {"title": info["list_title"]}})
|
|
|
+
|
|
|
+ # 发布时间大于2023/7/1
|
|
|
+ isvalid = False
|
|
|
+ try:
|
|
|
+ if 1688140800 <= date_to_timestamp(info["pubulishtime"]) <= int(time.time()):
|
|
|
+ isvalid = True
|
|
|
+ except ValueError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ news_detail.update_one({"_id": info["_id"]}, {"$set": {"isvalid": isvalid}})
|
|
|
+ logger.info(f"数据校验:{info['title']}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ while 1:
|
|
|
+ spider(workers=10)
|
|
|
+ Isvalid()
|
|
|
+ logger.info("本轮执行完成, 将延时5分钟后执行.")
|
|
|
+ time.sleep(300)
|