|
@@ -68,10 +68,10 @@ def page_source(element: HtmlElement):
|
|
|
class DetailSpider:
|
|
|
|
|
|
def __init__(
|
|
|
- self,
|
|
|
- db: str,
|
|
|
- crawl_tab: str,
|
|
|
- save_tab: str,
|
|
|
+ self,
|
|
|
+ db: str,
|
|
|
+ crawl_tab: str,
|
|
|
+ save_tab: str,
|
|
|
):
|
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
|
self.save_tab = mongo_table(db, save_tab)
|
|
@@ -90,16 +90,13 @@ class DetailSpider:
|
|
|
self._update_crawl_task(task['_id'], **update)
|
|
|
|
|
|
def json_request(self, fid, request_params):
|
|
|
-
|
|
|
url = "https://www.chinabidding.cn/agency.info.Detail/show"
|
|
|
params = {
|
|
|
"fid": f"{fid}"
|
|
|
}
|
|
|
-
|
|
|
res = requests.get(url, params=params, **request_params)
|
|
|
return res
|
|
|
|
|
|
-
|
|
|
def crawl_request(self, item: dict):
|
|
|
url = item['competehref']
|
|
|
headers = {
|
|
@@ -212,8 +209,12 @@ class DetailSpider:
|
|
|
'''检查原始页面内容'''
|
|
|
source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', html)
|
|
|
if source_url:
|
|
|
- self.save_url.insert_one({"site":"元博网", "title": item['title'], "source_url":source_url.group(1),
|
|
|
- "comeintime":int2long(int(time.time()))})
|
|
|
+ self.save_url.insert_one({
|
|
|
+ "site": "元博网",
|
|
|
+ "title": item['title'],
|
|
|
+ "source_url": source_url.group(1),
|
|
|
+ "comeintime": int2long(int(time.time()))
|
|
|
+ })
|
|
|
CheckText(html)
|
|
|
item["contenthtml"] = html
|
|
|
special = {
|
|
@@ -226,8 +227,12 @@ class DetailSpider:
|
|
|
'''检查清洗之后的详情'''
|
|
|
source_url = re.search('.*<a href=\"(.*?)\">点击查看内容', item["detail"])
|
|
|
if source_url:
|
|
|
- self.save_url.insert_one({"site": "元博网", "title": item['title'], "source_url": source_url.group(1),
|
|
|
- "comeintime":int2long(int(time.time()))})
|
|
|
+ self.save_url.insert_one({
|
|
|
+ "site": "元博网",
|
|
|
+ "title": item['title'],
|
|
|
+ "source_url": source_url.group(1),
|
|
|
+ "comeintime": int2long(int(time.time()))
|
|
|
+ })
|
|
|
CheckText(item["detail"])
|
|
|
insert = {}
|
|
|
for key, val in item.items():
|