|
@@ -0,0 +1,172 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on 2024-02-27
|
|
|
+---------
|
|
|
+@summary:
|
|
|
+---------
|
|
|
+@author: Dzr
|
|
|
+"""
|
|
|
+
|
|
|
+import datetime
|
|
|
+import time
|
|
|
+from collections import namedtuple
|
|
|
+from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
+from threading import Thread
|
|
|
+from urllib.parse import urljoin
|
|
|
+
|
|
|
+import requests
|
|
|
+from loguru import logger
|
|
|
+from pymongo import MongoClient
|
|
|
+
|
|
|
+from config.load import region
|
|
|
+from utils.databases import int2long, es_query
|
|
|
+from utils.socks5 import get_proxy
|
|
|
+
|
|
|
+from utils.RedisDB import rexists, radd, RedisFilter
|
|
|
+
|
|
|
+
|
|
|
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
|
|
|
+ timestamp = time.mktime(time.strptime(date, time_format))
|
|
|
+ return int(timestamp)
|
|
|
+
|
|
|
+
|
|
|
+def spider(collection, dedup, page, task):
|
|
|
+
|
|
|
+ t_name = Thread().getName()
|
|
|
+ headers = {
|
|
|
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
|
|
|
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
|
|
|
+ 'Cache-Control': 'no-cache',
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
+ 'Pragma': 'no-cache',
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
|
+ 'X-Requested-With': 'XMLHttpRequest',
|
|
|
+ }
|
|
|
+ page_size = 1000 # 单页大小
|
|
|
+ date = datetime.datetime.now().strftime('%Y-%m-%d')
|
|
|
+ params = {
|
|
|
+ 'device': 'es',
|
|
|
+ 'cpcode': 'es001',
|
|
|
+ 'keywords': '',
|
|
|
+ 'table_type': f'{task.table_type},',
|
|
|
+ 'search_type': 'CONTEXT',
|
|
|
+ 'areaid': f'{task.areaid},',
|
|
|
+ 'categoryid': '',
|
|
|
+ 'b_date': 'custom',
|
|
|
+ 'time_start': date,
|
|
|
+ 'time_end': date,
|
|
|
+ 'page': page,
|
|
|
+ 'rp': page_size,
|
|
|
+ 'usrecord_id': '',
|
|
|
+ }
|
|
|
+ response = requests.get(
|
|
|
+ 'https://www.chinabidding.cn/302e302e7379675f73736f/datax/json/gj_zbcg_daylimit',
|
|
|
+ params=params,
|
|
|
+ headers=headers,
|
|
|
+ proxies=get_proxy(),
|
|
|
+ timeout=60
|
|
|
+ )
|
|
|
+ logger.debug(response)
|
|
|
+ data_count = 0
|
|
|
+ if response.status_code == 200:
|
|
|
+ resp_json = response.json()
|
|
|
+ data_items = resp_json['result']['list']
|
|
|
+ logger.info(f"[{t_name}]第{page}页{len(data_items)}条数据")
|
|
|
+
|
|
|
+ for items in data_items:
|
|
|
+ title = items['fields']['title']
|
|
|
+ publish_date = items['fields']['publish_date']
|
|
|
+ l_np_publishtime = date_to_timestamp(publish_date)
|
|
|
+ url = urljoin('https://www.chinabidding.cn', items['fields']['url'])
|
|
|
+ if title and not rexists(dedup, url):
|
|
|
+ data = {
|
|
|
+ "site": "元博网(采购与招标网)",
|
|
|
+ "channel": task.channel,
|
|
|
+ "area": task.area if task.area != '跨省' else '全国',
|
|
|
+ "_d": "comeintime",
|
|
|
+ "comeintime": int2long(int(time.time())),
|
|
|
+ "T": "bidding",
|
|
|
+ "sendflag": "false",
|
|
|
+ "spidercode": task.spidercode,
|
|
|
+ "city": "",
|
|
|
+ "infoformat": 1,
|
|
|
+ "type": "",
|
|
|
+ "publishdept": "",
|
|
|
+ "title": title,
|
|
|
+ "competehref": url,
|
|
|
+ "href": "#",
|
|
|
+ "publishtime": publish_date,
|
|
|
+ "l_np_publishtime": int2long(l_np_publishtime),
|
|
|
+ "crawl": False
|
|
|
+ }
|
|
|
+ try:
|
|
|
+ count = es_query(title, l_np_publishtime)
|
|
|
+ except:
|
|
|
+ count = 0
|
|
|
+
|
|
|
+ data['count'] = count
|
|
|
+
|
|
|
+ collection.insert_one(data)
|
|
|
+ data_count += 1
|
|
|
+ radd(dedup, url)
|
|
|
+ if data_count % 100 == 0:
|
|
|
+ logger.info(f"[{t_name}]已保存 {data_count} 条数据")
|
|
|
+
|
|
|
+ logger.info(f'[{t_name}]完成第{page}页数据采集')
|
|
|
+
|
|
|
+
|
|
|
+def get_tasks():
|
|
|
+ Menu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
|
|
|
+ Task = namedtuple('Task', ['channel', 'spidercode', 'table_type', 'areaid', 'area'])
|
|
|
+ menus = [
|
|
|
+ Menu('政府采购', 'a_ybwcgyzbw_zfcg', '6'),
|
|
|
+ Menu('招标预告', 'a_ybwcgyzbw_zbyg', '5'),
|
|
|
+ Menu('中标公示', 'a_ybwcgyzbw_zbgs', '4'),
|
|
|
+ Menu('服务招标', 'a_ybwcgyzbw_fwzb', '3'),
|
|
|
+ Menu('货物招标', 'a_ybwcgyzbw_hwzb', '2'),
|
|
|
+ Menu('工程招标', 'a_ybwcgyzbw_gczb', '1'),
|
|
|
+ ]
|
|
|
+ tasks = []
|
|
|
+ for menu in menus:
|
|
|
+ for i, n in region.items():
|
|
|
+ tasks.append(Task(
|
|
|
+ **menu._asdict(),
|
|
|
+ areaid=i,
|
|
|
+ area=n
|
|
|
+ ))
|
|
|
+
|
|
|
+ return tasks
|
|
|
+
|
|
|
+
|
|
|
+def error(future):
|
|
|
+ err = future.exception()
|
|
|
+ if err:
|
|
|
+ logger.exception(f'[{Thread().getName()}]{err}')
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ tasks = get_tasks()
|
|
|
+
|
|
|
+ expire_time = 86400 * 365 * 1 # 1年 = 86400 * 365 * 1
|
|
|
+ dedup = RedisFilter(
|
|
|
+ # redis_url='redis://default:top@123@192.168.3.165:8165/5',
|
|
|
+ redis_url='redis://:k5ZJR5KV4q7DRZ92DQ@172.17.4.240:8361/0',
|
|
|
+ expire_time=expire_time) # 默认过期时间1年
|
|
|
+
|
|
|
+ to_mongodb = MongoClient('172.17.4.87', 27080)
|
|
|
+ collection = to_mongodb['py_spider']['ybw_list']
|
|
|
+
|
|
|
+ with ThreadPoolExecutor(max_workers=4) as tpool:
|
|
|
+ fs = []
|
|
|
+ for task in tasks:
|
|
|
+ f = tpool.submit(spider, collection, dedup, 1, task)
|
|
|
+ f.add_done_callback(error)
|
|
|
+ fs.append(f)
|
|
|
+ wait(fs)
|
|
|
+
|
|
|
+ to_mongodb.close()
|
|
|
+ logger.info("列表页采集结束")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|