소스 검색

维护列表页采集爬虫,废除无效栏目

dongzhaorui 2 년 전
부모
커밋
b540eb55c7
1개의 변경된 파일149개의 추가작업 그리고 108개의 파일을 삭제
  1. 149 108
      zgztb_cookie/zgzbtb_spider.py

+ 149 - 108
zgztb_cookie/zgzbtb_spider.py

@@ -2,141 +2,182 @@
 #  中国招标投标公共服务平台
 #  @CreatDate    : 4/11/2021 上午 10:04
 #  @Author  : 马国鹏
-#  @File    : qgzb_spider.py
-import sys
-sys.path.append('/mnt/FworkSpider')
-
-import datetime
+import json
 import time
 from collections import namedtuple
 
 import requests
 
-from utils.databases import redis_cluster, mongo_table, int2long
+from utils.databases import mongo_table, int2long, redis_client
 from utils.log import logger
 from utils.tools import redis_exists, redis_set
-from feapder.network.proxy_pool import swordfish_proxy
 
-Menu = namedtuple('Menu', ['channel', 'code', 'id', 'crawl_page', "businessKeyWord"])
+Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
+
+
+def socks_proxy():
+    """剑鱼代理"""
+    url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
+    headers = {"Authorization": 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
+    proxy = requests.get(url, headers=headers).json()
+    proxies = proxy.get('data')
+    logger.info(f"切换代理:{proxies}")
+    return proxies
+
 
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
+    @param time_format:时间格式
+    ---------
+    @result: 返回时间戳
+    """
 
-class CebPubServiceListPageSpider(object):
+    timestamp = time.mktime(time.strptime(date, time_format))
+    return int(timestamp)
+
+
+class CebPubServiceListPageSpider:
 
     def __init__(self):
         self.menus = [
-            Menu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm', "招标项目", 1, "tenderProject"),
-            Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', "招标公告", 1,"tenderBulletin"),
-            Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', "开标记录", 1, "openBidRecord"),
-            Menu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs', "评标公示", 1, "winCandidateBulletin"),
-            Menu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg', "中标公告", 1, "winBidBulletin"),
-            # Menu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx', "签约履行", 1,"tenderBulletin"),
+            Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
+            Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
+            Menu('未按数据规范-评标公示', 'a_zgzbtbggfwpt_wasjgf_pbgs', '评标公示', 'winCandidateBulletin'),
+            Menu('未按数据规范-中标公告', 'a_zgzbtbggfwpt_wasjgf_zhbgg', '中标公告', 'winBidBulletin'),
+            # Menu('未按数据规范-签约履行', 'a_zgzbtbggfwpt_wasjgf_qylx', "签约履行", "tenderBulletin"),
+            # Menu('未按数据规范-招标项目', 'a_zgzbtbggfwpt_wasjgf_zbxm', '招标项目',  'tenderProject'), # 已废除
         ]
         self.crawl_list = mongo_table('py_spider', 'zgzb_list')
-        self.r = redis_cluster()
+        self.r = redis_client()
         self.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
 
     def start(self):
-        for menu in self.menus:
-            self.list_page(menu)
-
-    def list_page(self, menu):
-        header = {
-            "Origin": "http://www.cebpubservice.com",
-            "Host": "www.cebpubservice.com",
-            "Content-Length": "228",
-            "X-Requested-With": "XMLHttpRequest",
-            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+        headers = {
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
+            'Cache-Control': 'no-cache',
+            'Connection': 'keep-alive',
+            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            'Origin': 'http://www.cebpubservice.com',
+            'Pragma': 'no-cache',
+            'X-Requested-With': 'XMLHttpRequest',
             "Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400"
         }
-        data = "searchName=&searchArea=&searchIndustry=&centerPlat=&businessType=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&searchTimeStart=&searchTimeStop=&timeTypeParam=&bulletinIssnTime=&bulletinIssnTimeStop="
-        data = {i.split("=")[0]: i.split("=")[-1] for i in data.split("&")}
-        data["businessType"] = menu.id
+        start_time = '2天'
+        page_size = 1000
 
-        for page_no in range(1, 10):
+        # today = datetime.date.today()
+        # end_day = today - datetime.timedelta(days=-3)
+        for menu in self.menus:
+            business_type = menu.type
+            for page in range(1, 10):
+                msg = f'{business_type}-第{page}页'
+                data = {
+                    'searchName': '',
+                    'searchArea': '',
+                    'searchIndustry': '',
+                    'centerPlat': '',
+                    'businessType': business_type,
+                    'searchTimeStart': '',
+                    'searchTimeStop': '',
+                    'timeTypeParam': '',
+                    'bulletinIssnTime': start_time,
+                    'bulletinIssnTimeStart': '',
+                    'bulletinIssnTimeStop': '',
+                    'pageNo': page,
+                    'row': page_size,
+                }
+                response = self.request(data, headers, msg=msg)
+                if not response:
+                    logger.info(f'{msg}-接口无数据')
+                    break
+
+                resp_json = response.json()
+                items = resp_json.get("object")
+                if not items:
+                    logger.info(f'{msg}-无列表数据')
+                    break
+
+                return_list = items.get("returnlist")
+                logger.info(f"{msg}-采集{len(return_list)}条数据")
+                total_page = items.get("page").get("totalPage", 0)
+                logger.info(f'{business_type}-共{total_page}页')
+                self.parse(return_list, menu)
+
+    def request(self, data, headers, **kwargs):
+        logger.info(f"开始请求{kwargs.get('msg')}")
+        while True:
             try:
-                today = datetime.date.today()
-                endday = today - datetime.timedelta(days=-3)
-                data["pageNo"] = str(page_no)
-                data["row"] = '1000'
-                data["bulletinIssnTime"] = "2天"
-
-                if menu.id == '招标公告':
-                    # data["bulletinIssnTimeStart"] = str(endday)
-                    # data["bulletinIssnTimeStart"] = str(today)
-                    data["bulletinIssnTimeStart"] = ""
-                    data["bulletinIssnTimeStart"] = ""
-                else:
-                    # data["searchTimeStop"] = str(endday)
-                    #
-                    # data["searchTimeStart"] = str(today)
-                    data["searchTimeStart"] = ""
-                    data["searchTimeStop"] = ""
-
-                logger.info(f"开始请求第{page_no}页")
-                # 发起请求
-                res = requests.post(
-                    self.url,
-                    headers=header,
+                request_param = dict(
+                    headers=headers,
                     data=data,
+                    proxies=socks_proxy(),
                     timeout=5,
-                    proxies=swordfish_proxy()
                 )
-                res = res.json()
-                # 解析结果
-                max_page = res.get("object").get("page").get("totalPage") or 0
-                list_page_datas = res.get("object").get("returnlist")
-                logger.info(f"请求成功,最大{max_page}页-{len(list_page_datas)}条数据")
-                # 数据处理
-                for jtme in list_page_datas:
-                    businessid = jtme.get("businessId")
-                    tenderprojectcode = jtme.get("tenderProjectCode")
-                    businessobjectname = jtme.get("businessObjectName")
-                    transactionplatfcode = jtme.get("transactionPlatfCode")
-                    transactionplatfname = jtme.get("transactionPlatfName")
-
-                    regionname = jtme.get("regionName")
-                    city = ''
-                    if regionname is not None:
-                        city = "" if "市" in regionname else regionname.split(" ")[-1]
-
-                    if jtme.get("businessObjectName") is None:
-                        continue
-                    if jtme.get("businessObjectName") == '':
-                        continue
-
-                    item = {
-                        "schemaVersion": jtme.get("schemaVersion"),
-                        "type": jtme.get("type"),
-                        "businessKeyWord": menu.businessKeyWord,
-                        "rowGuid": jtme.get("rowGuid"),
-                        "site": "中国招标投标公共服务平台",
-                        "channel": menu.channel,
-                        "area": jtme.get("regionName"),
-                        "_d": "comeintime",
-                        "comeintime": int2long(int(time.time())),
-                        "T": "bidding",
-                        "sendflag": "false",
-                        "spidercode": menu.code,
-                        "city": city,
-                        "iscompete": "true",
-                        "publishdept": "",
-                        "title": jtme.get("businessObjectName"),
-                        "href": businessid + "&" + tenderprojectcode + "&" + transactionplatfcode,
-                        "publishtime": str(jtme.get("receiveTime")) + " 00:00:00",
-                        "l_np_publishtime": int2long(int(time.mktime(time.strptime(jtme.get("receiveTime"), "%Y-%m-%d")))),
-                        "detail": "",
-                        "contenthtml": "",
-                        "infoformat": 1
-                    }
-                    feature = businessid + "&" + tenderprojectcode + "&" + transactionplatfcode
-                    if not redis_exists(feature, self.r):
-                        result = self.crawl_list.insert_one(item)
-                        logger.info(f"{menu.channel} >>> {result.inserted_id}-{item['title']} --上传成功")
-                        redis_set(feature, self.r)
+                response = requests.post(self.url, **request_param)
+                logger.info(f'{kwargs.get("msg")}--请求成功')
+                response.json()  # 检测数据是否请求成功
+                return response
+            except json.decoder.JSONDecodeError:
+                logger.error(f"{kwargs.get('msg')}--代理受限,等待重试")
             except Exception as e:
-                print(e)
-                logger.error(f"请求失败,原因:{e.args}")
+                logger.error(f"{kwargs.get('msg')}--请求失败")
+                logger.exception(f'异常原因:{e}')
+                return
+
+    def parse(self, items, menu):
+        for jtme in items:
+            businessid = jtme.get("businessId")
+            tenderprojectcode = jtme.get("tenderProjectCode")
+            platf_code = jtme.get("transactionPlatfCode")
+            href = "&".join([businessid, tenderprojectcode, platf_code])
+            publish_time = jtme.get("receiveTime")
+            title = jtme.get("businessObjectName")
+            if not title:
+                continue
+
+            region = jtme.get('regionName', '') or ''
+            if region and len(region.split(' ')) >= 2:
+                area, city = region.split(' ')
+            elif len(region.split(' ')) == 1:
+                area, city = region, ''
+            else:
+                area, city = '', ''
+
+            item = {
+                "schemaVersion": jtme.get("schemaVersion"),
+                "type": jtme.get("type"),
+                "businessKeyWord": menu.businessKeyWord,
+                "rowGuid": jtme.get("rowGuid"),
+                "title": title,
+                "href": href,
+                "site": "中国招标投标公共服务平台",
+                "channel": menu.channel,
+                "spidercode": menu.code,
+                "area": area,
+                "city": city,
+                "district": "",
+                "comeintime": int2long(int(time.time())),
+                "publishtime": publish_time,
+                "l_np_publishtime": int2long(date_to_timestamp(publish_time, '%Y-%m-%d')),
+                "detail": "",
+                "contenthtml": "",
+                "T": "bidding",
+                "sendflag": "false",
+                "iscompete": True,
+                "_d": "comeintime",
+                "publishdept": "",
+                "infoformat": 1
+            }
+            if not redis_exists(href, self.r):
+                result = self.crawl_list.insert_one(item)
+                redis_set(href, self.r)
+                msg = f"{item['title']} - ObjectId('{result.inserted_id}')"
+                logger.info(f"{menu.channel} >>> {msg} --上传成功")
 
 
 if __name__ == '__main__':