il y a 2 ans · 62bfb76f34
--- a/zgztb_cookie/FworkSpider/items/spider_item.py
+++ b/zgztb_cookie/FworkSpider/items/spider_item.py
@@ -1,140 +0,0 @@
 
				-from feapder import Item
			
 
				-from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
			
 
				-import time
			
 
				-from feapder.utils.log import log
			
 
				-from feapder.utils.tools import get_current_date
			
 
				-from datetime import datetime
			
 
				-import os
			
 
				-from feapder import setting
			
 
				-global xxc
			
 
				-xxc = 0
			
 
				-
			
 
				-class DataBakItem(Item):
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self.title = ""  # 文章标题
			
 
				-        self.publishtime = ""   # 文章发布时间（日期格式 xxxx-xx-xx）
			
 
				-        self.spidercode = ""   # 爬虫代码（编辑器爬虫平台定义）
			
 
				-        self.site = ""   # 采集的站点（编辑器爬虫平台定义）
			
 
				-        self.channel = ""   # 采集的版块（编辑器爬虫平台定义）
			
 
				-        self.area = "全国"   # 省
			
 
				-        self.city = ""   # 市
			
 
				-        self.competehref = None   # 竞品快照页地址
			
 
				-        self.href = ""   # 非竞品快照页地址
			
 
				-        self.publishdept = ""
			
 
				-        self.iscompete=True
			
 
				-        self.type = ""
			
 
				-        self.T = "bidding"
			
 
				-        self.l_np_publishtime = ""  # 发布时间的时间戳（秒级）, 需定义为long型
			
 
				-        self.comeintime = ""  # 入库时间戳（秒级）, 需定义为long型
			
 
				-        self.sendflag = "false"
			
 
				-        self._d = "comeintime"
			
 
				-        self.contenthtml = ""  # 快照页源码
			
 
				-        self.detail = ""  # 快照页源码清洗之后招投标文本
			
 
				-        self.projectinfo = None  # 快照页源码清洗之后招投标文本
			
 
				-        self.save = True
			
 
				-    def stop(self):
			
 
				-        self.save=False
			
 
				-        raise HtmlEmptyError
			
 
				-
			
 
				-    def pre_to_db(self):
			
 
				-        # 生成入库时间戳（秒级）, 定义为long型
			
 
				-        self.comeintime = int2long(time.time())
			
 
				-        # 根据文章发布时间 生成发布时间的时间戳（秒级）, 定义为long型
			
 
				-        '''如果无法解析到发布时间、可以考虑补一个发布时间
			
 
				-        '''
			
 
				-        if ":" in self.publishtime:
			
 
				-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
			
 
				-        else:
			
 
				-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
			
 
				-
			
 
				-        # 数据获取失败处理：输出错误日志
			
 
				-        if self.contenthtml is None and self.projectinfo is None:
			
 
				-            log.error(f"{self.href},此链接数据正文抓取失败")
			
 
				-            # self.sendflag = "true"
			
 
				-            self.stop()
			
 
				-        if not self.title or not self.publishtime or not self.href:
			
 
				-            # self.sendflag = "true"
			
 
				-            log.error(f"部分数据抓取失败，数据详情：\n 链接：{self.href}\n 发布时间：{self.publishtime}\n标题:{self.title}")
			
 
				-            self.stop()
			
 
				-        # html处理正文
			
 
				-        if self.contenthtml is not None and self.detail =='':
			
 
				-            self.detail = substitute(self.contenthtml)
			
 
				-            '''
			
 
				-            detail:去头、去尾
			
 
				-            '''
			
 
				-            if text_search(self.detail).total == 0:
			
 
				-                # 无正文内容时，该内容直接标记true, 不在被统计
			
 
				-                self.sendflag = "true"
			
 
				-
			
 
				-
			
 
				-class MgpListItem(Item):
			
 
				-    def __init__(self):
			
 
				-        # self.__table_name__='ggg_list'
			
 
				-
			
 
				-        self.parse = "" # 需要调用的方法名称
			
 
				-        self.item = "" # 传过来的参数
			
 
				-        self.parser_name = "" # 处理详情页的爬虫名
			
 
				-        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
			
 
				-        self.comeintime = int2long(int(time.time())) # 当前日期时间戳
			
 
				-        self.deal_detail = [] # 定义解析详情页主页内容的解析，detail_get是一个xpath列表，detail_post 则是一段处理代码
			
 
				-        self.create_time = None # 定义解析详情页发布时间的xpath，列表页无发布时间时应用
			
 
				-        self.parse_url = "" # 定义解析详情页主页内容的xpath
			
 
				-        self.request_params = {} # 定义callback所需的参数，诸如render，headers，method，data，params等等，
			
 
				-                                # 必须与requests请求的参数名称对应，否则无法识别
			
 
				-        self.failed = 0 #失败请求的计数
			
 
				-        self.author = "开发及维护人员" # 开发及维护人员
			
 
				-        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
			
 
				-        self.ex_python = None # 定义需要执行的python代码，生成params/date，如header和cookie特殊，最好使用特殊定义法
			
 
				-        self.pri = 1 # 爬虫报警级 可分9级
			
 
				-        self.proxies = True # 爬虫报警级 可分9级
			
 
				-        self.files = False # 附件采集配置
			
 
				-        self.error = None
			
 
				-        self.spidercode = ""
			
 
				-        self.save=True
			
 
				-
			
 
				-        # self.error_info =
			
 
				-    def pre_to_db(self):
			
 
				-        # 生成入库时间戳（秒级）, 定义为long型
			
 
				-        self.author = os.path.basename(os.getcwd())
			
 
				-        self.spidercode = self.item.get("spidercode")
			
 
				-
			
 
				-        if "通知公告" in self.item.get("channel"):
			
 
				-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				-            if code == 10106:
			
 
				-                log.error(f"{self.item.get('title')}----不可入库，失败原因:{reason}")
			
 
				-        elif "公告公示" in self.item.get("channel"):
			
 
				-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
			
 
				-            if code == 10106:
			
 
				-                log.error(f"{self.item.get('title')}----不可入库，失败原因:{reason}")
			
 
				-
			
 
				-        global xxc
			
 
				-        xxc += 1
			
 
				-
			
 
				-    def open_spider(self):
			
 
				-        pass
			
 
				-
			
 
				-class ListItem(Item):
			
 
				-    def __init__(self):
			
 
				-        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				-        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				-        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				-        self.url = ''
			
 
				-        self.count=0
			
 
				-        self.code=-1
			
 
				-        self.rel_count = 0
			
 
				-        self.save=True
			
 
				-
			
 
				-    def pre_to_db(self):
			
 
				-        time.sleep(0.1)
			
 
				-        self.author = setting.author.get(os.path.basename(os.getcwd()))
			
 
				-        if self.author is None:
			
 
				-            self.author = os.path.basename(os.getcwd())
			
 
				-        self.runtime = get_current_date(date_format="%Y-%m-%d")
			
 
				-        global xxc
			
 
				-        print("xxc___________________",xxc)
			
 
				-        self.rel_count = xxc
			
 
				-        xxc = 0
			
 
				-
			
 
				-
			
 
				-
			
--- a/zgztb_cookie/FworkSpider/untils/captcha.py
+++ b/zgztb_cookie/FworkSpider/untils/captcha.py
@@ -0,0 +1,120 @@
 
				+import requests
			
 
				+
			
 
				+__all__ = [
			
 
				+    'swordfish_platform',
			
 
				+    'chaojiying_platform',
			
 
				+    'chaojiying_report'
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def _get_click_captcha(file: bytes):
			
 
				+    url = "http://123.57.163.80:2119/v1/images/verify_det"
			
 
				+    files = {'image_content': file}
			
 
				+    headers = {'accept': 'application/json'}
			
 
				+    response = requests.post(url, headers=headers, files=files, stream=True)
			
 
				+    return response.json()
			
 
				+
			
 
				+
			
 
				+def _simple_captcha(file):
			
 
				+    """
			
 
				+    普通验证码
			
 
				+
			
 
				+    @param file: 验证码 - 可以是图片或者图片base64编码
			
 
				+    @return:
			
 
				+    """
			
 
				+    url = "http://123.57.163.80:2119/v1/images/verify"
			
 
				+    headers = {'accept': 'application/json'}
			
 
				+
			
 
				+    if isinstance(file, str) and file.startswith('data:image'):
			
 
				+        image_file = {'file': file}
			
 
				+    elif isinstance(file, bytes):
			
 
				+        image_file = {'file': file}
			
 
				+    else:
			
 
				+        with open(file, 'rb') as f:
			
 
				+            image_bytes = f.read()
			
 
				+        image_file = {'file': image_bytes}
			
 
				+
			
 
				+    r = requests.post(url, headers=headers, files=image_file, stream=True, timeout=10)
			
 
				+    json_resp = r.json()
			
 
				+    if "msg" in json_resp and "success" == json_resp["msg"]:
			
 
				+        return str(json_resp["r"]["code"]).upper()
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def _arithmetic_captcha(file):
			
 
				+    """算术验证码"""
			
 
				+    url = "http://123.57.163.80:2119/v1/images/arithmetic"
			
 
				+    headers = {'accept': 'application/json'}
			
 
				+    image_file = {'file': file}
			
 
				+
			
 
				+    r = requests.post(url, headers=headers, files=image_file, stream=True, timeout=10)
			
 
				+    json_resp = r.json()
			
 
				+    if "msg" in json_resp and "success" == json_resp["msg"]:
			
 
				+        return str(json_resp["r"]["code"]).upper()
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def swordfish_platform(file, mode='simple'):
			
 
				+    """剑鱼验证码识别平台"""
			
 
				+    if mode.upper() == 'arithmetic':
			
 
				+        return _arithmetic_captcha(file)
			
 
				+    elif mode.upper() == 'click':
			
 
				+        return _get_click_captcha(file)
			
 
				+    else:
			
 
				+        return _simple_captcha(file)
			
 
				+
			
 
				+
			
 
				+def chaojiying_platform(file, pic_type: int):
			
 
				+    """
			
 
				+    超级鹰识别平台
			
 
				+
			
 
				+    pic_type，详情查询地址: https://www.chaojiying.com/price.html
			
 
				+    """
			
 
				+    with open(file, 'rb') as f:
			
 
				+        image_bytes = f.read()
			
 
				+    files = {'file': image_bytes}
			
 
				+
			
 
				+    url = f"http://123.57.163.80:2119/v1/images/discern?pic_type={pic_type}"
			
 
				+    headers = {'accept': 'application/json'}
			
 
				+    data = {
			
 
				+        'grant_type': '',
			
 
				+        'username': 'jianyu001',
			
 
				+        'password': '123qwe!A',
			
 
				+        'scope': '',
			
 
				+        'client_id': '',
			
 
				+        'client_secret': ''
			
 
				+    }
			
 
				+    response = requests.post(url, headers=headers, data=data, files=files, timeout=10)
			
 
				+    json_resp = response.json()
			
 
				+    # print(json_resp)
			
 
				+    '''code 返回0时，打码平台正常返回数据'''
			
 
				+    pic_str = json_resp["r"]["pic_str"]
			
 
				+    pic_id = json_resp["r"]["pic_id"]
			
 
				+    print("pic_id >>", pic_id)
			
 
				+    if 0 == json_resp["code"]:
			
 
				+        return pic_str
			
 
				+
			
 
				+
			
 
				+def chaojiying_report(pic_id: str):
			
 
				+    """超级鹰平台识别验证码错误时，提交识别错误的验证码pic_id"""
			
 
				+    url = f"http://123.57.163.80:2119/v1/images/report_err?pic_id={pic_id}"
			
 
				+    headers = {
			
 
				+        'accept': 'application/json',
			
 
				+        'Content-Type': 'application/x-www-form-urlencoded'
			
 
				+    }
			
 
				+    data = {
			
 
				+        'grant_type': '',
			
 
				+        'username': 'jianyu001',
			
 
				+        'password': '123qwe!A',
			
 
				+        'scope': '',
			
 
				+        'client_id': '',
			
 
				+        'client_secret': ''
			
 
				+    }
			
 
				+    response = requests.post(url, headers=headers, data=data, timeout=10)
			
 
				+    '''
			
 
				+    回调成功:{'msg': 'OK', 'code': 0}  
			
 
				+    此接口不能随便调用！程序逻辑里要这样判断： 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断，就不要调用！
			
 
				+    '''
			
 
				+    # print(response.json())
			
 
				+    return response.json()
			
 
				+
			
--- a/zgztb_cookie/FworkSpider/untils/get_imgcode.py
+++ b/zgztb_cookie/FworkSpider/untils/get_imgcode.py
@@ -1,21 +0,0 @@
 
				-import requests
			
 
				-from typing import Mapping
			
 
				-
			
 
				-
			
 
				-def get_code(file_path: str) -> dict:
			
 
				-    upload_address = "http://123.57.163.80:2119/v1/images/verify"
			
 
				-    with open(file_path, 'rb') as f:
			
 
				-        image_bytes = f.read()
			
 
				-    content = {'file': image_bytes}
			
 
				-    # json_resp = get_verify_code(upload_address, content)
			
 
				-    headers = {'accept': 'application/json'}
			
 
				-    response = requests.post(upload_address, headers=headers, files=content, stream=True)
			
 
				-    return response.json()
			
 
				-
			
 
				-def get_code_det(image_bytes) -> dict:
			
 
				-   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
			
 
				-   content = {'image_content': image_bytes}
			
 
				-   headers = {'accept': 'application/json'}
			
 
				-   response = requests.post(upload_address, headers=headers, files=content, stream=True)
			
 
				-   return response.json()
			
 
				-
			
--- a/zgztb_cookie/config/dev.yaml
+++ b/zgztb_cookie/config/dev.yaml
@@ -25,13 +25,6 @@ es:
 
				   db: biddingall # es库别名
			
 
				 
			
 
				 
			
 
				-ali_oss:
			
 
				-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
			
 
				-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
			
 
				-  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
			
 
				-  bucket_name: jy-datafile
			
 
				-
			
 
				-
			
 
				 proxy:
			
 
				   socks5:
			
 
				     url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
			
--- a/zgztb_cookie/config/load.py
+++ b/zgztb_cookie/config/load.py
@@ -7,7 +7,6 @@ __all__ = [
 
				     'mongo_conf',
			
 
				     'redis_conf',
			
 
				     'redis_startup_nodes',
			
 
				-    'oss_conf',
			
 
				     'es_conf',
			
 
				     'jy_proxy',
			
 
				     'node_module_path',
			
@@ -29,7 +28,6 @@ with open(_yaml_conf, encoding="utf-8") as f:
 
				     mongo_conf = _conf['mongo']
			
 
				     redis_conf = _conf['redis']
			
 
				     redis_startup_nodes = _conf['redis_cluster']
			
 
				-    oss_conf: dict = _conf['ali_oss']
			
 
				     es_conf: dict = _conf['es']
			
 
				     jy_proxy: dict = _conf['proxy']
			
 
				     selenium_remote_server_addr = _conf['selenium']['remote']['server_addr']
			
--- a/zgztb_cookie/config/test.yaml
+++ b/zgztb_cookie/config/test.yaml
@@ -19,13 +19,6 @@ redis_cluster:
 
				     port: 2279
			
 
				 
			
 
				 
			
 
				-ali_oss:
			
 
				-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
			
 
				-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
			
 
				-  endpoint: oss-cn-beijing.aliyuncs.com   # 公网使用
			
 
				-  bucket_name: jy-datafile
			
 
				-
			
 
				-
			
 
				 es:
			
 
				   host: 192.168.3.206
			
 
				   port: !!int 9800
			
--- a/zgztb_cookie/utils/tools.py
+++ b/zgztb_cookie/utils/tools.py
@@ -1,104 +1,4 @@
 
				 import hashlib
			
 
				-import re
			
 
				-from collections import namedtuple
			
 
				-
			
 
				-SearchText = namedtuple('SearchText', ['total'])
			
 
				-
			
 
				-
			
 
				-def substitute(html_str):
			
 
				-    """HTML 替换"""
			
 
				-    patterns = {
			
 
				-        '<!--.*?-->': '',
			
 
				-        '"': "'",
			
 
				-        '\n': '',
			
 
				-        '\xa0': "",
			
 
				-        '<span .*?>': '',
			
 
				-        '</span> ': '',
			
 
				-        '</span>': '',
			
 
				-        '<span>': '',
			
 
				-        '<p.*?>': '<br>',
			
 
				-        '</p>': '<br>',
			
 
				-        '<div>': '<br>',
			
 
				-        '<div .*?>': '<br>',
			
 
				-        '</div>': '<br>',
			
 
				-        '<img .*?>': '<br>',
			
 
				-        '<style.*?</style>': '',
			
 
				-        '<EpointForm>': '',
			
 
				-        '<html.*?</head>': '',
			
 
				-        '<input .*?>': '',
			
 
				-        '<!DOCTYPE.*?>': '',
			
 
				-        '</meta>': '',
			
 
				-        '<?xml:.*?>': '',
			
 
				-        '<label.*?>': '<br>',
			
 
				-        '</label>': '',
			
 
				-        'style=".*?"': '',
			
 
				-        "style='.*?'": '',
			
 
				-        'class=".*?"': '',
			
 
				-        "class='.*?'": '',
			
 
				-        "align='.*?'": '',
			
 
				-        'align=".*?"': '',
			
 
				-        'border=".*?"': '',
			
 
				-        "border='.*?'": '',
			
 
				-        'cellpadding=".*?"': '',
			
 
				-        "cellpadding='.*?'": '',
			
 
				-        'cellspacing=".*?"': '',
			
 
				-        "cellspacing='.*?'": '',
			
 
				-        'center=".*?"': '',
			
 
				-        "center='.*?'": '',
			
 
				-        'width=".*?"': '',
			
 
				-        "width='.*?'": '',
			
 
				-        "bordercolor='.*?'": '',
			
 
				-        'bgcolor=".*?"': '',
			
 
				-        'BORDERCOLOR=".*?"': '',
			
 
				-        '<a name=".*?">': '',
			
 
				-        '<o:p>': '',
			
 
				-        '</o:p>': '',
			
 
				-        '<A name=.*?>': '',
			
 
				-        '<a .*?>': '',
			
 
				-        '</a>': '',
			
 
				-        '<font .*?>': '',
			
 
				-        '</font>': '',
			
 
				-        '<body.*?>': '',
			
 
				-        '</body>': '',
			
 
				-        '<script.*?>': '',
			
 
				-        '</script>': '',
			
 
				-        '【关闭】': '',
			
 
				-        '【打印】': '',
			
 
				-    }
			
 
				-
			
 
				-    def substitutes(k, v, c):
			
 
				-        return re.sub(k, v, c)
			
 
				-
			
 
				-    for k, v in patterns.items():
			
 
				-        html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M)
			
 
				-    return html_str
			
 
				-
			
 
				-
			
 
				-def get_signature(content: str) -> str:
			
 
				-    """
			
 
				-    十六进制数字字符串形式摘要值
			
 
				-
			
 
				-    @param content: 字符串文本
			
 
				-    @return: 摘要值
			
 
				-    """
			
 
				-    sha1 = hashlib.sha1()
			
 
				-    sha1.update(content.encode("utf-8"))
			
 
				-    return sha1.hexdigest()
			
 
				-
			
 
				-
			
 
				-def text_search(content: str) -> SearchText:
			
 
				-    """
			
 
				-    中文检索
			
 
				-
			
 
				-    :param content: 文本
			
 
				-    :return: 中文数量
			
 
				-    """
			
 
				-    if not content:
			
 
				-        return SearchText(0)
			
 
				-
			
 
				-    results = re.findall('[\u4e00-\u9fa5]', content, re.S)
			
 
				-    # 列表长度即是中文的字数
			
 
				-    return SearchText(len(results))
			
 
				 
			
 
				 
			
 
				 def encrypt(text: str):