dongzhaorui il y a 2 ans
Parent
commit
62bfb76f34

+ 0 - 140
zgztb_cookie/FworkSpider/items/spider_item.py

@@ -1,140 +0,0 @@
-from feapder import Item
-from untils.tools import int2long, substitute, text_search, CheckPrePareRequest, HtmlEmptyError
-import time
-from feapder.utils.log import log
-from feapder.utils.tools import get_current_date
-from datetime import datetime
-import os
-from feapder import setting
-global xxc
-xxc = 0
-
-class DataBakItem(Item):
-
-    def __init__(self):
-        self.title = ""  # 文章标题
-        self.publishtime = ""   # 文章发布时间(日期格式 xxxx-xx-xx)
-        self.spidercode = ""   # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""   # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""   # 采集的版块(编辑器爬虫平台定义)
-        self.area = "全国"   # 省
-        self.city = ""   # 市
-        self.competehref = None   # 竞品快照页地址
-        self.href = ""   # 非竞品快照页地址
-        self.publishdept = ""
-        self.iscompete=True
-        self.type = ""
-        self.T = "bidding"
-        self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
-        self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
-        self.sendflag = "false"
-        self._d = "comeintime"
-        self.contenthtml = ""  # 快照页源码
-        self.detail = ""  # 快照页源码清洗之后招投标文本
-        self.projectinfo = None  # 快照页源码清洗之后招投标文本
-        self.save = True
-    def stop(self):
-        self.save=False
-        raise HtmlEmptyError
-
-    def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.comeintime = int2long(time.time())
-        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
-        '''如果无法解析到发布时间、可以考虑补一个发布时间
-        '''
-        if ":" in self.publishtime:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
-        else:
-            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
-
-        # 数据获取失败处理:输出错误日志
-        if self.contenthtml is None and self.projectinfo is None:
-            log.error(f"{self.href},此链接数据正文抓取失败")
-            # self.sendflag = "true"
-            self.stop()
-        if not self.title or not self.publishtime or not self.href:
-            # self.sendflag = "true"
-            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
-            self.stop()
-        # html处理正文
-        if self.contenthtml is not None and self.detail =='':
-            self.detail = substitute(self.contenthtml)
-            '''
-            detail:去头、去尾
-            '''
-            if text_search(self.detail).total == 0:
-                # 无正文内容时,该内容直接标记true, 不在被统计
-                self.sendflag = "true"
-
-
-class MgpListItem(Item):
-    def __init__(self):
-        # self.__table_name__='ggg_list'
-
-        self.parse = "" # 需要调用的方法名称
-        self.item = "" # 传过来的参数
-        self.parser_name = "" # 处理详情页的爬虫名
-        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
-        self.comeintime = int2long(int(time.time())) # 当前日期时间戳
-        self.deal_detail = [] # 定义解析详情页主页内容的解析,detail_get是一个xpath列表,detail_post 则是一段处理代码
-        self.create_time = None # 定义解析详情页发布时间的xpath,列表页无发布时间时应用
-        self.parse_url = "" # 定义解析详情页主页内容的xpath
-        self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
-                                # 必须与requests请求的参数名称对应,否则无法识别
-        self.failed = 0 #失败请求的计数
-        self.author = "开发及维护人员" # 开发及维护人员
-        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
-        self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
-        self.pri = 1 # 爬虫报警级 可分9级
-        self.proxies = True # 爬虫报警级 可分9级
-        self.files = False # 附件采集配置
-        self.error = None
-        self.spidercode = ""
-        self.save=True
-
-        # self.error_info =
-    def pre_to_db(self):
-        # 生成入库时间戳(秒级), 定义为long型
-        self.author = os.path.basename(os.getcwd())
-        self.spidercode = self.item.get("spidercode")
-
-        if "通知公告" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
-        elif "公告公示" in self.item.get("channel"):
-            code,reason = CheckPrePareRequest().check_crawl_title(self.item.get("title"))
-            if code == 10106:
-                log.error(f"{self.item.get('title')}----不可入库,失败原因:{reason}")
-
-        global xxc
-        xxc += 1
-
-    def open_spider(self):
-        pass
-
-class ListItem(Item):
-    def __init__(self):
-        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
-        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
-        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
-        self.url = ''
-        self.count=0
-        self.code=-1
-        self.rel_count = 0
-        self.save=True
-
-    def pre_to_db(self):
-        time.sleep(0.1)
-        self.author = setting.author.get(os.path.basename(os.getcwd()))
-        if self.author is None:
-            self.author = os.path.basename(os.getcwd())
-        self.runtime = get_current_date(date_format="%Y-%m-%d")
-        global xxc
-        print("xxc___________________",xxc)
-        self.rel_count = xxc
-        xxc = 0
-
-
-

+ 120 - 0
zgztb_cookie/FworkSpider/untils/captcha.py

@@ -0,0 +1,120 @@
+import requests
+
+__all__ = [
+    'swordfish_platform',
+    'chaojiying_platform',
+    'chaojiying_report'
+]
+
+
+def _get_click_captcha(file: bytes):
+    url = "http://123.57.163.80:2119/v1/images/verify_det"
+    files = {'image_content': file}
+    headers = {'accept': 'application/json'}
+    response = requests.post(url, headers=headers, files=files, stream=True)
+    return response.json()
+
+
+def _simple_captcha(file):
+    """
+    普通验证码
+
+    @param file: 验证码 - 可以是图片或者图片base64编码
+    @return:
+    """
+    url = "http://123.57.163.80:2119/v1/images/verify"
+    headers = {'accept': 'application/json'}
+
+    if isinstance(file, str) and file.startswith('data:image'):
+        image_file = {'file': file}
+    elif isinstance(file, bytes):
+        image_file = {'file': file}
+    else:
+        with open(file, 'rb') as f:
+            image_bytes = f.read()
+        image_file = {'file': image_bytes}
+
+    r = requests.post(url, headers=headers, files=image_file, stream=True, timeout=10)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def _arithmetic_captcha(file):
+    """算术验证码"""
+    url = "http://123.57.163.80:2119/v1/images/arithmetic"
+    headers = {'accept': 'application/json'}
+    image_file = {'file': file}
+
+    r = requests.post(url, headers=headers, files=image_file, stream=True, timeout=10)
+    json_resp = r.json()
+    if "msg" in json_resp and "success" == json_resp["msg"]:
+        return str(json_resp["r"]["code"]).upper()
+    return None
+
+
+def swordfish_platform(file, mode='simple'):
+    """剑鱼验证码识别平台"""
+    if mode.upper() == 'arithmetic':
+        return _arithmetic_captcha(file)
+    elif mode.upper() == 'click':
+        return _get_click_captcha(file)
+    else:
+        return _simple_captcha(file)
+
+
+def chaojiying_platform(file, pic_type: int):
+    """
+    超级鹰识别平台
+
+    pic_type,详情查询地址: https://www.chaojiying.com/price.html
+    """
+    with open(file, 'rb') as f:
+        image_bytes = f.read()
+    files = {'file': image_bytes}
+
+    url = f"http://123.57.163.80:2119/v1/images/discern?pic_type={pic_type}"
+    headers = {'accept': 'application/json'}
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, files=files, timeout=10)
+    json_resp = response.json()
+    # print(json_resp)
+    '''code 返回0时,打码平台正常返回数据'''
+    pic_str = json_resp["r"]["pic_str"]
+    pic_id = json_resp["r"]["pic_id"]
+    print("pic_id >>", pic_id)
+    if 0 == json_resp["code"]:
+        return pic_str
+
+
+def chaojiying_report(pic_id: str):
+    """超级鹰平台识别验证码错误时,提交识别错误的验证码pic_id"""
+    url = f"http://123.57.163.80:2119/v1/images/report_err?pic_id={pic_id}"
+    headers = {
+        'accept': 'application/json',
+        'Content-Type': 'application/x-www-form-urlencoded'
+    }
+    data = {
+        'grant_type': '',
+        'username': 'jianyu001',
+        'password': '123qwe!A',
+        'scope': '',
+        'client_id': '',
+        'client_secret': ''
+    }
+    response = requests.post(url, headers=headers, data=data, timeout=10)
+    '''
+    回调成功:{'msg': 'OK', 'code': 0}  
+    此接口不能随便调用!程序逻辑里要这样判断: 如果 识别结果是错的 再调用 报错返分 接口。 如果没有这个判断或是无法判断,就不要调用!
+    '''
+    # print(response.json())
+    return response.json()
+

+ 0 - 21
zgztb_cookie/FworkSpider/untils/get_imgcode.py

@@ -1,21 +0,0 @@
-import requests
-from typing import Mapping
-
-
-def get_code(file_path: str) -> dict:
-    upload_address = "http://123.57.163.80:2119/v1/images/verify"
-    with open(file_path, 'rb') as f:
-        image_bytes = f.read()
-    content = {'file': image_bytes}
-    # json_resp = get_verify_code(upload_address, content)
-    headers = {'accept': 'application/json'}
-    response = requests.post(upload_address, headers=headers, files=content, stream=True)
-    return response.json()
-
-def get_code_det(image_bytes) -> dict:
-   upload_address = "http://123.57.163.80:2119/v1/images/verify_det"
-   content = {'image_content': image_bytes}
-   headers = {'accept': 'application/json'}
-   response = requests.post(upload_address, headers=headers, files=content, stream=True)
-   return response.json()
-

+ 0 - 7
zgztb_cookie/config/dev.yaml

@@ -25,13 +25,6 @@ es:
   db: biddingall # es库别名
 
 
-ali_oss:
-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
-  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
-  bucket_name: jy-datafile
-
-
 proxy:
   socks5:
     url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch

+ 0 - 2
zgztb_cookie/config/load.py

@@ -7,7 +7,6 @@ __all__ = [
     'mongo_conf',
     'redis_conf',
     'redis_startup_nodes',
-    'oss_conf',
     'es_conf',
     'jy_proxy',
     'node_module_path',
@@ -29,7 +28,6 @@ with open(_yaml_conf, encoding="utf-8") as f:
     mongo_conf = _conf['mongo']
     redis_conf = _conf['redis']
     redis_startup_nodes = _conf['redis_cluster']
-    oss_conf: dict = _conf['ali_oss']
     es_conf: dict = _conf['es']
     jy_proxy: dict = _conf['proxy']
     selenium_remote_server_addr = _conf['selenium']['remote']['server_addr']

+ 0 - 7
zgztb_cookie/config/test.yaml

@@ -19,13 +19,6 @@ redis_cluster:
     port: 2279
 
 
-ali_oss:
-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
-  endpoint: oss-cn-beijing.aliyuncs.com   # 公网使用
-  bucket_name: jy-datafile
-
-
 es:
   host: 192.168.3.206
   port: !!int 9800

+ 0 - 100
zgztb_cookie/utils/tools.py

@@ -1,104 +1,4 @@
 import hashlib
-import re
-from collections import namedtuple
-
-SearchText = namedtuple('SearchText', ['total'])
-
-
-def substitute(html_str):
-    """HTML 替换"""
-    patterns = {
-        '<!--.*?-->': '',
-        '"': "'",
-        '\n': '',
-        '\xa0': "",
-        '<span .*?>': '',
-        '</span> ': '',
-        '</span>': '',
-        '<span>': '',
-        '<p.*?>': '<br>',
-        '</p>': '<br>',
-        '<div>': '<br>',
-        '<div .*?>': '<br>',
-        '</div>': '<br>',
-        '<img .*?>': '<br>',
-        '<style.*?</style>': '',
-        '<EpointForm>': '',
-        '<html.*?</head>': '',
-        '<input .*?>': '',
-        '<!DOCTYPE.*?>': '',
-        '</meta>': '',
-        '<?xml:.*?>': '',
-        '<label.*?>': '<br>',
-        '</label>': '',
-        'style=".*?"': '',
-        "style='.*?'": '',
-        'class=".*?"': '',
-        "class='.*?'": '',
-        "align='.*?'": '',
-        'align=".*?"': '',
-        'border=".*?"': '',
-        "border='.*?'": '',
-        'cellpadding=".*?"': '',
-        "cellpadding='.*?'": '',
-        'cellspacing=".*?"': '',
-        "cellspacing='.*?'": '',
-        'center=".*?"': '',
-        "center='.*?'": '',
-        'width=".*?"': '',
-        "width='.*?'": '',
-        "bordercolor='.*?'": '',
-        'bgcolor=".*?"': '',
-        'BORDERCOLOR=".*?"': '',
-        '<a name=".*?">': '',
-        '<o:p>': '',
-        '</o:p>': '',
-        '<A name=.*?>': '',
-        '<a .*?>': '',
-        '</a>': '',
-        '<font .*?>': '',
-        '</font>': '',
-        '<body.*?>': '',
-        '</body>': '',
-        '<script.*?>': '',
-        '</script>': '',
-        '【关闭】': '',
-        '【打印】': '',
-    }
-
-    def substitutes(k, v, c):
-        return re.sub(k, v, c)
-
-    for k, v in patterns.items():
-        html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M)
-    return html_str
-
-
-def get_signature(content: str) -> str:
-    """
-    十六进制数字字符串形式摘要值
-
-    @param content: 字符串文本
-    @return: 摘要值
-    """
-    sha1 = hashlib.sha1()
-    sha1.update(content.encode("utf-8"))
-    return sha1.hexdigest()
-
-
-def text_search(content: str) -> SearchText:
-    """
-    中文检索
-
-    :param content: 文本
-    :return: 中文数量
-    """
-    if not content:
-        return SearchText(0)
-
-    results = re.findall('[\u4e00-\u9fa5]', content, re.S)
-    # 列表长度即是中文的字数
-    return SearchText(len(results))
 
 
 def encrypt(text: str):