|
@@ -290,13 +290,24 @@ def njpc_fields_extract_special(html, data_item):
|
|
|
return data_item
|
|
|
|
|
|
|
|
|
-def get_proxy():
|
|
|
+def get_proxy(scheme=None,default=None,socks5h=False):
|
|
|
headers = {
|
|
|
"Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
}
|
|
|
- proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
|
|
|
- print(f"切换代理:{proxy.get('data')}")
|
|
|
- return proxy.get("data").get("http")
|
|
|
+ proxy_res = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
|
|
|
+
|
|
|
+ proxies = proxy_res.get('data')
|
|
|
+ if proxy_res and proxies:
|
|
|
+ if socks5h:
|
|
|
+ proxyh = {}
|
|
|
+ proxyh["http"] = proxies.get("http").replace("socks5", "socks5h")
|
|
|
+ proxyh["https"] = proxies.get("http").replace("socks5", "socks5h")
|
|
|
+ proxies = proxyh
|
|
|
+ print(f"切换代理:{proxies}")
|
|
|
+ if not scheme:
|
|
|
+ return proxies
|
|
|
+ else:
|
|
|
+ return proxies.get(scheme,default)
|
|
|
|
|
|
|
|
|
def search(pattern, string):
|
|
@@ -364,3 +375,31 @@ def remove_htmldata(remove_info_list:list, html:str, response):
|
|
|
if extra_html:
|
|
|
html = html.replace(extra_html, '')
|
|
|
return html
|
|
|
+
|
|
|
+
|
|
|
+def extract_file_type(file_name="附件名", file_url="附件地址",file_type=[]):
|
|
|
+ """
|
|
|
+ 抽取附件类型
|
|
|
+ Args:
|
|
|
+ file_name: 附件名
|
|
|
+ file_url: 附件地址
|
|
|
+
|
|
|
+ Returns:
|
|
|
+
|
|
|
+ """
|
|
|
+ if file_name and file_url:
|
|
|
+ file_name = file_name.strip()
|
|
|
+ file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
|
|
|
+ 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
|
|
|
+ if file_type:
|
|
|
+ file_types.extend(file_type)
|
|
|
+
|
|
|
+ file_type = file_url.split('?')[0].split('.')[-1].lower()
|
|
|
+ if file_type not in file_types:
|
|
|
+ file_type = file_name.split('?')[0].split('.')[-1].lower()
|
|
|
+ if file_type in file_types:
|
|
|
+ return file_type
|
|
|
+ else:
|
|
|
+ return file_type
|
|
|
+ return None
|
|
|
+
|