2 жил өмнө · 4dee81dd7d
--- a/FworkSpider/feapder/templates/spider_template.tmpl
+++ b/FworkSpider/feapder/templates/spider_template.tmpl
@@ -55,10 +55,10 @@ class ${spider_name}(feapder.BiddingListSpider):
 
				             district = ""  # 区县
			
 
				 
			
 
				             try:
			
 
				-                next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title}")]')[index]  # index防止标题相同
			
 
				+                next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title}")]')[0]  # index防止标题相同
			
 
				             except:
			
 
				                 try:
			
 
				-                    next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title[:10]}")]')[index] # 标题过长
			
 
				+                    next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title[:10]}")]')[0] # 标题过长
			
 
				                 except:
			
 
				                     continue
			
 
				 
			
--- a/FworkSpider/untils/jsl_clearance_s.py
+++ b/FworkSpider/untils/jsl_clearance_s.py
@@ -59,7 +59,12 @@ class DTCookiePool(PageCookiePool):
 
				         js_func = re.sub(gox, "return document['cookie']\n};", js_func)
			
 
				         js_func = '''const jsdom = require("jsdom");
			
 
				                     const {JSDOM} = jsdom;
			
 
				-                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
			
 
				+                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
			
 
				+                                        {
			
 
				+                                            url: "https://example.org/",
			
 
				+                                            referrer: "https://example.com/",
			
 
				+                                            contentType: "text/html",
			
 
				+                                        });
			
 
				                     window = dom.window;
			
 
				                     document = window.document;''' + js_func
			
 
				         ctx = execjs.compile(js_func)
			
--- a/FworkSpider/untils/tools.py
+++ b/FworkSpider/untils/tools.py
@@ -290,13 +290,24 @@ def njpc_fields_extract_special(html, data_item):
 
				     return data_item
			
 
				 
			
 
				 
			
 
				-def get_proxy():
			
 
				+def get_proxy(scheme=None,default=None,socks5h=False):
			
 
				     headers = {
			
 
				         "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
			
 
				     }
			
 
				-    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
			
 
				-    print(f"切换代理：{proxy.get('data')}")
			
 
				-    return proxy.get("data").get("http")
			
 
				+    proxy_res = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
			
 
				+
			
 
				+    proxies = proxy_res.get('data')
			
 
				+    if proxy_res and proxies:
			
 
				+        if socks5h:
			
 
				+            proxyh = {}
			
 
				+            proxyh["http"] = proxies.get("http").replace("socks5", "socks5h")
			
 
				+            proxyh["https"] = proxies.get("http").replace("socks5", "socks5h")
			
 
				+            proxies = proxyh
			
 
				+        print(f"切换代理：{proxies}")
			
 
				+        if not scheme:
			
 
				+            return proxies
			
 
				+        else:
			
 
				+            return proxies.get(scheme,default)
			
 
				 
			
 
				 
			
 
				 def search(pattern, string):
			
@@ -364,3 +375,31 @@ def remove_htmldata(remove_info_list:list, html:str, response):
 
				                 if extra_html:
			
 
				                     html = html.replace(extra_html, '')
			
 
				     return html
			
 
				+
			
 
				+
			
 
				+def extract_file_type(file_name="附件名", file_url="附件地址",file_type=[]):
			
 
				+    """
			
 
				+        抽取附件类型
			
 
				+    Args:
			
 
				+        file_name: 附件名
			
 
				+        file_url: 附件地址
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if file_name and file_url:
			
 
				+        file_name = file_name.strip()
			
 
				+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
			
 
				+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
			
 
				+        if file_type:
			
 
				+            file_types.extend(file_type)
			
 
				+
			
 
				+        file_type = file_url.split('?')[0].split('.')[-1].lower()
			
 
				+        if file_type not in file_types:
			
 
				+            file_type = file_name.split('?')[0].split('.')[-1].lower()
			
 
				+            if file_type in file_types:
			
 
				+                return file_type
			
 
				+        else:
			
 
				+            return file_type
			
 
				+    return None
			
 
				+