3
0
Эх сурвалжийг харах

Merge branch 'master' of https://jygit.jydev.jianyu360.cn/data_processing/crawlab_feader

dongzhaorui 2 жил өмнө
parent
commit
4dee81dd7d

+ 2 - 2
FworkSpider/feapder/templates/spider_template.tmpl

@@ -55,10 +55,10 @@ class ${spider_name}(feapder.BiddingListSpider):
             district = ""  # 区县
 
             try:
-                next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title}")]')[index]  # index防止标题相同
+                next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title}")]')[0]  # index防止标题相同
             except:
                 try:
-                    next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title[:10]}")]')[index] # 标题过长
+                    next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title[:10]}")]')[0] # 标题过长
                 except:
                     continue
 

+ 6 - 1
FworkSpider/untils/jsl_clearance_s.py

@@ -59,7 +59,12 @@ class DTCookiePool(PageCookiePool):
         js_func = re.sub(gox, "return document['cookie']\n};", js_func)
         js_func = '''const jsdom = require("jsdom");
                     const {JSDOM} = jsdom;
-                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
+                                        {
+                                            url: "https://example.org/",
+                                            referrer: "https://example.com/",
+                                            contentType: "text/html",
+                                        });
                     window = dom.window;
                     document = window.document;''' + js_func
         ctx = execjs.compile(js_func)

+ 43 - 4
FworkSpider/untils/tools.py

@@ -290,13 +290,24 @@ def njpc_fields_extract_special(html, data_item):
     return data_item
 
 
-def get_proxy():
+def get_proxy(scheme=None,default=None,socks5h=False):
     headers = {
         "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
     }
-    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
-    print(f"切换代理:{proxy.get('data')}")
-    return proxy.get("data").get("http")
+    proxy_res = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
+
+    proxies = proxy_res.get('data')
+    if proxy_res and proxies:
+        if socks5h:
+            proxyh = {}
+            proxyh["http"] = proxies.get("http").replace("socks5", "socks5h")
+            proxyh["https"] = proxies.get("http").replace("socks5", "socks5h")
+            proxies = proxyh
+        print(f"切换代理:{proxies}")
+        if not scheme:
+            return proxies
+        else:
+            return proxies.get(scheme,default)
 
 
 def search(pattern, string):
@@ -364,3 +375,31 @@ def remove_htmldata(remove_info_list:list, html:str, response):
                 if extra_html:
                     html = html.replace(extra_html, '')
     return html
+
+
+def extract_file_type(file_name="附件名", file_url="附件地址",file_type=[]):
+    """
+        抽取附件类型
+    Args:
+        file_name: 附件名
+        file_url: 附件地址
+
+    Returns:
+
+    """
+    if file_name and file_url:
+        file_name = file_name.strip()
+        file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
+                      'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
+        if file_type:
+            file_types.extend(file_type)
+
+        file_type = file_url.split('?')[0].split('.')[-1].lower()
+        if file_type not in file_types:
+            file_type = file_name.split('?')[0].split('.')[-1].lower()
+            if file_type in file_types:
+                return file_type
+        else:
+            return file_type
+    return None
+