Selaa lähdekoodia

修复代码若干错误

dzr 3 viikkoa sitten
vanhempi
commit
e8a0f56b1c

+ 14 - 32
zbytb/crawler/account.py

@@ -16,10 +16,7 @@ JSON_ACCOUNT_RECORD = (ROOT_PATH / 'config/account_record.json').resolve()
 
 def account_record(uid, crawl_type):
     with open(JSON_ACCOUNT_RECORD, 'w+', encoding='utf-8') as wp:
-        item = {
-            "uid": uid,
-            "crawl_type": crawl_type
-        }
+        item = {"uid": uid, "crawl_type": crawl_type}
         wp.write(json.dumps(item, indent=4))
 
 
@@ -34,15 +31,9 @@ def read_account():
 
 def get_account(site, crawl_type):
     url = "http://cc.spdata.jianyu360.com/competing_goods/account/fetch"
-    params = {
-        "site": site,
-        "crawl_type": crawl_type
-    }
+    params = {"site": site, "crawl_type": crawl_type}
     try:
-        response = requests.get(url,
-                                headers=_headers,
-                                params=params,
-                                timeout=60)
+        response = requests.get(url, headers=_headers, params=params, timeout=60)
         data = response.json()['data']
         logger.info("当前账号状态:{}".format(data['crawl_detail']))
     except requests.RequestException:
@@ -53,24 +44,15 @@ def get_account(site, crawl_type):
 
 
 def release_account(uid, crawl_type, disable_log=False):
-
     url = "http://cc.spdata.jianyu360.com/competing_goods/account/release"
-    if uid is not None:
-        params = {
-            "uid": uid,
-            "crawl_type": crawl_type
-        }
-        while True:
-            try:
-                response = requests.get(url,
-                                        headers=_headers,
-                                        params=params,
-                                        timeout=60)
-                if response.status_code == 200:
-                    acc_status = mongo_table('py_spider', 'match_account').find_one({'_id': ObjectId(uid)})['crawl_detail']
-                    if not disable_log:
-                        logger.info(f"release_account >>> {response.json()}, status : {acc_status}")
-                    break
-            except requests.RequestException:
-                logger.error("网络异常,归还账号失败")
-                wait(1)
+    params = {"uid": uid, "crawl_type": crawl_type}
+    while True:
+        try:
+            response = requests.get(url, headers=_headers, params=params, timeout=60)
+            response.raise_for_status()
+            if not disable_log:
+                logger.info(f"release_account >>> {response.json()}")
+            break
+        except requests.RequestException:
+            logger.error("网络异常,归还账号失败")
+            wait(1)

+ 19 - 15
zbytb/crawler/defaults.py

@@ -1,5 +1,5 @@
 import json
-import os
+from pathlib import Path
 
 import requests
 import urllib3
@@ -11,11 +11,11 @@ urllib3.disable_warnings()
 
 
 def prepare_request(
-        headers: dict = None,
-        proxies: dict = None,
-        timeout: int = None,
-        verify: bool = None,
-        cookies=None,
+    headers: dict = None,
+    proxies: dict = None,
+    timeout: int = None,
+    verify: bool = None,
+    cookies=None,
 ):
     request_params = {}
     request_params.setdefault('headers', headers)
@@ -28,17 +28,19 @@ def prepare_request(
     return request_params
 
 
-def get_cookies(url, headers, proxies=None):
-    if not os.path.isfile(f'./zbytb_ck.json'):
+def get_jsl_cookies(url, headers, proxies=None):
+    root = Path(__file__).parent.parent
+    file = root.joinpath("config/jsl_ck.json").resolve()
+    if not file.exists():
         http_session_521(url, headers, proxies)
 
-    with open(f'./zbytb_ck.json', 'r', encoding='utf-8') as fr:
-        cks = fr.read()
-    ck = json.loads(cks.replace("'", '"'))
-    return ck
+    with file.open('r', encoding='utf-8') as fr:
+        cookies = fr.read()
 
+    return json.loads(cookies.replace("'", '"'))
 
-def http_request_get(url, **kwargs):
+
+def http_request_get(url, login=False, **kwargs):
     request_params = prepare_request(**kwargs)
     headers = request_params.get('headers')
     proxies = request_params.get('proxies')
@@ -46,8 +48,10 @@ def http_request_get(url, **kwargs):
     response = Response()
     while retries < 3:
         try:
-            cks = get_cookies(url, headers, proxies)
-            request_params['cookies'] = cks
+            if not login:
+                jsl_cookies = get_jsl_cookies(url, headers, proxies)
+                request_params['cookies'] = jsl_cookies
+
             response = requests.get(url, **request_params)
             if response.status_code == 200:
                 response.encoding = response.apparent_encoding

+ 12 - 7
zbytb/crawler/sessions_521.py

@@ -2,6 +2,7 @@ import copy
 import json
 import re
 import time
+from pathlib import Path
 
 import execjs
 import jsbeautifier
@@ -15,8 +16,7 @@ from jsl import jsl
 def save_js_script(js_code: str, allow_beautify_code=False):
     with open('etx.js', 'w', encoding='utf-8') as f:
         if allow_beautify_code:
-            # 解压缩js代码
-            f.write(jsbeautifier.beautify(js_code))
+            f.write(jsbeautifier.beautify(js_code))  # 解压缩js代码
         f.write(js_code)
 
 
@@ -233,12 +233,17 @@ def create_cookie(page_url, headers, **kwargs):
     try:
         return jsl.get_jsl_cookies(page_url, headers, **kwargs)
     except IndexError:
-        raise requests.exceptions.ContentDecodingError('fetch jsl cookies failed')
+        raise requests.exceptions.ContentDecodingError('jsl cookies fetch failed')
+    except AssertionError:
+        return {}
 
 
-def http_session_521(url, headers, proxies=None):
+def http_session_521(url, headers, proxies=None, storage=True):
     jsl_cookies = create_cookie(page_url=url, headers=headers, proxies=proxies)
-    with open(f'./zbytb_ck.json', 'w', encoding='utf-8') as fw:
-        fw.write(json.dumps(jsl_cookies))
-    return jsl_cookies
+    if storage:
+        root = Path(__file__).parent.parent
+        file = root.joinpath("config/jsl_ck.json").resolve()
+        with file.open('w', encoding='utf-8') as fw:
+            fw.write(json.dumps(jsl_cookies))
 
+    return jsl_cookies

+ 9 - 14
zbytb/crawler/spiders/DetailPageSpider.py

@@ -28,13 +28,7 @@ from utils.log import logger
 
 class CrawlDetailPageSpider:
 
-    def __init__(
-            self,
-            db: str,
-            crawl_tab: str,
-            save_tab: str,
-            error_tab: str,
-    ):
+    def __init__(self, db: str, crawl_tab: str, save_tab: str, error_tab: str):
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.save_tab = mongo_table(db, save_tab)
         self.crawl_error_tab = mongo_table(db, error_tab)
@@ -80,13 +74,13 @@ class CrawlDetailPageSpider:
         self._update_crawl_task(tid, account=self.senior_account)
 
     def crawl_error(
-            self,
-            *,
-            spider_code: str,
-            account: str,
-            err_msg='采集失败',
-            response=None,
-            rows=None,
+        self,
+        *,
+        spider_code: str,
+        account: str,
+        err_msg='采集失败',
+        response=None,
+        rows=None
     ):
         items = {
             'account': account,
@@ -243,6 +237,7 @@ class CrawlDetailPageSpider:
         while True:
             success, response = http_request_get(
                 url,
+                login=True,
                 headers=headers,
                 cookies=self.cookies,
                 verify=False,

+ 8 - 1
zbytb/jsl/jsl.py

@@ -16,11 +16,18 @@ import requests
 _cookies = {}
 
 
+def check_script(text):
+    ret = re.findall('cookie=(.*?);location', text)
+    assert len(ret) > 0
+
+
 def ck1(url, headers, timeout=10, proxies=None):
     global _cookies
     response = requests.get(url, headers=headers, timeout=timeout, proxies=proxies)
     _cookies.update(response.cookies)
-    script = re.findall('cookie=(.*?);location', response.content.decode())[0]
+    text = response.content.decode()
+    check_script(text)
+    script = re.findall('cookie=(.*?);location', text)[0]
     _, val = next(iter(execjs.eval(script).split(';')), '').split('=')
     # print(val)  # jsl_clearance_s
     _cookies['__jsl_clearance_s'] = val

+ 1 - 1
zbytb/package.json

@@ -10,6 +10,6 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
-    "crypto-js": "^4.2.0",
+    "crypto-js": "^4.2.0"
   }
 }

+ 1 - 1
zbytb/utils/log.py

@@ -3,7 +3,7 @@ from pathlib import Path
 from loguru import logger
 
 _absolute = Path(__file__).absolute().parent.parent
-_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
+_log_path = (_absolute / 'logs/crawl_{time:YYYYMMDD}.log').resolve()
 logger.add(
     _log_path,
     format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',