lizongze 1 éve
szülő
commit
d5f9c481c3

+ 16 - 9
bdzbw/ListPageSpider.py

@@ -9,8 +9,7 @@ from config.load import region
 from crawler.crawl_scheduler import Scheduler
 from utils.databases import mongo_table, int2long,redis_client, es_query
 from utils.log import logger
-from utils.socks5 import Proxy
-from utils.tools import sha1, check_crawl_title
+from utils.tools import sha1, check_crawl_title,get_proxy
 from utils.execptions import JyBasicException,CustomCheckError
 from login import get_cookies
 
@@ -18,6 +17,10 @@ from login import get_cookies
 CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
 
 
+
+
+
+
 class ListSpider:
 
     def __init__(self, db: str, crawl_tab: str, crawl_max_page=None, enable_proxy=False, allow_show_exception=False):
@@ -40,7 +43,7 @@ class ListSpider:
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.r = redis_client()
         self.session = requests.session()
-        self.proxy = Proxy(enable_proxy)
+        self.proxy = get_proxy()
         self.redis_key = 'bdzbw_2022'
         self.allow_show_exception = allow_show_exception
         self.cookies = None
@@ -68,27 +71,32 @@ class ListSpider:
         request_params.setdefault('timeout', 120)
 
         retries = 0
-        while retries < 5:
+        while retries < 2:
             try:
                 self.cookies = self.read_cookies()
                 response = self.session.post(url, data=data, cookies=self.cookies,
-                                             proxies=self.proxy.proxies, **request_params)
+                                             proxies=self.proxy, **request_params)
             except:
                 self.proxy.switch()
                 retries += 1
                 time.sleep(20)
                 continue
             if response.status_code == 403:
-                self.proxy.switch()
+                self.proxy = get_proxy()
                 get_cookies(self.session,self.proxy.proxies)
                 retries += 1
-            else:
+            elif response.status_code == 200:
                 element = fromstring(response.text)
+                time.sleep(2)
                 if element.xpath('//*[@id="searchResultList"]') or element.xpath('//*[@id="ulList"]'):
                     return response
                 else:
                     '''没有搜索到列表页'''
                     return None
+            else:
+                self.proxy = get_proxy()
+                retries += 1
+
 
         return None
 
@@ -221,8 +229,7 @@ if __name__ == '__main__':
     ListSpider(
         db='py_spider',
         crawl_tab='bdzbw_list',
-        crawl_max_page=50,
-        enable_proxy=True,
+        crawl_max_page=1,
     ).start()
 
 

+ 2 - 2
bdzbw/login.py

@@ -8,8 +8,8 @@ count_list = [{"name":"15736702898","pwd":"qwerty123456"},
 
 def get_cookies(session,proxy=False):
 
-    username = count_list[2].get('name')
-    password = count_list[2].get('pwd')
+    username = count_list[1].get('name')
+    password = count_list[1].get('pwd')
     logger.info(f"使用账号:{username}")
 
     try:

+ 1 - 1
bdzbw/login_cookie.json

@@ -1 +1 @@
-{"JSESSIONID": "F12036FB441449BAF9BB84E8EB3FA5B8", "SessionId": "D148EC19DDD02A0A9A425722728A5C57"}
+{"JSESSIONID": "B04168C3517C9D83B2739EBD95AEECA8", "SessionId": "B04168C3517C9D83B2739EBD95AEECA8"}

+ 15 - 0
bdzbw/utils/tools.py

@@ -1,6 +1,21 @@
 import hashlib
 import socket
 import re
+import requests
+from loguru import logger
+
+
+def get_proxy():
+    headers = {
+        "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+    }
+    proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
+    proxy = proxy.get("data")
+    proxyh = {}
+    proxyh["http"] = proxy.get("http").replace("socks5","socks5h")
+    proxyh["https"] = proxy.get("http").replace("socks5","socks5h")
+    logger.info("切换代理:{}".format(proxyh))
+    return proxyh
 
 
 def sha1(text: str):

+ 1 - 0
qlm/config/conf.yaml

@@ -14,6 +14,7 @@ ali_oss:
   key_id: LTAI4G5x9aoZx8dDamQ7vfZi
   key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
   endpoint: oss-cn-beijing-internal.aliyuncs.com
+#  endpoint: oss-cn-beijing.aliyuncs.com
   bucket_name: jy-datafile
 
 

+ 2 - 2
qlm/source_qianlima.py

@@ -210,5 +210,5 @@ def start():
     select_area(date_str)
 
 
-if __name__ == '__main__':
-    start()
+# if __name__ == '__main__':
+#     start()

+ 15 - 13
qlm/utils/config_parms.py

@@ -117,24 +117,26 @@ headers = {
     "Origin": "https://search.vip.qianlima.com",
     "Pragma": "no-cache",
     "Referer": "https://search.vip.qianlima.com/index.html",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
-    "X-Auth-Token": "7da2f8b3-7034-4774-b7d1-1b96ba572e63",
+    "Sec-Fetch-Dest": "empty",
+    "Sec-Fetch-Mode": "cors",
+    "Sec-Fetch-Site": "same-origin",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
+    "X-Auth-Token": "0a4034bd-f62d-414b-a1f9-9b32496a9b4a",
 }
+
 cookies = {
     "guest_id": "b1c483bd-2170-4322-b3d8-9637644938c4",
     "seo_curUrl": "www.qianlima.com",
-    "qlm_referrer": "https://www.google.com.hk/",
-    "delClose200811": "firstGoIn",
-    "HWWAFSESID": "cf4ca73bda48cfb3bd",
-    "HWWAFSESTIME": "1686896094447",
-    "backUrl": "https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=0",
-    "accessCaptchaPermission": "9AC523BE37D9BD163FDFB01EDB25D97ABC7F1FF2F188B48661BF7484B3580BD6",
-    "qlm_rem_login": "1",
-    "qlm_username": "16637019281",
-    "qlm_password": "fm7UBUpf83uECp33f88KuE3RppECupom",
     "seo_refUrl": "",
-    "xAuthToken": "7da2f8b3-7034-4774-b7d1-1b96ba572e63",
-    "userInfo": "{%22userId%22:11970045%2C%22username%22:%2216637019281%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22companyName%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222023-06-16%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:42%2C%22customerServiceName%22:%22%E5%8D%83%E9%87%8C%E9%A9%AC%E5%AE%A2%E6%9C%8D%22%2C%22weChatIcon%22:%22https://gw-static.qianlima.com/gw/invoice/1681907621_553c9cbd28.jpeg%22%2C%22customerServicePhone%22:%22%20400-688-2000%22%2C%22customerServiceQQ%22:%22%22%2C%22customerServiceEmail%22:%22qianlima_service@qianlima.com%22%2C%22deptType%22:0}%2C%22shouji%22:%2216637019281%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22zhiwu%22:%22%E5%91%98%E5%B7%A5%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22contactNumber%22:%2216637019281%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
+    "HWWAFSESID": "bc3a47328f58e0ccae",
+    "HWWAFSESTIME": "1697076648565",
+    "qlm_username": "16637019281",
+    "qlm_password": "Rp87pKKCfpmBCfjoj38mjou7Kfjpu3CR",
+    "source": "1",
+    "useragent_hash": "22210ca73bf1af2ec2eace74a96ee356",
+    "xAuthToken": "0a4034bd-f62d-414b-a1f9-9b32496a9b4a",
+    "login_time": "1697076660",
+    "userInfo": "{%22userId%22:11970045%2C%22username%22:%2216637019281%22%2C%22userIcon%22:%22%22%2C%22linkName%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22companyName%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22areaId%22:%222703%22%2C%22areaName%22:%22%E5%85%A8%E5%9B%BD%22%2C%22roleId%22:1%2C%22roleName%22:%22%E7%AE%A1%E7%90%86%E5%91%98%22%2C%22sex%22:%22m%22%2C%22expireDate%22:%22%E6%97%A0%22%2C%22isExpired%22:null%2C%22maxChildCount%22:0%2C%22isUsedCount%22:0%2C%22userStatus%22:1%2C%22memberLevel%22:5%2C%22memberLevelName%22:%22%E5%85%8D%E8%B4%B9%E4%BC%9A%E5%91%98%22%2C%22registerTime%22:%222023-06-16%22%2C%22isSuperSupplier%22:0%2C%22isNewUser%22:1%2C%22welcomeMsg%22:%22%E6%AC%A2%E8%BF%8E%E8%BF%9B%E5%85%A5%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91%EF%BD%9E%22%2C%22customerServiceInfo%22:{%22id%22:537%2C%22customerServiceName%22:%22%E6%9D%8E%E9%9B%AA%E5%B1%B1%22%2C%22weChatIcon%22:null%2C%22customerServicePhone%22:null%2C%22customerServiceQQ%22:null%2C%22customerServiceEmail%22:null%2C%22deptType%22:0}%2C%22shouji%22:%2216637019281%22%2C%22email%22:%22%22%2C%22dwmc%22:%22%E5%8C%97%E4%BA%AC%E8%B5%9E%E5%8D%9A%E6%81%92%E5%AE%89%22%2C%22zhiwu%22:%22%E5%91%98%E5%B7%A5%22%2C%22types%22:1%2C%22isPayBefore%22:0%2C%22memberOpenTime%22:null%2C%22showExpireDate%22:true%2C%22companyNature%22:null%2C%22companyArea%22:%22%22%2C%22companyType%22:null%2C%22industry%22:null%2C%22product%22:null%2C%22contacts%22:%22%E7%8E%8B%E5%BC%BA%22%2C%22contactNumber%22:%2216637019281%22%2C%22contactAddress%22:null%2C%22mainCustomerGroups%22:null%2C%22informationTypePreferences%22:null%2C%22businessUserType%22:null%2C%22businessCompanyName%22:null%2C%22isBusinessUser%22:null}"
 }
 
 

+ 1 - 42
qlm/utils/databases.py

@@ -1,9 +1,8 @@
 import bson
 import pymongo
 import redis
-from elasticsearch import Elasticsearch
 
-from config.load import mongo_conf, redis_conf, es_conf
+from config.load import mongo_conf, redis_conf
 
 # ---------------------------------- mongo ----------------------------------
 MONGO_URI_CLIENTS = {}    # a dictionary hold all client with uri as key
@@ -47,46 +46,6 @@ def object_id(_id: str):
     return bson.objectid.ObjectId(_id)
 
 
-# ---------------------------------- es ----------------------------------
-def es_client(cfg=None):
-    if cfg is None:
-        cfg = es_conf
-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
-
-
-def es_query(title: str, publish_time: int):
-    """
-    查询es
-
-    :param title: 标题
-    :param publish_time: 发布时间
-    :return:
-    """
-    client = es_client()
-    stime = publish_time - 432000  # 往前推5天
-    etime = publish_time + 432000
-    # 通过发布标题和发布时间范围查询
-    query = {
-        "query": {
-            "bool": {
-                "must": [
-                    {
-                        "multi_match": {
-                            "query": title,
-                            "type": "phrase",
-                            "fields": ["title"]
-                        }
-                    },
-                    {"range": {'publishtime': {"from": stime, "to": etime}}}
-                ]
-            }
-        }
-    }
-    result = client.search(index=es_conf['db'], body=query, request_timeout=100)
-    total = int(result['hits']['total'])
-    return total
-
-
 # ---------------------------------- redis ----------------------------------
 def redis_client(cfg=None):
     if cfg is None: