|
@@ -9,8 +9,7 @@ from config.load import region
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from utils.databases import mongo_table, int2long,redis_client, es_query
|
|
from utils.databases import mongo_table, int2long,redis_client, es_query
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
-from utils.socks5 import Proxy
|
|
|
|
-from utils.tools import sha1, check_crawl_title
|
|
|
|
|
|
+from utils.tools import sha1, check_crawl_title,get_proxy
|
|
from utils.execptions import JyBasicException,CustomCheckError
|
|
from utils.execptions import JyBasicException,CustomCheckError
|
|
from login import get_cookies
|
|
from login import get_cookies
|
|
|
|
|
|
@@ -18,6 +17,10 @@ from login import get_cookies
|
|
CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
|
|
CrawlMenu = namedtuple('CrawlMenu', ['channel', 'spidercode', 'table_type'])
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
class ListSpider:
|
|
class ListSpider:
|
|
|
|
|
|
def __init__(self, db: str, crawl_tab: str, crawl_max_page=None, enable_proxy=False, allow_show_exception=False):
|
|
def __init__(self, db: str, crawl_tab: str, crawl_max_page=None, enable_proxy=False, allow_show_exception=False):
|
|
@@ -40,7 +43,7 @@ class ListSpider:
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
self.r = redis_client()
|
|
self.r = redis_client()
|
|
self.session = requests.session()
|
|
self.session = requests.session()
|
|
- self.proxy = Proxy(enable_proxy)
|
|
|
|
|
|
+ self.proxy = get_proxy()
|
|
self.redis_key = 'bdzbw_2022'
|
|
self.redis_key = 'bdzbw_2022'
|
|
self.allow_show_exception = allow_show_exception
|
|
self.allow_show_exception = allow_show_exception
|
|
self.cookies = None
|
|
self.cookies = None
|
|
@@ -68,27 +71,32 @@ class ListSpider:
|
|
request_params.setdefault('timeout', 120)
|
|
request_params.setdefault('timeout', 120)
|
|
|
|
|
|
retries = 0
|
|
retries = 0
|
|
- while retries < 5:
|
|
|
|
|
|
+ while retries < 2:
|
|
try:
|
|
try:
|
|
self.cookies = self.read_cookies()
|
|
self.cookies = self.read_cookies()
|
|
response = self.session.post(url, data=data, cookies=self.cookies,
|
|
response = self.session.post(url, data=data, cookies=self.cookies,
|
|
- proxies=self.proxy.proxies, **request_params)
|
|
|
|
|
|
+ proxies=self.proxy, **request_params)
|
|
except:
|
|
except:
|
|
self.proxy.switch()
|
|
self.proxy.switch()
|
|
retries += 1
|
|
retries += 1
|
|
time.sleep(20)
|
|
time.sleep(20)
|
|
continue
|
|
continue
|
|
if response.status_code == 403:
|
|
if response.status_code == 403:
|
|
- self.proxy.switch()
|
|
|
|
|
|
+ self.proxy = get_proxy()
|
|
get_cookies(self.session,self.proxy.proxies)
|
|
get_cookies(self.session,self.proxy.proxies)
|
|
retries += 1
|
|
retries += 1
|
|
- else:
|
|
|
|
|
|
+ elif response.status_code == 200:
|
|
element = fromstring(response.text)
|
|
element = fromstring(response.text)
|
|
|
|
+ time.sleep(2)
|
|
if element.xpath('//*[@id="searchResultList"]') or element.xpath('//*[@id="ulList"]'):
|
|
if element.xpath('//*[@id="searchResultList"]') or element.xpath('//*[@id="ulList"]'):
|
|
return response
|
|
return response
|
|
else:
|
|
else:
|
|
'''没有搜索到列表页'''
|
|
'''没有搜索到列表页'''
|
|
return None
|
|
return None
|
|
|
|
+ else:
|
|
|
|
+ self.proxy = get_proxy()
|
|
|
|
+ retries += 1
|
|
|
|
+
|
|
|
|
|
|
return None
|
|
return None
|
|
|
|
|
|
@@ -221,8 +229,7 @@ if __name__ == '__main__':
|
|
ListSpider(
|
|
ListSpider(
|
|
db='py_spider',
|
|
db='py_spider',
|
|
crawl_tab='bdzbw_list',
|
|
crawl_tab='bdzbw_list',
|
|
- crawl_max_page=50,
|
|
|
|
- enable_proxy=True,
|
|
|
|
|
|
+ crawl_max_page=1,
|
|
).start()
|
|
).start()
|
|
|
|
|
|
|
|
|