import json import re import sys import execjs sys.path.append('/app/spiders/sword_feapder/FworkSpider') from untils.cookie_pool import PageCookiePool import requests class DTCookiePool(PageCookiePool): def __init__(self,redis_key,header,page_url=None, min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs): super(DTCookiePool, self).__init__(redis_key,page_url=None, min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs) self.headers=header self.page_url = page_url def create_cookie(self,): session = requests.Session() start_url = self.page_url print(self.headers) res = session.get(start_url, headers=self.headers,verify=False) js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0] js_func = 'function sd() { return ' + js_func + "}" ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules") ss = ctx.call("sd") cookies = {} for item in ss.split(";"): if '=' in item: cookies[item.split("=")[0]] = item.split("=")[-1] res = session.get(start_url, cookies=cookies, headers=self.headers) js_do_data = re.findall('};go\((.*?)\)', res.text)[0] js_func = re.sub("<(/*?)script>", "", res.text) location = re.compile('location(.*?)}else') setTimeout = re.compile('_(.{37})setTimeout(.*?)document') setTimeout2 = re.compile('setTimeout(.*?)document') gox = re.compile('};go(.*?)\)') js_func = re.sub(location, "}else", js_func) js_func = re.sub(setTimeout, " document", js_func) js_func = re.sub(setTimeout2, " document", js_func) js_func = re.sub(gox, " return document['cookie']\n};", js_func) js_func = '''const jsdom = require("jsdom"); const {JSDOM} = jsdom; const dom = new JSDOM(`
Hello world
`); window = dom.window; document = window.document;''' + js_func ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules") with open('ex_js.js', 'w+', encoding='utf-8') as f: f.write(js_func) print(js_do_data) ss = ctx.call("go", json.loads(js_do_data)) for item in ss.split(";"): if '=' in item: cookies[item.split("=")[0]] = item.split("=")[-1] session.cookies.setdefault(item.split("=")[0], item.split("=")[-1]) res = session.get(start_url, headers=self.headers, cookies=cookies) cookies = requests.utils.dict_from_cookiejar(session.cookies) return cookies if __name__ == '__main__': headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.hefei.gov.cn", "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" } cookie_pool = DTCookiePool( page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2', header=headers, redis_key="dongtaices") cookie = cookie_pool.get_cookie() print(cookie) # cookie_pool.del_cookie(cookie)