12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- import json
- import re
- import sys
- import execjs
- sys.path.append('/app/spiders/sword_feapder/FworkSpider')
- from untils.cookie_pool import PageCookiePool
- import requests
- class DTCookiePool(PageCookiePool):
- def __init__(self,redis_key,header,page_url=None,
- min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
- super(DTCookiePool, self).__init__(redis_key,page_url=None,
- min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
- self.headers=header
- self.page_url = page_url
- def create_cookie(self,):
- session = requests.Session()
- start_url = self.page_url
- print(self.headers)
- res = session.get(start_url, headers=self.headers,verify=False)
- js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
- js_func = 'function sd() { return ' + js_func + "}"
- ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
- ss = ctx.call("sd")
- cookies = {}
- for item in ss.split(";"):
- if '=' in item:
- cookies[item.split("=")[0]] = item.split("=")[-1]
- res = session.get(start_url, cookies=cookies, headers=self.headers)
- js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
- js_func = re.sub("<(/*?)script>", "", res.text)
- location = re.compile('location(.*?)}else')
- setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
- setTimeout2 = re.compile('setTimeout(.*?)document')
- gox = re.compile('};go(.*?)\)')
- js_func = re.sub(location, "}else", js_func)
- js_func = re.sub(setTimeout, " document", js_func)
- js_func = re.sub(setTimeout2, " document", js_func)
- js_func = re.sub(gox, " return document['cookie']\n};", js_func)
- js_func = '''const jsdom = require("jsdom");
- const {JSDOM} = jsdom;
- const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
- window = dom.window;
- document = window.document;''' + js_func
- ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
- with open('ex_js.js', 'w+', encoding='utf-8') as f:
- f.write(js_func)
- print(js_do_data)
- ss = ctx.call("go", json.loads(js_do_data))
- for item in ss.split(";"):
- if '=' in item:
- cookies[item.split("=")[0]] = item.split("=")[-1]
- session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
- res = session.get(start_url, headers=self.headers, cookies=cookies)
- cookies = requests.utils.dict_from_cookiejar(session.cookies)
- return cookies
- if __name__ == '__main__':
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
- "Accept-Encoding": "gzip, deflate, br",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "max-age=0",
- "Connection": "keep-alive",
- "Host": "www.hefei.gov.cn",
- "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": "\"Windows\"",
- "Sec-Fetch-Dest": "document",
- "Sec-Fetch-Mode": "navigate",
- "Sec-Fetch-Site": "none",
- "Sec-Fetch-User": "?1",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
- }
- cookie_pool = DTCookiePool(
- page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
- header=headers, redis_key="dongtaices")
- cookie = cookie_pool.get_cookie()
- print(cookie)
- # cookie_pool.del_cookie(cookie)
|