dtcookie_pool.py 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import json
  2. import re
  3. import sys
  4. import execjs
  5. sys.path.append('/app/spiders/sword_feapder/FworkSpider')
  6. from untils.cookie_pool import PageCookiePool
  7. import requests
  8. class DTCookiePool(PageCookiePool):
  9. def __init__(self,redis_key,header,page_url=None,
  10. min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
  11. super(DTCookiePool, self).__init__(redis_key,page_url=None,
  12. min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
  13. self.headers=header
  14. self.page_url = page_url
  15. def create_cookie(self,):
  16. session = requests.Session()
  17. start_url = self.page_url
  18. print(self.headers)
  19. res = session.get(start_url, headers=self.headers,verify=False)
  20. js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
  21. js_func = 'function sd() { return ' + js_func + "}"
  22. ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
  23. ss = ctx.call("sd")
  24. cookies = {}
  25. for item in ss.split(";"):
  26. if '=' in item:
  27. cookies[item.split("=")[0]] = item.split("=")[-1]
  28. res = session.get(start_url, cookies=cookies, headers=self.headers)
  29. js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
  30. js_func = re.sub("<(/*?)script>", "", res.text)
  31. location = re.compile('location(.*?)}else')
  32. setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
  33. setTimeout2 = re.compile('setTimeout(.*?)document')
  34. gox = re.compile('};go(.*?)\)')
  35. js_func = re.sub(location, "}else", js_func)
  36. js_func = re.sub(setTimeout, " document", js_func)
  37. js_func = re.sub(setTimeout2, " document", js_func)
  38. js_func = re.sub(gox, " return document['cookie']\n};", js_func)
  39. js_func = '''const jsdom = require("jsdom");
  40. const {JSDOM} = jsdom;
  41. const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
  42. window = dom.window;
  43. document = window.document;''' + js_func
  44. ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
  45. with open('ex_js.js', 'w+', encoding='utf-8') as f:
  46. f.write(js_func)
  47. print(js_do_data)
  48. ss = ctx.call("go", json.loads(js_do_data))
  49. for item in ss.split(";"):
  50. if '=' in item:
  51. cookies[item.split("=")[0]] = item.split("=")[-1]
  52. session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
  53. res = session.get(start_url, headers=self.headers, cookies=cookies)
  54. cookies = requests.utils.dict_from_cookiejar(session.cookies)
  55. return cookies
  56. if __name__ == '__main__':
  57. headers = {
  58. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  59. "Accept-Encoding": "gzip, deflate, br",
  60. "Accept-Language": "zh-CN,zh;q=0.9",
  61. "Cache-Control": "max-age=0",
  62. "Connection": "keep-alive",
  63. "Host": "www.hefei.gov.cn",
  64. "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
  65. "sec-ch-ua-mobile": "?0",
  66. "sec-ch-ua-platform": "\"Windows\"",
  67. "Sec-Fetch-Dest": "document",
  68. "Sec-Fetch-Mode": "navigate",
  69. "Sec-Fetch-Site": "none",
  70. "Sec-Fetch-User": "?1",
  71. "Upgrade-Insecure-Requests": "1",
  72. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
  73. }
  74. cookie_pool = DTCookiePool(
  75. page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
  76. header=headers, redis_key="dongtaices")
  77. cookie = cookie_pool.get_cookie()
  78. print(cookie)
  79. # cookie_pool.del_cookie(cookie)