detail_normol.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary:
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import json
  10. import re
  11. import sys
  12. import time
  13. from lxml import etree
  14. from encode_info import encode_info
  15. sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider')
  16. sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
  17. import feapder
  18. from feapder.db.mongodb import MongoDB
  19. from feapder.utils.log import log
  20. from typing import Optional
  21. from lxml.html import HtmlElement
  22. from lxml.html.clean import Cleaner
  23. from untils.tools import int2long, substitute, text_search
  24. from cookie_pool import WebCookiePool
  25. from untils.proxy_pool import ProxyPool
  26. from pathlib import Path
  27. import execjs
  28. import requests
  29. def get_proxy():
  30. headers = {
  31. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  32. }
  33. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  34. log.info("切换代理:", proxy.get("data"))
  35. return proxy.get("data")
  36. class ParseElement:
  37. def __init__(self, ):
  38. self.__element: Optional[HtmlElement] = None
  39. @property
  40. def html(self) -> str:
  41. return etree.tostring(self.elem, method="html", encoding="utf-8").decode()
  42. @property
  43. def clean_html(self) -> str:
  44. cleaner = Cleaner()
  45. cleaner.javascript = False
  46. cleaner.remove_unknown_tags = False
  47. cleaner.safe_attrs = ['href', 'src']
  48. return cleaner.clean_html(self.html)
  49. @property
  50. def elem(self):
  51. return self.__element
  52. @elem.setter
  53. def elem(self, element: HtmlElement):
  54. self.__element = element
  55. def splicing(response):
  56. path = 'splicing.js'
  57. with open(path, encoding='utf-8') as rp:
  58. js_script = rp.read()
  59. ctx = execjs.compile(js_script)
  60. html = ctx.call('splicing', response)
  61. return html
  62. class Details(feapder.AirSpider):
  63. cookie_pool = WebCookiePool(redis_key='zgztbcookie',
  64. page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do")
  65. _to_db = None
  66. db_name = 'zgzb_list'
  67. send_list = []
  68. proxy = get_proxy()
  69. @property
  70. def to_db(self):
  71. if not self._to_db:
  72. self._to_db = MongoDB()
  73. return self._to_db
  74. def start_requests(self):
  75. base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action'
  76. while True:
  77. data_lsit = self.to_db.find(self.db_name, {"type": "0","timeout":None}, sort={"publishtime": -1}, limit=300)
  78. for item in data_lsit:
  79. tenderprojectcode = item.get("href").split("&")[1]
  80. businessid = item.get("href").split("&")[0]
  81. businesskeyword = item.get("businessKeyWord")
  82. businessoObjectname = item.get("title")
  83. schemaversion = item.pop("schemaVersion")
  84. rowguid = item.pop("rowGuid")
  85. data = {
  86. "schemaVersion": schemaversion,
  87. "businessKeyWord": businesskeyword,
  88. "tenderProjectCode": encode_info(tenderprojectcode),
  89. "businessObjectName": businessoObjectname,
  90. "businessId": encode_info(businessid),
  91. }
  92. detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do'
  93. yield feapder.Request(url=detail_url, item=item, method="POST", data=data,
  94. callback=self.detail_get, timeout=5, use_session=True,count=0)
  95. break
  96. def get_acw_sc_v2(self, arg1):
  97. path = 'acw_sc_v2.js'
  98. with open(path, encoding='utf-8') as rp:
  99. js_script = rp.read()
  100. ctx = execjs.compile(js_script)
  101. arg2 = ctx.call('l', arg1)
  102. log.info(f'acw_sc_v2 >>> {arg2}')
  103. return arg2
  104. def detail_get(self, request, response):
  105. # return
  106. if "arg1" in response.text:
  107. arg1 = "".join(re.findall("arg1='(\w+)';", response.text)).strip()
  108. if arg1 != '':
  109. acw_sc_v2 = self.get_acw_sc_v2(arg1)
  110. request.session.cookies.update({'acw_sc__v2': acw_sc_v2})
  111. time.sleep(1)
  112. yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
  113. callback=self.detail_get, timeout=5, use_session=True, count=0)
  114. elif '滑动验证页面' in response.text:
  115. log.info('开始过滑块验证')
  116. cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
  117. count = request.count
  118. if count > 4:
  119. return
  120. if cookies is None:
  121. self.proxy = get_proxy()
  122. elif len(cookies) <= 1:
  123. self.proxy = get_proxy()
  124. request.session.cookies.update(cookies)
  125. yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
  126. callback=self.detail_get, timeout=5, use_session=True, count=count + 1)
  127. else:
  128. try:
  129. response.json
  130. except Exception as e:
  131. log.info(e)
  132. self.proxy = get_proxy()
  133. cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
  134. request.session.cookies.update(cookies)
  135. yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
  136. callback=self.detail_get, timeout=5, use_session=True, cookies=cookies, count=0)
  137. else:
  138. item = request.item
  139. tenderprojectcode = item.get("href").split("&")[1]
  140. businessid = item.get("href").split("&")[0]
  141. businesskeyword = "businessKeyWord"
  142. if item.__contains__("businessKeyWord"):
  143. businesskeyword = item.pop("businessKeyWord")
  144. detail_info = response.json.get("object").get(businesskeyword)
  145. if not detail_info:
  146. businesskeywords = response.json.get("object").keys()
  147. for businesskeyword in businesskeywords:
  148. businesskeyword = businesskeyword
  149. detail_info = response.json.get("object").get(businesskeyword)
  150. area = item.get("area")
  151. if area is None:
  152. item["area"] = "全国"
  153. item["city"] = ""
  154. elif " " in area:
  155. item["area"] = area.split(" ")[0]
  156. item["city"] = area.split(" ")[-1]
  157. else:
  158. item["area"] = "全国"
  159. item["city"] = ""
  160. if detail_info is None or detail_info == []:
  161. businessKeyWords = response.json.get("object").keys()
  162. for key in businessKeyWords:
  163. businesskeyword = key
  164. detail_info = response.json.get("object").get(businesskeyword)
  165. if detail_info is None or detail_info == []:
  166. item[
  167. "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}"
  168. item["sendflag"] = "true"
  169. item["comeintime"] = int(time.time())
  170. result = self.to_db.add("data_bak", item)
  171. # self.to_db.delete(self.db_name, {"_id": item["_id"]})
  172. self.to_db.update(self.db_name, {"timeout": 3}, {"_id": item["_id"]})
  173. log.info(f"mongo add _id:{item.get('title')}<空结果")
  174. # log.info(f"mongo add _id:{item.get('title')}")
  175. else:
  176. pass
  177. if businesskeyword == "tenderProject":
  178. item["contenthtml"] = splicing(detail_info)
  179. pass
  180. else:
  181. detail_info = detail_info[0]
  182. item["contenthtml"] = detail_info.get("bulletinContent")
  183. if item["contenthtml"] is None:
  184. item["detail"] = None
  185. item["sendflag"] = "true"
  186. else:
  187. item["detail"] = substitute(item["contenthtml"])
  188. item[
  189. "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}"
  190. if text_search(item["detail"]).total == 0:
  191. item["sendflag"] = "true"
  192. item["comeintime"] = int2long(int(time.time()))
  193. ss = {"": ""}
  194. result = self.to_db.add("data_bak", item)
  195. # self.to_db.delete(self.db_name, {"_id": item["_id"]})
  196. self.to_db.update(self.db_name, {"timeout": 2}, {"_id": item["_id"]})
  197. # print(f"mongo add _id:{item.get('title')}")
  198. log.info(f"mongo add _id:{item.get('title')}")
  199. time.sleep(0.5)
  200. def download_midware(self, request):
  201. request.proxies = self.proxy
  202. log.info(request.item.get("title"))
  203. request.headers = {
  204. "Host": "www.cebpubservice.com",
  205. "Accept": "application/json, text/javascript, */*; q=0.01",
  206. "X-Requested-With": "XMLHttpRequest",
  207. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
  208. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  209. "Origin": "http://www.cebpubservice.com",
  210. "Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do",
  211. "Accept-Language": "zh-CN,zh;q=0.9"
  212. }
  213. def exception_request(self, request, response):
  214. if response is None:
  215. item = request.item
  216. self.to_db.update(self.db_name, {"timeout": 1}, {"_id": item["_id"]})
  217. self.proxy = get_proxy()
  218. log.info("添加到超时标记")
  219. else:
  220. log.info("error:这是一个一般错误")
  221. if __name__ == "__main__":
  222. spider = Details(thread_count=1)
  223. spider.start()
  224. spider.join()