ppp合作伙伴采购信息.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-05-08
  4. ---------
  5. @summary: 云南省政府采购网 采购信息
  6. ---------
  7. @author: lzz
  8. """
  9. import json
  10. import execjs
  11. import base64
  12. import feapder
  13. from items.spider_item import MgpListItem
  14. from get_yn_cookies import get_ck
  15. from collections import namedtuple
  16. from untils.get_imgcode import get_code_det
  17. import time
  18. from untils.tools import get_proxy
  19. from feapder.utils.tools import get_today_of_day
  20. def pass_code(session):
  21. js_script = '''
  22. uuid = function(){
  23. var s = [];
  24. var hexDigits = "0123456789abcdef";
  25. for (var i = 0; i < 36; i++) {
  26. s[i] = hexDigits.substr(Math.floor(Math.random() * 0x10), 1);
  27. }
  28. s[14] = "4";
  29. s[19] = hexDigits.substr((s[19] & 0x3) | 0x8, 1);
  30. s[8] = s[13] = s[18] = s[23] = "-";
  31. return 'point' + '-' + s.join("");
  32. }
  33. '''
  34. ctx = execjs.compile(js_script)
  35. client_uid = ctx.call('uuid')
  36. url = "http://www.yngp.com/api/captcha/captcha.get.svc"
  37. data = {
  38. "captchaType": "clickWord",
  39. "clientUid": client_uid,
  40. "ts": round(time.time() * 1000)
  41. }
  42. data = json.dumps(data)
  43. headers = {
  44. "Accept": "*/*",
  45. "Accept-Language": "zh-CN,zh;q=0.9",
  46. "Cache-Control": "no-cache",
  47. "Connection": "keep-alive",
  48. "Content-Type": "application/json;charset=UTF-8",
  49. "Origin": "http://www.yngp.com",
  50. "Pragma": "no-cache",
  51. "Referer": "http://www.yngp.com/page/procurement/procurementList.html",
  52. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
  53. }
  54. res = session.post(url, headers=headers, data=data,timeout=60)
  55. image_content = res.json().get("data").get("repData").get("originalImageBase64")
  56. code_list = res.json().get("data").get("repData").get("wordList")
  57. keyWord = res.json().get("data").get("repData").get("secretKey")
  58. token = res.json().get("data").get("repData").get("token")
  59. image_code = get_code_det(base64.b64decode(image_content))
  60. image_code = image_code.get("r").get("code")
  61. word = []
  62. for i in code_list:
  63. if image_code.get(i):
  64. word.append(image_code.get(i))
  65. else:
  66. word.append(image_code.get(""))
  67. try:
  68. word = [{"x": (i[0] + i[2]) // 2, "y": (i[1] + i[3]) // 2} for i in word]
  69. except:
  70. word = [{"x": (i[0] + i[2]) // 2, "y": (i[1] + i[3]) // 2} for i in list(image_code.values())[:3]]
  71. js_func = '''
  72. var CryptoJS = require("crypto-js")
  73. function aesEncrypt(word,keyWord){
  74. word = JSON.stringify(word)
  75. var key = CryptoJS.enc.Utf8.parse(keyWord);
  76. var srcs = CryptoJS.enc.Utf8.parse(word);
  77. var encrypted = CryptoJS.AES.encrypt(srcs, key, {mode:CryptoJS.mode.ECB,padding: CryptoJS.pad.Pkcs7});
  78. return encrypted.toString();
  79. }
  80. '''
  81. ctx = execjs.compile(js_func)
  82. pointJson = ctx.call("aesEncrypt", word, keyWord)
  83. data = {
  84. "captchaType": "clickWord",
  85. "pointJson": pointJson,
  86. "token": token,
  87. "clientUid": client_uid,
  88. "ts": round(time.time() * 1000)
  89. }
  90. # with open('ynszfcgw.png', 'wb+') as f:
  91. # f.write(base64.b64decode(image_content))
  92. url = "http://www.yngp.com/api/captcha/captcha.check.svc"
  93. data = json.dumps(data)
  94. session.post(url, headers=headers, data=data, timeout=60)
  95. time.sleep(5)
  96. js_func = '''
  97. var CryptoJS = require("crypto-js")
  98. function aesEncrypt(token,word,keyWord){
  99. word = token + "---" +JSON.stringify(word)
  100. var key = CryptoJS.enc.Utf8.parse(keyWord);
  101. var srcs = CryptoJS.enc.Utf8.parse(word);
  102. var encrypted = CryptoJS.AES.encrypt(srcs, key, {mode:CryptoJS.mode.ECB,padding: CryptoJS.pad.Pkcs7});
  103. return encrypted.toString();
  104. }
  105. '''
  106. ctx = execjs.compile(js_func)
  107. captchaVerification = ctx.call("aesEncrypt", token, word, keyWord)
  108. return captchaVerification
  109. class Ynszfcgw_New(feapder.BiddingListSpider):
  110. def start_callback(self):
  111. Menu = namedtuple('Menu', ['channel', 'code', 'noticeType', 'crawl_page'])
  112. self.site = "云南省政府采购网"
  113. self.menus = [
  114. Menu('ppp合作伙伴采购信息', 'yn_ynszfcgw_new_ppphzgbcgxx', '4', 1),
  115. ]
  116. self.headers = {
  117. "Accept": "*/*",
  118. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  119. "Cache-Control": "no-cache",
  120. "Connection": "keep-alive",
  121. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  122. "Origin": "http://www.yngp.com",
  123. "Pragma": "no-cache",
  124. "Referer": "http://www.yngp.com/page/procurement/procurementList.html",
  125. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
  126. "X-Requested-With": "XMLHttpRequest"
  127. }
  128. self.retry = 0
  129. self.proxy = get_proxy()
  130. self.cookies = {}
  131. self.vcode = ""
  132. def start_requests(self):
  133. for menu in self.menus:
  134. start_url = "http://www.yngp.com/api/procurement/Procurement.gghtMoreList.svc"
  135. yield feapder.Request(url=start_url,item=menu._asdict(),proxies=False, page=1)
  136. def download_midware(self, request):
  137. for _ in range(5):
  138. if not self.cookies:
  139. self.cookies = get_ck(proxies=self.proxy)
  140. if self.cookies:
  141. break
  142. else:
  143. self.proxy = get_proxy()
  144. else:
  145. break
  146. menu = request.item
  147. page = request.page
  148. if page != 1:
  149. url = f"http://www.yngp.com/api/procurement/Procurement.gghtMoreList.svc?captchaCheckFlag={self.vcode}&p={page}"
  150. else:
  151. url = f"http://www.yngp.com/api/procurement/Procurement.gghtMoreList.svc?captchaCheckFlag=0&p=1"
  152. if menu.get('code') == "yn_ynszfcgw_new_zfcghtgg":
  153. tm = get_today_of_day()
  154. else:
  155. tm = ""
  156. data = {
  157. "current": f"{page}",
  158. "rowCount": "10",
  159. "searchPhrase": "",
  160. "query_bulletintitle": "",
  161. "query_startTime": tm,
  162. "query_endTime": tm,
  163. "query_type": menu.get('noticeType'),
  164. "query_code": "",
  165. "query_gglxdm": "",
  166. "query_purchaser": "",
  167. "query_projectid": ""
  168. }
  169. request.url = url
  170. request.data = data
  171. request.cookies = self.cookies
  172. request.proxies = self.proxy
  173. request.headers = self.headers
  174. def exception_request(self, request, response):
  175. self.proxy = get_proxy()
  176. self.cookies = {}
  177. yield request
  178. def parse(self, request, response):
  179. if self.retry > 5:
  180. return
  181. if "系统异常,请稍后再试" in response.text or "磐云" in response.text:
  182. self.retry += 1
  183. self.cookies = {}
  184. self.proxy = get_proxy()
  185. yield request
  186. else:
  187. menu = request.item
  188. response=response.json
  189. info_list = response.get('data').get('rows')
  190. for info in info_list:
  191. title = info.get('bulletintitle').split(":")[-1].strip()
  192. href_id = info.get('bulletin_id')
  193. bulletinclassname = info.get('bulletinclassname')
  194. href = f'http://www.yngp.com/showBulletinInfo.html?bulletin_id={href_id}'
  195. if bulletinclassname == "采购合同公告":
  196. if "单位采购合同公告" in title:
  197. continue
  198. href = f'http://www.yngp.com/ggmxinfo.html?bulletinid={href_id}'
  199. elif "公共服务项目验收结果公告" in bulletinclassname:
  200. href = f'http://www.yngp.com/showAcceptanceResultsNoticeInfo.html?bulletinid={href_id}'
  201. elif bulletinclassname in "单一来源审核前公示":
  202. href = f'http://www.yngp.com/dylyggInfo.html?type=3&bulletin_id={href_id}'
  203. elif "成交公告" in bulletinclassname:
  204. href = f'http://www.yngp.com/showZCYBulletinInfo.html?bulletin_id={href_id}'
  205. create_time = info.get("finishday")
  206. area = "云南" # 省份
  207. city = ""
  208. if info.get('districtname') !="省级":
  209. city = info.get('districtname') # 城市
  210. list_item = MgpListItem() # 存储数据的管道
  211. list_item.href = href # 标书链接
  212. list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
  213. list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
  214. list_item.title = title # 标题
  215. list_item.publishtime = create_time # 标书发布时间
  216. list_item.site = self.site
  217. list_item.area = area # 城市默认:全国
  218. list_item.city = city # 城市 默认为空
  219. list_item.unique_key = ('href','title',create_time)
  220. list_item.parse = "self.detail_get"
  221. list_item.render_time = 1
  222. list_item.proxies = False
  223. list_item.parse_url = href
  224. yield list_item
  225. request = self.infinite_pages(request, response)
  226. yield request
  227. if __name__ == "__main__":
  228. Ynszfcgw_New(redis_key="lzz:Ynszfcgw_Jdgl").start()