# -*- coding: utf-8 -*- """ Created on 2021-12-13 13:25:15 --------- @summary: --------- @author: 马国鹏 """ import json import re import sys import time from lxml import etree from encode_info import encode_info sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider') sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider') import feapder from feapder.db.mongodb import MongoDB from feapder.utils.log import log from typing import Optional from lxml.html import HtmlElement from lxml.html.clean import Cleaner from untils.tools import int2long, substitute, text_search from cookie_pool import WebCookiePool from untils.proxy_pool import ProxyPool from pathlib import Path import execjs import requests def get_proxy(): headers = { "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB" } proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json() log.info("切换代理:", proxy.get("data")) return proxy.get("data") class ParseElement: def __init__(self, ): self.__element: Optional[HtmlElement] = None @property def html(self) -> str: return etree.tostring(self.elem, method="html", encoding="utf-8").decode() @property def clean_html(self) -> str: cleaner = Cleaner() cleaner.javascript = False cleaner.remove_unknown_tags = False cleaner.safe_attrs = ['href', 'src'] return cleaner.clean_html(self.html) @property def elem(self): return self.__element @elem.setter def elem(self, element: HtmlElement): self.__element = element def splicing(response): path = 'splicing.js' with open(path, encoding='utf-8') as rp: js_script = rp.read() ctx = execjs.compile(js_script) html = ctx.call('splicing', response) return html class Details(feapder.AirSpider): cookie_pool = WebCookiePool(redis_key='zgztbcookie', page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do") _to_db = None db_name = 'zgzb_list' send_list = [] proxy = get_proxy() @property def to_db(self): if not self._to_db: self._to_db = MongoDB() return self._to_db def start_requests(self): base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action' while True: data_lsit = self.to_db.find(self.db_name, {"type": "0","timeout":None}, sort={"publishtime": -1}, limit=300) for item in data_lsit: tenderprojectcode = item.get("href").split("&")[1] businessid = item.get("href").split("&")[0] businesskeyword = item.get("businessKeyWord") businessoObjectname = item.get("title") schemaversion = item.pop("schemaVersion") rowguid = item.pop("rowGuid") data = { "schemaVersion": schemaversion, "businessKeyWord": businesskeyword, "tenderProjectCode": encode_info(tenderprojectcode), "businessObjectName": businessoObjectname, "businessId": encode_info(businessid), } detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do' yield feapder.Request(url=detail_url, item=item, method="POST", data=data, callback=self.detail_get, timeout=5, use_session=True,count=0) break def get_acw_sc_v2(self, arg1): path = 'acw_sc_v2.js' with open(path, encoding='utf-8') as rp: js_script = rp.read() ctx = execjs.compile(js_script) arg2 = ctx.call('l', arg1) log.info(f'acw_sc_v2 >>> {arg2}') return arg2 def detail_get(self, request, response): # return if "arg1" in response.text: arg1 = "".join(re.findall("arg1='(\w+)';", response.text)).strip() if arg1 != '': acw_sc_v2 = self.get_acw_sc_v2(arg1) request.session.cookies.update({'acw_sc__v2': acw_sc_v2}) time.sleep(1) yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data, callback=self.detail_get, timeout=5, use_session=True, count=0) elif '滑动验证页面' in response.text: log.info('开始过滑块验证') cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http")) count = request.count if count > 4: return if cookies is None: self.proxy = get_proxy() elif len(cookies) <= 1: self.proxy = get_proxy() request.session.cookies.update(cookies) yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data, callback=self.detail_get, timeout=5, use_session=True, count=count + 1) else: try: response.json except Exception as e: log.info(e) self.proxy = get_proxy() cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http")) request.session.cookies.update(cookies) yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data, callback=self.detail_get, timeout=5, use_session=True, cookies=cookies, count=0) else: item = request.item tenderprojectcode = item.get("href").split("&")[1] businessid = item.get("href").split("&")[0] businesskeyword = "businessKeyWord" if item.__contains__("businessKeyWord"): businesskeyword = item.pop("businessKeyWord") detail_info = response.json.get("object").get(businesskeyword) if not detail_info: businesskeywords = response.json.get("object").keys() for businesskeyword in businesskeywords: businesskeyword = businesskeyword detail_info = response.json.get("object").get(businesskeyword) area = item.get("area") if area is None: item["area"] = "全国" item["city"] = "" elif " " in area: item["area"] = area.split(" ")[0] item["city"] = area.split(" ")[-1] else: item["area"] = "全国" item["city"] = "" if detail_info is None or detail_info == []: businessKeyWords = response.json.get("object").keys() for key in businessKeyWords: businesskeyword = key detail_info = response.json.get("object").get(businesskeyword) if detail_info is None or detail_info == []: item[ "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}" item["sendflag"] = "true" item["comeintime"] = int(time.time()) result = self.to_db.add("data_bak", item) # self.to_db.delete(self.db_name, {"_id": item["_id"]}) self.to_db.update(self.db_name, {"timeout": 3}, {"_id": item["_id"]}) log.info(f"mongo add _id:{item.get('title')}<空结果") # log.info(f"mongo add _id:{item.get('title')}") else: pass if businesskeyword == "tenderProject": item["contenthtml"] = splicing(detail_info) pass else: detail_info = detail_info[0] item["contenthtml"] = detail_info.get("bulletinContent") if item["contenthtml"] is None: item["detail"] = None item["sendflag"] = "true" else: item["detail"] = substitute(item["contenthtml"]) item[ "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}" if text_search(item["detail"]).total == 0: item["sendflag"] = "true" item["comeintime"] = int2long(int(time.time())) ss = {"": ""} result = self.to_db.add("data_bak", item) # self.to_db.delete(self.db_name, {"_id": item["_id"]}) self.to_db.update(self.db_name, {"timeout": 2}, {"_id": item["_id"]}) # print(f"mongo add _id:{item.get('title')}") log.info(f"mongo add _id:{item.get('title')}") time.sleep(0.5) def download_midware(self, request): request.proxies = self.proxy log.info(request.item.get("title")) request.headers = { "Host": "www.cebpubservice.com", "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Origin": "http://www.cebpubservice.com", "Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do", "Accept-Language": "zh-CN,zh;q=0.9" } def exception_request(self, request, response): if response is None: item = request.item self.to_db.update(self.db_name, {"timeout": 1}, {"_id": item["_id"]}) self.proxy = get_proxy() log.info("添加到超时标记") else: log.info("error:这是一个一般错误") if __name__ == "__main__": spider = Details(thread_count=1) spider.start() spider.join()