123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259 |
- # -*- coding: utf-8 -*-
- """
- Created on 2021-12-13 13:25:15
- ---------
- @summary:
- ---------
- @author: 马国鹏
- """
- import json
- import re
- import sys
- import time
- from lxml import etree
- from encode_info import encode_info
- sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider')
- sys.path.append('C:/Users/topnet/Desktop/crawlab_feader/FworkSpider')
- import feapder
- from feapder.db.mongodb import MongoDB
- from feapder.utils.log import log
- from typing import Optional
- from lxml.html import HtmlElement
- from lxml.html.clean import Cleaner
- from untils.tools import int2long, substitute, text_search
- from cookie_pool import WebCookiePool
- from untils.proxy_pool import ProxyPool
- from pathlib import Path
- import execjs
- import requests
- def get_proxy():
- headers = {
- "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
- }
- proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
- log.info("切换代理:", proxy.get("data"))
- return proxy.get("data")
- class ParseElement:
- def __init__(self, ):
- self.__element: Optional[HtmlElement] = None
- @property
- def html(self) -> str:
- return etree.tostring(self.elem, method="html", encoding="utf-8").decode()
- @property
- def clean_html(self) -> str:
- cleaner = Cleaner()
- cleaner.javascript = False
- cleaner.remove_unknown_tags = False
- cleaner.safe_attrs = ['href', 'src']
- return cleaner.clean_html(self.html)
- @property
- def elem(self):
- return self.__element
- @elem.setter
- def elem(self, element: HtmlElement):
- self.__element = element
- def splicing(response):
- path = 'splicing.js'
- with open(path, encoding='utf-8') as rp:
- js_script = rp.read()
- ctx = execjs.compile(js_script)
- html = ctx.call('splicing', response)
- return html
- class Details(feapder.AirSpider):
- cookie_pool = WebCookiePool(redis_key='zgztbcookie',
- page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do")
- _to_db = None
- db_name = 'zgzb_list'
- send_list = []
- proxy = get_proxy()
- @property
- def to_db(self):
- if not self._to_db:
- self._to_db = MongoDB()
- return self._to_db
- def start_requests(self):
- base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action'
- while True:
- data_lsit = self.to_db.find(self.db_name, {"type": "0","timeout":None}, sort={"publishtime": -1}, limit=300)
- for item in data_lsit:
- tenderprojectcode = item.get("href").split("&")[1]
- businessid = item.get("href").split("&")[0]
- businesskeyword = item.get("businessKeyWord")
- businessoObjectname = item.get("title")
- schemaversion = item.pop("schemaVersion")
- rowguid = item.pop("rowGuid")
- data = {
- "schemaVersion": schemaversion,
- "businessKeyWord": businesskeyword,
- "tenderProjectCode": encode_info(tenderprojectcode),
- "businessObjectName": businessoObjectname,
- "businessId": encode_info(businessid),
- }
- detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do'
- yield feapder.Request(url=detail_url, item=item, method="POST", data=data,
- callback=self.detail_get, timeout=5, use_session=True,count=0)
- break
- def get_acw_sc_v2(self, arg1):
- path = 'acw_sc_v2.js'
- with open(path, encoding='utf-8') as rp:
- js_script = rp.read()
- ctx = execjs.compile(js_script)
- arg2 = ctx.call('l', arg1)
- log.info(f'acw_sc_v2 >>> {arg2}')
- return arg2
- def detail_get(self, request, response):
- # return
- if "arg1" in response.text:
- arg1 = "".join(re.findall("arg1='(\w+)';", response.text)).strip()
- if arg1 != '':
- acw_sc_v2 = self.get_acw_sc_v2(arg1)
- request.session.cookies.update({'acw_sc__v2': acw_sc_v2})
- time.sleep(1)
- yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
- callback=self.detail_get, timeout=5, use_session=True, count=0)
- elif '滑动验证页面' in response.text:
- log.info('开始过滑块验证')
- cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
- count = request.count
- if count > 4:
- return
- if cookies is None:
- self.proxy = get_proxy()
- elif len(cookies) <= 1:
- self.proxy = get_proxy()
- request.session.cookies.update(cookies)
- yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
- callback=self.detail_get, timeout=5, use_session=True, count=count + 1)
- else:
- try:
- response.json
- except Exception as e:
- log.info(e)
- self.proxy = get_proxy()
- cookies = self.cookie_pool.create_cookies(proxies=self.proxy.get("http"))
- request.session.cookies.update(cookies)
- yield feapder.Request(url=request.url, item=request.item, method="POST", data=request.data,
- callback=self.detail_get, timeout=5, use_session=True, cookies=cookies, count=0)
- else:
- item = request.item
- tenderprojectcode = item.get("href").split("&")[1]
- businessid = item.get("href").split("&")[0]
- businesskeyword = "businessKeyWord"
- if item.__contains__("businessKeyWord"):
- businesskeyword = item.pop("businessKeyWord")
- detail_info = response.json.get("object").get(businesskeyword)
- if not detail_info:
- businesskeywords = response.json.get("object").keys()
- for businesskeyword in businesskeywords:
- businesskeyword = businesskeyword
- detail_info = response.json.get("object").get(businesskeyword)
- area = item.get("area")
- if area is None:
- item["area"] = "全国"
- item["city"] = ""
- elif " " in area:
- item["area"] = area.split(" ")[0]
- item["city"] = area.split(" ")[-1]
- else:
- item["area"] = "全国"
- item["city"] = ""
- if detail_info is None or detail_info == []:
- businessKeyWords = response.json.get("object").keys()
- for key in businessKeyWords:
- businesskeyword = key
- detail_info = response.json.get("object").get(businesskeyword)
- if detail_info is None or detail_info == []:
- item[
- "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}"
- item["sendflag"] = "true"
- item["comeintime"] = int(time.time())
- result = self.to_db.add("data_bak", item)
- # self.to_db.delete(self.db_name, {"_id": item["_id"]})
- self.to_db.update(self.db_name, {"timeout": 3}, {"_id": item["_id"]})
- log.info(f"mongo add _id:{item.get('title')}<空结果")
- # log.info(f"mongo add _id:{item.get('title')}")
- else:
- pass
- if businesskeyword == "tenderProject":
- item["contenthtml"] = splicing(detail_info)
- pass
- else:
- detail_info = detail_info[0]
- item["contenthtml"] = detail_info.get("bulletinContent")
- if item["contenthtml"] is None:
- item["detail"] = None
- item["sendflag"] = "true"
- else:
- item["detail"] = substitute(item["contenthtml"])
- item[
- "href"] = f"http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do#uuid={businessid + tenderprojectcode}"
- if text_search(item["detail"]).total == 0:
- item["sendflag"] = "true"
- item["comeintime"] = int2long(int(time.time()))
- ss = {"": ""}
- result = self.to_db.add("data_bak", item)
- # self.to_db.delete(self.db_name, {"_id": item["_id"]})
- self.to_db.update(self.db_name, {"timeout": 2}, {"_id": item["_id"]})
- # print(f"mongo add _id:{item.get('title')}")
- log.info(f"mongo add _id:{item.get('title')}")
- time.sleep(0.5)
- def download_midware(self, request):
- request.proxies = self.proxy
- log.info(request.item.get("title"))
- request.headers = {
- "Host": "www.cebpubservice.com",
- "Accept": "application/json, text/javascript, */*; q=0.01",
- "X-Requested-With": "XMLHttpRequest",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
- "Origin": "http://www.cebpubservice.com",
- "Referer": "http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do",
- "Accept-Language": "zh-CN,zh;q=0.9"
- }
- def exception_request(self, request, response):
- if response is None:
- item = request.item
- self.to_db.update(self.db_name, {"timeout": 1}, {"_id": item["_id"]})
- self.proxy = get_proxy()
- log.info("添加到超时标记")
- else:
- log.info("error:这是一个一般错误")
- if __name__ == "__main__":
- spider = Details(thread_count=1)
- spider.start()
- spider.join()
|