# -*- coding: utf-8 -*- """ Created on 2021-12-13 13:25:15 --------- @summary: --------- @author: 马国鹏 """ import json import sys import time from lxml import etree sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider') sys.path.append('C:/Users/topnet/Desktop/FworkSpider') from untils.cleaner import cleaner import feapder from feapder.utils.tools import wechat_warning from feapder.utils.log import log from items.spider_item import DataBakItem, MgpListItem from feapder.db.mongodb import MongoDB from typing import Optional from lxml.html import HtmlElement from lxml.html.clean import Cleaner from untils.tools import int2long,substitute,text_search class ParseElement: def __init__(self, ): self.__element: Optional[HtmlElement] = None @property def html(self) -> str: return etree.tostring(self.elem, method="html", encoding="utf-8").decode() @property def clean_html(self) -> str: cleaner = Cleaner() cleaner.javascript = False cleaner.remove_unknown_tags = False cleaner.safe_attrs = ['href', 'src'] return cleaner.clean_html(self.html) @property def elem(self): return self.__element @elem.setter def elem(self, element: HtmlElement): self.__element = element class Details(feapder.Spider): _to_db = None db_name = 'zgzb_list' send_list = [] # 定义mongo链接 @property def to_db(self): if not self._to_db: self._to_db = MongoDB() return self._to_db def start_requests(self): base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action' while True: data_lsit = self.to_db.find(self.db_name,{"type" : "1","timeout":None},sort={"publishtime":1},limit=100) for item in data_lsit: rowguid = item.pop("rowGuid") detail_url = f'http://connect.cebpubservice.com/PSPFrame/infobasemis/socialpublic/publicyewu/Frame_yewuDetail?rowguid={rowguid}' yield feapder.Request(url=detail_url, item=item, callback=self.detail_get, splash=True,render_time=3,iframes=1) break def detail_get(self,request,response): item = request.item html = '' res_html = response.json.get("html") childFrames = response.json.get("childFrames") for frame in childFrames: res_html += frame.get("html") if 1: html_etree = etree.HTML(res_html) xpath_list = ['//div[@class="fui-accordions"]', '//div[@class="mini-panel-viewport mini-grid-viewport"]'] for xpath in xpath_list: htmls = html_etree.xpath(xpath) if len(htmls) > 0: html = '\n'.join([etree.tounicode(html) for html in htmls]) break area = item.get("area") if area is None: area = '' area.strip() if " " in area: item["area"] = area.split(" ")[0] item["city"] = area.split(" ")[-1] item["contenthtml"] = html item["detail"] = substitute(item["contenthtml"]) item["href"] = request.url if text_search(item["detail"]).total == 0: # 无正文内容时,该内容直接标记true, 不在被统计 item["sendflag"] = "true" # spidersendnew 发送程序是按照时间读取的,采集数据入库时间要求为当前入库时间 item["comeintime"] = int2long(int(time.time())) try: item.pop("businessKeyWord") except: print("businessKeyWord") try: item.pop("businessKeyWord") except: print("businessKeyWord") taskid = item.pop("_id") try: self.to_db.add("data_bak", item) log.info(f"_id:{str(item['_id'])}") self.to_db.update(self.db_name, {"timeout": 2}, {"_id": taskid}) log.info( f"mongo add _id:{item.get('title')},{item.get('publishtime')}") print("抓取成功") except: item["_id"] = taskid self.to_db.update(self.db_name, {"timeout": 5}, {"_id": taskid}) print("更新成功") def end_callback(self): print("爬虫结束") if __name__ == "__main__": while True: apider = Details(redis_key="splish:zgzb:detail3") apider.start() apider.join()