123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- # -*- coding: utf-8 -*-
- """
- Created on 2021-12-13 13:25:15
- ---------
- @summary:
- ---------
- @author: 马国鹏
- """
- import json
- import sys
- import time
- from lxml import etree
- sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider')
- sys.path.append('C:/Users/topnet/Desktop/FworkSpider')
- from untils.cleaner import cleaner
- import feapder
- from feapder.utils.tools import wechat_warning
- from feapder.utils.log import log
- from items.spider_item import DataBakItem, MgpListItem
- from feapder.db.mongodb import MongoDB
- from typing import Optional
- from lxml.html import HtmlElement
- from lxml.html.clean import Cleaner
- from untils.tools import int2long,substitute,text_search
- class ParseElement:
- def __init__(self, ):
- self.__element: Optional[HtmlElement] = None
- @property
- def html(self) -> str:
- return etree.tostring(self.elem, method="html", encoding="utf-8").decode()
- @property
- def clean_html(self) -> str:
- cleaner = Cleaner()
- cleaner.javascript = False
- cleaner.remove_unknown_tags = False
- cleaner.safe_attrs = ['href', 'src']
- return cleaner.clean_html(self.html)
- @property
- def elem(self):
- return self.__element
- @elem.setter
- def elem(self, element: HtmlElement):
- self.__element = element
- class Details(feapder.Spider):
- _to_db = None
- db_name = 'zgzb_list'
- send_list = []
- # 定义mongo链接
- @property
- def to_db(self):
- if not self._to_db:
- self._to_db = MongoDB()
- return self._to_db
- def start_requests(self):
- base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action'
- while True:
- data_lsit = self.to_db.find(self.db_name,{"type" : "1","timeout":None},sort={"publishtime":1},limit=100)
- for item in data_lsit:
- rowguid = item.pop("rowGuid")
- detail_url = f'http://connect.cebpubservice.com/PSPFrame/infobasemis/socialpublic/publicyewu/Frame_yewuDetail?rowguid={rowguid}'
- yield feapder.Request(url=detail_url, item=item,
- callback=self.detail_get, splash=True,render_time=3,iframes=1)
- break
- def detail_get(self,request,response):
- item = request.item
- html = ''
- res_html = response.json.get("html")
- childFrames = response.json.get("childFrames")
- for frame in childFrames:
- res_html += frame.get("html")
- if 1:
- html_etree = etree.HTML(res_html)
- xpath_list = ['//div[@class="fui-accordions"]',
- '//div[@class="mini-panel-viewport mini-grid-viewport"]']
- for xpath in xpath_list:
- htmls = html_etree.xpath(xpath)
- if len(htmls) > 0:
- html = '\n'.join([etree.tounicode(html) for html in htmls])
- break
- area = item.get("area")
- if area is None:
- area = ''
- area.strip()
- if " " in area:
- item["area"] = area.split(" ")[0]
- item["city"] = area.split(" ")[-1]
- item["contenthtml"] = html
- item["detail"] = substitute(item["contenthtml"])
- item["href"] = request.url
- if text_search(item["detail"]).total == 0:
- # 无正文内容时,该内容直接标记true, 不在被统计
- item["sendflag"] = "true"
- # spidersendnew 发送程序是按照时间读取的,采集数据入库时间要求为当前入库时间
- item["comeintime"] = int2long(int(time.time()))
- try:
- item.pop("businessKeyWord")
- except:
- print("businessKeyWord")
- try:
- item.pop("businessKeyWord")
- except:
- print("businessKeyWord")
- taskid = item.pop("_id")
- try:
- self.to_db.add("data_bak", item)
- log.info(f"_id:{str(item['_id'])}")
- self.to_db.update(self.db_name, {"timeout": 2}, {"_id": taskid})
- log.info(
- f"mongo add _id:{item.get('title')},{item.get('publishtime')}")
- print("抓取成功")
- except:
- item["_id"] = taskid
- self.to_db.update(self.db_name, {"timeout": 5}, {"_id": taskid})
- print("更新成功")
- def end_callback(self):
- print("爬虫结束")
- if __name__ == "__main__":
- while True:
- apider = Details(redis_key="splish:zgzb:detail3")
- apider.start()
- apider.join()
|