detail_firefox.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-13 13:25:15
  4. ---------
  5. @summary:
  6. ---------
  7. @author: 马国鹏
  8. """
  9. import json
  10. import sys
  11. import time
  12. from lxml import etree
  13. sys.path.append('/mnt/topic_spider/zgztb_cookie/FworkSpider')
  14. sys.path.append('C:/Users/topnet/Desktop/FworkSpider')
  15. from untils.cleaner import cleaner
  16. import feapder
  17. from feapder.utils.tools import wechat_warning
  18. from feapder.utils.log import log
  19. from items.spider_item import DataBakItem, MgpListItem
  20. from feapder.db.mongodb import MongoDB
  21. from typing import Optional
  22. from lxml.html import HtmlElement
  23. from lxml.html.clean import Cleaner
  24. from untils.tools import int2long,substitute,text_search
  25. class ParseElement:
  26. def __init__(self, ):
  27. self.__element: Optional[HtmlElement] = None
  28. @property
  29. def html(self) -> str:
  30. return etree.tostring(self.elem, method="html", encoding="utf-8").decode()
  31. @property
  32. def clean_html(self) -> str:
  33. cleaner = Cleaner()
  34. cleaner.javascript = False
  35. cleaner.remove_unknown_tags = False
  36. cleaner.safe_attrs = ['href', 'src']
  37. return cleaner.clean_html(self.html)
  38. @property
  39. def elem(self):
  40. return self.__element
  41. @elem.setter
  42. def elem(self, element: HtmlElement):
  43. self.__element = element
  44. class Details(feapder.Spider):
  45. _to_db = None
  46. db_name = 'zgzb_list'
  47. send_list = []
  48. # 定义mongo链接
  49. @property
  50. def to_db(self):
  51. if not self._to_db:
  52. self._to_db = MongoDB()
  53. return self._to_db
  54. def start_requests(self):
  55. base_url = 'http://www.ccgp-jilin.gov.cn/ext/search/gotoHelpFrontList.action'
  56. while True:
  57. data_lsit = self.to_db.find(self.db_name,{"type" : "1","timeout":None},sort={"publishtime":1},limit=100)
  58. for item in data_lsit:
  59. rowguid = item.pop("rowGuid")
  60. detail_url = f'http://connect.cebpubservice.com/PSPFrame/infobasemis/socialpublic/publicyewu/Frame_yewuDetail?rowguid={rowguid}'
  61. yield feapder.Request(url=detail_url, item=item,
  62. callback=self.detail_get, splash=True,render_time=3,iframes=1)
  63. break
  64. def detail_get(self,request,response):
  65. item = request.item
  66. html = ''
  67. res_html = response.json.get("html")
  68. childFrames = response.json.get("childFrames")
  69. for frame in childFrames:
  70. res_html += frame.get("html")
  71. if 1:
  72. html_etree = etree.HTML(res_html)
  73. xpath_list = ['//div[@class="fui-accordions"]',
  74. '//div[@class="mini-panel-viewport mini-grid-viewport"]']
  75. for xpath in xpath_list:
  76. htmls = html_etree.xpath(xpath)
  77. if len(htmls) > 0:
  78. html = '\n'.join([etree.tounicode(html) for html in htmls])
  79. break
  80. area = item.get("area")
  81. if area is None:
  82. area = ''
  83. area.strip()
  84. if " " in area:
  85. item["area"] = area.split(" ")[0]
  86. item["city"] = area.split(" ")[-1]
  87. item["contenthtml"] = html
  88. item["detail"] = substitute(item["contenthtml"])
  89. item["href"] = request.url
  90. if text_search(item["detail"]).total == 0:
  91. # 无正文内容时,该内容直接标记true, 不在被统计
  92. item["sendflag"] = "true"
  93. # spidersendnew 发送程序是按照时间读取的,采集数据入库时间要求为当前入库时间
  94. item["comeintime"] = int2long(int(time.time()))
  95. try:
  96. item.pop("businessKeyWord")
  97. except:
  98. print("businessKeyWord")
  99. try:
  100. item.pop("businessKeyWord")
  101. except:
  102. print("businessKeyWord")
  103. taskid = item.pop("_id")
  104. try:
  105. self.to_db.add("data_bak", item)
  106. log.info(f"_id:{str(item['_id'])}")
  107. self.to_db.update(self.db_name, {"timeout": 2}, {"_id": taskid})
  108. log.info(
  109. f"mongo add _id:{item.get('title')},{item.get('publishtime')}")
  110. print("抓取成功")
  111. except:
  112. item["_id"] = taskid
  113. self.to_db.update(self.db_name, {"timeout": 5}, {"_id": taskid})
  114. print("更新成功")
  115. def end_callback(self):
  116. print("爬虫结束")
  117. if __name__ == "__main__":
  118. while True:
  119. apider = Details(redis_key="splish:zgzb:detail3")
  120. apider.start()
  121. apider.join()