|
@@ -6,39 +6,34 @@ Created on 2025-04-15
|
|
|
---------
|
|
|
@author: lzz
|
|
|
"""
|
|
|
-import feapder
|
|
|
-from items.njpc_item import NjpcListItem
|
|
|
from collections import namedtuple
|
|
|
|
|
|
+import feapder
|
|
|
+from items.njpc_item import NjpcListItem
|
|
|
|
|
|
|
|
|
-
|
|
|
-class Njpc_Feapder(feapder.PlanToBuildListSpider):
|
|
|
+class Spider(feapder.PlanToBuildListSpider):
|
|
|
|
|
|
def start_callback(self):
|
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
|
|
|
-
|
|
|
self.site = "浙江省投资项目在线审批监管平台"
|
|
|
-
|
|
|
self.menus = [
|
|
|
- Menu('审批和监管事项办理结果公示', 'zj_zjstzxmzxspjgpt_pppzdtjxm_njpc', 60),
|
|
|
+ Menu('审批和监管事项办理结果公示', 'zj_zjstzxmzxspjgpt_pppzdtjxm_njpc', 50),
|
|
|
]
|
|
|
-
|
|
|
self.headers = {
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
- "Connection": "keep-alive",
|
|
|
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
|
|
"Origin": "https://tzxm.zjzwfw.gov.cn",
|
|
|
"Referer": "https://tzxm.zjzwfw.gov.cn/tzxmweb/zwtpages/resultsPublicity/notice_of_publicity_new.html?page=1",
|
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
|
}
|
|
|
|
|
|
def start_requests(self):
|
|
|
- for menu in self.menus:
|
|
|
- start_url = "https://tzxm.zjzwfw.gov.cn/publicannouncement.do?method=itemList"
|
|
|
- yield feapder.Request(url=start_url, item=menu._asdict(), page=1)
|
|
|
+ url = "https://tzxm.zjzwfw.gov.cn/publicannouncement.do?method=itemList"
|
|
|
+ for menu, page in self.product():
|
|
|
+ yield feapder.Request(url, item=menu._asdict(), page=page)
|
|
|
|
|
|
def download_midware(self, request):
|
|
|
page = request.page
|
|
@@ -54,7 +49,6 @@ class Njpc_Feapder(feapder.PlanToBuildListSpider):
|
|
|
request.headers = self.headers
|
|
|
|
|
|
def parse(self, request, response):
|
|
|
-
|
|
|
menu = request.item
|
|
|
info_list = response.json[0].get('itemList')
|
|
|
for info in info_list:
|
|
@@ -69,33 +63,29 @@ class Njpc_Feapder(feapder.PlanToBuildListSpider):
|
|
|
city = "" # 城市
|
|
|
district = ""
|
|
|
|
|
|
- data_item = NjpcListItem() # 存储数据的管道
|
|
|
- data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
- data_item.unique_key = ('href', publish_time)
|
|
|
- data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
- data_item.projectname = projectname # 项目名称
|
|
|
- data_item.publishtime = publish_time # 发布时间
|
|
|
- data_item.approvecode = deal_code
|
|
|
- data_item.approvecontent = info.get('ITEM_NAME')
|
|
|
-
|
|
|
- data_item.site = self.site
|
|
|
- data_item.area = area or "全国" # 城市默认:全国
|
|
|
- data_item.city = city # 城市 默认为空
|
|
|
- data_item.district = district # 城市 默认为空
|
|
|
- data_item.parser_url = "https://tzxm.zjzwfw.gov.cn/publicannouncement.do?method=projectDetail" # 详情页数据链接
|
|
|
- data_item.href = detail_href # 详情链接
|
|
|
- data_item.parser = "detail_get"
|
|
|
+ list_item = NjpcListItem() # 存储数据的管道
|
|
|
+ list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
+ list_item.unique_key = ('href', publish_time)
|
|
|
+ list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
+ list_item.projectname = projectname # 项目名称
|
|
|
+ list_item.publishtime = publish_time # 发布时间
|
|
|
+ list_item.approvecode = deal_code
|
|
|
+ list_item.approvecontent = info.get('ITEM_NAME')
|
|
|
+
|
|
|
+ list_item.site = self.site
|
|
|
+ list_item.area = area or "全国" # 城市默认:全国
|
|
|
+ list_item.city = city # 城市 默认为空
|
|
|
+ list_item.district = district # 城市 默认为空
|
|
|
+ list_item.parser_url = "https://tzxm.zjzwfw.gov.cn/publicannouncement.do?method=projectDetail" # 详情页数据链接
|
|
|
+ list_item.href = detail_href # 详情链接
|
|
|
+ list_item.parser = "detail_get"
|
|
|
data = {
|
|
|
"projectuuid": pUid
|
|
|
}
|
|
|
- data_item.request_params = {"data":data}
|
|
|
- data_item.is_check_spider = False
|
|
|
-
|
|
|
- yield data_item
|
|
|
+ list_item.request_params = {"data": data}
|
|
|
+ list_item.is_check_spider = False
|
|
|
+ yield list_item
|
|
|
|
|
|
- # 翻页
|
|
|
- request = self.infinite_pages(request, response)
|
|
|
- yield request
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- Njpc_Feapder(redis_key="lzz:zjstzxmzxspjgpt_pppzdtjxm").start()
|
|
|
+ Spider(redis_key="lzz:zjstzxmzxspjgpt_pppzdtjxm").start()
|