|
@@ -22,11 +22,11 @@ class ${spider_name}(feapder.BiddingListSpider):
|
|
|
|
|
|
self.site = ""
|
|
|
|
|
|
- # --- --- crawl_page 必须存在,且为纯数字(int) --- ---
|
|
|
- Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
|
|
|
+ # --- --- crawl_page 必须存在,且为纯数字(int) --- ---
|
|
|
+ Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
|
|
|
|
|
|
self.menus = [
|
|
|
- Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
|
|
|
+ Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
|
|
|
]
|
|
|
|
|
|
self.headers = {}
|
|
@@ -44,7 +44,7 @@ class ${spider_name}(feapder.BiddingListSpider):
|
|
|
def parse(self, request, response):
|
|
|
driver = response.browser
|
|
|
menu = request.item
|
|
|
- info_list = response.xpath('') # 数据结构为html
|
|
|
+ info_list = response.xpath('')
|
|
|
for info in info_list:
|
|
|
# href = info.xpath('').extract_first().strip()
|
|
|
title = info.xpath('').extract_first().strip()
|
|
@@ -61,6 +61,7 @@ class ${spider_name}(feapder.BiddingListSpider):
|
|
|
|
|
|
data_item = DataBakItem() # 存储数据的管道
|
|
|
data_item.href = href # 标书链接
|
|
|
+ data_item.unique_key = ('title','href') # 去重
|
|
|
data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
data_item.title = title # 标题
|
|
@@ -70,7 +71,7 @@ class ${spider_name}(feapder.BiddingListSpider):
|
|
|
data_item.city = city # 城市 默认 为空
|
|
|
data_item.district = district # 区县 默认 为空
|
|
|
|
|
|
- detail_html = Selector(driver.page_source)
|
|
|
+ detail_html = Selector(text=driver.page_source)
|
|
|
html = ""
|
|
|
dx_list = ['//div[@class="***"]',]
|
|
|
for dx in dx_list:
|