1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- # -*- coding: utf-8 -*-
- """
- Created on 2021-12-27 10:54:26
- ---------
- @summary:
- ---------
- @author: topnet
- """
- import feapder
- from items.spider_item import DataBakItem,MgpListItem
- from untils.proxy_pool import ProxyPool
- from feapder.dedup import Dedup
- from collections import namedtuple
- def gotoPage(types,fid): #onclick 的函数,生成url
- if types == "1" or types == "2": # 比价公告
- return "/Purchase/Notice/NewDetail?Id="+fid
- elif types == "3": # 在线询价
- return "https://work.mtrmart.com/Modules/SpareParts/SparePartsDispatch.ashx?ID=" + fid + "&AddNew=0"
- elif types == "4": # 招标项目
- return "/Bids/BidsNotice/NewDetail?Id="+fid
- elif types == "5": #单一来源公示
- return "/SingleSourceNotice/Notice/NewDetail?Id=" + fid
- class Cgcgw(feapder.Spider):
- def start_callback(self):
- self.count = 0
- Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
- self.menus = [
- Menu('Cgcgw', 'Cgcgw', "Notice", 1),
- Menu('Cgcgw', 'Cgcgw', "Notice", 1),
- ]
- def start_requests(self):
- for menu in self.menus:
- start_url = f'https://www.mtrmart.com/Purchase/Notice/SearchNewList?title=&category=¬iceType=¬iceTypeStr=&NoSinglesource=&companyValue=&isInProgress=n&isOneYear=y&page=2&pageSize=10'
- yield feapder.Request(url=start_url, item=menu._asdict())
- def parse(self, request, response):
- print(response.text)
- menu = request.item
- self.count += 1 # 一个计数器
- dedup = Dedup(Dedup.BloomFilter)
- href_list = []
- info_list = response.xpath('//ul[@class="base-list"]/li')
- for info in info_list:
- href = "https://www.mtrmart.com/" + eval(info.xpath('./h6/span/@onclick').extract_first().strip(";"))
- title = info.xpath('./h6/@title').extract_first()
- create_time = info.xpath('./p/span[2]/text()').extract_first()
- data_item = DataBakItem() # 存储数据的管道
- data_item.href = href # 标书链接
- data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
- data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
- data_item.title = title # 标题
- data_item.publishtime = create_time # 标书发布时间
- data_item.site = "城轨采购网"
- data_item.area = "全国" # 城市默认:全国
- data_item.city = "" # 城市 默认为空
- ss = dedup.filter_exist_data([href])
- if ss == []:
- continue
- list_item = MgpListItem()
- list_item.parse = "self.detail_get"
- list_item.parser_name = "details"
- list_item.item = data_item.to_dict
- list_item.deal_detail = ['//****',"*****"]
- list_item.author = "****"
- list_item.parse_url = href
- href_list.append(href)
- # yield list_item
- # dedup.add(href_list)
- def end_callback(self):
- print("爬虫结束")
- if __name__ == "__main__":
- Cgcgw(redis_key="fwork:Cgcgw").start()
|