城轨采购网.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2021-12-27 10:54:26
  4. ---------
  5. @summary:
  6. ---------
  7. @author: topnet
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem,MgpListItem
  11. from untils.proxy_pool import ProxyPool
  12. from feapder.dedup import Dedup
  13. from collections import namedtuple
  14. def gotoPage(types,fid): #onclick 的函数,生成url
  15. if types == "1" or types == "2": # 比价公告
  16. return "/Purchase/Notice/NewDetail?Id="+fid
  17. elif types == "3": # 在线询价
  18. return "https://work.mtrmart.com/Modules/SpareParts/SparePartsDispatch.ashx?ID=" + fid + "&AddNew=0"
  19. elif types == "4": # 招标项目
  20. return "/Bids/BidsNotice/NewDetail?Id="+fid
  21. elif types == "5": #单一来源公示
  22. return "/SingleSourceNotice/Notice/NewDetail?Id=" + fid
  23. class Cgcgw(feapder.Spider):
  24. def start_callback(self):
  25. self.count = 0
  26. Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
  27. self.menus = [
  28. Menu('Cgcgw', 'Cgcgw', "Notice", 1),
  29. Menu('Cgcgw', 'Cgcgw', "Notice", 1),
  30. ]
  31. def start_requests(self):
  32. for menu in self.menus:
  33. start_url = f'https://www.mtrmart.com/Purchase/Notice/SearchNewList?title=&category=&noticeType=&noticeTypeStr=&NoSinglesource=&companyValue=&isInProgress=n&isOneYear=y&page=2&pageSize=10'
  34. yield feapder.Request(url=start_url, item=menu._asdict())
  35. def parse(self, request, response):
  36. print(response.text)
  37. menu = request.item
  38. self.count += 1 # 一个计数器
  39. dedup = Dedup(Dedup.BloomFilter)
  40. href_list = []
  41. info_list = response.xpath('//ul[@class="base-list"]/li')
  42. for info in info_list:
  43. href = "https://www.mtrmart.com/" + eval(info.xpath('./h6/span/@onclick').extract_first().strip(";"))
  44. title = info.xpath('./h6/@title').extract_first()
  45. create_time = info.xpath('./p/span[2]/text()').extract_first()
  46. data_item = DataBakItem() # 存储数据的管道
  47. data_item.href = href # 标书链接
  48. data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
  49. data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
  50. data_item.title = title # 标题
  51. data_item.publishtime = create_time # 标书发布时间
  52. data_item.site = "城轨采购网"
  53. data_item.area = "全国" # 城市默认:全国
  54. data_item.city = "" # 城市 默认为空
  55. ss = dedup.filter_exist_data([href])
  56. if ss == []:
  57. continue
  58. list_item = MgpListItem()
  59. list_item.parse = "self.detail_get"
  60. list_item.parser_name = "details"
  61. list_item.item = data_item.to_dict
  62. list_item.deal_detail = ['//****',"*****"]
  63. list_item.author = "****"
  64. list_item.parse_url = href
  65. href_list.append(href)
  66. # yield list_item
  67. # dedup.add(href_list)
  68. def end_callback(self):
  69. print("爬虫结束")
  70. if __name__ == "__main__":
  71. Cgcgw(redis_key="fwork:Cgcgw").start()