Browse Source

Merge branch 'master' of http://192.168.3.207:8080/data_processing/crawlab_feader

dongzhaorui 2 years ago
parent
commit
52b4e2b575
2 changed files with 20 additions and 11 deletions
  1. 6 5
      FworkSpider/feapder/templates/spider_template.tmpl
  2. 14 6
      FworkSpider/untils/tools.py

+ 6 - 5
FworkSpider/feapder/templates/spider_template.tmpl

@@ -22,11 +22,11 @@ class ${spider_name}(feapder.BiddingListSpider):
 
          self.site = ""
 
-         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
+         #   --- --- crawl_page 必须存在,且为纯数字(int) --- ---
+         Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
 
          self.menus = [
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
          ]
 
          self.headers = {}
@@ -44,7 +44,7 @@ class ${spider_name}(feapder.BiddingListSpider):
     def parse(self, request, response):
         driver = response.browser
         menu = request.item
-        info_list = response.xpath('')       # 数据结构为html
+        info_list = response.xpath('')
         for info in info_list:
             # href = info.xpath('').extract_first().strip()
             title = info.xpath('').extract_first().strip()
@@ -61,6 +61,7 @@ class ${spider_name}(feapder.BiddingListSpider):
 
             data_item = DataBakItem()                # 存储数据的管道
             data_item.href = href                    # 标书链接
+            data_item.unique_key = ('title','href')  # 去重
             data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
             data_item.title = title                  # 标题
@@ -70,7 +71,7 @@ class ${spider_name}(feapder.BiddingListSpider):
             data_item.city = city                    # 城市 默认 为空
             data_item.district = district            # 区县 默认 为空
 
-            detail_html = Selector(driver.page_source)
+            detail_html = Selector(text=driver.page_source)
             html = ""
             dx_list = ['//div[@class="***"]',]
             for dx in dx_list:

+ 14 - 6
FworkSpider/untils/tools.py

@@ -335,12 +335,11 @@ def get_construction_area(project_scale):
     return construction_area
 
 
-# 过滤详情页无效数据
 def remove_htmldata(remove_info_list:list, html:str, response):
     """
-
+        过滤详情页无效数据
     Args:
-        remove_info_list: 需删除内容的xpath或文本 -> list
+        remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str]
         html: 待清洗文本
         response: 原文响应体
 
@@ -350,9 +349,18 @@ def remove_htmldata(remove_info_list:list, html:str, response):
     if html and remove_info_list:
         for extra_item in remove_info_list:
             if re.search('^//.*', extra_item):
-                extra_html = response.xpath(extra_item).extract_first()
+                extra_html_list = response.xpath(extra_item).extract()
+                for extra_html in extra_html_list:
+                    if extra_html:
+                        html = html.replace(extra_html, '')
+            elif re.search('^<re>.*', extra_item):
+                extra_item = extra_item.replace('<re>','')
+                extra_html_list = re.findall(f'{extra_item}',html,re.S|re.I|re.M)
+                if extra_html_list:
+                    for exhtml in extra_html_list:
+                        html = html.replace(exhtml, '')
             else:
                 extra_html = extra_item
-            if extra_html:
-                html = html.replace(extra_html, '')
+                if extra_html:
+                    html = html.replace(extra_html, '')
     return html