Browse Source

爬虫类型维护

lizongze 1 năm trước cách đây
mục cha
commit
495f3f35e2

+ 12 - 16
FworkSpider/feapder/templates/detail_template.tmpl

@@ -6,42 +6,39 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 from urllib.parse import urljoin
 import feapder
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
-from untils.tools import remove_htmldata,extract_file_type
+from untils.tools import remove_htmldata, extract_file_type
 from feapder.utils.log import log
 import time
 import json
 import re
 
 
-
-
 class Details(feapder.BiddingDetailSpider):
 
     def start_requests(self):
         while True:
             data_lsit = self.get_tasks_by_rabbitmq(limit=20)
             for item in data_lsit:
-                log.debug(item)
+                # log.debug(item)
                 request_params = item.get("request_params")
-                timeout = request_params.pop('timeout',10)
+                timeout = request_params.get('timeout', 10)
+                request_params.pop('timeout', None)
                 if item.get("js"):
                     eval(item.get("js"))
                 if item.get("ex_python"):
                     exec(item.get("ex_python"))
                 if item.get("proxies"):
-                    yield feapder.Request(url=item.get("parse_url"), item=item,files_info=item.get("files"),
+                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
                                           deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
-                                          **request_params,timeout=timeout)
+                                          **request_params, timeout=timeout)
                 else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item,files_info=item.get("files"),
+                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
                                           deal_detail=item.get("deal_detail"), timeout=timeout,
-                                          callback=eval(item.get("parse")), proxies=False,**request_params)
+                                          callback=eval(item.get("parse")), proxies=False, **request_params)
 
             break
 
@@ -56,9 +53,9 @@ class Details(feapder.BiddingDetailSpider):
             if html is not None:
                 break
 
-        if request.to_dict.get('rm_list',None) and html:
+        if request.to_dict.get('rm_list', None) and html:
             rm_list = request.rm_list
-            html = remove_htmldata(rm_list,html,response)
+            html = remove_htmldata(rm_list, html, response)
 
         list_item.contenthtml = html
 
@@ -77,7 +74,7 @@ class Details(feapder.BiddingDetailSpider):
                         if files_info.get("host"):
                             file_url = urljoin(files_info.get("host"), file_url)
                         if not files_info.get("file_type"):
-                            file_type = extract_file_type(file_name,file_url)
+                            file_type = extract_file_type(file_name, file_url)
                         else:
                             file_type = files_info.get("file_type")
 
@@ -89,7 +86,7 @@ class Details(feapder.BiddingDetailSpider):
                         if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
                             attachment = AttachmentDownloader().fetch_attachment(
                                 file_name=file_name, file_type=file_type, download_url=file_url,
-                                enable_proxy=False,proxies=fpx)
+                                enable_proxy=False, proxies=fpx)
                             attachments[str(len(attachments) + 1)] = attachment
                 if attachments:
                     list_item.projectinfo = {"attachments": attachments}
@@ -113,6 +110,5 @@ class Details(feapder.BiddingDetailSpider):
         yield list_item
 
 
-
 if __name__ == "__main__":
     Details(redis_key="detail:normal_details").start()

+ 19 - 24
FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -6,26 +6,22 @@ Created on {DATE}
 ---------
 @author: njpc_feapder
 """
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 import re
 import json
-import time,random
+import time, random
 from items.njpc_item import DataNjpcItem
 from untils.attachment import AttachmentDownloader as AD
 from untils.attachment_res import AttachmentDownloader as ADres
 from lxml.html import fromstring
-from untils.tools import remove_htmldata,extract_file_type
+from untils.tools import remove_htmldata, extract_file_type
 from feapder.utils.log import log
 
-
 redis_key = "njpc_details"
 
 
 # 拟建爬虫下载附件
-def njpc_get_files(html,file_type="",s_key="http",proxies=False):
-
+def njpc_get_files(html, file_type="", s_key="http", proxies=False):
     def parse_filetype(response, filetypes):
         val = response.headers.get("content-disposition")
         filetype = val.split('.')[-1].replace('"', '').replace("'", "")
@@ -54,7 +50,7 @@ def njpc_get_files(html,file_type="",s_key="http",proxies=False):
                 if file_type.lower() in file_types:
                     file_tp = file_type
                 else:
-                    file_tp = extract_file_type(file_name,file_url,[file_type])
+                    file_tp = extract_file_type(file_name, file_url, [file_type])
 
                 if file_tp and s_key in file_url and file_name:
                     file_name = file_name.strip()
@@ -70,26 +66,27 @@ class Details(feapder.PlanToBuildDetailSpider):
     def start_requests(self):
         data_lsit = self.get_tasks_by_rabbitmq(limit=100)
         for item in data_lsit:
-            log.debug(item)
+            # log.debug(item)
             request_params = item.get("request_params")
-            timeout = request_params.pop('timeout',10)
-            is_join_html = item.get("is_join_html")      # 正文是否根据xpath拼接
-            extra_html = item.get("extra_html")          # 过滤无效内容
-            title_xpath = item.get("title_xpath")        # 三级页标题
+            timeout = request_params.get('timeout', 10)
+            request_params.pop('timeout', None)
+            is_join_html = item.get("is_join_html")  # 正文是否根据xpath拼接
+            extra_html = item.get("extra_html")  # 过滤无效内容
+            title_xpath = item.get("title_xpath")  # 三级页标题
             extra_activity = item.get("extra_activity")  # 额外的需求动作
-            file_params = item.get("file_params")        # 附件下载配置
+            file_params = item.get("file_params")  # 附件下载配置
             if item.get("proxies"):
                 yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
+                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
                                       callback=item.get("parser"), file_params=file_params,
                                       extra_activity=extra_activity, timeout=timeout, **request_params)
             else:
-                yield feapder.Request(url=item.get("parser_url"), item=item,deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
+                yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
+                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
                                       callback=item.get("parser"), file_params=file_params,
                                       extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
 
-    def detail_get(self,request,response):
+    def detail_get(self, request, response):
         items = request.item
         data_item = DataNjpcItem(**items)
 
@@ -106,7 +103,7 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         if request.title_xpath:
             for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first() # 三级页标题
+                title = response.xpath(sxpath).extract_first()  # 三级页标题
                 if title:
                     data_item.title = title.strip()
                     if "..." in data_item.projectname or "…" in data_item.projectname:
@@ -115,12 +112,12 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         try:
             if request.extra_activity:
-                from untils.tools import njpc_fields_extract,njpc_fields_extract_special
+                from untils.tools import njpc_fields_extract, njpc_fields_extract_special
                 exec(request.extra_activity)
         except:
             pass
 
-        data_item.contenthtml = remove_htmldata(request.extra_html,html,response)
+        data_item.contenthtml = remove_htmldata(request.extra_html, html, response)
 
         fp = request.file_params or {}
         attachments = njpc_get_files(
@@ -134,8 +131,7 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         yield data_item
 
-
-    def detail_json(self,request,response):
+    def detail_json(self, request, response):
         items = request.item
         data_item = DataNjpcItem(**items)
 
@@ -146,4 +142,3 @@ class Details(feapder.PlanToBuildDetailSpider):
 
 if __name__ == '__main__':
     Details(redis_key="detail:njpc_details").start()
-

+ 31 - 33
FworkSpider/feapder/templates/njpc_list_template.tmpl

@@ -6,34 +6,32 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 from items.njpc_item import NjpcListItem
 from collections import namedtuple
-import time,random
+import time, random
 
 
 class Njpc_Feapder(feapder.PlanToBuildListSpider):
 
     def start_callback(self):
 
-         self.site = ""
+        self.site = ""
 
-         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
+        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
+        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
-         self.menus = [
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-         ]
+        self.menus = [
+            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+        ]
 
-         self.headers = {}
+        self.headers = {}
 
     def start_requests(self):
-         start_url = ''
-         for menu in self.menus:
-             yield feapder.Request(url=start_url,item=menu._asdict(),page=1,proxies=False)
+        start_url = ''
+        for menu in self.menus:
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
 
     def download_midware(self, request):
         page = request.page
@@ -41,31 +39,31 @@ class Njpc_Feapder(feapder.PlanToBuildListSpider):
 
     def parse(self, request, response):
         menu = request.item
-        info_list = response.xpath('')       # 数据结构为html
+        info_list = response.xpath('')  # 数据结构为html
         for info in info_list:
             detail_href = info.xpath('').extract_first().strip()
             projectname = info.xpath('').extract_first().strip()
             publish_time = info.xpath('').extract_first().strip()
 
-            area = ""      # 省份
-            city = ""      # 城市
+            area = ""  # 省份
+            city = ""  # 城市
             district = ""  # 区县
 
-            data_item = NjpcListItem()  # 存储数据的管道
-            data_item.unique_key = ("href", "publishtime") # 去重
-            data_item.channel = menu.get("channel")    # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")    # 最上方定义的爬虫code(编辑器定的)
-            data_item.projectname = projectname        # 项目名称
-            data_item.publishtime = publish_time       # 发布时间
+            data_item = NjpcListItem()          # 存储数据的管道
+            data_item.unique_key = ("href", publish_time)  # 去重
+            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            data_item.projectname = projectname      # 项目名称
+            data_item.publishtime = publish_time     # 发布时间
 
             data_item.site = self.site
-            data_item.area = area or "全国"             # 城市默认:全国
-            data_item.city = city                      # 城市 默认为空
-            data_item.district = district              # 城市 默认为空
-            data_item.parser_url = detail_href         # 详情页数据链接
-            data_item.href = detail_href               # 详情链接
-            data_item.request_params = {"headers":self.headers}
-            data_item.parser = "detail_get"            # 快照页爬虫调用的方法
+            data_item.area = area or "全国"                   # 城市默认:全国
+            data_item.city = city                            # 城市 默认为空
+            data_item.district = district                    # 城市 默认为空
+            data_item.parser_url = detail_href               # 详情页数据链接
+            data_item.href = detail_href                     # 详情链接
+            data_item.request_params = {"headers": self.headers}
+            data_item.parser = "detail_get"                  # 快照页爬虫调用的方法
             data_item.deal_detail = ['//div[@class="***"]']  # 正文解析规则
 
             # data_item.proxies = True               # 快照页是否开启代理
@@ -81,10 +79,10 @@ class Njpc_Feapder(feapder.PlanToBuildListSpider):
             yield data_item
 
         # 翻页
-        time.sleep(random.randint(2,5))
-        request = self.infinite_pages(request,response)
+        time.sleep(random.randint(2, 5))
+        request = self.infinite_pages(request, response)
         yield request
 
 
 if __name__ == "__main__":
-    Njpc_Feapder(redis_key="detail:njpc_details).start()
+    Njpc_Feapder(redis_key="detail:njpc_details").start()

+ 23 - 26
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -6,14 +6,11 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
-from items.spider_item import MgpListItem
+from items.spider_item import BidingListItem
 from collections import namedtuple
 
 
-
 class ${spider_name}(feapder.BiddingListSpider):
 
     def start_callback(self):
@@ -24,8 +21,8 @@ class ${spider_name}(feapder.BiddingListSpider):
         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
         self.menus = [
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
         ]
 
         self.headers = {}
@@ -33,7 +30,7 @@ class ${spider_name}(feapder.BiddingListSpider):
     def start_requests(self):
         for menu in self.menus:
             start_url = ''
-            yield feapder.Request(url=start_url,item=menu._asdict(),page=1,proxies=False)
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
 
     def download_midware(self, request):
         page = request.page
@@ -42,48 +39,48 @@ class ${spider_name}(feapder.BiddingListSpider):
     def parse(self, request, response):
 
         menu = request.item
-        info_list = response.xpath('')       # 数据结构为html
+        info_list = response.xpath('')  # 数据结构为html
         for info in info_list:
             href = info.xpath('').extract_first().strip()
             title = info.xpath('').extract_first().strip()
             publish_time = info.xpath('').extract_first().strip()
 
-            area = ""      # 省份
-            city = ""      # 城市
+            area = ""  # 省份
+            city = ""  # 城市
             district = ""  # 区县
 
-            list_item = MgpListItem()                # 存储数据的管道
-            list_item.href = href                    # 标书链接
+            list_item = BidingListItem()     # 存储数据的管道
+            list_item.href = href            # 标书链接
             list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
             list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
             list_item.title = title                  # 标题
             list_item.publishtime = publish_time     # 标书发布时间
             list_item.site = self.site
-            list_item.area = area or "全国"           # 省份 默认:全国
-            list_item.city = city                    # 城市 默认 为空
-            list_item.district = district            # 区县 默认 为空
+            list_item.area = area or "全国"  # 省份 默认:全国
+            list_item.city = city           # 城市 默认 为空
+            list_item.district = district   # 区县 默认 为空
 
             list_item.unique_key = ('href',)
-            list_item.parse = "self.detail_get"      # 详情页回调方法
-            list_item.deal_detail = ['//div[@class="****"]']   # 抽取正文xpath
+            list_item.parse = "self.detail_get"        # 详情页回调方法
+            list_item.deal_detail = ['//div[@class="****"]']  # 抽取正文xpath
             list_item.proxies = False
-            list_item.parse_url = href               # 详情页请求地址
+            list_item.parse_url = href                 # 详情页请求地址
             # list_item.is_delay = 1                   # 延时推送标识
             # list_item.if_es = 1                      # 查询es标识
 
-            list_item.files={                        # 附件采集规则
-                "list_xpath":'//div[@class="***"]//a[@href]',
-                "url_xpath":'./@href',
-                "name_xpath":'./text()',
-                #"file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
-                "url_key":'http',                    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host":'',                           # 需要拼接url的host
+            list_item.files = {                       # 附件采集规则
+                "list_xpath": '//div[@class="***"]//a[@href]',
+                "url_xpath": './@href',
+                "name_xpath": './text()',
+                # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
+                "url_key": 'http',    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
+                "host": '',           # 需要拼接url的host
             }
 
             yield list_item
 
         # 翻页
-        request = self.infinite_pages(request,response)
+        request = self.infinite_pages(request, response)
         yield request
 
 

+ 28 - 31
FworkSpider/feapder/templates/spider_template.tmpl

@@ -6,8 +6,6 @@ Created on {DATE}
 ---------
 @author: {USER}
 """
-import sys
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 from items.spider_item import DataBakItem
 from feapder.network.selector import Selector
@@ -15,27 +13,26 @@ from collections import namedtuple
 import time
 
 
-
 class ${spider_name}(feapder.BiddingListSpider):
 
     def start_callback(self):
 
-         self.site = ""
+        self.site = ""
 
-         #   --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-         Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
+        #   --- --- crawl_page 必须存在,且为纯数字(int) --- ---
+        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
 
-         self.menus = [
-             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
-         ]
+        self.menus = [
+            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
+        ]
 
-         self.headers = {}
+        self.headers = {}
 
     def start_requests(self):
-         for menu in self.menus:
-             start_url = ''
-             yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
-                                   render=True, render_time=3, proxies=False)
+        for menu in self.menus:
+            start_url = ''
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
+                                  render=True, render_time=3, proxies=False)
 
     def download_midware(self, request):
         page = request.page
@@ -50,19 +47,19 @@ class ${spider_name}(feapder.BiddingListSpider):
             title = info.xpath('').extract_first().strip()
             publish_time = info.xpath('').extract_first().strip()
 
-            area = ""      # 省份
-            city = ""      # 城市
+            area = ""  # 省份
+            city = ""  # 城市
             district = ""  # 区县
 
             try:
-                next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title}")]')[0]
+                next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title}")]')
             except:
                 try:
-                    next_page = driver.find_elements_by_xpath(f'//a[contains(text(),"{title[:10]}")]')[0] # 标题过长
+                    next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title[:10]}")]')  # 标题过长
                 except:
                     continue
 
-            driver.execute_script("arguments[0].click();",next_page) # js点击
+            driver.execute_script("arguments[0].click();", next_page)  # js点击
             time.sleep(3)
 
             # 点击三级页标题后打开新窗口
@@ -71,21 +68,21 @@ class ${spider_name}(feapder.BiddingListSpider):
 
             href = driver.current_url
 
-            data_item = DataBakItem()                # 存储数据的管道
-            data_item.href = href                    # 标书链接
-            data_item.unique_key = ('title','href')  # 去重
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title                  # 标题
-            data_item.publishtime = publish_time     # 标书发布时间
+            data_item = DataBakItem()         # 存储数据的管道
+            data_item.href = href             # 标书链接
+            data_item.unique_key = ('title', 'href')  # 去重
+            data_item.channel = menu.get("channel")   # 最上方定义的抓取栏目 (编辑器定的)
+            data_item.spidercode = menu.get("code")   # 最上方定义的爬虫code(编辑器定的)
+            data_item.title = title                   # 标题
+            data_item.publishtime = publish_time      # 标书发布时间
             data_item.site = self.site
-            data_item.area = area or "全国"           # 省份 默认:全国
-            data_item.city = city                    # 城市 默认 为空
-            data_item.district = district            # 区县 默认 为空
+            data_item.area = area or "全国"  # 省份 默认:全国
+            data_item.city = city           # 城市 默认 为空
+            data_item.district = district   # 区县 默认 为空
 
             detail_html = Selector(text=driver.page_source)
             html = ""
-            dx_list = ['//div[@class="***"]',]
+            dx_list = ['//div[@class="***"]', ]
             for dx in dx_list:
                 html = detail_html.xpath(dx).extract_first()
                 if html:
@@ -103,7 +100,7 @@ class ${spider_name}(feapder.BiddingListSpider):
             yield data_item
 
         # 翻页
-        request = self.infinite_pages(request,response)
+        request = self.infinite_pages(request, response)
         yield request
 
 

+ 73 - 4
FworkSpider/items/njpc_item.py

@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-
 import feapder.utils.tools as tools
 from feapder import BaseListItem, BaseDetailItem
 from feapder.utils.log import log
@@ -25,9 +24,71 @@ class DataNjpcItem(BaseDetailItem):
         'total', 'freshair', 'air', 'projectperiod', 'elevator',
         'funds', 'pace', 'owner', 'projectaddr', 'system', 'exterior',
         'method', 'passive', 'conditioner', 'approvedept', 'project',
-        'prefabricated'
+        'prefabricated', 'approvetime', 'total_investment', 'project_startdate',
+        'project_completedate', 'project_person', 'project_phone', 'project_scale_info',
+        'project_scale', 'construction_area', 'floor_area', 'building_floors', 'steel_structure',
+        'exterior_wall_materials', 'parking_pace', 'air_conditioner', 'freshair_system', 'heating_method',
+        'prefabricated_building', 'passive_house', 'other_project_scale', 'owner_info', 'designunit_info',
+        'constructionunit_info',
     }
 
+    # 以下字段为 二类字段,没有则不做存储,不在存储结构中
+    # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
+    #                       "fid":"附件id"
+    #                       "filename":"附件名称"
+    #                       "ftype":"文件类型"
+    #                       "org_url":"附件原始地址"
+    #                       "size":"附件大小"
+    #                       "url":"附件地址"}]
+    # 事项名称(审批事项)	approvecontent
+    # 项目代码(审批代码)	approvecode
+    # 批准文号	approvenumber
+    # 总投资	total_investment
+    # 资金来源	funds
+    # 业主单位	owner
+    # 申报方式(项目类型)	projecttype
+    # 建设地点	projectaddr
+    # 建设年限	projectperiod
+    # 开工时间	project_startdate
+    # 竣工时间	project_completedate
+    # 审批部门	approvedept
+    # 审批结果	approvestatus
+    # 项目联系人  project_person
+    # 项目联系电话  project_phone
+
+    # 建设规模及主要内容	project_scale_info
+    # 	project_scale
+    # 建筑面积	construction_area
+    # 占地面积	floor_area
+    # 建筑层数	building_floors
+    # 钢结构	steel_structure
+    # 外墙材料	exterior_wall_materials
+    # 车库停车位	parking_pace
+    # 电梯	elevator
+    # 空调	air_conditioner
+    # 新风系统	freshair_system
+    # 供暖方式	heating_method
+    # 装配式建筑	prefabricated_building
+    # 被动房	passive_house
+    # 其它建设内容描述	other_project_scale
+
+    # 三类字段,难以处理时可以不处理
+    # 业主及其联系方式	owner_info
+    # 业主单位/建设单位	owner
+    # 业主单位联系人	ownerperson
+    # 业主单位联系方式	ownertel
+    # 业主单位地址	owneraddr
+    # 设计院及其联系方式	designunit_info
+    # 设计单位	designunit
+    # 设计单位联系人	designunitperson
+    # 设计单位联系方式	designunittel
+    # 设计单位地址	designunitaddr
+    # 施工单位及其联系方式	constructionunit_info
+    # 施工单位	constructionunit
+    # 施工单位联系人	constructionunitperson
+    # 施工单位联系方式	constructionunittel
+    # 施工单位地址	constructionunitaddr
+
     def __init__(self, projectname='', publishtime='', **kwargs):
         """
 
@@ -45,6 +106,7 @@ class DataNjpcItem(BaseDetailItem):
         # 默认设置
         self.T = "bidding"
         self.infoformat = 2
+        self.is_check_text = True
 
     def handle_publish_time(self):
         # 时间格式处理
@@ -82,7 +144,7 @@ class DataNjpcItem(BaseDetailItem):
             if not self.detail:
                 self.detail = substitute(self.contenthtml)
 
-            if text_search(self.detail).total == 0:
+            if self.is_check_text and text_search(self.detail).total == 0:
                 self.sendflag = "true"
 
     def check_data_validity(self):
@@ -92,6 +154,9 @@ class DataNjpcItem(BaseDetailItem):
                 self.dont_save = True
 
     def cleanup(self):
+        # 删除检测正文是否包含中文字段
+        del self.is_check_text
+
         if not self.projectinfo:
             del self.projectinfo
 
@@ -118,10 +183,14 @@ class NjpcListItem(BaseListItem):
 
         self.projectname = ""  # 项目名称
         self.publishtime = ""  # 文章发布时间
+        self.is_check_spider = True
 
     def pre_to_db(self):
-        if CheckData.channel(self.channel, self.site, group="njpc"):
+        if self.is_check_spider and CheckData.channel(self.channel, self.site, group="njpc"):
             code, reason = CheckData.title(self.projectname, group="njpc")
             if code == 10106:
                 log.warning(f"{self.projectname}--不可入库,原因:{reason}")
                 self.dont_save = True
+
+        # 删除 是否检测 该爬虫的标题符合规范
+        del self.is_check_spider

+ 14 - 6
FworkSpider/items/spider_item.py

@@ -16,7 +16,7 @@ class DataBakItem(BaseDetailItem):
     __attr__ = {
         'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area',
         'city', 'district', 'href', 'title', 'contenthtml', 'detail',
-        'sendflag', 'projectinfo'
+        'sendflag', 'projectinfo', 'infoformat'
     }
 
     def __init__(self, s_title='', publishtime='', **kwargs):
@@ -35,15 +35,19 @@ class DataBakItem(BaseDetailItem):
         self.competehref = None  # 竞品详情页地址
 
         self.T = "bidding"
-        self.infoformat = 1
+        self.infoformat = kwargs.get('infoformat', 1)
 
         '''招投标默认属性'''
         self.iscompete = True  # 新爬虫标识
         self._d = "comeintime"
         self.publishdept = ""
         self.type = ""
+        self.is_check_text = True
 
     def cleanup(self):
+        # 删除检测正文是否包含中文字段
+        del self.is_check_text
+
         # 竞品网站-详情页地址标识字段
         if not self.competehref:
             del self.competehref
@@ -77,8 +81,8 @@ class DataBakItem(BaseDetailItem):
             if not self.detail:
                 self.detail = substitute(self.contenthtml)
 
-            if text_search(self.detail).total == 0:
-                self.sendflag = "true"   # 无内容数据,数据不入保存服务
+            if self.is_check_text and text_search(self.detail).total == 0:
+                self.sendflag = "true"  # 无内容数据,数据不入保存服务
 
     def check_data_validity(self):
         if not self.dont_save:
@@ -133,20 +137,24 @@ class BidingListItem(BaseListItem):
         self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
         self.parse = ""  # 详情爬虫解析回调方法名
 
-        self.proxies = False  # 是否启用代理, 多用于通用详情采集,关闭代理=False;开启代理=True
+        self.proxies = False  # 代理
 
         self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
         self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
         self.ex_python = None  # 定义需要执行的python代码,生成params/date,如headers和cookies特殊,最好使用特殊定义法
 
         self.files = False  # 采集附件配置
+        self.is_check_spider = True
 
     def pre_to_db(self):
-        if CheckData.channel(self.channel, self.site):
+        if self.is_check_spider and CheckData.channel(self.channel, self.site):
             code, reason = CheckData.title(self.title)
             if code == 10106:
                 log.warning(f"{self.title}--不可入库,原因:{reason}")
                 self.dont_save = True
 
+        # 删除 是否检测 该爬虫的标题符合规范
+        del self.is_check_spider
+
 
 MgpListItem = BidingListItem