Kaynağa Gözat

2023/5/11---工具类方法更新

lizongze 2 yıl önce
ebeveyn
işleme
6a9d229e28

+ 1 - 1
FworkSpider/feapder/core/collector.py

@@ -123,7 +123,7 @@ class Collector(threading.Thread):
 
         # 当任务Queue存在任务且其它节点再次启动爬虫,但爬虫无任务可执行
         # 原因是爬虫调用间隔时间小于 REQUEST_LOST_TIMEOUT
-        log.debug("领取新任务完毕,共{}条".format(len(requests_list)))
+        # log.debug("领取新任务完毕,共{}条".format(len(requests_list)))
 
         if requests_list:
             self._is_collector_task = True

+ 3 - 3
FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -13,7 +13,7 @@ import feapder
 import re
 import json
 import time,random
-from items.njpc_item import DataNjpcItem,NjpcListItem
+from items.njpc_item import DataNjpcItem
 from untils.attachment import AttachmentDownloader as AD
 from untils.attachment_res import AttachmentDownloader as ADres
 from lxml.html import fromstring
@@ -79,7 +79,7 @@ def remover_htmldata(request,response,html):
 class Details(feapder.PlanToBuildDetailSpider):
 
     def start_requests(self):
-        data_lsit = self.to_db.find(self.db_name,{"parser_name":f"{redis_key}","failed":{"$eq":0}},limit=50)
+        data_lsit = self.to_db.find(self.db_name,{"parser_name":f"{redis_key}"},limit=50)
         for item in data_lsit:
             log.debug(item)
             request_params = item.get("request_params")
@@ -136,7 +136,7 @@ class Details(feapder.PlanToBuildDetailSpider):
 
         try:
             if request.extra_activity:
-                from untils.tools import njpc_fields_extract
+                from untils.tools import njpc_fields_extract,njpc_fields_extract_special
                 exec(request.extra_activity)
         except:
             pass

+ 1 - 0
FworkSpider/feapder/templates/njpc_list_template.tmpl

@@ -78,6 +78,7 @@ class Njpc_Feapder(feapder.PlanToBuildListSpider):
             # data_item.render = True                # 是否开启开启浏览器
             # data_item.render_time = 3              # 渲染时间
             # data_item.extra_activity = '''***'''   # 额外的需求动作(三引号内顶左边框写执行语句)
+            super().increment_extract_count()
             yield data_item
 
         # 无限翻页

+ 2 - 2
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -10,7 +10,6 @@ import sys
 sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 from items.spider_item import DataBakItem,MgpListItem
-from feapder.dedup import Dedup
 from collections import namedtuple
 
 
@@ -41,7 +40,7 @@ class ${spider_name}(feapder.BiddingListSpider):
         request.headers = self.headers
 
     def parse(self, request, response):
-        real_count = 0
+
         menu = request.item
         dedup = Dedup(Dedup.BloomFilter)
         info_list = response.xpath('')       # 数据结构为html
@@ -85,6 +84,7 @@ class ${spider_name}(feapder.BiddingListSpider):
                 "host":'',                           # 需要拼接url的host
             }
 
+            super().increment_extract_count()
             yield list_item
 
         # 无限翻页

+ 68 - 40
FworkSpider/feapder/templates/spider_template.tmpl

@@ -2,66 +2,94 @@
 """
 Created on {DATE}
 ---------
-@summary:
+@summary: selenium抓取列表页无法获取href的信息
 ---------
 @author: {USER}
 """
-
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
 import feapder
 from items.spider_item import DataBakItem
-from untils.proxy_pool import ProxyPool
-from feapder.dedup import Dedup
+from feapder.network.selector import Selector
 from collections import namedtuple
+import time
+
+
 
+class ${spider_name}(feapder.BiddingListSpider):
 
-class ${spider_name}(feapder.Spider):
-    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
     def start_callback(self):
-         self.count = 0
-         self.prox_pool = ProxyPool()
+
+         self.site = ""
+
+         #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
          Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
 
          self.menus = [
-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
-             Menu('${spider_name}', '${spider_name}', "Notice", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
          ]
+
+         self.headers = {}
+
     def start_requests(self):
          for menu in self.menus:
-            start_url = f''
-            yield feapder.Request(url=start_url, item=menu._asdict())
+             start_url = ''
+             yield feapder.Request(url=start_url, item=menu._asdict(), page=1, real_page=0,
+                                   render=True, render_time=3, proxies=False)
+
+    def download_midware(self, request):
+        page = request.page
+        request.headers = self.headers
 
     def parse(self, request, response):
+        driver = response.browser
         menu = request.item
-        self.count += 1   # 一个计数器
+        info_list = response.xpath('')       # 数据结构为html
         for info in info_list:
-            list_item = DataBakItem()  # 存储数据的管道
-            list_item.href = href  # 标书链接
-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            list_item.title = title  # 标题
-            list_item.publishtime = create_time  # 标书发布时间
-
-            list_item.site = "#######记得编辑平台名称"
-            list_item.area = "全国"  # 城市默认:全国
-            list_item.city = ""  # 城市 默认为空
-            dedup = Dedup(Dedup.BloomFilter)
-            ss = dedup.filter_exist_data([href])
-            if ss == []:
-                continue
-            yield feapder.Request(href, callback=self.detail, item=list_item)
-    def detail(self,request,response):
-        list_item = request.item
-        html = response.xpath("//div[@class='']").extract_first()  # 标书详细内容
-        list_item.contenthtml = html
-        yield list_item
-
-    def end_callback(self):
-        print("爬虫结束")
-        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
+            # href = info.xpath('').extract_first().strip()
+            title = info.xpath('').extract_first().strip()
+            publish_time = info.xpath('').extract_first().strip()
+
+            area = ""      # 省份
+            city = ""      # 城市
+            district = ""  # 区县
+
+            next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title}")]')
+            next_page.click()
+            time.sleep(3)
+            href = driver.current_url
+
+            data_item = DataBakItem()                # 存储数据的管道
+            data_item.href = href                    # 标书链接
+            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            data_item.title = title                  # 标题
+            data_item.publishtime = publish_time     # 标书发布时间
+            data_item.site = self.site
+            data_item.area = area or "全国"           # 省份 默认:全国
+            data_item.city = city                    # 城市 默认 为空
+            data_item.district = district            # 区县 默认 为空
+
+            detail_html = Selector(driver.page_source)
+            html = ""
+            dx_list = ['//div[@class="***"]',]
+            for dx in dx_list:
+                html = detail_html.xpath(dx).extract_first()
+                if html:
+                    break
+
+            data_item.contenthtml = html
+
+            driver.back()
+            time.sleep(3)
+
+            super().increment_extract_count()
+            yield data_item
+
+        # 无限翻页
+        request = self.infinite_pages(request,response)
+        yield request
 
-    def download_midware(self, request):
-        request.proxies = self.prox_pool.get()
-        return request
 
 if __name__ == "__main__":
     ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 4 - 0
FworkSpider/items/njpc_item.py

@@ -106,6 +106,10 @@ class DataNjpcItem(SwordFishProjectItem):
         else:
             raise ValueError("发布时间格式不正确 -> %r " %(self.publishtime))
 
+        if isinstance(self.publishtime,type(self.comeintime)) and self.publishtime > self.comeintime:
+            log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
+            self.publishtime = int2long(tools.get_current_timestamp())
+
         if not self.projectname or not self.publishtime or not self.href:
             self.save = False
             log.warning(f"基础数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")

+ 5 - 0
FworkSpider/items/spider_item.py

@@ -56,6 +56,11 @@ class DataBakItem(SwordFishProjectItem):
         else:
             self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime, "%Y-%m-%d"))
 
+        if self.l_np_publishtime and self.l_np_publishtime > self.comeintime:
+            log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
+            self.publishtime = tools.get_current_date()
+            self.l_np_publishtime = int2long(tools.date_to_timestamp(self.publishtime))
+
         # html处理正文
         if not self.contenthtml:
             log.warning(f"正文数据为空:\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")

+ 3 - 1
FworkSpider/script_tools/create.py

@@ -10,6 +10,7 @@ def create_spider(spider_name, spider_type):
     5 招投标爬虫详情页模板(T_details)
     6 拟建爬虫列表页模板(njpc_feapder)
     7 拟建爬虫详情页模板(njpc_details)
+    2 selenium抓取列表页无法获取href的信息模板(selenium_feapder)
 
     :param spider_name: 类名
     :param spider_type: 爬虫模版类型
@@ -21,6 +22,7 @@ def create_spider(spider_name, spider_type):
 
 if __name__ == '__main__':
     # fire.Fire(create_spider('ztbpc_feapder', 4))
-    fire.Fire(create_spider('ztbpc_feapder', 5))
+    # fire.Fire(create_spider('ztbpc_feapder', 5))
     # fire.Fire(create_spider('njpc_list', 6))
     # fire.Fire(create_spider('njpc_detail', 7))
+    fire.Fire(create_spider('selenium_feapder', 2))

+ 1 - 1
FworkSpider/untils/check_data.py

@@ -13,7 +13,7 @@ class CheckData:
                            '违规', '评判', '监理', '竞价', '答疑', '终止',
                            '系统'}
 
-    __bidding_channel_set = {"通知公告", "公告公示"}
+    __bidding_channel_set = {"通知公告", "公告公示", "公示公告"}
 
     __plan_to_build_title_set = {'项目', '工程', '验收', '评价', '设计', '调查',
                                  '审核', '审批', '批复', '批后', '批前', '核准',

+ 1 - 0
FworkSpider/untils/clean_html.py

@@ -22,6 +22,7 @@ INLINE_TAGS = {
     '<span>|<span [^>]*>|</span>': '',  # span
     '<label>|<label [^>]*>|</label>': '<br>',  # label
     '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',            # 图片base64
 }
 # 块级元素
 BLOCK_TAGS = {

+ 77 - 0
FworkSpider/untils/jsl_clearance_s.py

@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-04-24 
+---------
+@summary: jsl通用模板
+---------
+@author: jsl
+"""
+import re
+import sys
+import json
+import execjs
+import requests
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+from untils.cookie_pool import PageCookiePool
+
+
+
+class DTCookiePool(PageCookiePool):
+    def __init__(self,redis_key,header,page_url=None,**kwargs):
+        super(DTCookiePool, self).__init__(redis_key,page_url=None,
+        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
+        self.headers=header
+        self.page_url = page_url
+        self.proxies = kwargs.get('proxies') or False
+
+    def create_cookie(self):
+
+        proxies = self.proxies
+
+        session = requests.Session()
+        session.proxies = proxies
+        start_url = self.page_url
+        res = session.get(start_url, headers=self.headers,timeout=120, verify=False)
+        js_func = "".join(re.findall("document.cookie=(.*?)location.href", res.text))
+        js_func = 'function sd() { return ' + js_func + "}"
+        ctx = execjs.compile(js_func)
+        sss = ctx.call("sd")
+        cookie = {}
+        for temp, index in res.cookies.get_dict().items():
+            cookie[temp] = index
+
+        for item in sss.split(";"):
+            if '=' in item:
+                cookie[item.split("=")[0]] = item.split("=")[-1]
+
+        res = session.get(start_url, cookies=cookie,headers=self.headers,timeout=120,verify=False)
+        html_str = res.content.decode()
+        js_do_data = "".join(re.findall('};go\((.*?)\)', html_str))
+        js_func = re.sub("<(/*?)script>", "", html_str)
+        location = re.compile('location(.*?)}}else')
+        location2 = re.compile('location(.*?)}else')
+        setTimeout = re.compile('setTimeout(.*?)document')
+        gox = re.compile('};go(.*?)\)')
+        js_func = re.sub(location, "}}else", js_func)
+        js_func = re.sub(location2, "}else", js_func)
+        js_func = re.sub(setTimeout, "document", js_func)
+        js_func = re.sub('0x5dc;}(.*?)\(document', "0x5dc;}document", js_func)
+        js_func = re.sub(gox, "return document['cookie']\n};", js_func)
+        js_func = '''const jsdom = require("jsdom");
+                    const {JSDOM} = jsdom;
+                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+                    window = dom.window;
+                    document = window.document;''' + js_func
+        ctx = execjs.compile(js_func)
+        # with open('ex_js.js', 'w+', encoding='utf-8') as f:
+        #     f.write(js_func)
+        try:
+            ss = ctx.call("go", json.loads(js_do_data))
+            for item in ss.split(";"):
+                if '=' in item:
+                    session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
+            session.get(start_url,headers=self.headers,timeout=120,verify=False)
+            cookies = requests.utils.dict_from_cookiejar(session.cookies)
+            return cookies
+        except Exception as e:
+            pass

+ 100 - 3
FworkSpider/untils/tools.py

@@ -4,7 +4,7 @@ import hashlib
 import re
 from collections import namedtuple
 from string import whitespace
-
+from bs4 import BeautifulSoup
 import bson
 import requests
 
@@ -124,6 +124,7 @@ def text_search(content: str) -> SearchText:
     return SearchText(len(results))
 
 
+
 def int2long(param: int):
     """int 转换成 long """
     return bson.int64.Int64(param)
@@ -157,9 +158,9 @@ def njpc_fields_extract(html, data_item, is_clean=False):
 
     data_item.title = data_item.projectname
     projectname = re.findall('项目名称(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
-    approvecode = re.findall('项目代码(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    approvecode = re.findall('项目(?:代码|编码)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
     approvecontent = re.findall('(?:事项名称|审批事项)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
-    owner = re.findall('建设(?:单位|单位名称)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
+    owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
     projectaddr = re.findall('建设(?:地点|地址)(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
     total_investment = re.findall('总投资(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
     project_person = re.findall('联系人(?:<[^>]+>|)[:|:](?:<[^>]+>|)(.*?)<', html, re.S)
@@ -216,6 +217,79 @@ def njpc_fields_extract(html, data_item, is_clean=False):
     return data_item
 
 
+# 拟建爬虫字段正则抽取(抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符)
+def njpc_fields_extract_special(html, data_item):
+    """
+        抽取信息结尾必须含有[,,.。;;一二三四五六七八九十、::]等标识符
+    :param str html: 页面源码
+    :param Items data_item: 详情页item
+    :return: 抽取完成字段表
+    """
+    # 清洗掉所有标签
+    soup = BeautifulSoup(html, 'html.parser')
+    html = "".join(soup.get_text().split()).strip()
+    # 抽取字段
+    data_item.title = data_item.projectname
+    projectname = re.findall('项目名称(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    approvecode = re.findall('项目(?:代码|编码)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    approvecontent = re.findall('(?:事项名称|审批事项)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    owner = re.findall('[建设|项目](?:单位|单位名称|业主)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    projectaddr = re.findall('建设(?:地点|地址)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    total_investment = re.findall('总投资(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[。;;、::]', html, re.S)
+    project_person = re.findall('联系人(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    project_phone = re.findall('联系(?:电话|方式)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    approvedept = re.findall('审批部门(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    approvenumber = re.findall('(?:审批|批准)文号(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,.。;;、::]', html, re.S)
+    approvetime = re.findall('审批时间(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
+    project_scale = "".join(re.findall('建设(?:内容|内容[及|与|和]规模|规模|规模[及|与|和]内容)(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]+[、::]', html, re.S))
+    project_completedate = re.findall('竣工日期(?:[\u4e00-\u9fa5]+|)[:|:](.*?)[一二三四五六七八九十\d]?[,,。;;、]', html, re.S)
+
+    if project_scale:
+        construction_area = search('[总]*\S*建筑[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        floor_area = search('[总]*\S*占地[面|面积]*[约|为]*(.*?)[。|,|,|;]', project_scale)
+        if not construction_area:
+            construction_area = ""
+        else:
+            construction_area = re.sub(":|:", "", construction_area)
+
+        if not floor_area:
+            floor_area = ""
+        else:
+            floor_area = re.sub(":|:", "", floor_area)
+
+        data_item.project_scale = project_scale
+        data_item.project_scale_info = {
+            "construction_area": construction_area,
+            "floor_area": floor_area,
+        }  # 建设规模及主要内容
+
+    fields_dict = {
+        "projectname": projectname,
+        "owner": owner,
+        "total_investment": total_investment,
+        "project_person": project_person,
+        "project_phone": project_phone,
+        "approvedept": approvedept,
+        "approvetime": approvetime,
+        "project_completedate": project_completedate,
+        "projectaddr": projectaddr,
+        "approvecode": approvecode,
+        "approvecontent": approvecontent,
+        "approvenumber": approvenumber
+    }
+    for fields_k, fields_v in fields_dict.items():
+        if fields_v:
+            fields_v[0] = clean_chars(fields_v[0])
+            if not fields_v[0]:
+                continue
+
+            data_item[fields_k] = re.sub(
+                r'([,|.|。|)|)|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$',
+                "", fields_v[0])
+
+    return data_item
+
+
 def get_proxy():
     headers = {
         "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
@@ -259,3 +333,26 @@ def get_construction_area(project_scale):
     else:
         construction_area = construction_area.replace(':', '').replace(':', '')
     return construction_area
+
+
+# 过滤详情页无效数据
+def remove_htmldata(remove_info_list:list, html:str, response):
+    """
+
+    Args:
+        remove_info_list: 需删除内容的xpath或文本 -> list
+        html: 待清洗文本
+        response: 原文响应体
+
+    Returns: 清洗后的文本
+
+    """
+    if html and remove_info_list:
+        for extra_item in remove_info_list:
+            if re.search('^//.*', extra_item):
+                extra_html = response.xpath(extra_item).extract_first()
+            else:
+                extra_html = extra_item
+            if extra_html:
+                html = html.replace(extra_html, '')
+    return html