dongzhaorui 3 жил өмнө
parent
commit
e6abdb807f
1 өөрчлөгдсөн 130 нэмэгдсэн , 0 устгасан
  1. 130 0
      find_source/t_channel.py

+ 130 - 0
find_source/t_channel.py

@@ -0,0 +1,130 @@
+import copy
+
+import pandas as pd
+
+from common.databases import mongo_table
+from common.tools import get_current_date
+from crawler.download import Downloader, RenderDownloader
+from crawler.services.channel import bfs
+
+shujuziyuan = mongo_table('shujuziyuan', 'channel')
+
+
+def is_duplicate(seed, href):
+    if href != seed:
+        return False
+    return True
+
+
+def read_local_data(file):
+    data_lst = []
+    df = pd.read_excel(io=file, sheet_name=0)
+    for site, *item in df.values:
+        # print(site, item)
+        href = item.pop(2)
+        # print(site, href, *item)
+        data = (site, href, *item)
+        if data not in data_lst:
+            data_lst.append(data)
+    return data_lst
+
+
+def excavate_data(tasks):
+    static = Downloader()
+    render = RenderDownloader()
+    download_tools = [static, render]
+    for site, href, *other in tasks:
+        print("开始 >>> ", site, href, *other)
+
+        # 尝试多种页面请求处理方式
+        insert_lst = []
+        item = {
+            'site': site,  # 网站名称
+            'seed': href,  # 种子地址
+            'seed_spidercode': other[0],  # 种子爬虫代码
+            'seed_channel': other[1],  # 种子栏目名称
+            'seed_spiderstatus': other[2],  # 种子爬虫状态
+            'remark': '',
+        }
+        for dl in download_tools:
+            resp = dl.get(href, timeout=3)
+            results = bfs(resp, href)
+            for key, items in results.items():
+                print(f"搜索 >>> {key} && {len(items)}")
+                for val in items:
+                    channel, url = val
+                    copy_data = copy.deepcopy(item)
+                    copy_data.update({
+                        'channel': channel,  # 栏目地址
+                        'href': url,  # 网站名称
+                        'is_duplicate': is_duplicate(href, url),  # 种子与新栏目地址是否相同
+                    })
+                    insert_lst.append(copy_data)
+            if len(results) > 0:
+                break
+
+        if len(insert_lst) > 0:
+            shujuziyuan.insert_many(insert_lst)
+        else:
+            shujuziyuan.insert_one(item)
+        print('结束 >>>\n')
+
+
+def to_excel():
+    date = get_current_date(fmt="%Y%m%d")
+    file = f'{date}_栏目挖掘.xlsx'
+    writer = pd.ExcelWriter(file)
+    q = {'channel': {'$exists': 1}, 'is_duplicate': False}
+    projection = {
+        'site': 1,
+        'seed_spidercode': 1,
+        'seed': 1,
+        'seed_channel': 1,
+        'href': 1,
+        'channel': 1,
+        'seed_spiderstatus': 1,
+        'remark': 1,
+        '_id': 0
+    }
+    cursor = shujuziyuan.find(q, projection=projection)
+    df = pd.DataFrame(list(cursor))
+    df.columns = [
+        '网站名称',
+        '种子栏目地址',
+        '种子爬虫代码',
+        '种子栏目名称',
+        '种子爬虫状态',
+        '栏目名称',
+        '栏目地址',
+        '备注'
+    ]
+    df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
+
+    q = {'channel': {'$exists': 0}}
+    projection = {
+        'site': 1,
+        'seed_spidercode': 1,
+        'seed': 1,
+        'seed_channel': 1,
+        'seed_spiderstatus': 1,
+        'remark': 1,
+        '_id': 0
+    }
+    cursor = shujuziyuan.find(q, projection=projection)
+    df = pd.DataFrame(list(cursor))
+    df.columns = [
+        '网站名称',
+        '种子栏目地址',
+        '种子爬虫代码',
+        '种子栏目名称',
+        '种子爬虫状态',
+    ]
+    df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
+    writer.save()
+    print(f"{file} 录入完成")
+
+
+if __name__ == '__main__':
+    # crawl_tasks = read_local_data('seed.xlsx')
+    # excavate_data(crawl_tasks)
+    to_excel()