dongzhaorui 3 年之前
父节点
当前提交
b09896c74f
共有 1 个文件被更改,包括 19 次插入2 次删除
  1. 19 2
      find_source/t_channel.py

+ 19 - 2
find_source/t_channel.py

@@ -18,12 +18,27 @@ def is_duplicate(seed, href):
 
 def read_local_data(file):
     data_lst = []
+    browsing_history = {}
     df = pd.read_excel(io=file, sheet_name=0)
     for site, *item in df.values:
         # print(site, item)
         href = item.pop(2)
         # print(site, href, *item)
-        data = (site, href, *item)
+
+        if site not in browsing_history:
+            browsing_history.setdefault(site, {})
+
+        channel = item[1]
+        if channel not in browsing_history[site]:
+            channel_nums = len(browsing_history[site])
+            if channel_nums == 0:
+                browsing_history[site][channel] = 1
+            else:
+                channel_nums += 1
+                browsing_history[site][channel] = channel_nums
+        nums = browsing_history[site][channel]
+        # print(f"{site}_{channel}", nums)
+        data = (site, href, *item, nums)
         if data not in data_lst:
             data_lst.append(data)
     return data_lst
@@ -44,6 +59,7 @@ def excavate_data(tasks):
             'seed_spidercode': other[0],  # 种子爬虫代码
             'seed_channel': other[1],  # 种子栏目名称
             'seed_spiderstatus': other[2],  # 种子爬虫状态
+            'nums': other[-1],  # 栏目的序号
             'remark': '',
         }
         for dl in download_tools:
@@ -74,7 +90,8 @@ def to_excel():
     date = get_current_date(fmt="%Y%m%d")
     file = f'{date}_栏目挖掘.xlsx'
     writer = pd.ExcelWriter(file)
-    q = {'channel': {'$exists': 1}, 'is_duplicate': False}
+    # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
+    q = {'channel': {'$exists': 1}, 'nums': 1}
     projection = {
         'site': 1,
         'seed_spidercode': 1,