3 年之前 · b09896c74f
--- a/find_source/t_channel.py
+++ b/find_source/t_channel.py
@@ -18,12 +18,27 @@ def is_duplicate(seed, href):
 
				 
			
 
				 def read_local_data(file):
			
 
				     data_lst = []
			
 
				+    browsing_history = {}
			
 
				     df = pd.read_excel(io=file, sheet_name=0)
			
 
				     for site, *item in df.values:
			
 
				         # print(site, item)
			
 
				         href = item.pop(2)
			
 
				         # print(site, href, *item)
			
 
				-        data = (site, href, *item)
			
 
				+
			
 
				+        if site not in browsing_history:
			
 
				+            browsing_history.setdefault(site, {})
			
 
				+
			
 
				+        channel = item[1]
			
 
				+        if channel not in browsing_history[site]:
			
 
				+            channel_nums = len(browsing_history[site])
			
 
				+            if channel_nums == 0:
			
 
				+                browsing_history[site][channel] = 1
			
 
				+            else:
			
 
				+                channel_nums += 1
			
 
				+                browsing_history[site][channel] = channel_nums
			
 
				+        nums = browsing_history[site][channel]
			
 
				+        # print(f"{site}_{channel}", nums)
			
 
				+        data = (site, href, *item, nums)
			
 
				         if data not in data_lst:
			
 
				             data_lst.append(data)
			
 
				     return data_lst
			
@@ -44,6 +59,7 @@ def excavate_data(tasks):
 
				             'seed_spidercode': other[0],  # 种子爬虫代码
			
 
				             'seed_channel': other[1],  # 种子栏目名称
			
 
				             'seed_spiderstatus': other[2],  # 种子爬虫状态
			
 
				+            'nums': other[-1],  # 栏目的序号
			
 
				             'remark': '',
			
 
				         }
			
 
				         for dl in download_tools:
			
@@ -74,7 +90,8 @@ def to_excel():
 
				     date = get_current_date(fmt="%Y%m%d")
			
 
				     file = f'{date}_栏目挖掘.xlsx'
			
 
				     writer = pd.ExcelWriter(file)
			
 
				-    q = {'channel': {'$exists': 1}, 'is_duplicate': False}
			
 
				+    # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
			
 
				+    q = {'channel': {'$exists': 1}, 'nums': 1}
			
 
				     projection = {
			
 
				         'site': 1,
			
 
				         'seed_spidercode': 1,