dongzhaorui 3 năm trước cách đây
mục cha
commit
17253f7512
1 tập tin đã thay đổi với 25 bổ sung20 xóa
  1. 25 20
      find_source/t_channel.py

+ 25 - 20
find_source/t_channel.py

@@ -21,26 +21,31 @@ def read_local_data(file):
     browsing_history = {}
     df = pd.read_excel(io=file, sheet_name=0)
     for site, *item in df.values:
-        # print(site, item)
-        href = item.pop(2)
-        # print(site, href, *item)
-
-        if site not in browsing_history:
-            browsing_history.setdefault(site, {})
-
-        channel = item[1]
-        if channel not in browsing_history[site]:
-            channel_nums = len(browsing_history[site])
-            if channel_nums == 0:
-                browsing_history[site][channel] = 1
-            else:
-                channel_nums += 1
-                browsing_history[site][channel] = channel_nums
-        nums = browsing_history[site][channel]
-        # print(f"{site}_{channel}", nums)
-        data = (site, href, *item, nums)
-        if data not in data_lst:
-            data_lst.append(data)
+        duplicate_spider_state_items = {
+            4: '已作废',
+            6: '已下架',
+            10: '已删除',
+        }
+        if item[3] not in duplicate_spider_state_items:
+            print(site, item)
+            href = item.pop(2)
+            # print(site, href, *item)
+            if site not in browsing_history:
+                browsing_history.setdefault(site, {})
+
+            channel = item[1]
+            if channel not in browsing_history[site]:
+                channel_nums = len(browsing_history[site])
+                if channel_nums == 0:
+                    browsing_history[site][channel] = 1
+                else:
+                    channel_nums += 1
+                    browsing_history[site][channel] = channel_nums
+            nums = browsing_history[site][channel]
+            # print(f"{site}_{channel}", nums)
+            data = (site, href, *item, nums)
+            if data not in data_lst:
+                data_lst.append(data)
     return data_lst