|
@@ -21,26 +21,31 @@ def read_local_data(file):
|
|
|
browsing_history = {}
|
|
|
df = pd.read_excel(io=file, sheet_name=0)
|
|
|
for site, *item in df.values:
|
|
|
- # print(site, item)
|
|
|
- href = item.pop(2)
|
|
|
- # print(site, href, *item)
|
|
|
-
|
|
|
- if site not in browsing_history:
|
|
|
- browsing_history.setdefault(site, {})
|
|
|
-
|
|
|
- channel = item[1]
|
|
|
- if channel not in browsing_history[site]:
|
|
|
- channel_nums = len(browsing_history[site])
|
|
|
- if channel_nums == 0:
|
|
|
- browsing_history[site][channel] = 1
|
|
|
- else:
|
|
|
- channel_nums += 1
|
|
|
- browsing_history[site][channel] = channel_nums
|
|
|
- nums = browsing_history[site][channel]
|
|
|
- # print(f"{site}_{channel}", nums)
|
|
|
- data = (site, href, *item, nums)
|
|
|
- if data not in data_lst:
|
|
|
- data_lst.append(data)
|
|
|
+ duplicate_spider_state_items = {
|
|
|
+ 4: '已作废',
|
|
|
+ 6: '已下架',
|
|
|
+ 10: '已删除',
|
|
|
+ }
|
|
|
+ if item[3] not in duplicate_spider_state_items:
|
|
|
+ print(site, item)
|
|
|
+ href = item.pop(2)
|
|
|
+ # print(site, href, *item)
|
|
|
+ if site not in browsing_history:
|
|
|
+ browsing_history.setdefault(site, {})
|
|
|
+
|
|
|
+ channel = item[1]
|
|
|
+ if channel not in browsing_history[site]:
|
|
|
+ channel_nums = len(browsing_history[site])
|
|
|
+ if channel_nums == 0:
|
|
|
+ browsing_history[site][channel] = 1
|
|
|
+ else:
|
|
|
+ channel_nums += 1
|
|
|
+ browsing_history[site][channel] = channel_nums
|
|
|
+ nums = browsing_history[site][channel]
|
|
|
+ # print(f"{site}_{channel}", nums)
|
|
|
+ data = (site, href, *item, nums)
|
|
|
+ if data not in data_lst:
|
|
|
+ data_lst.append(data)
|
|
|
return data_lst
|
|
|
|
|
|
|