|
@@ -18,12 +18,27 @@ def is_duplicate(seed, href):
|
|
|
|
|
|
def read_local_data(file):
|
|
def read_local_data(file):
|
|
data_lst = []
|
|
data_lst = []
|
|
|
|
+ browsing_history = {}
|
|
df = pd.read_excel(io=file, sheet_name=0)
|
|
df = pd.read_excel(io=file, sheet_name=0)
|
|
for site, *item in df.values:
|
|
for site, *item in df.values:
|
|
# print(site, item)
|
|
# print(site, item)
|
|
href = item.pop(2)
|
|
href = item.pop(2)
|
|
# print(site, href, *item)
|
|
# print(site, href, *item)
|
|
- data = (site, href, *item)
|
|
|
|
|
|
+
|
|
|
|
+ if site not in browsing_history:
|
|
|
|
+ browsing_history.setdefault(site, {})
|
|
|
|
+
|
|
|
|
+ channel = item[1]
|
|
|
|
+ if channel not in browsing_history[site]:
|
|
|
|
+ channel_nums = len(browsing_history[site])
|
|
|
|
+ if channel_nums == 0:
|
|
|
|
+ browsing_history[site][channel] = 1
|
|
|
|
+ else:
|
|
|
|
+ channel_nums += 1
|
|
|
|
+ browsing_history[site][channel] = channel_nums
|
|
|
|
+ nums = browsing_history[site][channel]
|
|
|
|
+ # print(f"{site}_{channel}", nums)
|
|
|
|
+ data = (site, href, *item, nums)
|
|
if data not in data_lst:
|
|
if data not in data_lst:
|
|
data_lst.append(data)
|
|
data_lst.append(data)
|
|
return data_lst
|
|
return data_lst
|
|
@@ -44,6 +59,7 @@ def excavate_data(tasks):
|
|
'seed_spidercode': other[0], # 种子爬虫代码
|
|
'seed_spidercode': other[0], # 种子爬虫代码
|
|
'seed_channel': other[1], # 种子栏目名称
|
|
'seed_channel': other[1], # 种子栏目名称
|
|
'seed_spiderstatus': other[2], # 种子爬虫状态
|
|
'seed_spiderstatus': other[2], # 种子爬虫状态
|
|
|
|
+ 'nums': other[-1], # 栏目的序号
|
|
'remark': '',
|
|
'remark': '',
|
|
}
|
|
}
|
|
for dl in download_tools:
|
|
for dl in download_tools:
|
|
@@ -74,7 +90,8 @@ def to_excel():
|
|
date = get_current_date(fmt="%Y%m%d")
|
|
date = get_current_date(fmt="%Y%m%d")
|
|
file = f'{date}_栏目挖掘.xlsx'
|
|
file = f'{date}_栏目挖掘.xlsx'
|
|
writer = pd.ExcelWriter(file)
|
|
writer = pd.ExcelWriter(file)
|
|
- q = {'channel': {'$exists': 1}, 'is_duplicate': False}
|
|
|
|
|
|
+ # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
|
|
|
|
+ q = {'channel': {'$exists': 1}, 'nums': 1}
|
|
projection = {
|
|
projection = {
|
|
'site': 1,
|
|
'site': 1,
|
|
'seed_spidercode': 1,
|
|
'seed_spidercode': 1,
|