t_channel.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import copy
  2. import pandas as pd
  3. from common.databases import mongo_table
  4. from common.tools import get_current_date
  5. from crawler.download import Downloader, RenderDownloader
  6. from crawler.services.channel import bfs
  7. shujuziyuan = mongo_table('shujuziyuan', 'channel')
  8. def is_duplicate(seed, href):
  9. if href != seed:
  10. return False
  11. return True
  12. def read_local_data(file):
  13. data_lst = []
  14. browsing_history = {}
  15. df = pd.read_excel(io=file, sheet_name=0)
  16. for site, *item in df.values:
  17. # print(site, item)
  18. href = item.pop(2)
  19. # print(site, href, *item)
  20. if site not in browsing_history:
  21. browsing_history.setdefault(site, {})
  22. channel = item[1]
  23. if channel not in browsing_history[site]:
  24. channel_nums = len(browsing_history[site])
  25. if channel_nums == 0:
  26. browsing_history[site][channel] = 1
  27. else:
  28. channel_nums += 1
  29. browsing_history[site][channel] = channel_nums
  30. nums = browsing_history[site][channel]
  31. # print(f"{site}_{channel}", nums)
  32. data = (site, href, *item, nums)
  33. if data not in data_lst:
  34. data_lst.append(data)
  35. return data_lst
  36. def excavate_data(tasks):
  37. static = Downloader()
  38. render = RenderDownloader()
  39. download_tools = [static, render]
  40. for site, href, *other in tasks:
  41. print("开始 >>> ", site, href, *other)
  42. # 尝试多种页面请求处理方式
  43. insert_lst = []
  44. item = {
  45. 'site': site, # 网站名称
  46. 'seed': href, # 种子地址
  47. 'seed_spidercode': other[0], # 种子爬虫代码
  48. 'seed_channel': other[1], # 种子栏目名称
  49. 'seed_spiderstatus': other[2], # 种子爬虫状态
  50. 'nums': other[-1], # 栏目的序号
  51. 'remark': '',
  52. }
  53. for dl in download_tools:
  54. resp = dl.get(href, timeout=3)
  55. results = bfs(resp, href)
  56. for key, items in results.items():
  57. print(f"搜索 >>> {key} && {len(items)}")
  58. for val in items:
  59. channel, url = val
  60. copy_data = copy.deepcopy(item)
  61. copy_data.update({
  62. 'channel': channel, # 栏目地址
  63. 'href': url, # 网站名称
  64. 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同
  65. })
  66. insert_lst.append(copy_data)
  67. if len(results) > 0:
  68. break
  69. if len(insert_lst) > 0:
  70. shujuziyuan.insert_many(insert_lst)
  71. else:
  72. shujuziyuan.insert_one(item)
  73. print('结束 >>>\n')
  74. def to_excel():
  75. date = get_current_date(fmt="%Y%m%d")
  76. file = f'{date}_栏目挖掘.xlsx'
  77. writer = pd.ExcelWriter(file)
  78. # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
  79. q = {'channel': {'$exists': 1}, 'nums': 1}
  80. projection = {
  81. 'site': 1,
  82. 'seed_spidercode': 1,
  83. 'seed': 1,
  84. 'seed_channel': 1,
  85. 'href': 1,
  86. 'channel': 1,
  87. 'seed_spiderstatus': 1,
  88. 'remark': 1,
  89. '_id': 0
  90. }
  91. cursor = shujuziyuan.find(q, projection=projection)
  92. df = pd.DataFrame(list(cursor))
  93. df.columns = [
  94. '网站名称',
  95. '种子栏目地址',
  96. '种子爬虫代码',
  97. '种子栏目名称',
  98. '种子爬虫状态',
  99. '栏目名称',
  100. '栏目地址',
  101. '备注'
  102. ]
  103. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
  104. q = {'channel': {'$exists': 0}}
  105. projection = {
  106. 'site': 1,
  107. 'seed_spidercode': 1,
  108. 'seed': 1,
  109. 'seed_channel': 1,
  110. 'seed_spiderstatus': 1,
  111. 'remark': 1,
  112. '_id': 0
  113. }
  114. cursor = shujuziyuan.find(q, projection=projection)
  115. df = pd.DataFrame(list(cursor))
  116. df.columns = [
  117. '网站名称',
  118. '种子栏目地址',
  119. '种子爬虫代码',
  120. '种子栏目名称',
  121. '种子爬虫状态',
  122. ]
  123. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
  124. writer.save()
  125. print(f"{file} 录入完成")
  126. if __name__ == '__main__':
  127. # crawl_tasks = read_local_data('seed.xlsx')
  128. # excavate_data(crawl_tasks)
  129. to_excel()