t_channel.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import copy
  2. import pandas as pd
  3. from common.databases import mongo_table
  4. from common.tools import get_current_date
  5. from crawler.download import Downloader, RenderDownloader
  6. from crawler.services.channel import bfs
  7. shujuziyuan = mongo_table('shujuziyuan', 'channel')
  8. def is_duplicate(seed, href):
  9. if href != seed:
  10. return False
  11. return True
  12. def read_local_data(file):
  13. data_lst = []
  14. browsing_history = {}
  15. df = pd.read_excel(io=file, sheet_name=0)
  16. for site, *item in df.values:
  17. duplicate_spider_state_items = {
  18. 4: '已作废',
  19. 6: '已下架',
  20. 10: '已删除',
  21. }
  22. if item[3] not in duplicate_spider_state_items:
  23. print(site, item)
  24. href = item.pop(2)
  25. # print(site, href, *item)
  26. if site not in browsing_history:
  27. browsing_history.setdefault(site, {})
  28. channel = item[1]
  29. if channel not in browsing_history[site]:
  30. channel_nums = len(browsing_history[site])
  31. if channel_nums == 0:
  32. browsing_history[site][channel] = 1
  33. else:
  34. channel_nums += 1
  35. browsing_history[site][channel] = channel_nums
  36. nums = browsing_history[site][channel]
  37. # print(f"{site}_{channel}", nums)
  38. data = (site, href, *item, nums)
  39. if data not in data_lst:
  40. data_lst.append(data)
  41. return data_lst
  42. def excavate_data(tasks):
  43. static = Downloader()
  44. render = RenderDownloader()
  45. download_tools = [static, render]
  46. for site, href, *other in tasks:
  47. print("开始 >>> ", site, href, *other)
  48. # 尝试多种页面请求处理方式
  49. insert_lst = []
  50. item = {
  51. 'site': site, # 网站名称
  52. 'seed': href, # 种子地址
  53. 'seed_spidercode': other[0], # 种子爬虫代码
  54. 'seed_channel': other[1], # 种子栏目名称
  55. 'seed_spiderstatus': other[2], # 种子爬虫状态
  56. 'nums': other[-1], # 栏目的序号
  57. 'remark': '',
  58. }
  59. for dl in download_tools:
  60. resp = dl.get(href, timeout=3)
  61. results = bfs(resp, href)
  62. for key, items in results.items():
  63. print(f"搜索 >>> {key} && {len(items)}")
  64. for val in items:
  65. channel, url = val
  66. copy_data = copy.deepcopy(item)
  67. copy_data.update({
  68. 'channel': channel, # 栏目地址
  69. 'href': url, # 网站名称
  70. 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同
  71. })
  72. insert_lst.append(copy_data)
  73. if len(results) > 0:
  74. break
  75. if len(insert_lst) > 0:
  76. shujuziyuan.insert_many(insert_lst)
  77. else:
  78. shujuziyuan.insert_one(item)
  79. print('结束 >>>\n')
  80. def to_excel():
  81. date = get_current_date(fmt="%Y%m%d")
  82. file = f'{date}_栏目挖掘.xlsx'
  83. writer = pd.ExcelWriter(file)
  84. # q = {'channel': {'$exists': 1}, 'is_duplicate': False}
  85. q = {'channel': {'$exists': 1}, 'nums': 1}
  86. projection = {
  87. 'site': 1,
  88. 'seed_spidercode': 1,
  89. 'seed': 1,
  90. 'seed_channel': 1,
  91. 'href': 1,
  92. 'channel': 1,
  93. 'seed_spiderstatus': 1,
  94. 'remark': 1,
  95. '_id': 0
  96. }
  97. cursor = shujuziyuan.find(q, projection=projection)
  98. df = pd.DataFrame(list(cursor))
  99. df.columns = [
  100. '网站名称',
  101. '种子栏目地址',
  102. '种子爬虫代码',
  103. '种子栏目名称',
  104. '种子爬虫状态',
  105. '栏目名称',
  106. '栏目地址',
  107. '备注'
  108. ]
  109. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
  110. q = {'channel': {'$exists': 0}}
  111. projection = {
  112. 'site': 1,
  113. 'seed_spidercode': 1,
  114. 'seed': 1,
  115. 'seed_channel': 1,
  116. 'seed_spiderstatus': 1,
  117. 'remark': 1,
  118. '_id': 0
  119. }
  120. cursor = shujuziyuan.find(q, projection=projection)
  121. df = pd.DataFrame(list(cursor))
  122. df.columns = [
  123. '网站名称',
  124. '种子栏目地址',
  125. '种子爬虫代码',
  126. '种子栏目名称',
  127. '种子爬虫状态',
  128. ]
  129. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
  130. writer.save()
  131. print(f"{file} 录入完成")
  132. if __name__ == '__main__':
  133. # crawl_tasks = read_local_data('seed.xlsx')
  134. # excavate_data(crawl_tasks)
  135. to_excel()