t_channel.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import copy
  2. import pandas as pd
  3. from common.databases import mongo_table
  4. from common.tools import get_current_date
  5. from crawler.download import Downloader, RenderDownloader
  6. from crawler.services.channel import bfs
  7. shujuziyuan = mongo_table('shujuziyuan', 'channel')
  8. def is_duplicate(seed, href):
  9. if href != seed:
  10. return False
  11. return True
  12. def read_local_data(file):
  13. data_lst = []
  14. df = pd.read_excel(io=file, sheet_name=0)
  15. for site, *item in df.values:
  16. # print(site, item)
  17. href = item.pop(2)
  18. # print(site, href, *item)
  19. data = (site, href, *item)
  20. if data not in data_lst:
  21. data_lst.append(data)
  22. return data_lst
  23. def excavate_data(tasks):
  24. static = Downloader()
  25. render = RenderDownloader()
  26. download_tools = [static, render]
  27. for site, href, *other in tasks:
  28. print("开始 >>> ", site, href, *other)
  29. # 尝试多种页面请求处理方式
  30. insert_lst = []
  31. item = {
  32. 'site': site, # 网站名称
  33. 'seed': href, # 种子地址
  34. 'seed_spidercode': other[0], # 种子爬虫代码
  35. 'seed_channel': other[1], # 种子栏目名称
  36. 'seed_spiderstatus': other[2], # 种子爬虫状态
  37. 'remark': '',
  38. }
  39. for dl in download_tools:
  40. resp = dl.get(href, timeout=3)
  41. results = bfs(resp, href)
  42. for key, items in results.items():
  43. print(f"搜索 >>> {key} && {len(items)}")
  44. for val in items:
  45. channel, url = val
  46. copy_data = copy.deepcopy(item)
  47. copy_data.update({
  48. 'channel': channel, # 栏目地址
  49. 'href': url, # 网站名称
  50. 'is_duplicate': is_duplicate(href, url), # 种子与新栏目地址是否相同
  51. })
  52. insert_lst.append(copy_data)
  53. if len(results) > 0:
  54. break
  55. if len(insert_lst) > 0:
  56. shujuziyuan.insert_many(insert_lst)
  57. else:
  58. shujuziyuan.insert_one(item)
  59. print('结束 >>>\n')
  60. def to_excel():
  61. date = get_current_date(fmt="%Y%m%d")
  62. file = f'{date}_栏目挖掘.xlsx'
  63. writer = pd.ExcelWriter(file)
  64. q = {'channel': {'$exists': 1}, 'is_duplicate': False}
  65. projection = {
  66. 'site': 1,
  67. 'seed_spidercode': 1,
  68. 'seed': 1,
  69. 'seed_channel': 1,
  70. 'href': 1,
  71. 'channel': 1,
  72. 'seed_spiderstatus': 1,
  73. 'remark': 1,
  74. '_id': 0
  75. }
  76. cursor = shujuziyuan.find(q, projection=projection)
  77. df = pd.DataFrame(list(cursor))
  78. df.columns = [
  79. '网站名称',
  80. '种子栏目地址',
  81. '种子爬虫代码',
  82. '种子栏目名称',
  83. '种子爬虫状态',
  84. '栏目名称',
  85. '栏目地址',
  86. '备注'
  87. ]
  88. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(有结果)', index=False)
  89. q = {'channel': {'$exists': 0}}
  90. projection = {
  91. 'site': 1,
  92. 'seed_spidercode': 1,
  93. 'seed': 1,
  94. 'seed_channel': 1,
  95. 'seed_spiderstatus': 1,
  96. 'remark': 1,
  97. '_id': 0
  98. }
  99. cursor = shujuziyuan.find(q, projection=projection)
  100. df = pd.DataFrame(list(cursor))
  101. df.columns = [
  102. '网站名称',
  103. '种子栏目地址',
  104. '种子爬虫代码',
  105. '种子栏目名称',
  106. '种子爬虫状态',
  107. ]
  108. df.to_excel(writer, encoding='utf-8', sheet_name='栏目挖掘(无结果)', index=False)
  109. writer.save()
  110. print(f"{file} 录入完成")
  111. if __name__ == '__main__':
  112. # crawl_tasks = read_local_data('seed.xlsx')
  113. # excavate_data(crawl_tasks)
  114. to_excel()