add_task.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import time
  2. import urllib3
  3. import pandas as pd
  4. from crawler.Task import Task
  5. from crawler.q import RedisQueue
  6. from crawler.utils import extract_host, extract_domain, is_url
  7. from settings import (
  8. REDIS_QUERY_KEYWORD,
  9. REDIS_EXCAVATE,
  10. FILTER_WORDS,
  11. MGO_KEYWORDS,
  12. MGO_ORGS,
  13. MGO_URLS,
  14. MGO_COMPETING_GOODS,
  15. MGO_LUA_SPIDERS,
  16. )
  17. from common.databases import int2long, mongo_table
  18. def push_task(file):
  19. path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
  20. mrq = RedisQueue()
  21. _file = str(file)
  22. df = pd.read_excel(path + _file, sheet_name=0)
  23. lst = [Task(url=nd[0], groups='seed_url') for nd in df.values]
  24. print(mrq.push_task(REDIS_EXCAVATE, lst, level=9))
  25. def make_seed_words_table(file):
  26. path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
  27. df = pd.read_excel(path + file, sheet_name=1)
  28. # print(df.to_dict())
  29. # 企业名称
  30. df_dict = df.to_dict()
  31. for key, val in df_dict['企业名称'].items():
  32. # print(key, val)
  33. MGO_ORGS.insert_one({'name': val})
  34. print('企业名称表成功创建')
  35. # 关键词
  36. for val in FILTER_WORDS:
  37. # print(val)
  38. MGO_KEYWORDS.insert_one({'name': val})
  39. print('关键词表成功创建')
  40. def make_seed_urls_table(file):
  41. path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
  42. _file = str(file)
  43. df = pd.read_excel(path + _file, sheet_name=0)
  44. for nd in df.values:
  45. if len(nd) > 1:
  46. name, url = nd[0], nd[1]
  47. else:
  48. name, url = '', nd[0]
  49. # print(name, url)
  50. if is_url(url) and extract_domain(url) != '':
  51. try:
  52. MGO_URLS.insert_one({'name': extract_host(url), 'site_name': name})
  53. except urllib3.exceptions.LocationParseError:
  54. continue
  55. print('种子urls表成功创建')
  56. def make_competing_goods_table(file):
  57. path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
  58. _file = str(file)
  59. df = pd.read_excel(path + _file, sheet_name=0)
  60. for nd in df.values:
  61. url = nd[0]
  62. MGO_COMPETING_GOODS.insert_one({'name': url})
  63. print('竞品urls表成功创建')
  64. def make_garbage_tab():
  65. q = {"param_common.11": {"$exists": True}}
  66. projection = {'param_common': 1, '_id': 0}
  67. data_garbage = mongo_table('shujuziyuan', 'data_garbage', host='127.0.0.1', port=27017)
  68. cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
  69. history = []
  70. for item in cursor:
  71. href = item['param_common'][11]
  72. domain = extract_domain(href)
  73. if len(domain) > 0 and domain not in history:
  74. data_garbage.insert_one({
  75. 'domain': domain,
  76. 'create_at': int2long(int(time.time())),
  77. })
  78. history.append(domain)
  79. print(f'href >> {href}; domain >> {domain}')
  80. def make_domain():
  81. q = {"param_common.11": {"$exists": True}}
  82. projection = {'param_common': 1, '_id': 0}
  83. data_garbage = mongo_table('dzr', 'web_domain', host='baibai.ink', port=28082)
  84. cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
  85. history = []
  86. for item in cursor:
  87. href = item['param_common'][11]
  88. domain = extract_domain(href)
  89. if len(domain) > 0 and domain not in history:
  90. data_garbage.insert_one({
  91. 'domain': domain,
  92. 'create_at': int2long(int(time.time())),
  93. })
  94. history.append(domain)
  95. print(f'href >> {href}; domain >> {domain}')
  96. if __name__ == '__main__':
  97. # make_garbage_tab()
  98. make_seed_urls_table('元博网正文数据寻源网址类2022.8.12.xlsx')
  99. # make_seed_urls_table('剑鱼正文数据寻源网址类2022.8.25.xlsx')
  100. # make_competing_goods_table('元博网正文数据寻源网址类2022.2.23.xlsx')
  101. # make_seed_words_table('自动寻源程序种子提供2022.xlsx')