build_tools.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. from flask import Flask, request, jsonify, abort
  2. from flask_httpauth import HTTPBasicAuth
  3. from werkzeug.security import generate_password_hash, check_password_hash
  4. from common.databases import mongo_table
  5. from common.log import logger
  6. from services import (
  7. accountManagePool,
  8. get_base_url,
  9. socks5ProxyPool,
  10. httpProxyPool,
  11. )
  12. '''以下模块以动态方式加载入全局变量,请勿删除'''
  13. try:
  14. from services import zbytb
  15. from services import ybw
  16. # from services import nmpa
  17. from services import site_monitor
  18. except ImportError as e:
  19. print(f"缺少全局变量, 原因:{e.args}")
  20. app = Flask(__name__)
  21. '''认证方式'''
  22. auth = HTTPBasicAuth()
  23. '''chrome代理状态记录'''
  24. ChromeUser: dict = {}
  25. '''用户表'''
  26. Users = mongo_table('py_spider', 'spider_scheduler_auth')
  27. @auth.verify_password
  28. def verify_password(username, password):
  29. item = Users.find_one({'username': username})
  30. if item is not None:
  31. user = {
  32. item['username']: generate_password_hash(item['password'])
  33. }
  34. if username in user and check_password_hash(user.get(username), password):
  35. return username
  36. return None
  37. @app.route('/')
  38. @auth.login_required
  39. def index():
  40. return ' Hello, {}!<br><br> <a href="{}">代理池使用情况</a>'.format(
  41. auth.username(),
  42. get_base_url() + '/crawl/proxy/query'
  43. )
  44. @app.route('/proxy', methods=['GET'])
  45. def chrome_proxy_plugin():
  46. global ChromeUser
  47. client = request.args.get('clientid')
  48. ip = request.remote_addr
  49. if client is None:
  50. return jsonify(data={})
  51. if client not in ChromeUser:
  52. ChromeUser.setdefault(client, {'chrome_use_proxy': True, 'ip': ip})
  53. else:
  54. config: dict = ChromeUser.get(client)
  55. config.update({'chrome_use_proxy': True})
  56. ChromeUser.update({client: config})
  57. logger.info(f"ChromeUser: {ChromeUser}")
  58. return jsonify(data=ChromeUser.get(client))
  59. @app.route('/proxy/test', methods=['GET'])
  60. def chrome_proxy_plugin_check():
  61. global ChromeUser
  62. client = request.args.get('clientid')
  63. if client is None or client not in ChromeUser:
  64. return 'false'
  65. else:
  66. config: dict = ChromeUser.get(client)
  67. if config.get('chrome_use_proxy'):
  68. config.update({'chrome_use_proxy': False})
  69. ChromeUser.update({client: config})
  70. return 'true'
  71. else:
  72. return 'false'
  73. @app.route('/proxy/user/show', methods=['GET'])
  74. @auth.login_required
  75. def show_chrome_proxy_plugin_user():
  76. return jsonify(data=ChromeUser)
  77. @app.route('/crawl/proxy/<scheme>/fetch', methods=['GET'])
  78. @auth.login_required
  79. def get_proxy(scheme):
  80. # logger.info(f'[访问ip]{request.remote_addr}, class:{scheduler_class_name}')
  81. result = {}
  82. try:
  83. proxies = None
  84. if scheme == 'http':
  85. proxies = httpProxyPool.proxies()
  86. elif scheme == 'socks5':
  87. proxies = socks5ProxyPool.proxies()
  88. else:
  89. abort(404)
  90. logger.info(f'[调用{scheme}代理]{proxies}')
  91. if proxies is not None:
  92. result.update(proxies)
  93. except KeyError:
  94. pass
  95. return jsonify(data=result)
  96. @app.route('/crawl/proxy/query', methods=['GET'])
  97. @auth.login_required
  98. def show_proxy():
  99. socks_pool = socks5ProxyPool.get_proxy_pool()
  100. http_pool = httpProxyPool.get_proxy_pool()
  101. pool = [*socks_pool, *http_pool]
  102. return jsonify(data=pool)
  103. @app.route('/upload/data/<scheduler_class_name>/<table>', methods=['POST'])
  104. @auth.login_required
  105. def upload_data(scheduler_class_name, table):
  106. data_json = request.json
  107. logger.info(f"[接收数据]{data_json}")
  108. try:
  109. scheduler_class = globals()[scheduler_class_name]
  110. scheduler_class.save_data(table, data_json)
  111. return 'success'
  112. except KeyError:
  113. return 'failure'
  114. @app.route('/crawl/<scheduler_class_name>/task/fetch', methods=['GET'])
  115. def get_crawl_task(scheduler_class_name):
  116. task = {}
  117. try:
  118. scheduler_class = globals()[scheduler_class_name]
  119. result = scheduler_class.get_crawl_task()
  120. if result is not None:
  121. task = result
  122. except KeyError:
  123. pass
  124. return jsonify(data=task)
  125. @app.route('/crawl/<scheduler_class_name>/task/total', methods=['GET'])
  126. def get_crawl_task_total(scheduler_class_name):
  127. total = {'total': 0}
  128. try:
  129. scheduler_class = globals()[scheduler_class_name]
  130. total.update({'total': scheduler_class.task_total})
  131. except KeyError:
  132. pass
  133. return jsonify(data=total)
  134. @app.route('/competing_goods/account/fetch', methods=['GET'])
  135. @auth.login_required
  136. def competing_goods_account_lock():
  137. req_ip = request.remote_addr
  138. site = request.args.get('site')
  139. crawl_type = request.args.get('crawl_type')
  140. result = accountManagePool.lock_account(site, crawl_type, req_ip)
  141. return jsonify(data=result)
  142. @app.route('/competing_goods/account/release', methods=['GET'])
  143. @auth.login_required
  144. def competing_goods_account_release():
  145. req_ip = request.remote_addr
  146. uid = request.args.get('uid')
  147. crawl_type = request.args.get('crawl_type')
  148. if uid in [None, '']:
  149. abort(404) # Unauthorized 未授权
  150. res = accountManagePool.release_account(uid, crawl_type, req_ip)
  151. return jsonify(data=res)
  152. # if __name__ == '__main__':
  153. # app.run(host='0.0.0.0', port=1405, debug=True, use_reloader=False)