Browse Source

删除旧代码

dongzhaorui 2 months ago
parent
commit
d82db9ad37
100 changed files with 2695 additions and 10988 deletions
  1. BIN
      .DS_Store
  2. 0 0
      .idea/.gitignore
  3. 4 0
      .idea/encodings.xml
  4. 6 0
      .idea/inspectionProfiles/profiles_settings.xml
  5. 4 0
      .idea/misc.xml
  6. 8 0
      .idea/modules.xml
  7. 12 0
      .idea/topic_spider.iml
  8. 6 0
      .idea/vcs.xml
  9. 436 0
      .idea/workspace.xml
  10. 0 0
      find_source/.idea/.gitignore
  11. 4 0
      find_source/.idea/encodings.xml
  12. 15 0
      find_source/.idea/find_source.iml
  13. 37 0
      find_source/.idea/inspectionProfiles/Project_Default.xml
  14. 6 0
      find_source/.idea/inspectionProfiles/profiles_settings.xml
  15. 4 0
      find_source/.idea/misc.xml
  16. 8 0
      find_source/.idea/modules.xml
  17. 6 0
      find_source/.idea/vcs.xml
  18. 865 0
      find_source/.idea/workspace.xml
  19. 200 0
      find_source/spiders/government.py
  20. 166 0
      find_source/spiders/hospital.py
  21. 119 0
      find_source/tests/add_task.py
  22. 50 0
      find_source/tests/exa_channel.py
  23. 13 0
      find_source/tests/predict_bidding_model.py
  24. 29 0
      find_source/tests/test_predict_v2.py
  25. 0 0
      geocode/.idea/.gitignore
  26. 12 0
      geocode/.idea/geocode.iml
  27. 25 0
      geocode/.idea/inspectionProfiles/Project_Default.xml
  28. 6 0
      geocode/.idea/inspectionProfiles/profiles_settings.xml
  29. 4 0
      geocode/.idea/misc.xml
  30. 8 0
      geocode/.idea/modules.xml
  31. 6 0
      geocode/.idea/vcs.xml
  32. 56 0
      geocode/.idea/workspace.xml
  33. 0 55
      jzsc/chaojiying.py
  34. 0 32
      jzsc/config/conf.yaml
  35. 0 13
      jzsc/config/constants.yaml
  36. 0 35
      jzsc/config/load.py
  37. 97 85
      jzsc/spider.py
  38. 0 109
      jzsc/utils/databases.py
  39. 0 49
      jzsc/utils/execptions.py
  40. 0 14
      jzsc/utils/log.py
  41. 0 153
      jzsc/utils/socks5.py
  42. 0 24
      jzsc/utils/tools.py
  43. 0 0
      zgzb/.idea/.gitignore
  44. 4 0
      zgzb/.idea/encodings.xml
  45. 36 0
      zgzb/.idea/inspectionProfiles/Project_Default.xml
  46. 6 0
      zgzb/.idea/inspectionProfiles/profiles_settings.xml
  47. 4 0
      zgzb/.idea/misc.xml
  48. 8 0
      zgzb/.idea/modules.xml
  49. 6 0
      zgzb/.idea/vcs.xml
  50. 407 0
      zgzb/.idea/workspace.xml
  51. 12 0
      zgzb/.idea/zgzb.iml
  52. 0 64
      zgztb_cookie/FworkSpider/Dockerfile
  53. 0 11
      zgztb_cookie/FworkSpider/MANIFEST.in
  54. 0 34
      zgztb_cookie/FworkSpider/README.md
  55. 0 45
      zgztb_cookie/FworkSpider/docker-compose.yml
  56. 0 1
      zgztb_cookie/FworkSpider/feapder/VERSION
  57. 0 33
      zgztb_cookie/FworkSpider/feapder/__init__.py
  58. 0 9
      zgztb_cookie/FworkSpider/feapder/buffer/__init__.py
  59. 0 426
      zgztb_cookie/FworkSpider/feapder/buffer/item_buffer.py
  60. 0 151
      zgztb_cookie/FworkSpider/feapder/buffer/request_buffer.py
  61. 0 45
      zgztb_cookie/FworkSpider/feapder/commands/cmdline.py
  62. 0 21
      zgztb_cookie/FworkSpider/feapder/commands/create/__init__.py
  63. 0 48
      zgztb_cookie/FworkSpider/feapder/commands/create/create_cookies.py
  64. 0 30
      zgztb_cookie/FworkSpider/feapder/commands/create/create_init.py
  65. 0 165
      zgztb_cookie/FworkSpider/feapder/commands/create/create_item.py
  66. 0 52
      zgztb_cookie/FworkSpider/feapder/commands/create/create_json.py
  67. 0 51
      zgztb_cookie/FworkSpider/feapder/commands/create/create_params.py
  68. 0 52
      zgztb_cookie/FworkSpider/feapder/commands/create/create_project.py
  69. 0 27
      zgztb_cookie/FworkSpider/feapder/commands/create/create_setting.py
  70. 0 102
      zgztb_cookie/FworkSpider/feapder/commands/create/create_spider.py
  71. 0 135
      zgztb_cookie/FworkSpider/feapder/commands/create/create_table.py
  72. 0 118
      zgztb_cookie/FworkSpider/feapder/commands/create_builder.py
  73. 0 93
      zgztb_cookie/FworkSpider/feapder/commands/shell.py
  74. 0 9
      zgztb_cookie/FworkSpider/feapder/core/__init__.py
  75. 0 216
      zgztb_cookie/FworkSpider/feapder/core/base_parser.py
  76. 0 176
      zgztb_cookie/FworkSpider/feapder/core/collector.py
  77. 0 56
      zgztb_cookie/FworkSpider/feapder/core/handle_failed_requests.py
  78. 0 721
      zgztb_cookie/FworkSpider/feapder/core/parser_control.py
  79. 0 579
      zgztb_cookie/FworkSpider/feapder/core/scheduler.py
  80. 0 15
      zgztb_cookie/FworkSpider/feapder/core/spiders/__init__.py
  81. 0 125
      zgztb_cookie/FworkSpider/feapder/core/spiders/air_spider.py
  82. 0 1273
      zgztb_cookie/FworkSpider/feapder/core/spiders/batch_spider.py
  83. 0 437
      zgztb_cookie/FworkSpider/feapder/core/spiders/spider.py
  84. 0 9
      zgztb_cookie/FworkSpider/feapder/db/__init__.py
  85. 0 37
      zgztb_cookie/FworkSpider/feapder/db/memory_db.py
  86. 0 422
      zgztb_cookie/FworkSpider/feapder/db/mongodb.py
  87. 0 381
      zgztb_cookie/FworkSpider/feapder/db/mysqldb.py
  88. 0 848
      zgztb_cookie/FworkSpider/feapder/db/redisdb.py
  89. 0 156
      zgztb_cookie/FworkSpider/feapder/dedup/README.md
  90. 0 177
      zgztb_cookie/FworkSpider/feapder/dedup/__init__.py
  91. 0 41
      zgztb_cookie/FworkSpider/feapder/dedup/basefilter.py
  92. 0 143
      zgztb_cookie/FworkSpider/feapder/dedup/bitarray.py
  93. 0 379
      zgztb_cookie/FworkSpider/feapder/dedup/bloomfilter.py
  94. 0 81
      zgztb_cookie/FworkSpider/feapder/dedup/expirefilter.py
  95. 0 70
      zgztb_cookie/FworkSpider/feapder/dedup/litefilter.py
  96. 0 131
      zgztb_cookie/FworkSpider/feapder/dedup/redisfilter.py
  97. 0 799
      zgztb_cookie/FworkSpider/feapder/network/cookie_pool.py
  98. 0 145
      zgztb_cookie/FworkSpider/feapder/network/item.py
  99. 0 753
      zgztb_cookie/FworkSpider/feapder/network/proxy_pool.py
  100. 0 527
      zgztb_cookie/FworkSpider/feapder/network/request.py

BIN
.DS_Store


+ 0 - 0
jzsc/config/__init__.py → .idea/.gitignore


+ 4 - 0
.idea/encodings.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>

+ 6 - 0
.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (python38)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/topic_spider.iml" filepath="$PROJECT_DIR$/.idea/topic_spider.iml" />
+    </modules>
+  </component>
+</project>

+ 12 - 0
.idea/topic_spider.iml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.8 (python38)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 436 - 0
.idea/workspace.xml

@@ -0,0 +1,436 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="8c2f903f-55ef-4ced-987d-6ac010bf606d" name="Changes" comment="add project - 中国招标投标公共服务平台(未按规范数据)">
+      <change beforePath="$PROJECT_DIR$/jzsc/chaojiying.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/config/__init__.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/config/conf.yaml" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/config/constants.yaml" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/config/load.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/spider.py" beforeDir="false" afterPath="$PROJECT_DIR$/jzsc/spider.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/__init__.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/databases.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/execptions.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/log.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/socks5.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/jzsc/utils/tools.py" beforeDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="JavaScript File" />
+        <option value="HTML File" />
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="FlaskConsoleOptions" custom-start-script="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
+    <envs>
+      <env key="FLASK_APP" value="app" />
+    </envs>
+    <option name="myCustomStartScript" value="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
+    <option name="myEnvs">
+      <map>
+        <entry key="FLASK_APP" value="app" />
+      </map>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="GitSEFilterConfiguration">
+    <file-type-list>
+      <filtered-out-file-type name="LOCAL_BRANCH" />
+      <filtered-out-file-type name="REMOTE_BRANCH" />
+      <filtered-out-file-type name="TAG" />
+      <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
+    </file-type-list>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProblemsViewState">
+    <option name="selectedTabId" value="CurrentFile" />
+  </component>
+  <component name="ProjectColorInfo"><![CDATA[{
+  "associatedIndex": 1
+}]]></component>
+  <component name="ProjectId" id="26cDQQPzrwjA2Sg1Lq9nLzcrVU6" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "git-widget-placeholder": "master",
+    "node.js.detected.package.eslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+      <recent name="$PROJECT_DIR$/codes_hospital" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/codes_hospital" />
+    </key>
+  </component>
+  <component name="RunManager" selected="Python.crawl_hospital">
+    <configuration name="aaa (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="topic_spider" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/aaa.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="b" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="topic_spider" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/b.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="c" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="topic_spider" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/c.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="crawl_hospital" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="topic_spider" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/crawl_hospital.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="defaults" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="topic_spider" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/defaults.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.crawl_hospital" />
+        <item itemvalue="Python.c" />
+        <item itemvalue="Python.b" />
+        <item itemvalue="Python.defaults" />
+        <item itemvalue="Python.aaa (1)" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-js-predefined-1d06a55b98c1-0b3e54e931b4-JavaScript-PY-241.18034.82" />
+        <option value="bundled-python-sdk-975db3bf15a3-2767605e8bc2-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.18034.82" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="8c2f903f-55ef-4ced-987d-6ac010bf606d" name="Changes" comment="" />
+      <created>1647713963501</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1647713963501</updated>
+      <workItem from="1647713965332" duration="763000" />
+      <workItem from="1647910768281" duration="540000" />
+      <workItem from="1648631632649" duration="336000" />
+      <workItem from="1648694544136" duration="742000" />
+      <workItem from="1649240699733" duration="190000" />
+      <workItem from="1649661973115" duration="99000" />
+      <workItem from="1649668394783" duration="10000" />
+      <workItem from="1650511827592" duration="9000" />
+      <workItem from="1651908647592" duration="13000" />
+      <workItem from="1653448397924" duration="786000" />
+      <workItem from="1653622679966" duration="1000" />
+      <workItem from="1653900611887" duration="199000" />
+      <workItem from="1655950837952" duration="610000" />
+      <workItem from="1656551153853" duration="524000" />
+      <workItem from="1658974433813" duration="14000" />
+      <workItem from="1659510721458" duration="62787000" />
+      <workItem from="1659931805307" duration="2748000" />
+      <workItem from="1660016111330" duration="184000" />
+      <workItem from="1660020728622" duration="1379000" />
+      <workItem from="1660032595629" duration="9306000" />
+      <workItem from="1660108524756" duration="3319000" />
+      <workItem from="1660117699024" duration="24296000" />
+      <workItem from="1660203314924" duration="6999000" />
+      <workItem from="1660266196873" duration="16000" />
+      <workItem from="1660290776459" duration="3956000" />
+      <workItem from="1660525419402" duration="7502000" />
+      <workItem from="1660611424284" duration="4765000" />
+      <workItem from="1660640960007" duration="468000" />
+      <workItem from="1660697369474" duration="24146000" />
+      <workItem from="1660789633638" duration="4806000" />
+      <workItem from="1661216042374" duration="2194000" />
+      <workItem from="1661308591662" duration="149000" />
+      <workItem from="1661330414839" duration="415000" />
+      <workItem from="1661390503915" duration="498000" />
+      <workItem from="1661739076245" duration="78000" />
+      <workItem from="1661820771088" duration="10000" />
+      <workItem from="1661849635521" duration="90000" />
+      <workItem from="1662100724285" duration="28000" />
+      <workItem from="1662367616316" duration="715000" />
+      <workItem from="1662705496621" duration="566000" />
+      <workItem from="1734574047690" duration="179000" />
+    </task>
+    <task id="LOCAL-00001" summary="新增 - 全国统一组织查询">
+      <created>1660201252821</created>
+      <option name="number" value="00001" />
+      <option name="presentableId" value="LOCAL-00001" />
+      <option name="project" value="LOCAL" />
+      <updated>1660201252821</updated>
+    </task>
+    <task id="LOCAL-00002" summary="fixbug">
+      <created>1660207650214</created>
+      <option name="number" value="00002" />
+      <option name="presentableId" value="LOCAL-00002" />
+      <option name="project" value="LOCAL" />
+      <updated>1660207650214</updated>
+    </task>
+    <task id="LOCAL-00003" summary="fixbug">
+      <created>1660211038526</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1660211038526</updated>
+    </task>
+    <task id="LOCAL-00004" summary="fixbug">
+      <created>1660302204646</created>
+      <option name="number" value="00004" />
+      <option name="presentableId" value="LOCAL-00004" />
+      <option name="project" value="LOCAL" />
+      <updated>1660302204646</updated>
+    </task>
+    <task id="LOCAL-00005" summary="fixbug">
+      <created>1660542942569</created>
+      <option name="number" value="00005" />
+      <option name="presentableId" value="LOCAL-00005" />
+      <option name="project" value="LOCAL" />
+      <updated>1660542942569</updated>
+    </task>
+    <task id="LOCAL-00006" summary="fixbug">
+      <created>1660616796886</created>
+      <option name="number" value="00006" />
+      <option name="presentableId" value="LOCAL-00006" />
+      <option name="project" value="LOCAL" />
+      <updated>1660616796886</updated>
+    </task>
+    <task id="LOCAL-00007" summary="update">
+      <created>1660707257585</created>
+      <option name="number" value="00007" />
+      <option name="presentableId" value="LOCAL-00007" />
+      <option name="project" value="LOCAL" />
+      <updated>1660707257585</updated>
+    </task>
+    <task id="LOCAL-00008" summary="fixbug">
+      <created>1660707264013</created>
+      <option name="number" value="00008" />
+      <option name="presentableId" value="LOCAL-00008" />
+      <option name="project" value="LOCAL" />
+      <updated>1660707264013</updated>
+    </task>
+    <task id="LOCAL-00009" summary="update">
+      <created>1660718674713</created>
+      <option name="number" value="00009" />
+      <option name="presentableId" value="LOCAL-00009" />
+      <option name="project" value="LOCAL" />
+      <updated>1660718674713</updated>
+    </task>
+    <task id="LOCAL-00010" summary="fixbug">
+      <created>1660718680064</created>
+      <option name="number" value="00010" />
+      <option name="presentableId" value="LOCAL-00010" />
+      <option name="project" value="LOCAL" />
+      <updated>1660718680064</updated>
+    </task>
+    <task id="LOCAL-00011" summary="fixbug">
+      <created>1660725321099</created>
+      <option name="number" value="00011" />
+      <option name="presentableId" value="LOCAL-00011" />
+      <option name="project" value="LOCAL" />
+      <updated>1660725321099</updated>
+    </task>
+    <task id="LOCAL-00012" summary="fixbug">
+      <created>1660734122824</created>
+      <option name="number" value="00012" />
+      <option name="presentableId" value="LOCAL-00012" />
+      <option name="project" value="LOCAL" />
+      <updated>1660734122824</updated>
+    </task>
+    <task id="LOCAL-00013" summary="update">
+      <created>1660872544902</created>
+      <option name="number" value="00013" />
+      <option name="presentableId" value="LOCAL-00013" />
+      <option name="project" value="LOCAL" />
+      <updated>1660872544903</updated>
+    </task>
+    <task id="LOCAL-00014" summary="update">
+      <created>1660872629395</created>
+      <option name="number" value="00014" />
+      <option name="presentableId" value="LOCAL-00014" />
+      <option name="project" value="LOCAL" />
+      <updated>1660872629395</updated>
+    </task>
+    <task id="LOCAL-00015" summary="update">
+      <created>1660896035987</created>
+      <option name="number" value="00015" />
+      <option name="presentableId" value="LOCAL-00015" />
+      <option name="project" value="LOCAL" />
+      <updated>1660896035987</updated>
+    </task>
+    <task id="LOCAL-00016" summary="add project - 中国招标投标公共服务平台(未按规范数据)">
+      <created>1661330637516</created>
+      <option name="number" value="00016" />
+      <option name="presentableId" value="LOCAL-00016" />
+      <option name="project" value="LOCAL" />
+      <updated>1661330637517</updated>
+    </task>
+    <option name="localTasksCounter" value="17" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="新增 - 全国统一组织查询" />
+    <MESSAGE value="fixbug" />
+    <MESSAGE value="update" />
+    <MESSAGE value="add project - 中国招标投标公共服务平台(未按规范数据)" />
+    <option name="LAST_COMMIT_MESSAGE" value="add project - 中国招标投标公共服务平台(未按规范数据)" />
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/codes_hospital/sp1.py</url>
+          <line>421</line>
+          <option name="timeStamp" value="13" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/topic_spider$query_hospital.coverage" NAME="query_spider Coverage Results" MODIFIED="1659847227682" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$aaa.coverage" NAME="aaa Coverage Results" MODIFIED="1660111607631" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/topic_spider$aaaa.coverage" NAME="aaaa Coverage Results" MODIFIED="1660124559170" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$aaa__1_.coverage" NAME="aaa (1) Coverage Results" MODIFIED="1660706922075" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$c.coverage" NAME="c Coverage Results" MODIFIED="1660731422968" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$crawl_hospital.coverage" NAME="crawl_hospital Coverage Results" MODIFIED="1660733510071" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$search.coverage" NAME="retrieval Coverage Results" MODIFIED="1659587342978" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$crawl_spider.coverage" NAME="crawl_spider Coverage Results" MODIFIED="1659587372586" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$b.coverage" NAME="b Coverage Results" MODIFIED="1660731393321" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$a3.coverage" NAME="a3 Coverage Results" MODIFIED="1659786080163" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/topic_spider$crawl_hospital_3.coverage" NAME="crawl_hospital_3 Coverage Results" MODIFIED="1660704358659" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$a2.coverage" NAME="t1 Coverage Results" MODIFIED="1660103182145" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$defaults.coverage" NAME="defaults Coverage Results" MODIFIED="1660730508043" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$query_spider.coverage" NAME="query_spider Coverage Results" MODIFIED="1660099299022" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+    <SUITE FILE_PATH="coverage/topic_spider$step1.coverage" NAME="step1 Coverage Results" MODIFIED="1660112070606" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/topic_spider$t1.coverage" NAME="t1 Coverage Results" MODIFIED="1660117720005" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
+  </component>
+</project>

+ 0 - 0
jzsc/utils/__init__.py → find_source/.idea/.gitignore


+ 4 - 0
find_source/.idea/encodings.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>

+ 15 - 0
find_source/.idea/find_source.iml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.8 (py38)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PackageRequirementsSettings">
+    <option name="requirementsPath" value="$MODULE_DIR$/requirements.txt" />
+    <option name="removeUnused" value="true" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>

+ 37 - 0
find_source/.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,37 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="24">
+            <item index="0" class="java.lang.String" itemvalue="rsa" />
+            <item index="1" class="java.lang.String" itemvalue="greenlet" />
+            <item index="2" class="java.lang.String" itemvalue="ws4py" />
+            <item index="3" class="java.lang.String" itemvalue="mysql-connector" />
+            <item index="4" class="java.lang.String" itemvalue="cffi" />
+            <item index="5" class="java.lang.String" itemvalue="asgiref" />
+            <item index="6" class="java.lang.String" itemvalue="zope.interface" />
+            <item index="7" class="java.lang.String" itemvalue="et-xmlfile" />
+            <item index="8" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="9" class="java.lang.String" itemvalue="pycparser" />
+            <item index="10" class="java.lang.String" itemvalue="sqlparse" />
+            <item index="11" class="java.lang.String" itemvalue="jdcal" />
+            <item index="12" class="java.lang.String" itemvalue="ddt" />
+            <item index="13" class="java.lang.String" itemvalue="websocket" />
+            <item index="14" class="java.lang.String" itemvalue="gevent" />
+            <item index="15" class="java.lang.String" itemvalue="PyMySQL" />
+            <item index="16" class="java.lang.String" itemvalue="zope.event" />
+            <item index="17" class="java.lang.String" itemvalue="openpyxl" />
+            <item index="18" class="java.lang.String" itemvalue="Flask-Script" />
+            <item index="19" class="java.lang.String" itemvalue="Flask" />
+            <item index="20" class="java.lang.String" itemvalue="pymongo" />
+            <item index="21" class="java.lang.String" itemvalue="requests" />
+            <item index="22" class="java.lang.String" itemvalue="redis" />
+            <item index="23" class="java.lang.String" itemvalue="numpy" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
find_source/.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
find_source/.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (py38)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
find_source/.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/find_source.iml" filepath="$PROJECT_DIR$/.idea/find_source.iml" />
+    </modules>
+  </component>
+</project>

+ 6 - 0
find_source/.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>

+ 865 - 0
find_source/.idea/workspace.xml

@@ -0,0 +1,865 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="638282b5-fbaf-4e70-b78d-3ba2d271ee5b" name="Changes" comment="更新下载器访问失败时返回的文本流">
+      <change beforePath="$PROJECT_DIR$/../jzsc/chaojiying.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/config/__init__.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/config/conf.yaml" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/config/constants.yaml" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/config/load.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/spider.py" beforeDir="false" afterPath="$PROJECT_DIR$/../jzsc/spider.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/__init__.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/databases.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/execptions.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/log.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/socks5.py" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/../jzsc/utils/tools.py" beforeDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="HTML File" />
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="FlaskConsoleOptions" custom-start-script="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
+    <envs>
+      <env key="FLASK_APP" value="app" />
+    </envs>
+    <option name="myCustomStartScript" value="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
+    <option name="myEnvs">
+      <map>
+        <entry key="FLASK_APP" value="app" />
+      </map>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
+  </component>
+  <component name="GitSEFilterConfiguration">
+    <file-type-list>
+      <filtered-out-file-type name="LOCAL_BRANCH" />
+      <filtered-out-file-type name="REMOTE_BRANCH" />
+      <filtered-out-file-type name="TAG" />
+      <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
+    </file-type-list>
+  </component>
+  <component name="HighlightingSettingsPerFile">
+    <setting file="file://$PROJECT_DIR$/common/log.py" root0="FORCE_HIGHLIGHTING" />
+    <setting file="file://$PROJECT_DIR$/crawler/defaults.py" root0="FORCE_HIGHLIGHTING" />
+    <setting file="file://$PROJECT_DIR$/crawler/q.py" root0="FORCE_HIGHLIGHTING" />
+    <setting file="file://$PROJECT_DIR$/crawler/utils.py" root0="FORCE_HIGHLIGHTING" />
+    <setting file="file://$PROJECT_DIR$/t_channel.py" root0="FORCE_HIGHLIGHTING" />
+    <setting file="file://$PROJECT_DIR$/tests/exa_channel.py" root0="FORCE_HIGHLIGHTING" />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProblemsViewState">
+    <option name="selectedTabId" value="CurrentFile" />
+  </component>
+  <component name="ProjectColorInfo"><![CDATA[{
+  "customColor": "",
+  "associatedIndex": 8
+}]]></component>
+  <component name="ProjectId" id="27dtw01v0eDvTSP1lWouR4OZww9" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true">
+    <ConfirmationsSetting value="1" id="Add" />
+  </component>
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "DefaultHtmlFileTemplate": "HTML File",
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "WebServerToolWindowFactoryState": "false",
+    "git-widget-placeholder": "master",
+    "last_opened_file_path": "/Users/dongzhaorui/Desktop/swordfish/work/topic_spider/find_source/predict_bidding_model_v2/lib",
+    "node.js.detected.package.eslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyIntegratedToolsModulesConfigurable",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/predict_bidding_model_v2/lib" />
+      <recent name="$PROJECT_DIR$" />
+      <recent name="$PROJECT_DIR$/predict_bidding_model" />
+      <recent name="$PROJECT_DIR$/exists_bidding/models" />
+      <recent name="$PROJECT_DIR$/exists_bidding/utils" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/tests" />
+      <recent name="$PROJECT_DIR$/spiders" />
+      <recent name="$PROJECT_DIR$" />
+      <recent name="$PROJECT_DIR$/crawler" />
+      <recent name="$PROJECT_DIR$/cj_analyse" />
+    </key>
+  </component>
+  <component name="RunManager" selected="Python.government">
+    <configuration name="aa" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="find_source" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/spiders" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/spiders/aa.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="build_excavate" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="find_source" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/build_excavate.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="download" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="find_source" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/download.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="government" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="find_source" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/spiders" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/spiders/government.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="test_predict_v2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="find_source" />
+      <option name="ENV_FILES" value="" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/tests" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/tests/test_predict_v2.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <list>
+      <item itemvalue="Python.aa" />
+      <item itemvalue="Python.build_excavate" />
+      <item itemvalue="Python.download" />
+      <item itemvalue="Python.government" />
+      <item itemvalue="Python.test_predict_v2" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.government" />
+        <item itemvalue="Python.download" />
+        <item itemvalue="Python.build_excavate" />
+        <item itemvalue="Python.test_predict_v2" />
+        <item itemvalue="Python.aa" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-js-predefined-1d06a55b98c1-0b3e54e931b4-JavaScript-PY-241.18034.82" />
+        <option value="bundled-python-sdk-975db3bf15a3-2767605e8bc2-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.18034.82" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="638282b5-fbaf-4e70-b78d-3ba2d271ee5b" name="Changes" comment="" />
+      <created>1649662024755</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1649662024755</updated>
+      <workItem from="1649662025905" duration="4622000" />
+      <workItem from="1649724850569" duration="26193000" />
+      <workItem from="1649822456194" duration="96927000" />
+      <workItem from="1650242926194" duration="221701000" />
+      <workItem from="1651112646963" duration="150000" />
+      <workItem from="1651112800769" duration="43193000" />
+      <workItem from="1651833699291" duration="5000" />
+      <workItem from="1651907537951" duration="1236000" />
+      <workItem from="1652060693262" duration="6287000" />
+      <workItem from="1652164281005" duration="1710000" />
+      <workItem from="1652173230553" duration="15898000" />
+      <workItem from="1652324120730" duration="156568000" />
+      <workItem from="1653526630817" duration="1806000" />
+      <workItem from="1653976628855" duration="88000" />
+      <workItem from="1654146982188" duration="2573000" />
+      <workItem from="1654484709618" duration="478000" />
+      <workItem from="1654580999117" duration="736000" />
+      <workItem from="1654769985745" duration="16111000" />
+      <workItem from="1654914173207" duration="14985000" />
+      <workItem from="1655080389725" duration="57425000" />
+      <workItem from="1656038784480" duration="10566000" />
+      <workItem from="1656383487994" duration="5000" />
+      <workItem from="1656383510966" duration="51357000" />
+      <workItem from="1656927550295" duration="17701000" />
+      <workItem from="1657004311017" duration="135671000" />
+      <workItem from="1657769510827" duration="3791000" />
+      <workItem from="1658109366969" duration="3839000" />
+      <workItem from="1658385543224" duration="2029000" />
+      <workItem from="1658457395250" duration="2409000" />
+      <workItem from="1658714558796" duration="27222000" />
+      <workItem from="1658825422658" duration="76400000" />
+      <workItem from="1660201354808" duration="1992000" />
+      <workItem from="1660550260265" duration="41147000" />
+      <workItem from="1661160815568" duration="226000" />
+      <workItem from="1661408561553" duration="190000" />
+      <workItem from="1661599967710" duration="451000" />
+      <workItem from="1662366609913" duration="15811000" />
+      <workItem from="1663035676698" duration="26688000" />
+      <workItem from="1663131002840" duration="35248000" />
+      <workItem from="1663901537554" duration="1854000" />
+      <workItem from="1664163211226" duration="1842000" />
+      <workItem from="1666251386764" duration="50000" />
+      <workItem from="1668649331530" duration="7618000" />
+      <workItem from="1669172288130" duration="20000" />
+      <workItem from="1669858287683" duration="12593000" />
+      <workItem from="1669954914968" duration="9551000" />
+      <workItem from="1670036270856" duration="1056000" />
+      <workItem from="1670054172931" duration="6000" />
+      <workItem from="1670061826512" duration="564000" />
+      <workItem from="1670062647826" duration="284000" />
+      <workItem from="1670087602979" duration="424000" />
+      <workItem from="1670201381130" duration="41693000" />
+      <workItem from="1670391187659" duration="13272000" />
+      <workItem from="1670547139259" duration="1378000" />
+      <workItem from="1672376221165" duration="399000" />
+      <workItem from="1673318267288" duration="222000" />
+      <workItem from="1675992042987" duration="119000" />
+      <workItem from="1677296362508" duration="136000" />
+      <workItem from="1677313939119" duration="9000" />
+      <workItem from="1678690923574" duration="292000" />
+      <workItem from="1681088700061" duration="51000" />
+      <workItem from="1681364217330" duration="17000" />
+      <workItem from="1683618805864" duration="1780000" />
+      <workItem from="1684113778407" duration="43000" />
+      <workItem from="1684113989302" duration="314000" />
+      <workItem from="1684114343499" duration="223000" />
+      <workItem from="1684114780227" duration="633000" />
+      <workItem from="1688103861695" duration="72000" />
+      <workItem from="1688530619253" duration="723000" />
+      <workItem from="1689298951057" duration="151000" />
+      <workItem from="1689313400721" duration="11000" />
+      <workItem from="1689580286563" duration="12942000" />
+      <workItem from="1689661092117" duration="4650000" />
+      <workItem from="1690166160230" duration="940000" />
+      <workItem from="1690334527269" duration="46000" />
+      <workItem from="1690336910207" duration="27000" />
+      <workItem from="1690428828546" duration="1000" />
+      <workItem from="1691118375151" duration="4000" />
+      <workItem from="1697598708258" duration="444000" />
+      <workItem from="1698119322453" duration="208000" />
+      <workItem from="1698124393195" duration="900000" />
+      <workItem from="1698132365473" duration="1326000" />
+      <workItem from="1698391894333" duration="1596000" />
+      <workItem from="1698815501602" duration="3127000" />
+      <workItem from="1698825313802" duration="1796000" />
+      <workItem from="1698911627371" duration="102000" />
+      <workItem from="1698911737282" duration="2000" />
+      <workItem from="1698913319970" duration="578000" />
+      <workItem from="1699861315716" duration="1586000" />
+      <workItem from="1699930633781" duration="88000" />
+      <workItem from="1700538019401" duration="733000" />
+      <workItem from="1700554283512" duration="4923000" />
+      <workItem from="1701159686230" duration="1388000" />
+      <workItem from="1701676230469" duration="4683000" />
+      <workItem from="1701681043837" duration="308000" />
+      <workItem from="1701754048518" duration="65000" />
+      <workItem from="1701765819740" duration="1517000" />
+      <workItem from="1701853814873" duration="335000" />
+      <workItem from="1701855396314" duration="221000" />
+      <workItem from="1701855633632" duration="42027000" />
+      <workItem from="1702175067813" duration="10918000" />
+      <workItem from="1711524574163" duration="8000" />
+      <workItem from="1740538877768" duration="13000" />
+    </task>
+    <task id="LOCAL-00228" summary="fixbug">
+      <created>1660552580496</created>
+      <option name="number" value="00228" />
+      <option name="presentableId" value="LOCAL-00228" />
+      <option name="project" value="LOCAL" />
+      <updated>1660552580496</updated>
+    </task>
+    <task id="LOCAL-00229" summary="update">
+      <created>1660628064516</created>
+      <option name="number" value="00229" />
+      <option name="presentableId" value="LOCAL-00229" />
+      <option name="project" value="LOCAL" />
+      <updated>1660628064516</updated>
+    </task>
+    <task id="LOCAL-00230" summary="fixbug - 布隆过滤器的redis分布式锁问题修复">
+      <created>1663052588413</created>
+      <option name="number" value="00230" />
+      <option name="presentableId" value="LOCAL-00230" />
+      <option name="project" value="LOCAL" />
+      <updated>1663052588414</updated>
+    </task>
+    <task id="LOCAL-00231" summary="fixbug - 布隆过滤器的redis分布式锁问题修复">
+      <created>1663123418522</created>
+      <option name="number" value="00231" />
+      <option name="presentableId" value="LOCAL-00231" />
+      <option name="project" value="LOCAL" />
+      <updated>1663123418522</updated>
+    </task>
+    <task id="LOCAL-00232" summary="new add - 数据查询">
+      <created>1663141313606</created>
+      <option name="number" value="00232" />
+      <option name="presentableId" value="LOCAL-00232" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141313606</updated>
+    </task>
+    <task id="LOCAL-00233" summary="new add - 数据同步">
+      <created>1663141321022</created>
+      <option name="number" value="00233" />
+      <option name="presentableId" value="LOCAL-00233" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141321022</updated>
+    </task>
+    <task id="LOCAL-00234" summary="new add - 招投标预测模型">
+      <created>1663141459996</created>
+      <option name="number" value="00234" />
+      <option name="presentableId" value="LOCAL-00234" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141459996</updated>
+    </task>
+    <task id="LOCAL-00235" summary="update - 数据同步">
+      <created>1663141486390</created>
+      <option name="number" value="00235" />
+      <option name="presentableId" value="LOCAL-00235" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141486390</updated>
+    </task>
+    <task id="LOCAL-00236" summary="update - 初始化引入路径">
+      <created>1663141506482</created>
+      <option name="number" value="00236" />
+      <option name="presentableId" value="LOCAL-00236" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141506482</updated>
+    </task>
+    <task id="LOCAL-00237" summary="fixbug - 布隆过滤器的redis分布式锁问题修复">
+      <created>1663141537906</created>
+      <option name="number" value="00237" />
+      <option name="presentableId" value="LOCAL-00237" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141537906</updated>
+    </task>
+    <task id="LOCAL-00238" summary="update - 请求头添加资源压缩">
+      <created>1663141582041</created>
+      <option name="number" value="00238" />
+      <option name="presentableId" value="LOCAL-00238" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141582041</updated>
+    </task>
+    <task id="LOCAL-00239" summary="update - 数据库配置">
+      <created>1663141617950</created>
+      <option name="number" value="00239" />
+      <option name="presentableId" value="LOCAL-00239" />
+      <option name="project" value="LOCAL" />
+      <updated>1663141617950</updated>
+    </task>
+    <task id="LOCAL-00240" summary="fixbug - 下载器(session)会话异常时阻塞问题修复">
+      <created>1663222875035</created>
+      <option name="number" value="00240" />
+      <option name="presentableId" value="LOCAL-00240" />
+      <option name="project" value="LOCAL" />
+      <updated>1663222875036</updated>
+    </task>
+    <task id="LOCAL-00241" summary="update - 新增文本压缩方法">
+      <created>1663226654239</created>
+      <option name="number" value="00241" />
+      <option name="presentableId" value="LOCAL-00241" />
+      <option name="project" value="LOCAL" />
+      <updated>1663226654239</updated>
+    </task>
+    <task id="LOCAL-00242" summary="update - 添加文本特征检查方法">
+      <created>1663228021498</created>
+      <option name="number" value="00242" />
+      <option name="presentableId" value="LOCAL-00242" />
+      <option name="project" value="LOCAL" />
+      <updated>1663228021498</updated>
+    </task>
+    <task id="LOCAL-00243" summary="fixbug">
+      <created>1663293850525</created>
+      <option name="number" value="00243" />
+      <option name="presentableId" value="LOCAL-00243" />
+      <option name="project" value="LOCAL" />
+      <updated>1663293850525</updated>
+    </task>
+    <task id="LOCAL-00244" summary="fixbug - 完善清洗页面标签与属性导致的lxml.etree.ParserError问题">
+      <created>1663297691245</created>
+      <option name="number" value="00244" />
+      <option name="presentableId" value="LOCAL-00244" />
+      <option name="project" value="LOCAL" />
+      <updated>1663297691245</updated>
+    </task>
+    <task id="LOCAL-00245" summary="fixbug - 时间抽取规则修正">
+      <created>1663911890163</created>
+      <option name="number" value="00245" />
+      <option name="presentableId" value="LOCAL-00245" />
+      <option name="project" value="LOCAL" />
+      <updated>1663911890164</updated>
+    </task>
+    <task id="LOCAL-00246" summary="update - 数据挖掘文件更名">
+      <created>1668651616602</created>
+      <option name="number" value="00246" />
+      <option name="presentableId" value="LOCAL-00246" />
+      <option name="project" value="LOCAL" />
+      <updated>1668651616602</updated>
+    </task>
+    <task id="LOCAL-00247" summary="update">
+      <created>1669950265629</created>
+      <option name="number" value="00247" />
+      <option name="presentableId" value="LOCAL-00247" />
+      <option name="project" value="LOCAL" />
+      <updated>1669950265629</updated>
+    </task>
+    <task id="LOCAL-00248" summary="update">
+      <created>1669950418770</created>
+      <option name="number" value="00248" />
+      <option name="presentableId" value="LOCAL-00248" />
+      <option name="project" value="LOCAL" />
+      <updated>1669950418770</updated>
+    </task>
+    <task id="LOCAL-00249" summary="update">
+      <created>1669960314120</created>
+      <option name="number" value="00249" />
+      <option name="presentableId" value="LOCAL-00249" />
+      <option name="project" value="LOCAL" />
+      <updated>1669960314121</updated>
+    </task>
+    <task id="LOCAL-00250" summary="update:使用招投标预测模型检测站点是否招投标网站类型">
+      <created>1669962502993</created>
+      <option name="number" value="00250" />
+      <option name="presentableId" value="LOCAL-00250" />
+      <option name="project" value="LOCAL" />
+      <updated>1669962502993</updated>
+    </task>
+    <task id="LOCAL-00251" summary="添加项目依赖">
+      <created>1669962519569</created>
+      <option name="number" value="00251" />
+      <option name="presentableId" value="LOCAL-00251" />
+      <option name="project" value="LOCAL" />
+      <updated>1669962519569</updated>
+    </task>
+    <task id="LOCAL-00252" summary="添加Dockerfile">
+      <created>1669964391626</created>
+      <option name="number" value="00252" />
+      <option name="presentableId" value="LOCAL-00252" />
+      <option name="project" value="LOCAL" />
+      <updated>1669964391626</updated>
+    </task>
+    <task id="LOCAL-00253" summary="fixbug: 修复工作者参数类型错误">
+      <created>1669964423269</created>
+      <option name="number" value="00253" />
+      <option name="presentableId" value="LOCAL-00253" />
+      <option name="project" value="LOCAL" />
+      <updated>1669964423270</updated>
+    </task>
+    <task id="LOCAL-00254" summary="新增数据同步构建配置">
+      <created>1670316196521</created>
+      <option name="number" value="00254" />
+      <option name="presentableId" value="LOCAL-00254" />
+      <option name="project" value="LOCAL" />
+      <updated>1670316196521</updated>
+    </task>
+    <task id="LOCAL-00255" summary="更新lua表代理访问地址">
+      <created>1670316218248</created>
+      <option name="number" value="00255" />
+      <option name="presentableId" value="LOCAL-00255" />
+      <option name="project" value="LOCAL" />
+      <updated>1670316218248</updated>
+    </task>
+    <task id="LOCAL-00256" summary="更新配置">
+      <created>1670316252460</created>
+      <option name="number" value="00256" />
+      <option name="presentableId" value="LOCAL-00256" />
+      <option name="project" value="LOCAL" />
+      <updated>1670316252460</updated>
+    </task>
+    <task id="LOCAL-00257" summary="更新Dockerfile">
+      <created>1670392600651</created>
+      <option name="number" value="00257" />
+      <option name="presentableId" value="LOCAL-00257" />
+      <option name="project" value="LOCAL" />
+      <updated>1670392600651</updated>
+    </task>
+    <task id="LOCAL-00258" summary="fixbug:修正Task不正确的属性">
+      <created>1670462199980</created>
+      <option name="number" value="00258" />
+      <option name="presentableId" value="LOCAL-00258" />
+      <option name="project" value="LOCAL" />
+      <updated>1670462199980</updated>
+    </task>
+    <task id="LOCAL-00259" summary="update">
+      <created>1670462318491</created>
+      <option name="number" value="00259" />
+      <option name="presentableId" value="LOCAL-00259" />
+      <option name="project" value="LOCAL" />
+      <updated>1670462318492</updated>
+    </task>
+    <task id="LOCAL-00260" summary="update">
+      <created>1670548210057</created>
+      <option name="number" value="00260" />
+      <option name="presentableId" value="LOCAL-00260" />
+      <option name="project" value="LOCAL" />
+      <updated>1670548210057</updated>
+    </task>
+    <task id="LOCAL-00261" summary="update">
+      <created>1689591233131</created>
+      <option name="number" value="00261" />
+      <option name="presentableId" value="LOCAL-00261" />
+      <option name="project" value="LOCAL" />
+      <updated>1689591233132</updated>
+    </task>
+    <task id="LOCAL-00262" summary="添加docker镜像与编排脚本">
+      <created>1689591291835</created>
+      <option name="number" value="00262" />
+      <option name="presentableId" value="LOCAL-00262" />
+      <option name="project" value="LOCAL" />
+      <updated>1689591291836</updated>
+    </task>
+    <task id="LOCAL-00263" summary="redis数据类型校验异常修复">
+      <created>1689592034661</created>
+      <option name="number" value="00263" />
+      <option name="presentableId" value="LOCAL-00263" />
+      <option name="project" value="LOCAL" />
+      <updated>1689592034661</updated>
+    </task>
+    <task id="LOCAL-00264" summary="删除失效配置">
+      <created>1689666120193</created>
+      <option name="number" value="00264" />
+      <option name="presentableId" value="LOCAL-00264" />
+      <option name="project" value="LOCAL" />
+      <updated>1689666120193</updated>
+    </task>
+    <task id="LOCAL-00265" summary="修正redis读取文本内容类型错误">
+      <created>1689666152711</created>
+      <option name="number" value="00265" />
+      <option name="presentableId" value="LOCAL-00265" />
+      <option name="project" value="LOCAL" />
+      <updated>1689666152711</updated>
+    </task>
+    <task id="LOCAL-00266" summary="更新es测试环境配置">
+      <created>1698394553080</created>
+      <option name="number" value="00266" />
+      <option name="presentableId" value="LOCAL-00266" />
+      <option name="project" value="LOCAL" />
+      <updated>1698394553080</updated>
+    </task>
+    <task id="LOCAL-00267" summary="更新 docker 配置">
+      <created>1700555337531</created>
+      <option name="number" value="00267" />
+      <option name="presentableId" value="LOCAL-00267" />
+      <option name="project" value="LOCAL" />
+      <updated>1700555337531</updated>
+    </task>
+    <task id="LOCAL-00268" summary="更新去重入库的分拣条件">
+      <created>1700558514305</created>
+      <option name="number" value="00268" />
+      <option name="presentableId" value="LOCAL-00268" />
+      <option name="project" value="LOCAL" />
+      <updated>1700558514305</updated>
+    </task>
+    <task id="LOCAL-00269" summary="处理预测模型文本为空异常">
+      <created>1701680694150</created>
+      <option name="number" value="00269" />
+      <option name="presentableId" value="LOCAL-00269" />
+      <option name="project" value="LOCAL" />
+      <updated>1701680694150</updated>
+    </task>
+    <task id="LOCAL-00270" summary="1、种子不与已收录判重&#10;2、域名判重时使用全称&#10;3、关键词计数&#10;关键词计数为5以上网站   标记为1&#10;关键词计数为3-5网站    标记为2">
+      <created>1701681084123</created>
+      <option name="number" value="00270" />
+      <option name="presentableId" value="LOCAL-00270" />
+      <option name="project" value="LOCAL" />
+      <updated>1701681084123</updated>
+    </task>
+    <task id="LOCAL-00271" summary="添加新 &quot;信息是否归属招投标预测&quot; 模型">
+      <created>1701919315872</created>
+      <option name="number" value="00271" />
+      <option name="presentableId" value="LOCAL-00271" />
+      <option name="project" value="LOCAL" />
+      <updated>1701919315872</updated>
+    </task>
+    <task id="LOCAL-00272" summary="添加全局禁用告警">
+      <created>1702006598460</created>
+      <option name="number" value="00272" />
+      <option name="presentableId" value="LOCAL-00272" />
+      <option name="project" value="LOCAL" />
+      <updated>1702006598461</updated>
+    </task>
+    <task id="LOCAL-00273" summary="添加新模型">
+      <created>1702017058885</created>
+      <option name="number" value="00273" />
+      <option name="presentableId" value="LOCAL-00273" />
+      <option name="project" value="LOCAL" />
+      <updated>1702017058885</updated>
+    </task>
+    <task id="LOCAL-00274" summary="更新下载器访问失败时返回的文本流和编码">
+      <created>1702187415348</created>
+      <option name="number" value="00274" />
+      <option name="presentableId" value="LOCAL-00274" />
+      <option name="project" value="LOCAL" />
+      <updated>1702187415348</updated>
+    </task>
+    <task id="LOCAL-00275" summary="更新下载器访问失败时返回的文本流和编码">
+      <created>1702191136729</created>
+      <option name="number" value="00275" />
+      <option name="presentableId" value="LOCAL-00275" />
+      <option name="project" value="LOCAL" />
+      <updated>1702191136729</updated>
+    </task>
+    <task id="LOCAL-00276" summary="更新下载器访问失败时返回的文本流">
+      <created>1702191502539</created>
+      <option name="number" value="00276" />
+      <option name="presentableId" value="LOCAL-00276" />
+      <option name="project" value="LOCAL" />
+      <updated>1702191502539</updated>
+    </task>
+    <option name="localTasksCounter" value="277" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="update - 数据挖掘文件更名" />
+    <MESSAGE value="update:使用招投标预测模型检测站点是否招投标网站类型" />
+    <MESSAGE value="添加项目依赖" />
+    <MESSAGE value="添加Dockerfile" />
+    <MESSAGE value="fixbug: 修复工作者参数类型错误" />
+    <MESSAGE value="新增数据同步构建配置" />
+    <MESSAGE value="更新lua表代理访问地址" />
+    <MESSAGE value="更新配置" />
+    <MESSAGE value="更新Dockerfile" />
+    <MESSAGE value="fixbug:修正Task不正确的属性" />
+    <MESSAGE value="update" />
+    <MESSAGE value="添加docker镜像与编排脚本" />
+    <MESSAGE value="redis数据类型校验异常修复" />
+    <MESSAGE value="删除失效配置" />
+    <MESSAGE value="修正redis读取文本内容类型错误" />
+    <MESSAGE value="更新es测试环境配置" />
+    <MESSAGE value="更新 docker 配置" />
+    <MESSAGE value="更新去重入库的分拣条件" />
+    <MESSAGE value="处理预测模型文本为空异常" />
+    <MESSAGE value="1、种子不与已收录判重&#10;2、域名判重时使用全称&#10;3、关键词计数&#10;关键词计数为5以上网站   标记为1&#10;关键词计数为3-5网站    标记为2" />
+    <MESSAGE value="添加新 &quot;信息是否归属招投标预测&quot; 模型" />
+    <MESSAGE value="添加全局禁用告警" />
+    <MESSAGE value="添加新模型" />
+    <MESSAGE value="更新下载器访问失败时返回的文本流和编码" />
+    <MESSAGE value="更新下载器访问失败时返回的文本流" />
+    <option name="LAST_COMMIT_MESSAGE" value="更新下载器访问失败时返回的文本流" />
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file:///usr/local/anaconda3/envs/python38/lib/python3.8/site-packages/lxml/html/clean.py</url>
+          <line>559</line>
+          <option name="timeStamp" value="137" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file:///usr/local/anaconda3/envs/python38/lib/python3.8/site-packages/lxml/html/__init__.py</url>
+          <line>186</line>
+          <option name="timeStamp" value="331" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/find_source$ztb_info__1_.coverage" NAME="ztb_info (1) Coverage Results" MODIFIED="1663124416301" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/exists_bidding" />
+    <SUITE FILE_PATH="coverage/find_source$Queue.coverage" NAME="q Coverage Results" MODIFIED="1649989558117" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$spiders.coverage" NAME="spiders Coverage Results" MODIFIED="1650604961056" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$cut_word__1_.coverage" NAME="cut_word (1) Coverage Results" MODIFIED="1663133048004" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/predict_bidding_model/utils" />
+    <SUITE FILE_PATH="coverage/find_source$pre_test.coverage" NAME="pre_test Coverage Results" MODIFIED="1663296795875" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$add_task.coverage" NAME="add_task Coverage Results" MODIFIED="1668650254038" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$sync_data.coverage" NAME="sync_data Coverage Results" MODIFIED="1700556423544" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_extract_base_url__1_.coverage" NAME="Doctest extract_base_url (1) Coverage Results" MODIFIED="1650007965506" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_search_site__1_.coverage" NAME="Doctest search_site (1) Coverage Results" MODIFIED="1650346064987" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$example.coverage" NAME="example Coverage Results" MODIFIED="1650854219173" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$qcc.coverage" NAME="qcc Coverage Results" MODIFIED="1650972649843" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/retrieve" />
+    <SUITE FILE_PATH="coverage/find_source$download.coverage" NAME="download Coverage Results" MODIFIED="1702191074819" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$q.coverage" NAME="q Coverage Results" MODIFIED="1652411753509" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$asdda.coverage" NAME="asdda Coverage Results" MODIFIED="1651048217426" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$aaa__1_.coverage" NAME="aaa (1) Coverage Results" MODIFIED="1669970064242" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/../../../.." />
+    <SUITE FILE_PATH="coverage/find_source$extract_text.coverage" NAME="extract_text Coverage Results" MODIFIED="1660639984906" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/cj_analyse" />
+    <SUITE FILE_PATH="coverage/find_source$.coverage" NAME=" Coverage Results" MODIFIED="1701918477712" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
+    <SUITE FILE_PATH="coverage/find_source$aa.coverage" NAME="aa Coverage Results" MODIFIED="1702004063503" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/spiders" />
+    <SUITE FILE_PATH="coverage/find_source$t_channel.coverage" NAME="t_channel Coverage Results" MODIFIED="1659772567488" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_RedisFilter_add.coverage" NAME="Doctest RedisFilter.add Coverage Results" MODIFIED="1649752155802" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/BloomFilter" />
+    <SUITE FILE_PATH="coverage/find_source$a.coverage" NAME="a Coverage Results" MODIFIED="1649841675077" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$cut_word.coverage" NAME="cut_word Coverage Results" MODIFIED="1663129487670" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/exists_bidding/utils" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_extract_page_title__2_.coverage" NAME="Doctest extract_page_title (2) Coverage Results" MODIFIED="1649930136960" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$ad.coverage" NAME="ad Coverage Results" MODIFIED="1652256975545" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$build_spider.coverage" NAME="build_spider Coverage Results" MODIFIED="1663138143354" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$__init__.coverage" NAME="__init__ Coverage Results" MODIFIED="1701918396308" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/predict_bidding_model_v2" />
+    <SUITE FILE_PATH="coverage/find_source$basics.coverage" NAME="basics Coverage Results" MODIFIED="1662699147734" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_RedisFilter__1_.coverage" NAME="Doctests in RedisFilter (1) Coverage Results" MODIFIED="1649753412832" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/BloomFilter" />
+    <SUITE FILE_PATH="coverage/find_source$ztb_info.coverage" NAME="ztb_info Coverage Results" MODIFIED="1663123789963" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/exists_bidding" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_Task__1_.coverage" NAME="Doctests in Task (1) Coverage Results" MODIFIED="1650000799583" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$Task.coverage" NAME="Task Coverage Results" MODIFIED="1670398210605" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$bfs_channel.coverage" NAME="channel Coverage Results" MODIFIED="1657508705479" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+    <SUITE FILE_PATH="coverage/find_source$build_spider2.coverage" NAME="build_spider2 Coverage Results" MODIFIED="1657606209526" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$aaaa.coverage" NAME="aaaa Coverage Results" MODIFIED="1701934926442" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$TimeExtractor.coverage" NAME="TimeExtractor Coverage Results" MODIFIED="1657195670237" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/analysis" />
+    <SUITE FILE_PATH="coverage/find_source$tett.coverage" NAME="tett Coverage Results" MODIFIED="1663135225716" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_search_site.coverage" NAME="Doctest search_site Coverage Results" MODIFIED="1650778091302" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$search_engine.coverage" NAME="search_engine Coverage Results" MODIFIED="1650000952990" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$utils__1_.coverage" NAME="utils (1) Coverage Results" MODIFIED="1689591552370" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/bloom_filter" />
+    <SUITE FILE_PATH="coverage/find_source$utils.coverage" NAME="utils Coverage Results" MODIFIED="1689664748304" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$aaa.coverage" NAME="aaa Coverage Results" MODIFIED="1670208583790" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$cjk_scripts.coverage" NAME="cjk_scripts Coverage Results" MODIFIED="1670297953210" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$verify.coverage" NAME="verify Coverage Results" MODIFIED="1651027599962" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/retrieve" />
+    <SUITE FILE_PATH="coverage/find_source$engines.coverage" NAME="engines Coverage Results" MODIFIED="1662606017221" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$comparison.coverage" NAME="comparison Coverage Results" MODIFIED="1660635227869" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$build_sync.coverage" NAME="build_sync Coverage Results" MODIFIED="1663139471668" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$ad1.coverage" NAME="ad1 Coverage Results" MODIFIED="1663037197000" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_extract_base_url.coverage" NAME="Doctest extract_base_url Coverage Results" MODIFIED="1650007951401" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$aasda.coverage" NAME="aasda Coverage Results" MODIFIED="1663134152555" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/predict_bidding_model" />
+    <SUITE FILE_PATH="coverage/find_source$dddd.coverage" NAME="dddd Coverage Results" MODIFIED="1701677856476" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$government.coverage" NAME="government Coverage Results" MODIFIED="1702191467194" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/spiders" />
+    <SUITE FILE_PATH="coverage/find_source$bing.coverage" NAME="bing Coverage Results" MODIFIED="1649733119879" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/search" />
+    <SUITE FILE_PATH="coverage/find_source$tclaen.coverage" NAME="tclaen Coverage Results" MODIFIED="1663295420902" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$RedisBloomFilter.coverage" NAME="RedisBloomFilter Coverage Results" MODIFIED="1663046605839" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/bloom_filter" />
+    <SUITE FILE_PATH="coverage/find_source$exa_channel.coverage" NAME="exa_channel Coverage Results" MODIFIED="1689590627962" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$test_predict_v2.coverage" NAME="test_predict_v2 Coverage Results" MODIFIED="1702006504029" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/tests" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_extract_page_title.coverage" NAME="Doctest extract_page_title Coverage Results" MODIFIED="1649930157727" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$dada.coverage" NAME="dada Coverage Results" MODIFIED="1649819373150" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$aaa__2_.coverage" NAME="aaa (2) Coverage Results" MODIFIED="1670220449109" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/predict_bidding_model" />
+    <SUITE FILE_PATH="coverage/find_source$hospital.coverage" NAME="hospital Coverage Results" MODIFIED="1657602964415" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$BloomFilter.coverage" NAME="BloomFilter Coverage Results" MODIFIED="1650521288694" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/bloom_filter" />
+    <SUITE FILE_PATH="coverage/find_source$search.coverage" NAME="search_engines Coverage Results" MODIFIED="1649821877392" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/search" />
+    <SUITE FILE_PATH="coverage/find_source$tools.coverage" NAME="tools Coverage Results" MODIFIED="1660627908243" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common" />
+    <SUITE FILE_PATH="coverage/find_source$build_excavate.coverage" NAME="build_excavate Coverage Results" MODIFIED="1702190312888" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$RedisBloomFilter__1_.coverage" NAME="RedisBloomFilter (1) Coverage Results" MODIFIED="1649842081762" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/bloom_filter" />
+    <SUITE FILE_PATH="coverage/find_source$ztb_info__2_.coverage" NAME="ztb_info Coverage Results" MODIFIED="1663133657795" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$asd.coverage" NAME="t_splash Coverage Results" MODIFIED="1657705133273" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$FilterUrl.coverage" NAME="FilterUrl Coverage Results" MODIFIED="1655081722374" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/analysis" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_Task.coverage" NAME="Doctests in Task Coverage Results" MODIFIED="1651056813223" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$bidding_cq_spider.coverage" NAME="bidding_cq_spider Coverage Results" MODIFIED="1652152157846" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$aaa1.coverage" NAME="aaa1 Coverage Results" MODIFIED="1656917609789" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$excavate.coverage" NAME="excavate Coverage Results" MODIFIED="1701678989850" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+    <SUITE FILE_PATH="coverage/find_source$data_fl.coverage" NAME="data_fl Coverage Results" MODIFIED="1658719938457" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$defaults.coverage" NAME="defaults Coverage Results" MODIFIED="1659081808751" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$search_engines.coverage" NAME="search_engines Coverage Results" MODIFIED="1649847967006" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_search_site__2_.coverage" NAME="Doctest search_site (2) Coverage Results" MODIFIED="1650346081610" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$settings.coverage" NAME="settings Coverage Results" MODIFIED="1652433969377" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$tw.coverage" NAME="tw Coverage Results" MODIFIED="1663147731248" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$data_query.coverage" NAME="data_query Coverage Results" MODIFIED="1651037197271" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+    <SUITE FILE_PATH="coverage/find_source$a1.coverage" NAME="hospital Coverage Results" MODIFIED="1654942158577" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$build_sync_data.coverage" NAME="build_sync_data Coverage Results" MODIFIED="1701680097886" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$DomAnalysis.coverage" NAME="DomAnalysis Coverage Results" MODIFIED="1655089321364" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/analysis" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_RedisBloomFilter.coverage" NAME="Doctests in RedisBloomFilter Coverage Results" MODIFIED="1649753561488" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/BloomFilter" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_RedisFilter.coverage" NAME="Doctests in RedisFilter Coverage Results" MODIFIED="1649752240274" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common/BloomFilter" />
+    <SUITE FILE_PATH="coverage/find_source$load.coverage" NAME="load Coverage Results" MODIFIED="1649667235571" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/config" />
+    <SUITE FILE_PATH="coverage/find_source$ads.coverage" NAME="ads Coverage Results" MODIFIED="1649988089714" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctests_in_example.coverage" NAME="Doctests in example Coverage Results" MODIFIED="1650854209833" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_extract_page_title__1_.coverage" NAME="Doctest extract_page_title (1) Coverage Results" MODIFIED="1649930126261" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$review_search.coverage" NAME="review_search Coverage Results" MODIFIED="1660634191881" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_test_downloader.coverage" NAME="Doctest test_downloader Coverage Results" MODIFIED="1650761779471" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$log.coverage" NAME="log Coverage Results" MODIFIED="1652322401023" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common" />
+    <SUITE FILE_PATH="coverage/find_source$constants.coverage" NAME="constants Coverage Results" MODIFIED="1649741426585" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/find_source$search_web.coverage" NAME="search_web Coverage Results" MODIFIED="1660561469330" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$Doctest_test_downloader__1_.coverage" NAME="Doctest test_downloader (1) Coverage Results" MODIFIED="1650780062585" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/find_source$execptions.coverage" NAME="execptions Coverage Results" MODIFIED="1655276839199" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common" />
+    <SUITE FILE_PATH="coverage/find_source$channel.coverage" NAME="channel Coverage Results" MODIFIED="1657625717177" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler/services" />
+  </component>
+</project>

+ 200 - 0
find_source/spiders/government.py

@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-12-07 
+---------
+@summary:  政府部门 数据挖掘
+---------
+@author: Dzr
+"""
+import datetime
+import re
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import bson
+
+import crawler.utils as tools
+from common.databases import mongo_table, redis_client
+from common.log import logger
+from common.tools import sha1
+from crawler.analysis import parser_items
+from crawler.download import Downloader
+
+gov_lst = mongo_table('dzr', 'BasicDataList')
+gov_task_lst = mongo_table('dzr', 'GovDataList')
+r = redis_client()
+r_key = 'gov_2023'
+
+downloader = Downloader(max_retries=0, disable_debug_log=False)
+# 数据挖掘层级深度
+max_excavate_depth = 3
+
+
+def to_mongodb(host, title, href, depth, **kwargs):
+    gov_task_lst.insert_one({
+        'host': host,
+        'href': href,
+        'title': title,
+        'depth': depth,
+        'is_crawl': False,
+        'create_at': bson.Int64(int(datetime.datetime.now().timestamp())),
+        **kwargs
+    })
+    r.hset(r_key, sha1(href), '')  # 添加数据指纹
+
+
+def deduplicate(href):
+    if not r.hexists(r_key, sha1(href)):
+        return False
+    return True
+
+
+def deduplicate_task_add_to_mongodb(host, title, href, depth, **kwargs):
+    """
+
+    :param str host:
+    :param str title:
+    :param str href:
+    :param int depth:
+    :param kwargs:
+    """
+    if not deduplicate(href):
+        to_mongodb(host, title, href, depth, **kwargs)
+
+
+def production_data_excavate_tasks():
+    data_lst = []
+
+    query = {"collect": "否"}
+    with gov_lst.find(query, projection={"site": 1, "href": 1}) as cursor:
+        for doc in cursor:
+            site = str(doc["site"]).strip()
+            href = str(doc["href"]).strip()
+            if not tools.is_url(href):
+                continue
+
+            args = list(filter(lambda x: x is not None, tools.get_host(href)))
+            if len(args) > 2:
+                host = "{0}://{1}:{2}".format(*args)
+            else:
+                host = "{0}://{1}".format(*args)
+
+            if not re.search("^http[s|]?", href):
+                href = host
+
+            task = {
+                'href': href,
+                'origin': href,
+                'host': host,
+                'title': site,
+                'site': site,
+                'depth': 1,
+                'datalist_id': str(doc['_id'])  # 数据源主键
+            }
+            data_lst.append(task)
+
+    return data_lst
+
+
+def get_tasks(query, projection=None, limit=100):
+    with gov_task_lst.find(query, projection=projection, limit=limit) as cursor:
+        data_lst = [item for item in cursor.sort([('depth', 1)])]
+    return data_lst
+
+
+def get_response_by_request(url, host):
+    response = downloader.get(url, timeout=10)
+    # Decode unicode from given encoding.
+    try:
+        content = str(response.content, response.encoding, errors="replace")
+    except (LookupError, TypeError):
+        content = str(response.content, errors="replace")
+
+    if response.status_code == 200 and content not in ["", None]:
+        items = parser_items(content, url=host, mode=1)  # 同源抽取
+        text_lst = tools.extract_text(content, parser="bs4").split()
+        # 去除所有不包含中文的文本
+        text_lst = list(filter(lambda x: re.search('[\u4e00-\u9fa5]', x) is not None, text_lst))
+        # 过滤短语(长度小于10)
+        text_lst = list(filter(lambda x: len(x) > 10, text_lst))
+        # 招投标文本预测命中数量
+        hits = tools.predict_bidding_model_v2(text_lst) if text_lst else 0
+
+        result = {
+            'href': url,
+            'host': host,
+            'total': len(text_lst),  # 招投标预测文档总量
+            'hits': hits,   # 有效量
+            'items': items
+        }
+        return result
+
+
+def spider(task):
+    success, dedup = 0, 0
+    update = {
+        'is_crawl': True,
+        'fetch': 0,  # 页面访问是否成功;0=失败 1=成功
+        'depth': task['depth'],
+    }
+    try:
+        response = get_response_by_request(task['href'], task['host'])
+        if response:
+            update['docs'] = response['total']  # 数据挖掘的文本量
+            update['hits'] = response['hits']  # 招投标预测命中的文本量
+            update['fetch'] = 1  # 访问成功的标识
+            for ret in response['items']:
+                if deduplicate(ret['href']):
+                    dedup += 1
+                    continue
+
+                excavate = {
+                    'title': ret['title'],
+                    'href': ret['href'],
+                    'depth': update['depth'] + 1,
+                    'host': task['host'],
+                    'origin': task['origin'],
+                    'datalist_id': task['datalist_id']
+                }
+                if excavate['depth'] > max_excavate_depth:
+                    continue
+
+                to_mongodb(**excavate, site=task['site'])
+                success += 1
+    except Exception as e:
+        logger.exception(e)
+
+    # 更新任务详情
+    update['update_at'] = bson.Int64(int(datetime.datetime.now().timestamp()))
+    gov_task_lst.update_one({'_id': task['_id']}, {'$set': update})
+    return success, dedup
+
+
+def start(query, workers=1, init_excavate=False):
+    if init_excavate:
+        logger.info("创建数据挖掘任务...")
+        task_lst = production_data_excavate_tasks()
+        for task in task_lst:
+            deduplicate_task_add_to_mongodb(**task)
+
+    while True:
+        tasks = get_tasks(query, limit=1000)
+        logger.info(f"数据挖掘任务加载 {len(tasks)} 条")
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            fs = [executor.submit(spider, task) for task in tasks]
+            logger.info(f"待处理数据挖掘任务 {len(fs)} 条")
+            for f in as_completed(fs):
+                pubs, dupl = f.result()
+                tips = [f"发布任务 {pubs} 条", f"重复任务 {dupl} 条"]
+                logger.info(" ".join(tips))
+
+        logger.info("数据挖掘结束,等待加载新任务...")
+        if not tasks:
+            break
+
+
+if __name__ == '__main__':
+    start(
+        init_excavate=True,
+        query={"is_crawl": False},  # 数据挖掘条件
+        workers=20,
+    )

+ 166 - 0
find_source/spiders/hospital.py

@@ -0,0 +1,166 @@
+import datetime
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor, wait
+
+from pymongo.errors import DocumentTooLarge
+
+from common.databases import mongo_table, redis_client
+from common.log import logger
+from common.tools import sha1
+from crawler.analysis import parser_items
+from crawler.download import Downloader
+from crawler.utils import (
+    extract_page_title,
+    extract_host,
+    err_details,
+    extract_text
+)
+
+hospital = mongo_table('tmp_crawl', 'hospital_info')
+r = redis_client()
+r_key = 'hospital_2022'
+
+down_loader = Downloader()
+seed_tasks = [
+    ('http://www.hnsrmyy.net', '河南省人民医院'),
+    ('https://www.zzsetyy.cn/index.html', '河南省儿童医院'),
+    ('https://www.pumch.cn/index.html', '北京协和医院'),
+]
+
+
+def create_task(host: str, title: str, href: str, depth: int, **kwargs):
+    sid = sha1(href)
+    if not r.hexists(r_key, sid):
+        hospital.insert_one({
+            'host': host,
+            'href': href,
+            'title': title,
+            'depth': depth,
+            'is_crawl': False,
+            'create_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            **kwargs
+        })
+        r.hset(r_key, sid, '')
+        return True
+    return False
+
+
+def push_tasks():
+    for url, title in seed_tasks:
+        item = {
+            'host': extract_host(url),
+            'href': url,
+            'title': title,
+            'site': title,
+            'depth': 1,
+        }
+        create_task(**item)
+
+
+def get_tasks(**kwargs):
+    _results = []
+    _projection = kwargs.pop('projection', {})
+    projection = {
+        'host': 1,
+        'site': 1,
+        'href': 1,
+        'depth': 1,
+        'is_crawl': 1,
+        **_projection
+    }
+    cursor = hospital.find({'is_crawl': False}, projection=projection)
+    for item in cursor.sort([('depth', 1)]).limit(100):
+        _results.append(item)
+    return _results
+
+
+def update_data(mongo_id, source, source_text):
+    item = {
+        'source': source,
+        'source_text': source_text,
+        'is_crawl': True,
+        'update_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    }
+    try:
+        hospital.update_one({'_id': mongo_id}, {'$set': item})
+    except DocumentTooLarge:
+        item['source'] = ''
+        hospital.update_one({'_id': mongo_id}, {'$set': item})
+
+
+def crawl_request(url, host):
+    suffix = re.search('([.][a-zA-Z]{3,5})$', url)
+    if suffix is not None and suffix.group().find('.htm') == -1:
+        raise ValueError(f'无法挖掘的url:{url}')
+
+    response = down_loader.get(url, timeout=10, max_retries=1, disable_debug_log=False)
+    if response.status_code == 200 and response.text not in [None, '']:
+        items = parser_items(response.text, url=host, mode=1)
+        title = extract_page_title(response.text)
+        source_text = "&_&".join(extract_text(response.text).split())
+    else:
+        title = f'请求异常-{response.status_code}-{response.reason}'
+        source_text = ''
+        items = []
+    results = {
+        'host': host,
+        'href': url,
+        'source': response.text,
+        'title': title,
+        'source_text': source_text,
+        'items': items,
+    }
+    return results
+
+
+def crawl_spider(task):
+    _id = task['_id']
+    _total, _success, _err = 0, 0, 0
+    try:
+        dic_data = crawl_request(task['href'], task['host'])
+        # 创建挖掘任务
+        for item in dic_data['items']:
+            href = item['href']
+            title = item['title']
+            sub_item = {
+                'host': task['host'],
+                'title': title,
+                'href': href,
+                'depth': task['depth'] + 1,
+            }
+            success = create_task(**sub_item, site=task['site'])
+            if success:
+                _success += 1
+            else:
+                _err += 1
+            _total += 1
+    except ValueError:
+        dic_data = {}
+
+    # 更新挖掘结果
+    update_data(**dict(
+        mongo_id=_id,
+        source=dic_data.get('source', None),
+        source_text=dic_data.get('source_text', None)))
+    logger.info(f"[{str(_id)}]采集成功{_total}条,上传成功{_success}条,删除重复{_err}条")
+
+
+def start():
+    push_tasks()
+    while True:
+        tasks = get_tasks()
+        logger.info(f"加载采集任务{len(tasks)}条")
+        with ThreadPoolExecutor(max_workers=4, thread_name_prefix='hospital') as Executor:
+            futures = []
+            for task in tasks:
+                future = Executor.submit(crawl_spider, task)
+                future.add_done_callback(err_details)
+                futures.append(future)
+            wait(futures)
+        logger.info(f"完成采集任务{len(tasks)}条,等待加载")
+        time.sleep(10)
+
+
+if __name__ == '__main__':
+    start()

+ 119 - 0
find_source/tests/add_task.py

@@ -0,0 +1,119 @@
+import time
+import urllib3
+
+import pandas as pd
+
+from crawler.Task import Task
+from crawler.q import RedisQueue
+from crawler.utils import extract_host, extract_domain, is_url
+from settings import (
+    REDIS_QUERY_KEYWORD,
+    REDIS_EXCAVATE,
+    FILTER_WORDS,
+    MGO_KEYWORDS,
+    MGO_ORGS,
+    MGO_URLS,
+    MGO_COMPETING_GOODS,
+    MGO_LUA_SPIDERS,
+)
+from common.databases import int2long, mongo_table
+
+
+def push_task(file):
+    path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
+    mrq = RedisQueue()
+    _file = str(file)
+    df = pd.read_excel(path + _file, sheet_name=0)
+    lst = [Task(url=nd[0], groups='seed_url') for nd in df.values]
+    print(mrq.push_task(REDIS_EXCAVATE, lst, level=9))
+
+
+def make_seed_words_table(file):
+    path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/'
+    df = pd.read_excel(path + file, sheet_name=1)
+    # print(df.to_dict())
+
+    # 企业名称
+    df_dict = df.to_dict()
+    for key, val in df_dict['企业名称'].items():
+        # print(key, val)
+        MGO_ORGS.insert_one({'name': val})
+    print('企业名称表成功创建')
+
+    # 关键词
+    for val in FILTER_WORDS:
+        # print(val)
+        MGO_KEYWORDS.insert_one({'name': val})
+    print('关键词表成功创建')
+
+
+def make_seed_urls_table(file):
+    path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
+    _file = str(file)
+    df = pd.read_excel(path + _file, sheet_name=0)
+    for nd in df.values:
+        if len(nd) > 1:
+            name, url = nd[0], nd[1]
+        else:
+            name, url = '', nd[0]
+        # print(name, url)
+        if is_url(url) and extract_domain(url) != '':
+            try:
+                MGO_URLS.insert_one({'name': extract_host(url), 'site_name': name})
+            except urllib3.exceptions.LocationParseError:
+                continue
+    print('种子urls表成功创建')
+
+
+def make_competing_goods_table(file):
+    path = '/Users/dongzhaorui/Desktop/swordfish/数据寻源/陈佳康/'
+    _file = str(file)
+    df = pd.read_excel(path + _file, sheet_name=0)
+    for nd in df.values:
+        url = nd[0]
+        MGO_COMPETING_GOODS.insert_one({'name': url})
+    print('竞品urls表成功创建')
+
+
+def make_garbage_tab():
+    q = {"param_common.11": {"$exists": True}}
+    projection = {'param_common': 1, '_id': 0}
+    data_garbage = mongo_table('shujuziyuan', 'data_garbage', host='127.0.0.1', port=27017)
+    cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
+    history = []
+    for item in cursor:
+        href = item['param_common'][11]
+        domain = extract_domain(href)
+        if len(domain) > 0 and domain not in history:
+            data_garbage.insert_one({
+                'domain': domain,
+                'create_at': int2long(int(time.time())),
+            })
+            history.append(domain)
+            print(f'href >> {href}; domain >> {domain}')
+
+
+def make_domain():
+    q = {"param_common.11": {"$exists": True}}
+    projection = {'param_common': 1, '_id': 0}
+    data_garbage = mongo_table('dzr', 'web_domain', host='baibai.ink', port=28082)
+    cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
+    history = []
+    for item in cursor:
+        href = item['param_common'][11]
+        domain = extract_domain(href)
+        if len(domain) > 0 and domain not in history:
+            data_garbage.insert_one({
+                'domain': domain,
+                'create_at': int2long(int(time.time())),
+            })
+            history.append(domain)
+            print(f'href >> {href}; domain >> {domain}')
+
+
+if __name__ == '__main__':
+    # make_garbage_tab()
+    make_seed_urls_table('元博网正文数据寻源网址类2022.8.12.xlsx')
+    # make_seed_urls_table('剑鱼正文数据寻源网址类2022.8.25.xlsx')
+    # make_competing_goods_table('元博网正文数据寻源网址类2022.2.23.xlsx')
+    # make_seed_words_table('自动寻源程序种子提供2022.xlsx')

+ 50 - 0
find_source/tests/exa_channel.py

@@ -0,0 +1,50 @@
+from crawler.download import Downloader, RenderDownloader
+from crawler.services.channel import bfs
+
+if __name__ == '__main__':
+    d = Downloader()
+    r = RenderDownloader()
+
+    # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
+    # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
+    # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
+    # url = 'http://113.200.193.24:8009/Main/Projects#'
+    # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
+    # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
+    # url = 'http://www.hdzbgs.com/List.aspx?id=12'
+    # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
+    # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
+    # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
+    # url = 'https://www.elongbiao.com/List/NoticeP/9'
+    # url = 'https://www.elongbiao.com/List/Notice/12'  # 多时间文本 算法优化一次
+    # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
+    # url = 'http://www.lydeyy.com/plus/list.php?tid=36'  # 时间文本 算法优化一次
+    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
+    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
+    # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
+
+    # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302'  # 多个时间文本窗口栏目抽取,完成优化
+    # url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html' # 优化时间文本块数量多与先辈节点个数,导致无法全部删除,残余时间文本块干扰问题
+    # url = 'http://www.shanghang.gov.cn/zwgk/zwgkzdgz/gczb/sphzbaxx/'
+    # url = 'http://www.qlebid.com/cms/channel/1ywgg4qb/index.htm'
+    # url = ' http://zhaobiao.elongcheng.com:82/'
+    # url = 'http://www.gdgpo.gov.cn/queryPlanList.do'
+    # url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
+    # url = 'http://www.xtsrmyy.com.cn/newlist.asp?bigclassid=4&smallclassid=5'
+    # url = 'http://jsj.yima.gov.cn/col/col109/index.html'
+    # url = 'http://zw.hainan.gov.cn/wssc/ra/projects/rp_list.html?num=3'
+    # url = 'http://www.hlbeggzyjy.org.cn/jygk/021001/trade_public.html'
+    # url = 'http://jsj.yima.gov.cn/col/col109/index.html?uid=8327&pageNum=4'
+    # url = 'http://oldzfcg.scsczt.cn/CmsNewsController.do?method=recommendBulletinList&moreType=provincebuyBulletinMore&channelCode=cggg&rp=25&page=1'
+    # url = 'http://www.ccgp-xizang.gov.cn/freecms/site/xizang/index.html'
+    # url = 'http://ggzy.yn.gov.cn/#/tradeHall/tradeList'
+    # url = 'http://www.gdgpo.gov.cn/queryPlanList.do'
+    # url = 'http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0'
+    # javascript 渲染页面
+    # url = 'http://zhaobiao.elongcheng.com:82/'  # 详情所在 onclick
+    url = 'https://ebid.espic.com.cn/newgdtcms//category/purchaseListNew.html?dates=300&categoryId=2&tenderMethod=00&tabName=%E9%87%87%E8%B4%AD%E4%BF%A1%E6%81%AF&page=1'
+
+    resp = r.get(url, timeout=3)
+    # resp = r.get(url, timeout=3)
+    print(resp)
+    # bfs(resp, url)

File diff suppressed because it is too large
+ 13 - 0
find_source/tests/predict_bidding_model.py


+ 29 - 0
find_source/tests/test_predict_v2.py

@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-12-07 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+from predict_bidding_model_v2 import predict
+
+
+if __name__ == '__main__':
+    result = predict([
+        "罗山县财政局:做好“加减乘除”法,优化营商环境出实招",
+        "上蔡县财政局对基层财政部门开展政府采购基础信息检查",
+        "河南财经政法大学2024年1至2月政府采购意向",
+        "河南交通技师学院汽车技术服务与营销、休闲体育服务、物联网应用技术三个专业校企合作共建服务项目单一来源公示",
+        "河南交通技师学院汽车技术服务与营销专业校企合作共建服务项目单一来源公示",
+        "河南交通技师学院物联网应用技术专业校企合作共建服务项目单一来源公示",
+        "河南交通技师学院休闲体育服务专业校企合作共建服务项目单一来源公示",
+        "河南省工业和信息化厅中小企业统计运行监测工作服务项目单一来源采购公示",
+        "河南交通技师学院计算机应用与维修专业校企合作共建服务项目单一来源公示",
+        "河南交通技师学院工业机器人应用与维护、航空服务两个专业校企合作共建服务项目单一来源公示",
+        "使用sklearn和tf-idf变换的针对20Newsgroup数据集做文本分类"
+    ])
+    print(result)
+
+    for item in result:
+        print(item)

+ 0 - 0
zgztb_cookie/FworkSpider/feapder/commands/__init__.py → geocode/.idea/.gitignore


+ 12 - 0
geocode/.idea/geocode.iml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.8 (py38)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

+ 25 - 0
geocode/.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,25 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ourVersions">
+        <value>
+          <list size="3">
+            <item index="0" class="java.lang.String" itemvalue="3.8" />
+            <item index="1" class="java.lang.String" itemvalue="3.10" />
+            <item index="2" class="java.lang.String" itemvalue="3.11" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="1">
+            <item index="0" class="java.lang.String" itemvalue="requests" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
geocode/.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
geocode/.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (py38)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
geocode/.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/geocode.iml" filepath="$PROJECT_DIR$/.idea/geocode.iml" />
+    </modules>
+  </component>
+</project>

+ 6 - 0
geocode/.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>

+ 56 - 0
geocode/.idea/workspace.xml

@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="0ab7e610-75d4-4b34-b1d8-222e2a2577af" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectId" id="2KDjKHaZKcgp6fGZurPLmjzaRIR" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">{
+  &quot;keyToString&quot;: {
+    &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
+    &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
+    &quot;WebServerToolWindowFactoryState&quot;: &quot;false&quot;
+  }
+}</component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="0ab7e610-75d4-4b34-b1d8-222e2a2577af" name="Changes" comment="" />
+      <created>1673515984165</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1673515984165</updated>
+      <workItem from="1673515986136" duration="111000" />
+      <workItem from="1673570751066" duration="120000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+</project>

+ 0 - 55
jzsc/chaojiying.py

@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-# coding:utf-8
-
-import requests
-from hashlib import md5
-
-
-class Chaojiying_Client(object):
-
-    def __init__(self, username, password, soft_id):
-        self.username = username
-        password = password.encode('utf8')
-        self.password = md5(password).hexdigest()
-        self.soft_id = soft_id
-        self.base_params = {
-            'user': self.username,
-            'pass2': self.password,
-            'softid': self.soft_id,
-        }
-        self.headers = {
-            'Connection': 'Keep-Alive',
-            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
-        }
-
-    def PostPic(self, im, codetype):
-        """
-        im: 图片字节
-        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
-        """
-        params = {
-            'codetype': codetype,
-        }
-        params.update(self.base_params)
-        files = {'userfile': ('ccc.jpg', im)}
-        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
-        return r.json()
-
-    def ReportError(self, im_id):
-        """
-        im_id:报错题目的图片ID
-        """
-        params = {
-            'id': im_id,
-        }
-        params.update(self.base_params)
-        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
-        return r.json()
-
-
-# if __name__ == '__main__':
-#     chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')  # 用户中心>>软件ID 生成一个替换 96001
-    # im = open('img.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
-    # print(chaojiying.PostPic(im, 9008))  #1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
-    # result = chaojiying.ReportError('1170412067373400583')
-    # print(result)

+ 0 - 32
jzsc/config/conf.yaml

@@ -1,32 +0,0 @@
-# mongo
-mongo:
-#  host: 172.17.4.87
-#  port: !!int 27080
-  host: 127.0.0.1
-  port: !!int 27017
-
-
-# redis
-redis:
-  host: 127.0.0.1
-  port: !!int 6379
-  pwd: ""
-  db: !!int 10
-
-
-# es
-es:
-  host: 172.17.145.170
-#  host: 127.0.0.1
-#  host: 192.168.3.206
-  port: !!int 9800
-  db: biddingall
-
-
-# 阿里oss
-ali_oss:
-  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
-  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
-#  endpoint: oss-cn-beijing.aliyuncs.com    # 公网使用
-  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
-  bucket_name: jy-datafile

+ 0 - 13
jzsc/config/constants.yaml

@@ -1,13 +0,0 @@
-headers:
-  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
-  Accept: '*/*'
-
-proxy:
-  socks5:
-    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
-    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
-
-
-node_module:
-  windows: C:\Users\dell\AppData\Roaming\npm\node_modules
-  linux: /usr/lib/node_modules

+ 0 - 35
jzsc/config/load.py

@@ -1,35 +0,0 @@
-import sys
-from pathlib import Path
-
-import yaml
-
-__all__ = [
-    'mongo_conf', 'redis_conf', 'oss_conf', 'es_conf',
-    'constants',
-    'headers', 'jy_proxy', 'node_module',
-    'analyze_url', 'node_module_path'
-]
-
-base_path = Path(__file__).parent
-yaml_conf = (base_path / 'conf.yaml').resolve()
-yaml_constants = (base_path / 'constants.yaml').resolve()
-
-with open(yaml_conf, encoding="utf-8") as f:
-    conf = yaml.safe_load(f)
-    mongo_conf = conf['mongo']
-    redis_conf = conf['redis']
-    es_conf: dict = conf['es']
-    oss_conf: dict = conf['ali_oss']
-
-with open(yaml_constants, encoding="utf-8") as fp:
-    constants = yaml.safe_load(fp)
-    headers: dict = constants['headers']
-    jy_proxy: dict = constants['proxy']
-    node_module: dict = constants['node_module']
-    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
-
-
-if sys.platform == 'linux':
-    node_module_path = node_module['linux']
-else:
-    node_module_path = node_module['windows']

+ 97 - 85
jzsc/spider.py

@@ -1,31 +1,44 @@
-import io
+import hashlib
 import random
 import time
+from pathlib import Path
 
 import pandas as pd
-from PIL import Image
+import redis
+import requests
+from loguru import logger
 from lxml.html import fromstring, tostring
+from pymongo import MongoClient
 from selenium import webdriver
-from selenium.webdriver import ActionChains
 from selenium.webdriver import Chrome
 from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.wait import WebDriverWait
-
-from chaojiying import Chaojiying_Client
-from utils.databases import mongo_table, redis_client
-from utils.log import logger
-from utils.tools import sha1
 
 '''MongoDB'''
-company_tab = mongo_table('national', 'company')
+client = MongoClient('192.168.3.182', 27017)
+company_tab = client['national']['company']
 
 '''redis服务'''
-r = redis_client()
+r = redis.Redis(
+    connection_pool=redis.ConnectionPool(
+        host='192.168.3.182',
+        port=6379,
+        password='jianyu@python',
+        db=10
+    ),
+    decode_responses=True
+)
 redis_key = 'jzsc_2022'
 
-'''验证码服务'''
-chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')
+'''日志'''
+log_path = (Path(__file__).absolute().parent / 'logs/log_{time:YYYYMMDD}.log').resolve()
+logger.add(
+    log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)
 
 '''企业资质'''
 COMPANY_QUALITY_MAPS = {
@@ -63,6 +76,45 @@ PUNISH_MAPS = {
 CRAWL_SITE = 'http://jzsc.mohurd.gov.cn/data/company'
 
 
+def sha1(*args):
+    """
+    十六进制数字字符串形式摘要值
+
+    @param args: 字符串
+    @return: 摘要值
+    """
+    hash_sha1 = hashlib.sha1()
+    for arg in args:
+        hash_sha1.update(arg.encode('utf-8'))
+    return hash_sha1.hexdigest()
+
+
+def get_proxy(scheme=None, default=None, socks5h=False):
+    url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
+    headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
+
+    try:
+        proxy = requests.get(url, headers=headers, timeout=15).json()
+    except requests.RequestException:
+        return default
+
+    if not proxy:
+        logger.debug('暂无代理...')
+        return default
+
+    proxies = proxy.get('data')
+    if proxies:
+        if socks5h:
+            proxy_items = proxies.get('http')
+            proxy_h = {
+                'http': proxy_items.replace('socks5', 'socks5h'),
+                'https': proxy_items.replace('socks5', 'socks5h')
+            }
+            proxies = proxy_h
+
+    return proxies if not scheme else proxies.get(scheme, default)
+
+
 def html2element(html):
     return fromstring(html)
 
@@ -113,97 +165,48 @@ def prompt_popup(driver: Chrome, wait_time=None):
     time.sleep(_wait_time)
 
 
-def geetest_panel(driver: Chrome, save_img_to_local=False, wait_time=None):
-    pic_id = None
+def geetest_panel(driver: Chrome, wait_time=None, save_img_to_local=False):
     while True:
         if not display_geetest_panel(driver.page_source):
             break
         logger.info(">>> 验证码检测")
-        if pic_id is not None:
-            '''打码平台失败'''
-            captcha_result = chaojiying.ReportError(pic_id)
-            pic_id = None
-            logger.info(captcha_result)
-
-        '''获取验证图片对象'''
-        wait = WebDriverWait(driver, 60, 0.5)
-        locator = (By.CLASS_NAME, 'geetest_panel_next')
-        touclick_element = wait.until(EC.presence_of_element_located(locator))
-
-        '''获取网页截图'''
-        element_png = touclick_element.screenshot_as_png
-        screenshot = Image.open(io.BytesIO(element_png))
-
-        '''修改截图尺寸;超级鹰:推荐宽不超过460px,高不超过310px'''
-        # reim = screenshot.resize((306, 310))
-        # reim = screenshot.resize((307, 300))
-        reim = screenshot.resize((310, 300))
-
-        '''获取验证码图片'''
-        bytes_array = io.BytesIO()
-        reim.save(bytes_array, format='PNG')
-
-        '''保存验证码到本地'''
-        if save_img_to_local:
-            touclick_element.screenshot('captcha.png')
-            with open('ele.png', 'wb') as wp:
-                wp.write(bytes_array.getvalue())
-
-        '''识别验证码'''
-        captcha_result = chaojiying.PostPic(bytes_array.getvalue(), 9004)
-        logger.info(f'[识别结果]{captcha_result}')
-        pic_id = captcha_result['pic_id']
-
-        '''解析识别结果'''
-        groups = captcha_result.get('pic_str').split('|')
-        locations = [[int(number) for number in group.split(',')] for group in groups]
-
-        '''点击验证图片'''
-        for index, location in enumerate(locations):
-            ActionChains(driver).move_to_element_with_offset(
-                touclick_element,
-                location[0] + 10,
-                location[1] + 53,
-            ).click().perform()
-            time.sleep(1)
-
-        '''保存点击之后的图片'''
-        if save_img_to_local:
-            touclick_element.screenshot('touclick_img.png')
-
-        '''提交验证码'''
-        locator = (By.CLASS_NAME, 'geetest_commit')
-        commit_element = wait.until(EC.presence_of_element_located(locator))
-        ActionChains(driver).click(commit_element).perform()
-        time.sleep(5)
+
+        text = input("通过验证后,结束等待。请输入:y")
+        if text == 'y':
+            continue
+
     _wait_time = (wait_time or 1)
     time.sleep(wait_time)
 
 
 def check_page(driver: Chrome, wait_time=None, **kwargs):
     """检查页面"""
-    _wait_time = (wait_time or 1)
-    prompt_popup(driver, wait_time=_wait_time)
+    wait_time = (wait_time or 1)
+    prompt_popup(driver, wait_time=wait_time)
     geetest_panel(
         driver,
-        wait_time=_wait_time,
+        wait_time=wait_time,
         save_img_to_local=kwargs.get('save_img_to_local'),
     )
 
 
-def click(driver: Chrome, button, wait_time=None, allow_check_page=False):
-    driver.execute_script("arguments[0].click();", button)
-    _wait_time = (wait_time or 1)
-    time.sleep(_wait_time)
+def click(driver: Chrome, button, wait_time=None, allow_check_page=False, run_js=True):
+    if run_js:
+        driver.execute_script("arguments[0].click();", button)
+    else:
+        button.click()
+
+    wait_time = (wait_time or 1)
+    time.sleep(wait_time)
     if allow_check_page:
-        check_page(driver, wait_time=_wait_time)
+        check_page(driver, wait_time=wait_time)
 
 
 def click_query(driver: Chrome, wait_time=None):
     """查询按钮"""
     button = driver.find_element_by_class_name("ssButton")
-    _wait_time = (wait_time or 1)
-    click(driver, button, wait_time=_wait_time)
+    wait_time = (wait_time or 1)
+    click(driver, button, wait_time=wait_time)
 
 
 def next_page(driver: Chrome):
@@ -269,7 +272,7 @@ def crawl_spider(driver: Chrome, handler):
             logger.info(f"[重复数据]{title} - 丢弃")
             continue
         button = td_element.find_element_by_class_name("link")
-        click(driver, button, wait_time=random.randint(3, 10))
+        click(driver, button, wait_time=random.randint(3, 10), run_js=False)
         for current_handler in driver.window_handles:
             if current_handler == handler:
                 continue
@@ -456,11 +459,20 @@ def select_categories(driver: Chrome, records):
 
 
 def start(enable_remote_driver=False):
+    '''
+
+    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"  --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir="./data"
+
+    '''
     options = webdriver.ChromeOptions()
     if enable_remote_driver:
         options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
     options.add_argument("--disable-gpu")
-    chrome_driver = webdriver.Chrome(options=options)
+
+    chrome_driver = webdriver.Chrome(
+        executable_path="/Users/dongzhaorui/Downloads/chromedriver-mac-x64/chromedriver",
+        options=options
+    )
     main_handler = chrome_driver.current_window_handle  # 获取句柄
     '''清除其余窗口'''
     for handler in chrome_driver.window_handles:

+ 0 - 109
jzsc/utils/databases.py

@@ -1,109 +0,0 @@
-import bson
-import pymongo
-import redis
-import requests
-from elasticsearch import Elasticsearch
-
-from config.load import mongo_conf, redis_conf, es_conf, analyze_url
-
-
-# ---------------------------------- mongo ----------------------------------
-def mongo_client(cfg=None):
-    if cfg is None:
-        cfg = mongo_conf
-    return pymongo.MongoClient(host=cfg['host'], port=cfg['port'])
-
-
-def mongo_database(db: str):
-    client = mongo_client()
-    return client[db]
-
-
-def mongo_table(db: str, coll: str):
-    client = mongo_client()
-    return client[db][coll]
-
-
-def int2long(param: int):
-    """int 转换成 long """
-    return bson.int64.Int64(param)
-
-
-def object_id(_id: str):
-    return bson.objectid.ObjectId(_id)
-
-
-# ---------------------------------- es ----------------------------------
-def es_client(cfg=None):
-    if cfg is None:
-        cfg = es_conf
-    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
-
-
-def es_participles_service(text: str):
-    """
-    获取文本的分词列表
-
-    :param text: 需要分词的文本
-    :return: 分词列表
-    """
-    result = []
-    params = {"text": text, "analyzer": "ik_smart"}
-    res = requests.get(analyze_url, params=params, timeout=60)
-    if res.status_code == 200:
-        tokens = res.json().get('tokens', [])
-        for x in tokens:
-            if x["token"].encode('utf-8').isalpha():
-                continue
-            result.append(x["token"])
-    return result
-
-
-def es_query(title: str, publish_time: int):
-    """
-    查询es
-
-    :param title: 标题
-    :param publish_time: 发布时间
-    :return:
-    """
-    client = es_client()
-    stime = publish_time - 432000  # 往前推5天
-    etime = publish_time + 432000
-    conditions = []
-    participles = es_participles_service(title)
-    for word in participles:
-        conditions.append({
-            "multi_match": {
-                "query": word,
-                "type": "phrase",
-                "fields": ["title"]
-            }
-        })
-    conditions.append({
-        "range": {"publishtime": {"from": stime, "to": etime}}
-    })
-    query = {
-        "query": {
-            "bool": {
-                "must": conditions,
-                "minimum_should_match": 1
-            }
-        }
-    }
-    result = client.search(index='bidding', body=query, request_timeout=100)
-    count = len(result['hits']['hits'])
-    return count
-
-
-# ---------------------------------- redis ----------------------------------
-def redis_client(cfg=None):
-    if cfg is None:
-        cfg = redis_conf
-    pool = redis.ConnectionPool(
-        host=cfg['host'],
-        port=cfg['port'],
-        password=cfg['pwd'],
-        db=cfg['db']
-    )
-    return redis.Redis(connection_pool=pool, decode_responses=True)

+ 0 - 49
jzsc/utils/execptions.py

@@ -1,49 +0,0 @@
-
-class JyBasicException(Exception):
-
-    def __init__(self, code: int, reason: str, **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class CustomAccountPrivilegeError(JyBasicException):
-
-    def __init__(self, code: int = 10001, reason: str = '账号权限登录异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class CustomCheckError(JyBasicException):
-
-    def __init__(self, code: int = 10002, reason: str = '特征条件检查异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class VoidCrawlError(JyBasicException):
-
-    def __init__(self, code: int = 10003, reason: str = '空页面采集错误', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)
-
-
-class AttachmentNullError(JyBasicException):
-
-    def __init__(self, code: int = 10004, reason: str = '附件下载异常', **kwargs):
-        self.code = code
-        self.reason = reason
-        self.err_details = kwargs
-        for key, val in kwargs.items():
-            setattr(self, key, val)

+ 0 - 14
jzsc/utils/log.py

@@ -1,14 +0,0 @@
-from pathlib import Path
-
-from loguru import logger
-
-_absolute = Path(__file__).absolute().parent.parent
-_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
-logger.add(
-    _log_path,
-    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
-    level='INFO',
-    rotation='00:00',
-    retention='1 week',
-    encoding='utf-8',
-)

+ 0 - 153
jzsc/utils/socks5.py

@@ -1,153 +0,0 @@
-import threading
-import time
-from collections import deque
-from urllib.parse import urlparse
-
-import requests
-
-from config.load import jy_proxy, headers
-from utils.log import logger
-
-__all__ = ['Proxy']
-
-
-def decrypt(input_str: str) -> str:
-    """
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    key = jy_proxy['socks5']['decrypt']
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-
-class Socks5Proxy:
-
-    __instance = None
-
-    def __new__(cls, *args, **kwargs):
-        if cls.__instance is None:
-            cls.__instance = super().__new__(cls)
-        return cls.__instance
-
-    def __init__(self):
-        self.seconds = 60
-        self._lock = threading.RLock()
-        self._url = jy_proxy['socks5']['url']
-        self._dq = deque([])
-        self._proxies = {}
-        self._pool = []
-        self._counter = {}
-
-    def _init(self):
-        while not self._proxies:
-            if len(self._dq) > 0:
-                '''队列左边取值'''
-                self._proxies = self._dq.popleft()
-                '''添加到队尾'''
-                self._dq.append(self._proxies)
-            else:
-                self.__request_service()
-                self.__check_proxies()
-
-    @property
-    def proxies(self):
-        with self._lock:
-            return self._proxies if len(self._proxies) > 0 else None
-
-    def switch(self, reset=False):
-        with self._lock:
-            if reset is True:
-                self.__flush_proxy_pool()
-            elif len(self._counter) > 0:
-                end_time = self._counter[self.get_netloc(self._proxies)]
-                current_time = int(time.time())
-                if end_time - current_time < self.seconds:
-                    logger.info(f"[移除socks5代理]{self.get_netloc(self._proxies)}")
-                    self._dq.remove(self._proxies)
-                    del self._counter[self.get_netloc(self._proxies)]
-                    logger.info(f"[socks5代理]剩余 {len(self._dq)} 个")
-
-            self._proxies = {}  # 重置代理
-            while len(self._proxies) == 0:
-                if len(self._dq) > 0:
-                    self._proxies = self._dq.popleft()
-                    self._dq.append(self._proxies)
-                else:
-                    self.__flush_proxy_pool()
-
-    @staticmethod
-    def get_netloc(item: dict):
-        parser = urlparse(item.get('http'))
-        return parser.netloc
-
-    def __request_service(self):
-        try:
-            response = requests.get(self._url, timeout=10)
-            self.__extract_ip(response)
-        except requests.RequestException:
-            pass
-
-    def __extract_ip(self, response):
-        for proxy in response.json():
-            host = decrypt(proxy['host'])
-            port = int(proxy['port'])
-            end_time = proxy['EndTime']
-            items = {
-                'http': 'socks5://{}:{}'.format(host, port),
-                'https': 'socks5://{}:{}'.format(host, port)
-            }
-            self._pool.append(items)
-            self._counter.setdefault(self.get_netloc(items), end_time)
-
-    def __check_proxies(self):
-        check_ip = 'https://myip.ipip.net'
-        logger.info(f"[socks5代理检验]访问地址-{check_ip}")
-        for proxies in self._pool:
-            try:
-                requests_param = {
-                    "headers": headers,
-                    "proxies": proxies,
-                    "timeout": 2
-                }
-                requests.get(check_ip, **requests_param)
-                self._dq.append(proxies)
-            except requests.RequestException:
-                del self._counter[self.get_netloc(proxies)]
-
-    def __flush_proxy_pool(self):
-        logger.info(f"[socks5代理]刷新代理池")
-        self._pool.clear()
-        self._dq.clear()
-        self._counter.clear()
-        self.__request_service()
-        self.__check_proxies()
-
-    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
-        if enable_proxy:
-            logger.info("[加载socks5代理]")
-            self._init()
-        return self
-
-
-Proxy = Socks5Proxy()

+ 0 - 24
jzsc/utils/tools.py

@@ -1,24 +0,0 @@
-import socket
-import hashlib
-
-
-def get_host_ip():
-    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-    try:
-        s.connect(('8.8.8.8', 80))
-        ip = s.getsockname()[0]
-    finally:
-        s.close()
-    return ip
-
-
-def sha1(text: str):
-    """
-    十六进制数字字符串形式摘要值
-
-    @param text: 字符串文本
-    @return: 摘要值
-    """
-    _sha1 = hashlib.sha1()
-    _sha1.update(text.encode("utf-8"))
-    return _sha1.hexdigest()

+ 0 - 0
zgztb_cookie/FworkSpider/feapder/network/__init__.py → zgzb/.idea/.gitignore


+ 4 - 0
zgzb/.idea/encodings.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>

+ 36 - 0
zgzb/.idea/inspectionProfiles/Project_Default.xml

@@ -0,0 +1,36 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="23">
+            <item index="0" class="java.lang.String" itemvalue="rsa" />
+            <item index="1" class="java.lang.String" itemvalue="greenlet" />
+            <item index="2" class="java.lang.String" itemvalue="ws4py" />
+            <item index="3" class="java.lang.String" itemvalue="mysql-connector" />
+            <item index="4" class="java.lang.String" itemvalue="cffi" />
+            <item index="5" class="java.lang.String" itemvalue="asgiref" />
+            <item index="6" class="java.lang.String" itemvalue="zope.interface" />
+            <item index="7" class="java.lang.String" itemvalue="et-xmlfile" />
+            <item index="8" class="java.lang.String" itemvalue="pyasn1" />
+            <item index="9" class="java.lang.String" itemvalue="pycparser" />
+            <item index="10" class="java.lang.String" itemvalue="sqlparse" />
+            <item index="11" class="java.lang.String" itemvalue="jdcal" />
+            <item index="12" class="java.lang.String" itemvalue="ddt" />
+            <item index="13" class="java.lang.String" itemvalue="websocket" />
+            <item index="14" class="java.lang.String" itemvalue="gevent" />
+            <item index="15" class="java.lang.String" itemvalue="PyMySQL" />
+            <item index="16" class="java.lang.String" itemvalue="zope.event" />
+            <item index="17" class="java.lang.String" itemvalue="openpyxl" />
+            <item index="18" class="java.lang.String" itemvalue="Flask-Script" />
+            <item index="19" class="java.lang.String" itemvalue="Flask" />
+            <item index="20" class="java.lang.String" itemvalue="pymongo" />
+            <item index="21" class="java.lang.String" itemvalue="requests" />
+            <item index="22" class="java.lang.String" itemvalue="redis" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

+ 6 - 0
zgzb/.idea/inspectionProfiles/profiles_settings.xml

@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

+ 4 - 0
zgzb/.idea/misc.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (python38)" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
zgzb/.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/zgzb.iml" filepath="$PROJECT_DIR$/.idea/zgzb.iml" />
+    </modules>
+  </component>
+</project>

+ 6 - 0
zgzb/.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>

+ 407 - 0
zgzb/.idea/workspace.xml

@@ -0,0 +1,407 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="aa35f1aa-aa7a-484d-85b7-8c3958190ef5" name="Changes" comment="update" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="HTML File" />
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/.." />
+    <option name="SET_USER_NAME_GLOBALLY" value="false" />
+  </component>
+  <component name="GitSEFilterConfiguration">
+    <file-type-list>
+      <filtered-out-file-type name="LOCAL_BRANCH" />
+      <filtered-out-file-type name="REMOTE_BRANCH" />
+      <filtered-out-file-type name="TAG" />
+      <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
+    </file-type-list>
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProblemsViewState">
+    <option name="selectedTabId" value="CurrentFile" />
+  </component>
+  <component name="ProjectId" id="26ifQQsXWVWiQV0Cvbah37V73vJ" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true">
+    <ConfirmationsSetting value="1" id="Add" />
+  </component>
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="DefaultHtmlFileTemplate" value="HTML File" />
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/common" />
+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/common" />
+      <recent name="$PROJECT_DIR$" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/crawler" />
+    </key>
+  </component>
+  <component name="RunManager" selected="Python.kbjl">
+    <configuration name="asd" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="zgzb" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/asd.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="crawl_spider" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="zgzb" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/crawl_spider.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="defaults" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="zgzb" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/crawler" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/crawler/defaults.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="kbjl" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="zgzb" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/kbjl.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="zbxm" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
+      <module name="zgzb" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/zbxm.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.kbjl" />
+        <item itemvalue="Python.zbxm" />
+        <item itemvalue="Python.asd" />
+        <item itemvalue="Python.defaults" />
+        <item itemvalue="Python.crawl_spider" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="aa35f1aa-aa7a-484d-85b7-8c3958190ef5" name="Changes" comment="" />
+      <created>1647911310505</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1647911310505</updated>
+      <workItem from="1647911311873" duration="27789000" />
+      <workItem from="1647996492407" duration="22988000" />
+      <workItem from="1648090481790" duration="22069000" />
+      <workItem from="1648170709383" duration="24970000" />
+      <workItem from="1648210850640" duration="4346000" />
+      <workItem from="1648225231398" duration="1945000" />
+      <workItem from="1648287732886" duration="714000" />
+      <workItem from="1648290552384" duration="592000" />
+      <workItem from="1648630196724" duration="485000" />
+      <workItem from="1649134181455" duration="352000" />
+      <workItem from="1649240712522" duration="413000" />
+      <workItem from="1649243693605" duration="217000" />
+      <workItem from="1649323935993" duration="184000" />
+      <workItem from="1649387422380" duration="278000" />
+    </task>
+    <task id="LOCAL-00001" summary="new add project">
+      <created>1648027823994</created>
+      <option name="number" value="00001" />
+      <option name="presentableId" value="LOCAL-00001" />
+      <option name="project" value="LOCAL" />
+      <updated>1648027823994</updated>
+    </task>
+    <task id="LOCAL-00002" summary="update">
+      <created>1648028673565</created>
+      <option name="number" value="00002" />
+      <option name="presentableId" value="LOCAL-00002" />
+      <option name="project" value="LOCAL" />
+      <updated>1648028673565</updated>
+    </task>
+    <task id="LOCAL-00003" summary="update">
+      <created>1648099680634</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1648099680634</updated>
+    </task>
+    <task id="LOCAL-00004" summary="update">
+      <created>1648102046587</created>
+      <option name="number" value="00004" />
+      <option name="presentableId" value="LOCAL-00004" />
+      <option name="project" value="LOCAL" />
+      <updated>1648102046587</updated>
+    </task>
+    <task id="LOCAL-00005" summary="update">
+      <created>1648105017189</created>
+      <option name="number" value="00005" />
+      <option name="presentableId" value="LOCAL-00005" />
+      <option name="project" value="LOCAL" />
+      <updated>1648105017189</updated>
+    </task>
+    <task id="LOCAL-00006" summary="update">
+      <created>1648107441825</created>
+      <option name="number" value="00006" />
+      <option name="presentableId" value="LOCAL-00006" />
+      <option name="project" value="LOCAL" />
+      <updated>1648107441825</updated>
+    </task>
+    <task id="LOCAL-00007" summary="update">
+      <created>1648108194492</created>
+      <option name="number" value="00007" />
+      <option name="presentableId" value="LOCAL-00007" />
+      <option name="project" value="LOCAL" />
+      <updated>1648108194492</updated>
+    </task>
+    <task id="LOCAL-00008" summary="update">
+      <created>1648113119359</created>
+      <option name="number" value="00008" />
+      <option name="presentableId" value="LOCAL-00008" />
+      <option name="project" value="LOCAL" />
+      <updated>1648113119359</updated>
+    </task>
+    <task id="LOCAL-00009" summary="update">
+      <created>1648124536707</created>
+      <option name="number" value="00009" />
+      <option name="presentableId" value="LOCAL-00009" />
+      <option name="project" value="LOCAL" />
+      <updated>1648124536707</updated>
+    </task>
+    <task id="LOCAL-00010" summary="update">
+      <created>1648171363627</created>
+      <option name="number" value="00010" />
+      <option name="presentableId" value="LOCAL-00010" />
+      <option name="project" value="LOCAL" />
+      <updated>1648171363627</updated>
+    </task>
+    <task id="LOCAL-00011" summary="update">
+      <created>1648201258851</created>
+      <option name="number" value="00011" />
+      <option name="presentableId" value="LOCAL-00011" />
+      <option name="project" value="LOCAL" />
+      <updated>1648201258851</updated>
+    </task>
+    <task id="LOCAL-00012" summary="update">
+      <created>1648201495540</created>
+      <option name="number" value="00012" />
+      <option name="presentableId" value="LOCAL-00012" />
+      <option name="project" value="LOCAL" />
+      <updated>1648201495540</updated>
+    </task>
+    <task id="LOCAL-00013" summary="update">
+      <created>1648210880862</created>
+      <option name="number" value="00013" />
+      <option name="presentableId" value="LOCAL-00013" />
+      <option name="project" value="LOCAL" />
+      <updated>1648210880862</updated>
+    </task>
+    <task id="LOCAL-00014" summary="update">
+      <created>1648212922653</created>
+      <option name="number" value="00014" />
+      <option name="presentableId" value="LOCAL-00014" />
+      <option name="project" value="LOCAL" />
+      <updated>1648212922653</updated>
+    </task>
+    <task id="LOCAL-00015" summary="update">
+      <created>1648215642262</created>
+      <option name="number" value="00015" />
+      <option name="presentableId" value="LOCAL-00015" />
+      <option name="project" value="LOCAL" />
+      <updated>1648215642262</updated>
+    </task>
+    <task id="LOCAL-00016" summary="update">
+      <created>1649134536691</created>
+      <option name="number" value="00016" />
+      <option name="presentableId" value="LOCAL-00016" />
+      <option name="project" value="LOCAL" />
+      <updated>1649134536691</updated>
+    </task>
+    <task id="LOCAL-00017" summary="update">
+      <created>1649241066908</created>
+      <option name="number" value="00017" />
+      <option name="presentableId" value="LOCAL-00017" />
+      <option name="project" value="LOCAL" />
+      <updated>1649241066908</updated>
+    </task>
+    <task id="LOCAL-00018" summary="update">
+      <created>1649243797490</created>
+      <option name="number" value="00018" />
+      <option name="presentableId" value="LOCAL-00018" />
+      <option name="project" value="LOCAL" />
+      <updated>1649243797491</updated>
+    </task>
+    <task id="LOCAL-00019" summary="update">
+      <created>1649324103198</created>
+      <option name="number" value="00019" />
+      <option name="presentableId" value="LOCAL-00019" />
+      <option name="project" value="LOCAL" />
+      <updated>1649324103198</updated>
+    </task>
+    <task id="LOCAL-00020" summary="update">
+      <created>1649387585449</created>
+      <option name="number" value="00020" />
+      <option name="presentableId" value="LOCAL-00020" />
+      <option name="project" value="LOCAL" />
+      <updated>1649387585449</updated>
+    </task>
+    <task id="LOCAL-00021" summary="update">
+      <created>1649387682334</created>
+      <option name="number" value="00021" />
+      <option name="presentableId" value="LOCAL-00021" />
+      <option name="project" value="LOCAL" />
+      <updated>1649387682334</updated>
+    </task>
+    <option name="localTasksCounter" value="22" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="new add project" />
+    <MESSAGE value="update" />
+    <option name="LAST_COMMIT_MESSAGE" value="update" />
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/asd.py</url>
+          <line>15</line>
+          <option name="timeStamp" value="2" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/asd.py</url>
+          <line>19</line>
+          <option name="timeStamp" value="4" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="com.intellij.coverage.CoverageDataManagerImpl">
+    <SUITE FILE_PATH="coverage/zgzb$pbjs.coverage" NAME="pbjs Coverage Results" MODIFIED="1648026973206" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$defaults.coverage" NAME="defaults Coverage Results" MODIFIED="1648196618278" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/zgzb$wagf_detail_spider1.coverage" NAME="crawl_spider Coverage Results" MODIFIED="1647941385311" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$zbgg.coverage" NAME="zbgg Coverage Results" MODIFIED="1648024602176" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$crawl_spider.coverage" NAME="crawl_spider Coverage Results" MODIFIED="1648195125743" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/crawler" />
+    <SUITE FILE_PATH="coverage/zgzb$kbjl.coverage" NAME="kbjl Coverage Results" MODIFIED="1649387492823" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$errorhandler.coverage" NAME="errorhandler Coverage Results" MODIFIED="1647918661592" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="/usr/local/anaconda3/envs/python38/lib/python3.8/site-packages/selenium/webdriver/remote" />
+    <SUITE FILE_PATH="coverage/zgzb$aaa.coverage" NAME="aaa Coverage Results" MODIFIED="1648122795472" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$asd.coverage" NAME="asd Coverage Results" MODIFIED="1648211602552" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$socks5.coverage" NAME="socks5 Coverage Results" MODIFIED="1648016207012" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common" />
+    <SUITE FILE_PATH="coverage/zgzb$zbxm.coverage" NAME="zbxm Coverage Results" MODIFIED="1648216441451" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
+    <SUITE FILE_PATH="coverage/zgzb$load.coverage" NAME="load Coverage Results" MODIFIED="1648026937821" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/config" />
+    <SUITE FILE_PATH="coverage/zgzb$tools.coverage" NAME="tools Coverage Results" MODIFIED="1648187145157" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/common" />
+  </component>
+</project>

+ 12 - 0
zgzb/.idea/zgzb.iml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.8 (python38)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

+ 0 - 64
zgztb_cookie/FworkSpider/Dockerfile

@@ -1,64 +0,0 @@
-# 拉取镜像
-FROM centos:centos7.9.2009
-
-# 配置容器时间
-RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
-# 添加快捷命令
-RUN echo "alias ll='ls -hall'" >> ~/.bashrc && source ~/.bashrc
-
-# 更新yum源, 并生成缓存
-RUN curl -o /etc/yum.repos.d/CentOS7-Aliyun.repo http://mirrors.aliyun.com/repo/Centos-7.repo && curl -o /etc/yum.repos.d/epel-7-Aliyun.repo http://mirrors.aliyun.com/repo/epel-7.repo
-RUN yum clean all && yum makecache && yum -y update
-RUN yum install -y wget unzip dbus-x11 kde-l10n-Chinese
-
-# 设置系统编码
-ENV LANG=zh_CN.UTF-8
-# 设置vi编码(防止中文乱码)
-RUN grep -qxF 'set encoding=utf8' /etc/virc || echo 'set encoding=utf8' >> /etc/virc
-
-# Allow container to access D-Bus
-ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
-
-# 开发环境安装
-WORKDIR /opt
-# 安装node, 更换npm源
-RUN curl -fsSL https://rpm.nodesource.com/setup_14.x | bash && yum -y install nodejs && npm config set registry http://registry.npmmirror.com
-# 设置全局NODE_PATH
-ENV NODE_PATH="/usr/lib/node_modules"
-
-# 安装 python3.8.10 gcc相关配置
-RUN yum --exclude=kernel* update -y && yum groupinstall -y 'Development Tools' && yum install -y gcc openssl-devel bzip2-devel libffi-devel gtk3 libXt glibc-common sqlite-devel
-# python3.8.10下载与解压缩
-RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
-
-# filefox环境安装
-# 下载和解压缩火狐浏览器
-RUN wget https://download-installer.cdn.mozilla.net/pub/firefox/releases/78.0/linux-x86_64/zh-CN/firefox-78.0.tar.bz2 && tar -jxvf firefox-78.0.tar.bz2 && ln -s /opt/firefox/firefox /usr/bin/firefox
-# 下载驱动并添加可执行权
-RUN wget http://172.17.162.28:8888/geckodriver && chmod +x geckodriver && ln -s /opt/geckodriver /usr/bin/geckodriver
-
-# 切换python工作目录
-WORKDIR /opt/Python-3.8.10
-# 创建和指定python环境存放路径
-RUN mkdir /usr/local/python38 && ./configure --prefix=/usr/local/python38
-# 编译和安装
-RUN make -j 8 && make altinstall
-# 添加python3的软连接
-RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3.8 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
-# 更换pip源&更新pip
-RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple && pip3 install --upgrade pip
-# 虚拟环境加入系统环境变量
-ENV PATH="/usr/local/python38/bin:$PATH"
-
-# 安装 node 项目依赖
-RUN npm i -g crypto-js@4.1.1 js-md5@0.7.3 jsdom@19.0.0 jsencrypt@3.2.1 node-bignumber@1.2.2 xhr2@0.2.1
-
-# 指定框架安装路径
-WORKDIR /app
-COPY . .
-# 安装 python 项目依赖
-RUN python3 setup.py install
-RUN pip3 install -r requirements.txt
-
-# 指定工作目录
-WORKDIR /mnt

+ 0 - 11
zgztb_cookie/FworkSpider/MANIFEST.in

@@ -1,11 +0,0 @@
-include README.md
-include LICENSE
-
-include feapder/requirements.txt
-include feapder/VERSION
-
-recursive-include feapder/utils/js *
-recursive-include feapder/templates *
-recursive-include tests *
-
-global-exclude __pycache__ *.py[cod]

+ 0 - 34
zgztb_cookie/FworkSpider/README.md

@@ -1,34 +0,0 @@
-# 中国招标投标公共服务平台
-
-## feapder框架安装
-    $ cd ../FworkSpider
-
-#### docker镜像构建
-    $ docker build -t cebpubservice:1.0 .
-
-#### docker容器启动 (--compatibility 兼容模式)
-    $ docker-compose --compatibility up -d
-
-#### 安装feapder全部依赖
-    $ pip install feapder[all]
-
-#### cron服务
-    # 安装cron服务依赖
-    $ yum install -y vixie-cron crontabs
-
-    # 启动cron服务
-    $ systemctl start crond.service
-
-    # cron服务加入开启启动项
-    $ systemctl enable crond.service
-
-    # 查看cron服务状态
-    $ systemctl status crond.service
-
-#### 添加定时配置
-    # 列表页爬虫
-    10 * * * * flock -xn /mnt/zgzbtb_spider.py -c 'cd /mnt && nohup python3 zgzbtb_spider.py > /dev/null &'
-    */5 * * * * flock -xn /mnt/zgzbtb_spider_m.py -c 'cd /mnt && nohup python3 zgzbtb_spider_m.py > /dev/null &'
-    */10 * * * * flock -xn /mnt/zgzbtb_spider_d.py -c 'cd /mnt && nohup python3 zgzbtb_spider_d.py > /dev/null &'
-    # 详情页爬虫
-    */5 * * * * bash /mnt/start.sh

+ 0 - 45
zgztb_cookie/FworkSpider/docker-compose.yml

@@ -1,45 +0,0 @@
-version: '3'
-services:
-  worker1:
-    container_name: cebpubspider-list
-    image: cebpubservice:1.0
-    restart: always
-    privileged: true
-    network_mode: "host"
-    shm_size: '2gb'
-    volumes:
-      - /mnt/zgztb_cookie:/mnt
-    logging:
-      driver: "json-file"
-      options:
-        max-size: "500M"
-        max-file: "1"
-    deploy:
-     resources:
-        limits:
-           memory: 4G
-        reservations:
-           memory: 200M
-    command: /sbin/init
-
-  worker2:
-    container_name: cebpubspider-detail
-    image: cebpubservice:1.0
-    restart: always
-    privileged: true
-    network_mode: "host"
-    shm_size: '2gb'
-    volumes:
-      - /mnt/zgztb_cookie:/mnt
-    logging:
-      driver: "json-file"
-      options:
-        max-size: "500M"
-        max-file: "1"
-    deploy:
-      resources:
-        limits:
-          memory: 4G
-        reservations:
-          memory: 200M
-    command: /sbin/init

+ 0 - 1
zgztb_cookie/FworkSpider/feapder/VERSION

@@ -1 +0,0 @@
-1.6.9

+ 0 - 33
zgztb_cookie/FworkSpider/feapder/__init__.py

@@ -1,33 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/21 10:41 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import os, sys
-import re
-
-sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
-
-__all__ = [
-    "AirSpider",
-    "Spider",
-    "BatchSpider",
-    "BaseParser",
-    "BatchParser",
-    "Request",
-    "Response",
-    "Item",
-    "UpdateItem",
-    "ArgumentParser",
-]
-
-from feapder.core.spiders import Spider, BatchSpider, AirSpider
-from feapder.core.base_parser import BaseParser, BatchParser
-from feapder.network.request import Request
-from feapder.network.response import Response
-from feapder.network.item import Item, UpdateItem
-from feapder.utils.custom_argparse import ArgumentParser

+ 0 - 9
zgztb_cookie/FworkSpider/feapder/buffer/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-'''

+ 0 - 426
zgztb_cookie/FworkSpider/feapder/buffer/item_buffer.py

@@ -1,426 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-06-19 17:17
----------
-@summary: item 管理器, 负责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import importlib
-import threading
-from queue import Queue
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.dedup import Dedup
-from feapder.network.item import Item, UpdateItem
-from feapder.pipelines import BasePipeline
-from feapder.pipelines.mysql_pipeline import MysqlPipeline
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-MAX_ITEM_COUNT = 5000  # 缓存中最大item数
-UPLOAD_BATCH_MAX_SIZE = 1000
-
-MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
-
-
-class ItemBuffer(threading.Thread):
-    dedup = None
-    __redis_db = None
-
-    def __init__(self, redis_key, task_table=None):
-        if not hasattr(self, "_table_item"):
-            super(ItemBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-            self._redis_key = redis_key
-            self._task_table = task_table
-
-            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
-
-            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
-            self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
-                redis_key=redis_key
-            )
-
-            self._item_tables = {
-                # 'item_name': 'table_name' # 缓存item名与表名对应关系
-            }
-
-            self._item_update_keys = {
-                # 'table_name': ['id', 'name'...] # 缓存table_name与__update_key__的关系
-            }
-
-            self._pipelines = self.load_pipelines()
-
-            self._have_mysql_pipeline = MYSQL_PIPELINE_PATH in setting.ITEM_PIPELINES
-            self._mysql_pipeline = None
-
-            if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup:
-                self.__class__.dedup = Dedup(
-                    to_md5=False, **setting.ITEM_FILTER_SETTING
-                )
-
-            # 导出重试的次数
-            self.export_retry_times = 0
-            # 导出失败的次数 TODO 非air爬虫使用redis统计
-            self.export_falied_times = 0
-
-    @property
-    def redis_db(self):
-        if self.__class__.__redis_db is None:
-            self.__class__.__redis_db = RedisDB()
-
-        return self.__class__.__redis_db
-
-    def load_pipelines(self):
-        pipelines = []
-        for pipeline_path in setting.ITEM_PIPELINES:
-            module, class_name = pipeline_path.rsplit(".", 1)
-            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
-            pipeline = pipeline_cls()
-            if not isinstance(pipeline, BasePipeline):
-                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
-            pipelines.append(pipeline)
-
-        return pipelines
-
-    @property
-    def mysql_pipeline(self):
-        if not self._mysql_pipeline:
-            module, class_name = MYSQL_PIPELINE_PATH.rsplit(".", 1)
-            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
-            self._mysql_pipeline = pipeline_cls()
-
-        return self._mysql_pipeline
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            self.flush()
-            tools.delay_time(1)
-
-        self.close()
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def put_item(self, item):
-        if isinstance(item, Item):
-            # 入库前的回调
-            item.pre_to_db()
-
-        self._items_queue.put(item)
-
-    def flush(self):
-        try:
-            items = []
-            update_items = []
-            requests = []
-            callbacks = []
-            items_fingerprints = []
-            data_count = 0
-
-            while not self._items_queue.empty():
-                data = self._items_queue.get_nowait()
-                data_count += 1
-
-                # data 分类
-                if callable(data):
-                    callbacks.append(data)
-
-                elif isinstance(data, UpdateItem):
-                    update_items.append(data)
-
-                elif isinstance(data, Item):
-                    items.append(data)
-                    if setting.ITEM_FILTER_ENABLE:
-                        items_fingerprints.append(data.fingerprint)
-
-                else:  # request-redis
-                    requests.append(data)
-
-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
-                    self.__add_item_to_db(
-                        items, update_items, requests, callbacks, items_fingerprints
-                    )
-
-                    items = []
-                    update_items = []
-                    requests = []
-                    callbacks = []
-                    items_fingerprints = []
-                    data_count = 0
-
-            if data_count:
-                self.__add_item_to_db(
-                    items, update_items, requests, callbacks, items_fingerprints
-                )
-
-        except Exception as e:
-            log.exception(e)
-
-    def get_items_count(self):
-        return self._items_queue.qsize()
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __dedup_items(self, items, items_fingerprints):
-        """
-        去重
-        @param items:
-        @param items_fingerprints:
-        @return: 返回去重后的items, items_fingerprints
-        """
-        if not items:
-            return items, items_fingerprints
-
-        is_exists = self.__class__.dedup.get(items_fingerprints)
-        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]
-
-        dedup_items = []
-        dedup_items_fingerprints = []
-        items_count = dedup_items_count = dup_items_count = 0
-
-        while is_exists:
-            item = items.pop(0)
-            items_fingerprint = items_fingerprints.pop(0)
-            is_exist = is_exists.pop(0)
-
-            items_count += 1
-
-            if not is_exist:
-                dedup_items.append(item)
-                dedup_items_fingerprints.append(items_fingerprint)
-                dedup_items_count += 1
-            else:
-                dup_items_count += 1
-
-        log.info(
-            "待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
-                items_count, dup_items_count, dedup_items_count
-            )
-        )
-
-        return dedup_items, dedup_items_fingerprints
-
-    def __pick_items(self, items, is_update_item=False):
-        """
-        将每个表之间的数据分开 拆分后 原items为空
-        @param items:
-        @param is_update_item:
-        @return:
-        """
-        datas_dict = {
-            # 'table_name': [{}, {}]
-        }
-
-        while items:
-            item = items.pop(0)
-            # 取item下划线格式的名
-            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
-            item_name = item.item_name
-            table_name = self._item_tables.get(item_name)
-            if not table_name:
-                table_name = item.table_name
-                self._item_tables[item_name] = table_name
-
-            if table_name not in datas_dict:
-                datas_dict[table_name] = []
-
-            datas_dict[table_name].append(item.to_dict)
-
-            if is_update_item and table_name not in self._item_update_keys:
-                self._item_update_keys[table_name] = item.update_key
-
-        return datas_dict
-
-    def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        # 打点 校验
-        self.check_datas(table=table, datas=datas)
-
-        for pipeline in self._pipelines:
-            if is_update:
-                if table == self._task_table and not isinstance(
-                    pipeline, MysqlPipeline
-                ):
-                    continue
-
-                if not pipeline.update_items(table, datas, update_keys=update_keys):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-
-            else:
-                if not pipeline.save_items(table, datas):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-
-        # 若是任务表, 且上面的pipeline里没mysql,则需调用mysql更新任务
-        if not self._have_mysql_pipeline and is_update and table == self._task_table:
-            if not self.mysql_pipeline.update_items(
-                table, datas, update_keys=update_keys
-            ):
-                log.error(
-                    f"{self.mysql_pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
-                )
-                return False
-
-        return True
-
-    def __add_item_to_db(
-        self, items, update_items, requests, callbacks, items_fingerprints
-    ):
-        export_success = True
-        self._is_adding_to_db = True
-
-        # 去重
-        if setting.ITEM_FILTER_ENABLE:
-            items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
-
-        # 分捡
-        items_dict = self.__pick_items(items)
-        update_items_dict = self.__pick_items(update_items, is_update_item=True)
-
-        # item批量入库
-        failed_items = {"add": [], "update": [], "requests": []}
-        while items_dict:
-            table, datas = items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            if not self.__export_to_db(table, datas):
-                export_success = False
-                failed_items["add"].append({"table": table, "datas": datas})
-
-        # 执行批量update
-        while update_items_dict:
-            table, datas = update_items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量更新 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            update_keys = self._item_update_keys.get(table)
-            if not self.__export_to_db(
-                table, datas, is_update=True, update_keys=update_keys
-            ):
-                export_success = False
-                failed_items["update"].append({"table": table, "datas": datas})
-
-        if export_success:
-            # 执行回调
-            while callbacks:
-                try:
-                    callback = callbacks.pop(0)
-                    callback()
-                except Exception as e:
-                    log.exception(e)
-
-            # 删除做过的request
-            if requests:
-                self.redis_db.zrem(self._table_request, requests)
-
-            # 去重入库
-            if setting.ITEM_FILTER_ENABLE:
-                if items_fingerprints:
-                    self.__class__.dedup.add(items_fingerprints, skip_check=True)
-        else:
-            failed_items["requests"] = requests
-
-            if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
-                if self._redis_key != "air_spider":
-                    # 失败的item记录到redis
-                    self.redis_db.sadd(self._table_failed_items, failed_items)
-
-                    # 删除做过的request
-                    if requests:
-                        self.redis_db.zrem(self._table_request, requests)
-
-                    log.error(
-                        "入库超过最大重试次数,不再重试,数据记录到redis,items:\n {}".format(
-                            tools.dumps_json(failed_items)
-                        )
-                    )
-                self.export_retry_times = 0
-
-            else:
-                tip = ["入库不成功"]
-                if callbacks:
-                    tip.append("不执行回调")
-                if requests:
-                    tip.append("不删除任务")
-                    exists = self.redis_db.zexists(self._table_request, requests)
-                    for exist, request in zip(exists, requests):
-                        if exist:
-                            self.redis_db.zadd(self._table_request, requests, 300)
-
-                if setting.ITEM_FILTER_ENABLE:
-                    tip.append("数据不入去重库")
-
-                if self._redis_key != "air_spider":
-                    tip.append("将自动重试")
-
-                tip.append("失败items:\n {}".format(tools.dumps_json(failed_items)))
-                log.error(",".join(tip))
-
-                self.export_falied_times += 1
-
-                if self._redis_key != "air_spider":
-                    self.export_retry_times += 1
-
-            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
-                # 报警
-                msg = "《{}》爬虫导出数据失败,失败次数:{},请检查爬虫是否正常".format(
-                    self._redis_key, self.export_falied_times
-                )
-                log.error(msg)
-                tools.send_msg(
-                    msg=msg,
-                    level="error",
-                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
-                )
-
-        self._is_adding_to_db = False
-
-    def check_datas(self, table, datas):
-        """
-        打点 记录总条数及每个key情况
-        @param table: 表名
-        @param datas: 数据 列表
-        @return:
-        """
-        metrics.emit_counter("total count", len(datas), classify=table)
-        for data in datas:
-            for k, v in data.items():
-                metrics.emit_counter(k, int(bool(v)), classify=table)
-
-    def close(self):
-        # 调用pipeline的close方法
-        for pipeline in self._pipelines:
-            try:
-                pipeline.close()
-            except:
-                pass

+ 0 - 151
zgztb_cookie/FworkSpider/feapder/buffer/request_buffer.py

@@ -1,151 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-06-19 17:17
----------
-@summary: request 管理器, 负责缓冲添加到数据库中的request
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import collections
-import threading
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.dedup import Dedup
-from feapder.utils.log import log
-
-MAX_URL_COUNT = 1000  # 缓存中最大request数
-
-
-class RequestBuffer(threading.Thread):
-    dedup = None
-
-    def __init__(self, redis_key):
-        if not hasattr(self, "_requests_deque"):
-            super(RequestBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-
-            self._requests_deque = collections.deque()
-            self._del_requests_deque = collections.deque()
-            self._db = RedisDB()
-
-            self._table_request = setting.TAB_REQUESTS.format(redis_key=redis_key)
-            self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
-                redis_key=redis_key
-            )
-
-            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
-                self.__class__.dedup = Dedup(
-                    name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
-                )  # 默认过期时间为一个月
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                self.__add_request_to_db()
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def put_request(self, request):
-        self._requests_deque.append(request)
-
-        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
-            self.flush()
-
-    def put_del_request(self, request):
-        self._del_requests_deque.append(request)
-
-    def put_failed_request(self, request, table=None):
-        try:
-            request_dict = request.to_dict
-            self._db.zadd(
-                table or self._table_failed_request, request_dict, request.priority
-            )
-        except Exception as e:
-            log.exception(e)
-
-    def flush(self):
-        try:
-            self.__add_request_to_db()
-        except Exception as e:
-            log.exception(e)
-
-    def get_requests_count(self):
-        return len(self._requests_deque)
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __add_request_to_db(self):
-        request_list = []
-        prioritys = []
-        callbacks = []
-
-        while self._requests_deque:
-            request = self._requests_deque.popleft()
-            self._is_adding_to_db = True
-
-            if callable(request):
-                # 函数
-                # 注意:应该考虑闭包情况。闭包情况可写成
-                # def test(xxx = xxx):
-                #     # TODO 业务逻辑 使用 xxx
-                # 这么写不会导致xxx为循环结束后的最后一个值
-                callbacks.append(request)
-                continue
-
-            priority = request.priority
-
-            # 如果需要去重并且库中已重复 则continue
-            if (
-                request.filter_repeat
-                and setting.REQUEST_FILTER_ENABLE
-                and not self.__class__.dedup.add(request.fingerprint)
-            ):
-                log.debug("request已存在  url = %s" % request.url)
-                continue
-            else:
-                request_list.append(str(request.to_dict))
-                prioritys.append(priority)
-
-            if len(request_list) > MAX_URL_COUNT:
-                self._db.zadd(self._table_request, request_list, prioritys)
-                request_list = []
-                prioritys = []
-
-        # 入库
-        if request_list:
-            self._db.zadd(self._table_request, request_list, prioritys)
-
-        # 执行回调
-        for callback in callbacks:
-            try:
-                callback()
-            except Exception as e:
-                log.exception(e)
-
-        # 删除已做任务
-        if self._del_requests_deque:
-            request_done_list = []
-            while self._del_requests_deque:
-                request_done_list.append(self._del_requests_deque.popleft())
-
-            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
-            request_done_list = list(set(request_done_list) - set(request_list))
-
-            if request_done_list:
-                self._db.zrem(self._table_request, request_done_list)
-
-        self._is_adding_to_db = False

+ 0 - 45
zgztb_cookie/FworkSpider/feapder/commands/cmdline.py

@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/5/8 2:24 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import sys
-from os.path import dirname, join
-
-from feapder.commands import create_builder
-from feapder.commands import shell
-
-
-def _print_commands():
-    with open(join(dirname(dirname(__file__)), "VERSION"), "rb") as f:
-        version = f.read().decode("ascii").strip()
-
-    print("feapder {}".format(version))
-    print("\nUsage:")
-    print("  feapder <command> [options] [args]\n")
-    print("Available commands:")
-    cmds = {"create": "create project、spider、item and so on", "shell": "debug response"}
-    for cmdname, cmdclass in sorted(cmds.items()):
-        print("  %-13s %s" % (cmdname, cmdclass))
-
-    print('\nUse "feapder <command> -h" to see more info about a command')
-
-
-def execute():
-    args = sys.argv
-    if len(args) < 2:
-        _print_commands()
-        return
-
-    command = args.pop(1)
-    if command == "create":
-        create_builder.main()
-    elif command == "shell":
-        shell.main()
-    else:
-        _print_commands()

+ 0 - 21
zgztb_cookie/FworkSpider/feapder/commands/create/__init__.py

@@ -1,21 +0,0 @@
-__all__ = [
-    "CreateProject",
-    "CreateSpider",
-    "CreateItem",
-    "CreateInit",
-    "CreateJson",
-    "CreateTable",
-    "CreateCookies",
-    "CreateSetting",
-    "CreateParams",
-]
-
-from .create_table import CreateTable
-from .create_json import CreateJson
-from .create_spider import CreateSpider
-from .create_init import CreateInit
-from .create_item import CreateItem
-from .create_project import CreateProject
-from .create_cookies import CreateCookies
-from .create_setting import CreateSetting
-from .create_params import CreateParams

+ 0 - 48
zgztb_cookie/FworkSpider/feapder/commands/create/create_cookies.py

@@ -1,48 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/25 10:22 上午
----------
-@summary: 将浏览器的cookie转为request的cookie
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import json
-import sys
-
-from feapder.utils.tools import get_cookies_from_str, print_pretty
-
-
-class CreateCookies:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入浏览器cookie (列表或字符串格式)")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-
-            data.append(line)
-
-        return "".join(data)
-
-    def create(self):
-        data = self.get_data()
-        cookies = {}
-        try:
-            data_json = json.loads(data)
-
-            for data in data_json:
-                cookies[data.get("name")] = data.get("value")
-
-        except:
-            cookies = get_cookies_from_str(data)
-
-        print_pretty(cookies)

+ 0 - 30
zgztb_cookie/FworkSpider/feapder/commands/create/create_init.py

@@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建__init__.py
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-from feapder.utils.tools import dumps_json
-
-
-class CreateInit:
-    def create(self):
-        __all__ = []
-
-        import os
-
-        path = os.getcwd()
-        for file in os.listdir(path):
-            if file.endswith(".py") and not file.startswith("__init__"):
-                model = file.split(".")[0]
-                __all__.append(model)
-
-        del os
-
-        with open("__init__.py", "w", encoding="utf-8") as file:
-            text = "__all__ = %s" % dumps_json(__all__)
-            file.write(text)

+ 0 - 165
zgztb_cookie/FworkSpider/feapder/commands/create/create_item.py

@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建item
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.db.mysqldb import MysqlDB
-from .create_init import CreateInit
-
-
-def deal_file_info(file):
-    file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
-    return file
-
-
-class CreateItem:
-    def __init__(self):
-        self._db = MysqlDB()
-        self._create_init = CreateInit()
-
-    def select_columns(self, table_name):
-        # sql = 'SHOW COLUMNS FROM ' + table_name
-        sql = f"SELECT COLUMN_NAME, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, COLUMN_KEY, COLUMN_COMMENT FROM INFORMATION_SCHEMA.Columns WHERE table_name = '{table_name}' and table_schema = '{setting.MYSQL_DB}'"
-        columns = self._db.find(sql)
-
-        return columns
-
-    def select_tables_name(self, tables_name):
-        """
-        @summary:
-        ---------
-        @param tables_name: 一类tables 如 qidian*
-        ---------
-        @result:
-        """
-        sql = f"select table_name from information_schema.tables where table_name like '{tables_name}' and table_schema = '{setting.MYSQL_DB}'"
-        tables_name = self._db.find(sql)
-
-        return tables_name
-
-    def convert_table_name_to_hump(self, table_name):
-        """
-        @summary: 格式化表明为驼峰格式
-        ---------
-        @param table:
-        ---------
-        @result:
-        """
-        table_hump_format = ""
-
-        words = table_name.split("_")
-        for word in words:
-            table_hump_format += word.capitalize()  # 首字母大写
-
-        return table_hump_format
-
-    def get_item_template(self):
-        template_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates/item_template.tmpl")
-        )
-        with open(template_path, "r", encoding="utf-8") as file:
-            item_template = file.read()
-
-        return item_template
-
-    def create_item(self, item_template, columns, table_name, support_dict):
-        table_name_hump_format = self.convert_table_name_to_hump(table_name)
-        # 组装 类名
-        item_template = item_template.replace("${item_name}", table_name_hump_format)
-        if support_dict:
-            item_template = item_template.replace("${table_name}", table_name + " 1")
-        else:
-            item_template = item_template.replace("${table_name}", table_name)
-
-        # 组装 属性
-        propertys = ""
-        for column in columns:
-            column_name = column[0]
-            column_type = column[1]
-            is_nullable = column[2]
-            column_default = column[3]
-            column_extra = column[4]
-            column_key = column[5]
-            column_comment = column[6]
-
-            try:
-                value = (
-                    "kwargs.get('{column_name}')".format(column_name=column_name)
-                    if support_dict
-                    else (
-                        column_default != "CURRENT_TIMESTAMP" and column_default or None
-                    )
-                    and eval(column_default)
-                )
-            except:
-                value = (
-                    "kwargs.get('{column_name}')".format(column_name=column_name)
-                    if support_dict
-                    else (
-                        column_default != "CURRENT_TIMESTAMP" and column_default or None
-                    )
-                    and column_default
-                )
-
-            if column_extra == "auto_increment" or column_default is not None:
-                propertys += f"# self.{column_name} = {value}"
-
-            else:
-                if value is None or isinstance(value, (float, int)) or support_dict:
-                    propertys += f"self.{column_name} = {value}"
-                else:
-                    propertys += f"self.{column_name} = '{value}'"
-
-            if column_comment:
-                propertys += f"  # {column_comment}"
-            propertys += "\n" + " " * 8
-
-        item_template = item_template.replace("${propertys}", propertys.strip())
-        item_template = deal_file_info(item_template)
-
-        return item_template
-
-    def save_template_to_file(self, item_template, table_name):
-        item_file = table_name + "_item.py"
-        if os.path.exists(item_file):
-            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % item_file)
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        with open(item_file, "w", encoding="utf-8") as file:
-            file.write(item_template)
-            print("\n%s 生成成功" % item_file)
-
-        self._create_init.create()
-
-    def create(self, tables_name, support_dict):
-        input_tables_name = tables_name
-
-        tables_name = self.select_tables_name(tables_name)
-        if not tables_name:
-            print(tables_name)
-            tip = "mysql数据库中无 %s 表 " % input_tables_name
-            raise KeyError(tip)
-
-        for table_name in tables_name:
-            table_name = table_name[0]
-
-            columns = self.select_columns(table_name)
-            item_template = self.get_item_template()
-            item_template = self.create_item(
-                item_template, columns, table_name, support_dict
-            )
-            self.save_template_to_file(item_template, table_name)

+ 0 - 52
zgztb_cookie/FworkSpider/feapder/commands/create/create_json.py

@@ -1,52 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 字符串转json
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import sys
-
-import feapder.utils.tools as tools
-
-
-class CreateJson:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入需要转换的内容: (xxx:xxx格式,支持多行)")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip().replace("\t", " " * 4)
-            if not line:
-                break
-
-            data.append(line)
-
-        return data
-
-    def create(self, sort_keys=False):
-        contents = self.get_data()
-
-        json = {}
-        for content in contents:
-            content = content.strip()
-            if not content or content.startswith(":"):
-                continue
-
-            regex = "([^:\s]*)[:|\s]*(.*)"
-
-            result = tools.get_info(content, regex, fetch_one=True)
-            if result[0] in json:
-                json[result[0]] = json[result[0]] + "&" + result[1]
-            else:
-                json[result[0]] = result[1].strip()
-
-        print(tools.dumps_json(json, sort_keys=sort_keys))

+ 0 - 51
zgztb_cookie/FworkSpider/feapder/commands/create/create_params.py

@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/25 10:22 上午
----------
-@summary: 将浏览器的cookie转为request的cookie
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import sys
-
-from feapder.utils.tools import dumps_json
-
-
-class CreateParams:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入请求地址")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-
-            data.append(line)
-
-        return "".join(data)
-
-    def get_params(self, url):
-        params_json = {}
-        params = url.split("?")[-1].split("&")
-        for param in params:
-            key_value = param.split("=", 1)
-            params_json[key_value[0]] = key_value[1]
-
-        return params_json
-
-    def create(self):
-        data = self.get_data()
-
-        params = self.get_params(data)
-        url = data.split("?")[0]
-
-        print(f'url = "{url}"')
-        print(f"params = {dumps_json(params)}")

+ 0 - 52
zgztb_cookie/FworkSpider/feapder/commands/create/create_project.py

@@ -1,52 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建项目
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-import shutil
-
-import feapder.utils.tools as tools
-
-
-def deal_file_info(file):
-    file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
-    return file
-
-
-class CreateProject:
-    def copy_callback(self, src, dst, *, follow_symlinks=True):
-        if src.endswith(".py"):
-            with open(src, "r", encoding="utf-8") as src_file, open(
-                dst, "w", encoding="utf8"
-            ) as dst_file:
-                content = src_file.read()
-                content = deal_file_info(content)
-                dst_file.write(content)
-
-        else:
-            shutil.copy2(src, dst, follow_symlinks=follow_symlinks)
-
-    def create(self, project_name):
-        if os.path.exists(project_name):
-            print("%s 项目已经存在" % project_name)
-        else:
-            template_path = os.path.abspath(
-                os.path.join(__file__, "../../../templates/project_template")
-            )
-            shutil.copytree(
-                template_path, project_name, copy_function=self.copy_callback
-            )
-
-            print("\n%s 项目生成成功" % project_name)
-
-
-

+ 0 - 27
zgztb_cookie/FworkSpider/feapder/commands/create/create_setting.py

@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/23 13:20
----------
-@summary: 生成配置文件
----------
-@author: mkdir700
-@email:  mkdir700@gmail.com
-"""
-
-import os
-import shutil
-
-
-class CreateSetting:
-    def create(self):
-        if os.path.exists("setting.py"):
-            confirm = input("配置文件已存在 是否覆盖 (y/n).  ")
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        template_file_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates/project_template/setting.py")
-        )
-        shutil.copy(template_file_path, "./", follow_symlinks=False)
-        print("配置文件生成成功")

+ 0 - 102
zgztb_cookie/FworkSpider/feapder/commands/create/create_spider.py

@@ -1,102 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建spider
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-import re
-
-import feapder.utils.tools as tools
-from .create_init import CreateInit
-
-
-def deal_file_info(file):
-    file = file.replace("{DATE}", tools.get_current_date())
-    # file = file.replace("{USER}", getpass.getuser())
-    file = file.replace("{USER}", os.path.basename(os.getcwd()))
-
-    return file
-
-
-class CreateSpider:
-    def __init__(self):
-        self._create_init = CreateInit()
-
-    def cover_to_underline(self, key):
-        regex = "[A-Z]*"
-        capitals = re.findall(regex, key)
-
-        if capitals:
-            for pos, capital in enumerate(capitals):
-                if not capital:
-                    continue
-                if pos == 0:
-                    if len(capital) > 1:
-                        key = key.replace(capital, capital.lower() + "_", 1)
-                    else:
-                        key = key.replace(capital, capital.lower(), 1)
-                else:
-                    if len(capital) > 1:
-                        key = key.replace(capital, "_" + capital.lower() + "_", 1)
-                    else:
-                        key = key.replace(capital, "_" + capital.lower(), 1)
-
-        return key
-
-    def get_spider_template(self, spider_type):
-        if spider_type == 1:
-            template_path = "air_spider_template.tmpl"
-        elif spider_type == 2:
-            template_path = "spider_template.tmpl"
-        elif spider_type == 3:
-            template_path = "batch_spider_template.tmpl"
-        elif spider_type == 4:
-            template_path = "spider_list_template.tmpl"
-        else:
-            raise ValueError("spider type error, support 1 2 3")
-
-        template_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates", template_path)
-        )
-        with open(template_path, "r", encoding="utf-8") as file:
-            spider_template = file.read()
-
-        return spider_template
-
-    def create_spider(self, spider_template, spider_name):
-        spider_template = spider_template.replace("${spider_name}", spider_name)
-        spider_template = deal_file_info(spider_template)
-        return spider_template
-
-    def save_spider_to_file(self, spider, spider_name):
-        spider_underline = self.cover_to_underline(spider_name)
-        spider_file = spider_underline + ".py"
-
-        if os.path.exists(spider_file):
-            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % spider_file)
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        with open(spider_file, "w", encoding="utf-8") as file:
-            file.write(spider)
-            print("\n%s 生成成功" % spider_name)
-
-        self._create_init.create()
-
-    def create(self, spider_name, spider_type):
-        # 检查spider_name
-        if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
-            raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
-
-        if spider_name.islower():
-            spider_name = tools.key2hump(spider_name)
-        spider_template = self.get_spider_template(spider_type)
-        spider = self.create_spider(spider_template, spider_name)
-        self.save_spider_to_file(spider, spider_name)

+ 0 - 135
zgztb_cookie/FworkSpider/feapder/commands/create/create_table.py

@@ -1,135 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 根据json生成表
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import sys
-import time
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.mysqldb import MysqlDB
-from feapder.utils.tools import key2underline
-
-
-class CreateTable:
-    def __init__(self):
-        self._db = MysqlDB()
-
-    def is_vaild_date(self, date):
-        try:
-            if ":" in date:
-                time.strptime(date, "%Y-%m-%d %H:%M:%S")
-            else:
-                time.strptime(date, "%Y-%m-%d")
-            return True
-        except:
-            return False
-
-    def get_key_type(self, value):
-        try:
-            value = eval(value)
-        except:
-            value = value
-
-        key_type = "varchar(255)"
-        if isinstance(value, int):
-            key_type = "int"
-        elif isinstance(value, float):
-            key_type = "double"
-        elif isinstance(value, str):
-            if self.is_vaild_date(value):
-                if ":" in value:
-                    key_type = "datetime"
-                else:
-                    key_type = "date"
-            elif len(value) > 255:
-                key_type = "text"
-            else:
-                key_type = "varchar(255)"
-
-        return key_type
-
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        data = ""
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-            data += line
-
-        return tools.get_json(data)
-
-    def create(self, table_name):
-        # 输入表字段
-        print('请输入表数据 json格式 如 {"name":"张三"}\n等待输入:\n')
-        data = self.get_data()
-
-        if not isinstance(data, dict):
-            raise Exception("表数据格式不正确")
-
-        # 拼接表结构
-        sql = """
-            CREATE TABLE `{db}`.`{table_name}` (
-                `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id 自动递增',
-                {other_key}
-                `gtime` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
-                PRIMARY KEY (`id`),
-                {unique}
-            ) COMMENT='';
-        """
-
-        print("请设置注释 回车跳过")
-        other_key = ""
-        for key, value in data.items():
-            key = key2underline(key)
-            key_type = self.get_key_type(value)
-
-            comment = input("%s : %s  -> comment:" % (key, key_type))
-
-            other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
-                key=key, key_type=key_type, comment=comment
-            )
-
-        print("\n")
-
-        while True:
-            is_need_batch_date = input("是否添加batch_date 字段 (y/n):")
-            if is_need_batch_date == "y":
-                other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
-                    key="batch_date", key_type="date", comment="批次时间"
-                )
-                break
-            elif is_need_batch_date == "n":
-                break
-
-        print("\n")
-
-        while True:
-            unique = input("请设置唯一索引, 多个逗号间隔\n等待输入:\n").replace(",", ",")
-            if unique:
-                break
-        unique = "UNIQUE `idx` USING BTREE (`%s`) comment ''" % "`,`".join(
-            unique.split(",")
-        )
-
-        sql = sql.format(
-            db=setting.MYSQL_DB,
-            table_name=table_name,
-            other_key=other_key,
-            unique=unique,
-        )
-        print(sql)
-        self._db.execute(sql)
-        print("\n%s 创建成功" % table_name)

+ 0 - 118
zgztb_cookie/FworkSpider/feapder/commands/create_builder.py

@@ -1,118 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/2/8 11:21 上午
----------
-@summary: 生成器
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import argparse
-
-import feapder.setting as setting
-from feapder.commands.create import *
-
-
-def main():
-    spider = argparse.ArgumentParser(description="生成器")
-
-    spider.add_argument(
-        "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
-    )
-    spider.add_argument(
-        "--s",
-        "--spider",
-        nargs="+",
-        help="创建爬虫\n"
-        "如 feapder create -s <spider_name> <spider_type> "
-        "spider_type=1  AirSpider; "
-        "spider_type=2  Spider; "
-        "spider_type=3  BatchSpider;",
-        metavar="",
-    )
-    spider.add_argument(
-        "-i",
-        "--item",
-        nargs="+",
-        help="创建item 如 feapder create -i test 则生成test表对应的item。 "
-        "支持like语法模糊匹配所要生产的表。 "
-        "若想生成支持字典方式赋值的item,则create -item test 1",
-        metavar="",
-    )
-    spider.add_argument(
-        "-t", "--table", help="根据json创建表 如 feapder create -t <table_name>", metavar=""
-    )
-    spider.add_argument(
-        "-init", help="创建__init__.py 如 feapder create -init", action="store_true"
-    )
-    spider.add_argument("-j", "--json", help="创建json", action="store_true")
-    spider.add_argument("-sj", "--sort_json", help="创建有序json", action="store_true")
-    spider.add_argument("-c", "--cookies", help="创建cookie", action="store_true")
-    spider.add_argument("--params", help="解析地址中的参数", action="store_true")
-    spider.add_argument(
-        "--setting", help="创建全局配置文件" "feapder create --setting", action="store_true"
-    )
-
-    # 指定数据库
-    spider.add_argument("--host", type=str, help="mysql 连接地址", metavar="")
-    spider.add_argument("--port", type=str, help="mysql 端口", metavar="")
-    spider.add_argument("--username", type=str, help="mysql 用户名", metavar="")
-    spider.add_argument("--password", type=str, help="mysql 密码", metavar="")
-    spider.add_argument("--db", type=str, help="mysql 数据库名", metavar="")
-    args = spider.parse_args()
-
-    if args.host:
-        setting.MYSQL_IP = args.host
-    if args.port:
-        setting.MYSQL_PORT = int(args.port)
-    if args.username:
-        setting.MYSQL_USER_NAME = args.username
-    if args.password:
-        setting.MYSQL_USER_PASS = args.password
-    if args.db:
-        setting.MYSQL_DB = args.db
-
-    if args.item:
-        item_name, *support_dict = args.item
-        support_dict = bool(support_dict)
-        CreateItem().create(item_name, support_dict)
-
-    elif args.spider:
-        spider_name, *spider_type = args.spider
-        if not spider_type:
-            spider_type = 1
-        else:
-            spider_type = spider_type[0]
-        try:
-            spider_type = int(spider_type)
-        except:
-            raise ValueError("spider_type error, support 1, 2, 3")
-        CreateSpider().create(spider_name, spider_type)
-
-    elif args.project:
-        CreateProject().create(args.project)
-
-    elif args.table:
-        CreateTable().create(args.table)
-
-    elif args.init:
-        CreateInit().create()
-
-    elif args.json:
-        CreateJson().create()
-
-    elif args.sort_json:
-        CreateJson().create(sort_keys=True)
-
-    elif args.cookies:
-        CreateCookies().create()
-
-    elif args.setting:
-        CreateSetting().create()
-
-    elif args.params:
-        CreateParams().create()
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 93
zgztb_cookie/FworkSpider/feapder/commands/shell.py

@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/5/9 12:37 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import json
-import re
-import sys
-
-import IPython
-
-from feapder import Request
-
-
-def request(**kwargs):
-    kwargs.setdefault("proxies", None)
-    response = Request(**kwargs).get_response()
-    print(response)
-
-    IPython.embed(header="now you can use response")
-
-
-def fetch_url(url):
-    request(url=url)
-
-
-def fetch_curl(curl_args):
-    """
-    解析及抓取curl请求
-    :param curl_args:
-    [url, '-H', 'xxx', '-H', 'xxx', '--data-binary', '{"xxx":"xxx"}', '--compressed']
-    :return:
-    """
-    url = curl_args[0]
-    curl_args.pop(0)
-
-    headers = {}
-    data = {}
-    for i in range(0, len(curl_args), 2):
-        if curl_args[i] == "-H":
-            regex = "([^:\s]*)[:|\s]*(.*)"
-            result = re.search(regex, curl_args[i + 1], re.S).groups()
-            if result[0] in headers:
-                headers[result[0]] = headers[result[0]] + "&" + result[1]
-            else:
-                headers[result[0]] = result[1].strip()
-
-        elif curl_args[i] == "--data-binary":
-            data = json.loads(curl_args[i + 1])
-
-    request(url=url, data=data, headers=headers)
-
-
-def usage():
-    """
-下载调试器
-
-usage: feapder shell [options] [args]
-
-optional arguments:
-  -u, --url     抓取指定url
-  -c, --curl    抓取curl格式的请求
-
-    """
-    print(usage.__doc__)
-    sys.exit()
-
-
-def main():
-    args = sys.argv
-    if len(args) < 3:
-        usage()
-
-    elif args[1] in ("-h", "--help"):
-        usage()
-
-    elif args[1] in ("-u", "--url"):
-        fetch_url(args[2])
-
-    elif args[1] in ("-c", "--curl"):
-        fetch_curl(args[2:])
-
-    else:
-        usage()
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 9
zgztb_cookie/FworkSpider/feapder/core/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-'''

+ 0 - 216
zgztb_cookie/FworkSpider/feapder/core/base_parser.py

@@ -1,216 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:41:57
----------
-@summary: parser 的基类
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-import os
-
-import feapder.utils.tools as tools
-from feapder.db.mysqldb import MysqlDB
-from feapder.network.item import UpdateItem
-from feapder.utils.log import log
-
-
-class BaseParser(object):
-    def start_requests(self):
-        """
-        @summary: 添加初始url
-        ---------
-        ---------
-        @result: yield Request()
-        """
-
-        pass
-
-    def download_midware(self, request):
-        """
-        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
-        ---------
-        @param request:
-        ---------
-        @result: return request / request, response
-        """
-
-        pass
-
-    def validate(self, request, response):
-        """
-        @summary: 校验函数, 可用于校验response是否正确
-        若函数内抛出异常,则重试请求
-        若返回True 或 None,则进入解析函数
-        若返回False,则抛弃当前请求
-        可通过request.callback_name 区分不同的回调函数,编写不同的校验逻辑
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result: True / None / False
-        """
-        pass
-
-    def parse(self, request, response):
-        """
-        @summary: 默认的解析函数
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result:
-        """
-
-        pass
-
-    def exception_request(self, request, response, e):
-        """
-        @summary: 请求或者parser里解析出异常的request
-        ---------
-        @param request:
-        @param response:
-        @param e:
-        ---------
-        @result: request / callback / None (返回值必须可迭代)
-        """
-
-        pass
-
-    def failed_request(self, request, response):
-        """
-        @summary: 超过最大重试次数的request
-        可返回修改后的request  若不返回request,则将传进来的request直接人redis的failed表。否则将修改后的request入failed表
-        ---------
-        @param request:
-        ---------
-        @result: request / item / callback / None (返回值必须可迭代)
-        """
-
-        pass
-
-    def start_callback(self):
-        """
-        @summary: 程序开始的回调
-        ---------
-        ---------
-        @result: None
-        """
-
-        pass
-
-    def end_callback(self):
-        """
-        @summary: 程序结束的回调
-        ---------
-        ---------
-        @result: None
-        """
-
-        pass
-
-    @property
-    def name(self):
-        return self.__class__.__name__
-
-    def close(self):
-        pass
-
-
-class BatchParser(BaseParser):
-    """
-    @summary: 批次爬虫模版
-    ---------
-    """
-
-    def __init__(
-        self, task_table, batch_record_table, task_state, date_format, mysqldb=None
-    ):
-        self._mysqldb = mysqldb or MysqlDB()  # mysqldb
-
-        self._task_table = task_table  # mysql中的任务表
-        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
-        self._task_state = task_state  # mysql中任务表的state字段名
-        self._date_format = date_format  # 批次日期格式
-
-    def add_task(self):
-        """
-        @summary: 添加任务, 每次启动start_monitor 都会调用,且在init_task之前调用
-        ---------
-        ---------
-        @result:
-        """
-
-    def start_requests(self, task):
-        """
-        @summary:
-        ---------
-        @param task: 任务信息 list
-        ---------
-        @result:
-        """
-
-    def update_task_state(self, task_id, state=1, **kwargs):
-        """
-        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
-        调用方法为 yield lambda : self.update_task_state(task_id, state)
-        ---------
-        @param task_id:
-        @param state:
-        ---------
-        @result:
-        """
-
-        kwargs["id"] = task_id
-        kwargs[self._task_state] = state
-
-        sql = tools.make_update_sql(
-            self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)
-        )
-
-        if self._mysqldb.update(sql):
-            log.debug("置任务%s状态成功" % task_id)
-        else:
-            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
-
-    def update_task_batch(self, task_id, state=1, **kwargs):
-        """
-        批量更新任务 多处调用,更新的字段必须一致
-        注意:需要 写成 yield update_task_batch(...) 否则不会更新
-        @param task_id:
-        @param state:
-        @param kwargs:
-        @return:
-        """
-        kwargs["id"] = task_id
-        kwargs[self._task_state] = state
-
-        update_item = UpdateItem(**kwargs)
-        update_item.table_name = self._task_table
-        update_item.name_underline = self._task_table + "_item"
-
-        return update_item
-
-    @property
-    def batch_date(self):
-        """
-        @summary: 获取批次时间
-        ---------
-        ---------
-        @result:
-        """
-
-        batch_date = os.environ.get("batch_date")
-        if not batch_date:
-            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
-                date_format=self._date_format.replace(":%M", ":%i"),
-                batch_record_table=self._batch_record_table,
-            )
-            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
-            if batch_info:
-                os.environ["batch_date"] = batch_date = batch_info[0][0]
-            else:
-                log.error("需先运行 start_monitor_task()")
-                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
-
-        return batch_date

+ 0 - 176
zgztb_cookie/FworkSpider/feapder/core/collector.py

@@ -1,176 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-12-23 11:24
----------
-@summary: request 管理
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import collections
-import threading
-import time
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-
-class Collector(threading.Thread):
-    def __init__(self, redis_key):
-        """
-        @summary:
-        ---------
-        @param redis_key:
-        ---------
-        @result:
-        """
-
-        super(Collector, self).__init__()
-        self._db = RedisDB()
-
-        self._thread_stop = False
-
-        self._todo_requests = collections.deque()
-
-        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
-        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
-
-        self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
-
-        self._interval = setting.COLLECTOR_SLEEP_TIME
-        self._request_count = setting.COLLECTOR_TASK_COUNT
-        self._is_collector_task = False
-        self._first_get_task = True
-
-        self.__delete_dead_node()
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                self.__report_node_heartbeat()
-                self.__input_data()
-            except Exception as e:
-                log.exception(e)
-
-            self._is_collector_task = False
-
-            time.sleep(self._interval)
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def __input_data(self):
-        current_timestamp = tools.get_current_timestamp()
-        if len(self._todo_requests) >= self._request_count:
-            return
-
-        request_count = self._request_count  # 先赋值
-        # 查询最近有心跳的节点数量
-        spider_count = self._db.zget_count(
-            self._tab_spider_status,
-            priority_min=current_timestamp - (self._interval + 10),
-            priority_max=current_timestamp,
-        )
-        # 根据等待节点数量,动态分配request
-        if spider_count:
-            # 任务数量
-            task_count = self._db.zget_count(self._tab_requests)
-            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
-            request_count = task_count // spider_count + 1
-
-        request_count = (
-            request_count
-            if request_count <= self._request_count
-            else self._request_count
-        )
-
-        if not request_count:
-            return
-
-        # 当前无其他节点,并且是首次取任务,则重置丢失的任务
-        if self._first_get_task and spider_count <= 1:
-            datas = self._db.zrangebyscore_set_score(
-                self._tab_requests,
-                priority_min=current_timestamp,
-                priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
-                score=300,
-                count=None,
-            )
-            self._first_get_task = False
-            lose_count = len(datas)
-            if lose_count:
-                log.info("重置丢失任务完毕,共{}条".format(len(datas)))
-
-        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
-        requests_list = self._db.zrangebyscore_set_score(
-            self._tab_requests,
-            priority_min="-inf",
-            priority_max=current_timestamp,
-            score=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
-            count=request_count,
-        )
-
-        if requests_list:
-            self._is_collector_task = True
-            # 存request
-            self.__put_requests(requests_list)
-
-    def __report_node_heartbeat(self):
-        """
-        汇报节点心跳,以便任务平均分配
-        """
-        self._db.zadd(
-            self._tab_spider_status, self._spider_mark, tools.get_current_timestamp()
-        )
-
-    def __delete_dead_node(self):
-        """
-        删除没有心跳的节点信息
-        """
-        self._db.zremrangebyscore(
-            self._tab_spider_status,
-            "-inf",
-            tools.get_current_timestamp() - (self._interval + 10),
-        )
-
-    def __put_requests(self, requests_list):
-        for request in requests_list:
-            try:
-                request_dict = {
-                    "request_obj": Request.from_dict(eval(request)),
-                    "request_redis": request,
-                }
-            except Exception as e:
-                log.exception(
-                    """
-                error %s
-                request %s
-                """
-                    % (e, request)
-                )
-
-                request_dict = None
-
-            if request_dict:
-                self._todo_requests.append(request_dict)
-
-    def get_requests(self, count):
-        requests = []
-        count = count if count <= len(self._todo_requests) else len(self._todo_requests)
-        while count:
-            requests.append(self._todo_requests.popleft())
-            count -= 1
-
-        return requests
-
-    def get_requests_count(self):
-        return len(self._todo_requests) or self._db.zget_count(self._tab_requests) or 0
-
-    def is_collector_task(self):
-        return self._is_collector_task

+ 0 - 56
zgztb_cookie/FworkSpider/feapder/core/handle_failed_requests.py

@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-13 11:43:01
----------
-@summary:
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-import feapder.setting as setting
-from feapder.buffer.request_buffer import RequestBuffer
-from feapder.db.redisdb import RedisDB
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-
-class HandleFailedRequests(object):
-    """docstring for HandleFailedRequests"""
-
-    def __init__(self, redis_key):
-        super(HandleFailedRequests, self).__init__()
-        self._redis_key = redis_key
-
-        self._redisdb = RedisDB()
-        self._request_buffer = RequestBuffer(self._redis_key)
-
-        self._table_failed_request = setting.TAB_FAILED_REQUESTS.format(
-            redis_key=redis_key
-        )
-
-    def get_failed_requests(self, count=10000):
-        failed_requests = self._redisdb.zget(self._table_failed_request, count=count)
-        failed_requests = [eval(failed_request) for failed_request in failed_requests]
-        return failed_requests
-
-    def reput_failed_requests_to_requests(self):
-        log.debug("正在重置失败的requests...")
-        total_count = 0
-        while True:
-            try:
-                failed_requests = self.get_failed_requests()
-                if not failed_requests:
-                    break
-
-                for request in failed_requests:
-                    request["retry_times"] = 0
-                    request_obj = Request.from_dict(request)
-                    self._request_buffer.put_request(request_obj)
-
-                    total_count += 1
-            except Exception as e:
-                log.exception(e)
-
-        self._request_buffer.flush()
-
-        log.debug("重置%s条失败requests为待抓取requests" % total_count)

+ 0 - 721
zgztb_cookie/FworkSpider/feapder/core/parser_control.py

@@ -1,721 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2017-01-03 16:06
----------
-@summary: parser 控制类
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import random
-import threading
-import time
-from collections import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.db.memory_db import MemoryDB
-from feapder.network.item import Item
-from feapder.network.request import Request
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-
-class PaserControl(threading.Thread):
-    DOWNLOAD_EXCEPTION = "download_exception"
-    DOWNLOAD_SUCCESS = "download_success"
-    DOWNLOAD_TOTAL = "download_total"
-    PAESERS_EXCEPTION = "parser_exception"
-
-    is_show_tip = False
-
-    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
-    _success_task_count = 0
-    _failed_task_count = 0
-
-    def __init__(self, collector, redis_key, request_buffer, item_buffer):
-        super(PaserControl, self).__init__()
-        self._parsers = []
-        self._collector = collector
-        self._redis_key = redis_key
-        self._request_buffer = request_buffer
-        self._item_buffer = item_buffer
-
-        self._thread_stop = False
-
-        self._wait_task_time = 0
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
-                if not requests:
-                    if not self.is_show_tip:
-                        log.debug("parser 等待任务...")
-                        self.is_show_tip = True
-
-                    # log.debug('parser 等待任务{}...'.format(tools.format_seconds(self._wait_task_time)))
-
-                    time.sleep(1)
-                    self._wait_task_time += 1
-                    continue
-
-                self.is_show_tip = False
-                self.deal_requests(requests)
-
-            except (Exception, BaseException) as e:
-                log.exception(e)
-                time.sleep(3)
-
-    def is_not_task(self):
-        return self.is_show_tip
-
-    @classmethod
-    def get_task_status_count(cls):
-        return cls._failed_task_count, cls._success_task_count
-
-    def deal_requests(self, requests):
-        for request in requests:
-
-            response = None
-            request_redis = request["request_redis"]
-            request = request["request_obj"]
-
-            del_request_redis_after_item_to_db = False
-            del_request_redis_after_request_to_db = False
-
-            for parser in self._parsers:
-                if parser.name == request.parser_name:
-                    used_download_midware_enable = False
-                    try:
-                        # 记录需下载的文档
-                        self.record_download_status(
-                            PaserControl.DOWNLOAD_TOTAL, parser.name
-                        )
-
-                        # 解析request
-                        if request.auto_request:
-                            request_temp = None
-                            response = None
-
-                            # 下载中间件
-                            if request.download_midware:
-                                if isinstance(request.download_midware, (list, tuple)):
-                                    request_temp = request
-                                    for download_midware in request.download_midware:
-                                        download_midware = (
-                                            download_midware
-                                            if callable(download_midware)
-                                            else tools.get_method(
-                                                parser, download_midware
-                                            )
-                                        )
-                                        request_temp = download_midware(request_temp)
-                                else:
-                                    download_midware = (
-                                        request.download_midware
-                                        if callable(request.download_midware)
-                                        else tools.get_method(
-                                            parser, request.download_midware
-                                        )
-                                    )
-                                    request_temp = download_midware(request)
-                            elif request.download_midware != False:
-                                request_temp = parser.download_midware(request)
-
-                            # 请求
-                            if request_temp:
-                                if (
-                                    isinstance(request_temp, (tuple, list))
-                                    and len(request_temp) == 2
-                                ):
-                                    request_temp, response = request_temp
-
-                                if not isinstance(request_temp, Request):
-                                    raise Exception(
-                                        "download_midware need return a request, but received type: {}".format(
-                                            type(request_temp)
-                                        )
-                                    )
-                                used_download_midware_enable = True
-                                if not response:
-                                    response = (
-                                        request_temp.get_response()
-                                        if not setting.RESPONSE_CACHED_USED
-                                        else request_temp.get_response_from_cached(
-                                            save_cached=False
-                                        )
-                                    )
-                            else:
-                                response = (
-                                    request.get_response()
-                                    if not setting.RESPONSE_CACHED_USED
-                                    else request.get_response_from_cached(
-                                        save_cached=False
-                                    )
-                                )
-
-                            if response == None:
-                                raise Exception("连接超时 url: %s" % (
-                                            request.url or request_temp.url))
-
-                            # 校验
-                            if parser.validate(request, response) == False:
-                                continue
-
-                        else:
-                            response = None
-
-                        if request.callback:  # 如果有parser的回调函数,则用回调处理
-                            callback_parser = (
-                                request.callback
-                                if callable(request.callback)
-                                else tools.get_method(parser, request.callback)
-                            )
-                            results = callback_parser(request, response)
-                        else:  # 否则默认用parser处理
-                            results = parser.parse(request, response)
-
-                        if results and not isinstance(results, Iterable):
-                            raise Exception(
-                                "%s.%s返回值必须可迭代"
-                                % (parser.name, request.callback or "parse")
-                            )
-
-                        # 标识上一个result是什么
-                        result_type = 0  # 0\1\2 (初始值\request\item)
-                        # 此处判断是request 还是 item
-                        for result in results or []:
-                            if isinstance(result, Request):
-                                result_type = 1
-                                # 给request的 parser_name 赋值
-                                result.parser_name = result.parser_name or parser.name
-
-                                # 判断是同步的callback还是异步的
-                                if result.request_sync:  # 同步
-                                    request_dict = {
-                                        "request_obj": result,
-                                        "request_redis": None,
-                                    }
-                                    requests.append(request_dict)
-                                else:  # 异步
-                                    # 将next_request 入库
-                                    self._request_buffer.put_request(result)
-                                    del_request_redis_after_request_to_db = True
-
-                            elif isinstance(result, Item):
-                                result_type = 2
-                                # 将item入库
-                                self._item_buffer.put_item(result)
-                                # 需删除已完成的request
-                                del_request_redis_after_item_to_db = True
-
-                            elif callable(result):  # result为可执行的无参函数
-                                if (
-                                    result_type == 2
-                                ):  # item 的 callback,buffer里的item均入库后再执行
-                                    self._item_buffer.put_item(result)
-                                    del_request_redis_after_item_to_db = True
-
-                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
-                                    self._request_buffer.put_request(result)
-                                    del_request_redis_after_request_to_db = True
-
-                            # else:
-                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))
-
-                    except (Exception, BaseException) as e:
-                        exception_type = (
-                            str(type(e)).replace("<class '", "").replace("'>", "")
-                        )
-                        if exception_type.startswith("requests"):
-                            # 记录下载失败的文档
-                            self.record_download_status(
-                                PaserControl.DOWNLOAD_EXCEPTION, parser.name
-                            )
-
-                        else:
-                            # 记录解析程序异常
-                            self.record_download_status(
-                                PaserControl.PAESERS_EXCEPTION, parser.name
-                            )
-
-                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
-                            log.exception(e)
-
-                        log.error(
-                            """
-                            -------------- %s.%s error -------------
-                            error          %s
-                            response       %s
-                            deal request   %s
-                            """
-                            % (
-                                parser.name,
-                                (
-                                    request.callback
-                                    and callable(request.callback)
-                                    and getattr(request.callback, "__name__")
-                                    or request.callback
-                                )
-                                or "parse",
-                                str(e),
-                                response,
-                                tools.dumps_json(request.to_dict, indent=28)
-                                if setting.LOG_LEVEL == "DEBUG"
-                                else request,
-                            )
-                        )
-
-                        request.error_msg = "%s: %s" % (exception_type, e)
-                        request.response = str(response)
-
-                        if "Invalid URL" in str(e):
-                            request.is_abandoned = True
-
-                        requests = parser.exception_request(request, response, e) or [
-                            request
-                        ]
-                        if not isinstance(requests, Iterable):
-                            raise Exception(
-                                "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
-                            )
-                        for request in requests:
-                            if callable(request):
-                                self._request_buffer.put_request(request)
-                                continue
-
-                            if not isinstance(request, Request):
-                                raise Exception("exception_request 需 yield request")
-
-                            if (
-                                request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
-                                or request.is_abandoned
-                            ):
-                                self.__class__._failed_task_count += 1  # 记录失败任务数
-
-                                # 处理failed_request的返回值 request 或 func
-                                results = parser.failed_request(request, response) or [
-                                    request
-                                ]
-                                if not isinstance(results, Iterable):
-                                    raise Exception(
-                                        "%s.%s返回值必须可迭代"
-                                        % (parser.name, "failed_request")
-                                    )
-
-                                for result in results:
-                                    if isinstance(result, Request):
-                                        if setting.SAVE_FAILED_REQUEST:
-                                            if used_download_midware_enable:
-                                                # 去掉download_midware 添加的属性
-                                                original_request = (
-                                                    Request.from_dict(
-                                                        eval(request_redis)
-                                                    )
-                                                    if request_redis
-                                                    else result
-                                                )
-                                                original_request.error_msg = (
-                                                    request.error_msg
-                                                )
-                                                original_request.response = (
-                                                    request.response
-                                                )
-
-                                                self._request_buffer.put_failed_request(
-                                                    original_request
-                                                )
-                                            else:
-                                                self._request_buffer.put_failed_request(
-                                                    result
-                                                )
-
-                                    elif callable(result):
-                                        self._request_buffer.put_request(result)
-
-                                    elif isinstance(result, Item):
-                                        self._item_buffer.put_item(result)
-
-                                del_request_redis_after_request_to_db = True
-
-                            else:
-                                # 将 requests 重新入库 爬取
-                                request.retry_times += 1
-                                request.filter_repeat = False
-                                log.info(
-                                    """
-                                    入库 等待重试
-                                    url     %s
-                                    重试次数 %s
-                                    最大允许重试次数 %s"""
-                                    % (
-                                        request.url,
-                                        request.retry_times,
-                                        setting.SPIDER_MAX_RETRY_TIMES,
-                                    )
-                                )
-                                if used_download_midware_enable:
-                                    # 去掉download_midware 添加的属性 使用原来的requests
-                                    original_request = (
-                                        Request.from_dict(eval(request_redis))
-                                        if request_redis
-                                        else request
-                                    )
-                                    if hasattr(request, "error_msg"):
-                                        original_request.error_msg = request.error_msg
-                                    if hasattr(request, "response"):
-                                        original_request.response = request.response
-                                    original_request.retry_times = request.retry_times
-                                    original_request.filter_repeat = (
-                                        request.filter_repeat
-                                    )
-
-                                    self._request_buffer.put_request(original_request)
-                                else:
-                                    self._request_buffer.put_request(request)
-                                del_request_redis_after_request_to_db = True
-
-                    else:
-                        # 记录下载成功的文档
-                        self.record_download_status(
-                            PaserControl.DOWNLOAD_SUCCESS, parser.name
-                        )
-                        # 记录成功任务数
-                        self.__class__._success_task_count += 1
-
-                        # 缓存下载成功的文档
-                        if setting.RESPONSE_CACHED_ENABLE:
-                            request.save_cached(
-                                response=response,
-                                expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
-                            )
-
-                    finally:
-                        # 释放浏览器
-                        if response and hasattr(response, "browser"):
-                            request._webdriver_pool.put(response.browser)
-
-                    break
-
-            # 删除正在做的request 跟随item优先
-            if request_redis:
-                if del_request_redis_after_item_to_db:
-                    self._item_buffer.put_item(request_redis)
-
-                elif del_request_redis_after_request_to_db:
-                    self._request_buffer.put_del_request(request_redis)
-
-                else:
-                    self._request_buffer.put_del_request(request_redis)
-
-        if setting.SPIDER_SLEEP_TIME:
-            if (
-                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
-                and len(setting.SPIDER_SLEEP_TIME) == 2
-            ):
-                sleep_time = random.randint(
-                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
-                )
-                time.sleep(sleep_time)
-            else:
-                time.sleep(setting.SPIDER_SLEEP_TIME)
-
-    def record_download_status(self, status, spider):
-        """
-        记录html等文档下载状态
-        @return:
-        """
-
-        metrics.emit_counter(f"{spider}:{status}", 1, classify="document")
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def add_parser(self, parser):
-        self._parsers.append(parser)
-
-
-class AirSpiderParserControl(PaserControl):
-    is_show_tip = False
-
-    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
-    _success_task_count = 0
-    _failed_task_count = 0
-
-    def __init__(self, memory_db: MemoryDB, item_buffer: ItemBuffer):
-        super(PaserControl, self).__init__()
-        self._parsers = []
-        self._memory_db = memory_db
-        self._thread_stop = False
-        self._wait_task_time = 0
-        self._item_buffer = item_buffer
-
-    def run(self):
-        while not self._thread_stop:
-            try:
-                request = self._memory_db.get()
-                if not request:
-                    if not self.is_show_tip:
-                        log.debug("parser 等待任务...")
-                        self.is_show_tip = True
-
-                    time.sleep(1)
-                    self._wait_task_time += 1
-                    continue
-
-                self.is_show_tip = False
-                self.deal_request(request)
-
-            except (Exception, BaseException) as e:
-                log.exception(e)
-                time.sleep(3)
-
-    def deal_request(self, request):
-        response = None
-
-        for parser in self._parsers:
-            if parser.name == request.parser_name:
-                try:
-                    # 记录需下载的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_TOTAL, parser.name
-                    )
-
-                    # 解析request
-                    if request.auto_request:
-                        request_temp = None
-                        response = None
-
-                        # 下载中间件
-                        if request.download_midware:
-                            if isinstance(request.download_midware, (list, tuple)):
-                                request_temp = request
-                                for download_midware in request.download_midware:
-                                    download_midware = (
-                                        download_midware
-                                        if callable(download_midware)
-                                        else tools.get_method(
-                                            parser, download_midware
-                                        )
-                                    )
-                                    request_temp = download_midware(request_temp)
-                            else:
-                                download_midware = (
-                                    request.download_midware
-                                    if callable(request.download_midware)
-                                    else tools.get_method(
-                                        parser, request.download_midware
-                                    )
-                                )
-                                request_temp = download_midware(request)
-                        elif request.download_midware != False:
-                            request_temp = parser.download_midware(request)
-
-                        # 请求
-                        if request_temp:
-                            if (
-                                isinstance(request_temp, (tuple, list))
-                                and len(request_temp) == 2
-                            ):
-                                request_temp, response = request_temp
-
-                            if not isinstance(request_temp, Request):
-                                raise Exception(
-                                    "download_midware need return a request, but received type: {}".format(
-                                        type(request_temp)
-                                    )
-                                )
-                            request = request_temp
-
-                        if not response:
-                            response = (
-                                request.get_response()
-                                if not setting.RESPONSE_CACHED_USED
-                                else request.get_response_from_cached(
-                                    save_cached=False
-                                )
-                            )
-
-                        # 校验
-                        if parser.validate(request, response) == False:
-                            break
-
-                    else:
-                        response = None
-
-                    if request.callback:  # 如果有parser的回调函数,则用回调处理
-                        callback_parser = (
-                            request.callback
-                            if callable(request.callback)
-                            else tools.get_method(parser, request.callback)
-                        )
-                        results = callback_parser(request, response)
-                    else:  # 否则默认用parser处理
-                        results = parser.parse(request, response)
-
-                    if results and not isinstance(results, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代"
-                            % (parser.name, request.callback or "parse")
-                        )
-
-                    # 此处判断是request 还是 item
-                    for result in results or []:
-                        if isinstance(result, Request):
-                            # 给request的 parser_name 赋值
-                            result.parser_name = result.parser_name or parser.name
-
-                            # 判断是同步的callback还是异步的
-                            if result.request_sync:  # 同步
-                                self.deal_request(result)
-                            else:  # 异步
-                                # 将next_request 入库
-                                self._memory_db.add(result)
-
-                        elif isinstance(result, Item):
-                            self._item_buffer.put_item(result)
-
-                except (Exception, BaseException) as e:
-                    exception_type = (
-                        str(type(e)).replace("<class '", "").replace("'>", "")
-                    )
-                    if exception_type.startswith("requests"):
-                        # 记录下载失败的文档
-                        self.record_download_status(
-                            PaserControl.DOWNLOAD_EXCEPTION, parser.name
-                        )
-
-                    else:
-                        # 记录解析程序异常
-                        self.record_download_status(
-                            PaserControl.PAESERS_EXCEPTION, parser.name
-                        )
-
-                    if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
-                        log.exception(e)
-
-                    log.error(
-                        """
-                            -------------- %s.%s error -------------
-                            error          %s
-                            response       %s
-                            deal request   %s
-                            """
-                        % (
-                            parser.name,
-                            (
-                                request.callback
-                                and callable(request.callback)
-                                and getattr(request.callback, "__name__")
-                                or request.callback
-                            )
-                            or "parse",
-                            str(e),
-                            response,
-                            tools.dumps_json(request.to_dict, indent=28)
-                            if setting.LOG_LEVEL == "DEBUG"
-                            else request,
-                        )
-                    )
-
-                    request.error_msg = "%s: %s" % (exception_type, e)
-                    request.response = str(response)
-
-                    if "Invalid URL" in str(e):
-                        request.is_abandoned = True
-
-                    requests = parser.exception_request(request, response, e) or [
-                        request
-                    ]
-                    if not isinstance(requests, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
-                        )
-                    for request in requests:
-                        if not isinstance(request, Request):
-                            raise Exception("exception_request 需 yield request")
-
-                        if (
-                            request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
-                            or request.is_abandoned
-                        ):
-                            self.__class__._failed_task_count += 1  # 记录失败任务数
-
-                            # 处理failed_request的返回值 request 或 func
-                            results = parser.failed_request(request, response) or [
-                                request
-                            ]
-                            if not isinstance(results, Iterable):
-                                raise Exception(
-                                    "%s.%s返回值必须可迭代"
-                                    % (parser.name, "failed_request")
-                                )
-
-                            log.info(
-                                """
-                                任务超过最大重试次数,丢弃
-                                url     %s
-                                重试次数 %s
-                                最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-
-                        else:
-                            # 将 requests 重新入库 爬取
-                            request.retry_times += 1
-                            request.filter_repeat = False
-                            log.info(
-                                """
-                                    入库 等待重试
-                                    url     %s
-                                    重试次数 %s
-                                    最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-                            self._memory_db.add(request)
-
-                else:
-                    # 记录下载成功的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_SUCCESS, parser.name
-                    )
-                    # 记录成功任务数
-                    self.__class__._success_task_count += 1
-
-                    # 缓存下载成功的文档
-                    if setting.RESPONSE_CACHED_ENABLE:
-                        request.save_cached(
-                            response=response,
-                            expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
-                        )
-
-                finally:
-                    # 释放浏览器
-                    if response and hasattr(response, "browser"):
-                        request._webdriver_pool.put(response.browser)
-
-                break
-
-        if setting.SPIDER_SLEEP_TIME:
-            if (
-                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
-                and len(setting.SPIDER_SLEEP_TIME) == 2
-            ):
-                sleep_time = random.randint(
-                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
-                )
-                time.sleep(sleep_time)
-            else:
-                time.sleep(setting.SPIDER_SLEEP_TIME)

+ 0 - 579
zgztb_cookie/FworkSpider/feapder/core/scheduler.py

@@ -1,579 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2017-01-09 10:38
----------
-@summary: 组装parser、 parser_control 和 collector
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import threading
-import time
-from collections import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.buffer.request_buffer import RequestBuffer
-from feapder.core.base_parser import BaseParser
-from feapder.core.collector import Collector
-from feapder.core.handle_failed_requests import HandleFailedRequests
-from feapder.core.parser_control import PaserControl
-from feapder.db.redisdb import RedisDB
-from feapder.network.item import Item
-from feapder.network.request import Request
-from feapder.utils.log import log
-from feapder.utils.redis_lock import RedisLock
-from feapder.utils import metrics
-
-SPIDER_START_TIME_KEY = "spider_start_time"
-SPIDER_END_TIME_KEY = "spider_end_time"
-SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
-
-
-class Scheduler(threading.Thread):
-    __custom_setting__ = {}
-
-    def __init__(
-        self,
-        redis_key=None,
-        thread_count=None,
-        begin_callback=None,
-        end_callback=None,
-        delete_keys=(),
-        keep_alive=None,
-        auto_start_requests=None,
-        batch_interval=0,
-        wait_lock=True,
-        task_table=None,
-        **kwargs
-    ):
-        """
-        @summary: 调度器
-        ---------
-        @param redis_key: 爬虫request及item存放redis中的文件夹
-        @param thread_count: 线程数,默认为配置文件中的线程数
-        @param begin_callback: 爬虫开始回调函数
-        @param end_callback: 爬虫结束回调函数
-        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
-        @param keep_alive: 爬虫是否常驻,默认否
-        @param auto_start_requests: 爬虫是否自动添加任务
-        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
-        @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
-        @param task_table: 任务表, 批次爬虫传递
-        ---------
-        @result:
-        """
-
-        super(Scheduler, self).__init__()
-
-        for key, value in self.__class__.__custom_setting__.items():
-            if key == "AUTO_STOP_WHEN_SPIDER_DONE":  # 兼容老版本的配置
-                setattr(setting, "KEEP_ALIVE", not value)
-            else:
-                setattr(setting, key, value)
-        
-
-        self._redis_key = redis_key or setting.REDIS_KEY
-        if not self._redis_key:
-            raise Exception(
-                """
-                redis_key 为redis中存放request与item的目录。不能为空,
-                可在setting中配置,如 REDIS_KEY = 'test'
-                或spider初始化时传参, 如 TestSpider(redis_key='test')
-                """
-            )
-
-        self._request_buffer = RequestBuffer(redis_key)
-        self._item_buffer = ItemBuffer(redis_key, task_table)
-
-        self._collector = Collector(redis_key)
-        self._parsers = []
-        self._parser_controls = []
-        self._parser_control_obj = PaserControl
-
-        # 兼容老版本的参数
-        if "auto_stop_when_spider_done" in kwargs:
-            self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
-        else:
-            self._keep_alive = (
-                keep_alive if keep_alive is not None else setting.KEEP_ALIVE
-            )
-        self._auto_start_requests = (
-            auto_start_requests
-            if auto_start_requests is not None
-            else setting.SPIDER_AUTO_START_REQUESTS
-        )
-        self._batch_interval = batch_interval
-
-        self._begin_callback = (
-            begin_callback
-            if begin_callback
-            else lambda: log.info("\n********** feapder begin **********")
-        )
-        self._end_callback = (
-            end_callback
-            if end_callback
-            else lambda: log.info("\n********** feapder end **********")
-        )
-
-        self._thread_count = (
-            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
-        )
-
-        self._spider_name = redis_key
-        self._project_name = redis_key.split(":")[0]
-
-        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
-        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
-        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
-        self._tab_failed_requests = setting.TAB_FAILED_REQUESTS.format(
-            redis_key=redis_key
-        )
-
-        self._is_notify_end = False  # 是否已经通知结束
-        self._last_task_count = 0  # 最近一次任务数量
-        self._redisdb = RedisDB()
-
-        self._project_total_state_table = "{}_total_state".format(self._project_name)
-        self._is_exist_project_total_state_table = False
-
-        # Request 缓存设置
-        Request.cached_redis_key = redis_key
-        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
-
-        delete_keys = delete_keys or setting.DELETE_KEYS
-        if delete_keys:
-            self.delete_tables(delete_keys)
-
-        self._last_check_task_status_time = 0
-        self.wait_lock = wait_lock
-
-        self.init_metrics()
-
-    def init_metrics(self):
-        """
-        初始化打点系统
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def add_parser(self, parser):
-        parser = parser()  # parser 实例化
-        if isinstance(parser, BaseParser):
-            self._parsers.append(parser)
-        else:
-            raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
-
-    def run(self):
-        if not self.is_reach_next_spider_time():
-            return
-
-        self._start()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    if not self._is_notify_end:
-                        self.spider_end()  # 跑完一轮
-                        self.record_spider_state(
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
-                        self._is_notify_end = True
-
-                    if not self._keep_alive:
-                        self._stop_all_thread()
-                        break
-
-                else:
-                    self._is_notify_end = False
-
-                self.check_task_status()
-
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-    def __add_task(self):
-        # 启动parser 的 start_requests
-        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
-        self.record_spider_state(
-            spider_type=1,
-            state=0,
-            batch_date=tools.get_current_date(),
-            spider_start_time=tools.get_current_date(),
-            batch_interval=self._batch_interval,
-        )
-
-        # 判断任务池中属否还有任务,若有接着抓取
-        todo_task_count = self._collector.get_requests_count()
-        if todo_task_count:
-            log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
-        else:
-            for parser in self._parsers:
-                results = parser.start_requests()
-                # 添加request到请求队列,由请求队列统一入库
-                if results and not isinstance(results, Iterable):
-                    raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
-
-                result_type = 1
-                for result in results or []:
-                    if isinstance(result, Request):
-                        result.parser_name = result.parser_name or parser.name
-                        self._request_buffer.put_request(result)
-                        result_type = 1
-
-                    elif isinstance(result, Item):
-                        self._item_buffer.put_item(result)
-                        result_type = 2
-
-                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
-                        if result_type == 1:
-                            self._request_buffer.put_request(result)
-                        else:
-                            self._item_buffer.put_item(result)
-                    else:
-                        raise TypeError(
-                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
-                                type(result)
-                            )
-                        )
-
-                self._request_buffer.flush()
-                self._item_buffer.flush()
-
-    def _start(self):
-        # 启动request_buffer
-        self._request_buffer.start()
-        # 启动item_buffer
-        self._item_buffer.start()
-        # 启动collector
-        self._collector.start()
-
-        # 启动parser control
-        for i in range(self._thread_count):
-            parser_control = self._parser_control_obj(
-                self._collector,
-                self._redis_key,
-                self._request_buffer,
-                self._item_buffer,
-            )
-
-            for parser in self._parsers:
-                parser_control.add_parser(parser)
-
-            parser_control.start()
-            self._parser_controls.append(parser_control)
-
-        # 下发任务 因为时间可能比较长,放到最后面
-        if setting.RETRY_FAILED_REQUESTS:
-            # 重设失败的任务, 不用加锁,原子性操作
-            handle_failed_requests = HandleFailedRequests(self._redis_key)
-            handle_failed_requests.reput_failed_requests_to_requests()
-
-        # 下发新任务
-        if self._auto_start_requests:  # 自动下发
-            if self.wait_lock:
-                # 将添加任务处加锁,防止多进程之间添加重复的任务
-                with RedisLock(key=self._spider_name) as lock:
-                    if lock.locked:
-                        self.__add_task()
-            else:
-                self.__add_task()
-
-    def all_thread_is_done(self):
-        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
-            # 检测 collector 状态
-            if (
-                self._collector.is_collector_task()
-                or self._collector.get_requests_count() > 0
-            ):
-                return False
-
-            # 检测 parser_control 状态
-            for parser_control in self._parser_controls:
-                if not parser_control.is_not_task():
-                    return False
-
-            # 检测 item_buffer 状态
-            if (
-                self._item_buffer.get_items_count() > 0
-                or self._item_buffer.is_adding_to_db()
-            ):
-                return False
-
-            # 检测 request_buffer 状态
-            if (
-                self._request_buffer.get_requests_count() > 0
-                or self._request_buffer.is_adding_to_db()
-            ):
-                return False
-
-            tools.delay_time(1)
-
-        return True
-
-    @tools.run_safe_model("check_task_status")
-    def check_task_status(self):
-        """
-        检查任务状态 预警
-        """
-        # 每分钟检查一次
-        now_time = time.time()
-        if now_time - self._last_check_task_status_time > 60:
-            self._last_check_task_status_time = now_time
-        else:
-            return
-
-        # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
-        task_count = self._redisdb.zget_count(self._tab_requests)
-
-        if task_count:
-            if task_count != self._last_task_count:
-                self._last_task_count = task_count
-                self._redisdb.hset(
-                    self._tab_spider_time,
-                    SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
-                    tools.get_current_timestamp(),
-                )  # 多进程会重复发消息, 使用reids记录上次统计时间
-            else:
-                # 判断时间间隔是否超过20分钟
-                lua = """
-                    -- local key = KEYS[1]
-                    local field = ARGV[1]
-                    local current_timestamp = ARGV[2]
-
-                    -- 取值
-                    local last_timestamp = redis.call('hget', KEYS[1], field)
-                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
-                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
-                    end
-
-                    if not last_timestamp then
-                        redis.call('hset', KEYS[1], field, current_timestamp)
-                    end
-
-                    return 0
-
-                """
-                redis_obj = self._redisdb.get_redis_obj()
-                cmd = redis_obj.register_script(lua)
-                overtime = cmd(
-                    keys=[self._tab_spider_time],
-                    args=[
-                        SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
-                        tools.get_current_timestamp(),
-                    ],
-                )
-
-                if overtime:
-                    # 发送报警
-                    msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
-                        self._spider_name, tools.format_seconds(overtime)
-                    )
-                    log.error(msg)
-                    self.send_msg(
-                        msg,
-                        level="error",
-                        message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
-                    )
-
-        else:
-            self._last_task_count = 0
-
-        # 检查失败任务数量 超过1000 报警,
-        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
-        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数:',failed_count)
-        if failed_count > setting.WARNING_FAILED_COUNT:
-            # 发送报警
-            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
-            log.error(msg)
-            self.send_msg(
-                msg,
-                level="error",
-                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
-            )
-
-        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
-        failed_task_count, success_task_count = PaserControl.get_task_status_count()
-        total_count = success_task_count + failed_task_count
-        if total_count > 0:
-            task_success_rate = success_task_count / total_count
-            if task_success_rate < 0.5:
-                # 发送报警
-                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
-                    self._spider_name,
-                    success_task_count,
-                    failed_task_count,
-                    task_success_rate,
-                )
-                log.error(msg)
-                self.send_msg(
-                    msg,
-                    level="error",
-                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
-                )
-
-        # 检查入库失败次数
-        if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
-            msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
-                self._spider_name, self._item_buffer.export_falied_times
-            )
-            log.error(msg)
-            self.send_msg(
-                msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
-            )
-
-    def delete_tables(self, delete_tables_list):
-        if isinstance(delete_tables_list, bool):
-            delete_tables_list = [self._redis_key + "*"]
-        elif not isinstance(delete_tables_list, (list, tuple)):
-            delete_tables_list = [delete_tables_list]
-
-        redis = RedisDB()
-        for delete_tab in delete_tables_list:
-            if not delete_tab.startswith(self._redis_key):
-                delete_tab = self._redis_key + delete_tab
-            tables = redis.getkeys(delete_tab)
-            for table in tables:
-                if table != self._tab_spider_time:
-                    log.info("正在删除key %s" % table)
-                    redis.clear(table)
-
-    def _stop_all_thread(self):
-        self._request_buffer.stop()
-        self._item_buffer.stop()
-        # 停止 collector
-        self._collector.stop()
-        # 停止 parser_controls
-        for parser_control in self._parser_controls:
-            parser_control.stop()
-
-        self._started.clear()
-
-    def send_msg(self, msg, level="debug", message_prefix=""):
-        # log.debug("发送报警 level:{} msg{}".format(level, msg))
-        tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
-
-    def spider_begin(self):
-        """
-        @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
-        ---------
-        ---------
-        @result:
-        """
-
-        if self._begin_callback:
-            self._begin_callback()
-
-        for parser in self._parsers:
-            parser.start_callback()
-
-        # 记录开始时间
-        if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
-            current_timestamp = tools.get_current_timestamp()
-            self._redisdb.hset(
-                self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
-            )
-
-            # 发送消息
-            # self.send_msg("《%s》爬虫开始" % self._spider_name)
-
-    def spider_end(self):
-        self.record_end_time()
-
-        if self._end_callback:
-            self._end_callback()
-
-        for parser in self._parsers:
-            if not self._keep_alive:
-                parser.close()
-            parser.end_callback()
-
-        if not self._keep_alive:
-            # 关闭webdirver
-            if Request.webdriver_pool:
-                Request.webdriver_pool.close()
-
-            # 关闭打点
-            metrics.close()
-        else:
-            metrics.flush()
-
-        # 计算抓取时长
-        data = self._redisdb.hget(
-            self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
-        )
-        if data:
-            begin_timestamp = int(data)
-
-            spand_time = tools.get_current_timestamp() - begin_timestamp
-
-            msg = "《%s》爬虫结束,耗时 %s" % (
-                self._spider_name,
-                tools.format_seconds(spand_time),
-            )
-            log.info(msg)
-
-            # self.send_msg(msg)
-
-        if self._keep_alive:
-            log.info("爬虫不自动结束, 等待下一轮任务...")
-        else:
-            self.delete_tables(self._tab_spider_status)
-
-    def record_end_time(self):
-        # 记录结束时间
-        if self._batch_interval:
-            current_timestamp = tools.get_current_timestamp()
-            self._redisdb.hset(
-                self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
-            )
-
-    def is_reach_next_spider_time(self):
-        if not self._batch_interval:
-            return True
-
-        last_spider_end_time = self._redisdb.hget(
-            self._tab_spider_time, SPIDER_END_TIME_KEY
-        )
-        if last_spider_end_time:
-            last_spider_end_time = int(last_spider_end_time)
-            current_timestamp = tools.get_current_timestamp()
-            time_interval = current_timestamp - last_spider_end_time
-
-            if time_interval < self._batch_interval * 86400:
-                log.info(
-                    "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
-                        tools.timestamp_to_date(last_spider_end_time),
-                        tools.format_seconds(time_interval),
-                        tools.format_seconds(self._batch_interval * 86400),
-                    )
-                )
-                return False
-
-        return True
-
-    def record_spider_state(
-        self,
-        spider_type,
-        state,
-        batch_date=None,
-        spider_start_time=None,
-        spider_end_time=None,
-        batch_interval=None,
-    ):
-        pass
-
-    def join(self, timeout=None):
-        """
-        重写线程的join
-        """
-        if not self._started.is_set():
-            return
-
-        super().join()

+ 0 - 15
zgztb_cookie/FworkSpider/feapder/core/spiders/__init__.py

@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:08 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-__all__ = ["AirSpider", "Spider", "BatchSpider"]
-
-from feapder.core.spiders.air_spider import AirSpider
-from feapder.core.spiders.spider import Spider
-from feapder.core.spiders.batch_spider import BatchSpider

+ 0 - 125
zgztb_cookie/FworkSpider/feapder/core/spiders/air_spider.py

@@ -1,125 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:05 AM
----------
-@summary: 基于内存队列的爬虫,不支持分布式
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-from threading import Thread
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.core.base_parser import BaseParser
-from feapder.core.parser_control import AirSpiderParserControl
-from feapder.db.memory_db import MemoryDB
-from feapder.network.request import Request
-from feapder.utils.log import log
-from feapder.utils import metrics
-
-
-class AirSpider(BaseParser, Thread):
-    __custom_setting__ = {}
-
-    def __init__(self, thread_count=None):
-        """
-        基于内存队列的爬虫,不支持分布式
-        :param thread_count: 线程数
-        """
-        super(AirSpider, self).__init__()
-
-        for key, value in self.__class__.__custom_setting__.items():
-            setattr(setting, key, value)
-
-        self._thread_count = (
-            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
-        )
-
-        self._memory_db = MemoryDB()
-        self._parser_controls = []
-        self._item_buffer = ItemBuffer(redis_key="air_spider")
-
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def distribute_task(self):
-        for request in self.start_requests():
-            if not isinstance(request, Request):
-                raise ValueError("仅支持 yield Request")
-
-            request.parser_name = request.parser_name or self.name
-            self._memory_db.add(request)
-
-    def all_thread_is_done(self):
-        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
-            # 检测 parser_control 状态
-            for parser_control in self._parser_controls:
-                if not parser_control.is_not_task():
-                    return False
-
-            # 检测 任务队列 状态
-            if not self._memory_db.empty():
-                return False
-
-            # 检测 item_buffer 状态
-            if (
-                self._item_buffer.get_items_count() > 0
-                or self._item_buffer.is_adding_to_db()
-            ):
-                return False
-
-            tools.delay_time(1)
-
-        return True
-
-    def run(self):
-        self.start_callback()
-
-        for i in range(self._thread_count):
-            parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer)
-            parser_control.add_parser(self)
-            parser_control.start()
-            self._parser_controls.append(parser_control)
-
-        self._item_buffer.start()
-
-        self.distribute_task()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    # 停止 parser_controls
-                    for parser_control in self._parser_controls:
-                        parser_control.stop()
-
-                    # 关闭item_buffer
-                    self._item_buffer.stop()
-
-                    # 关闭webdirver
-                    if Request.webdriver_pool:
-                        Request.webdriver_pool.close()
-
-                    log.info("无任务,爬虫结束")
-                    break
-
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-        self.end_callback()
-        # 为了线程可重复start
-        self._started.clear()
-        # 关闭打点
-        metrics.close()
-
-    def join(self, timeout=None):
-        """
-        重写线程的join
-        """
-        if not self._started.is_set():
-            return
-
-        super().join()

+ 0 - 1273
zgztb_cookie/FworkSpider/feapder/core/spiders/batch_spider.py

@@ -1,1273 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:06 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import datetime
-import os
-import time
-import warnings
-from collections import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import MAX_ITEM_COUNT
-from feapder.core.base_parser import BatchParser
-from feapder.core.scheduler import Scheduler
-from feapder.db.mysqldb import MysqlDB
-from feapder.db.redisdb import RedisDB
-from feapder.network.item import Item
-from feapder.network.item import UpdateItem
-from feapder.network.request import Request
-from feapder.utils.log import log
-from feapder.utils.perfect_dict import PerfectDict
-from feapder.utils.redis_lock import RedisLock
-
-CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
-MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
-
-
-class BatchSpider(BatchParser, Scheduler):
-    def __init__(
-        self,
-        task_table,
-        batch_record_table,
-        batch_name,
-        batch_interval,
-        task_keys,
-        task_state="state",
-        min_task_count=10000,
-        check_task_interval=5,
-        task_limit=10000,
-        related_redis_key=None,
-        related_batch_record=None,
-        task_condition="",
-        task_order_by="",
-        redis_key=None,
-        thread_count=None,
-        begin_callback=None,
-        end_callback=None,
-        delete_keys=(),
-        keep_alive=None,
-        **kwargs,
-    ):
-        """
-        @summary: 批次爬虫
-        必要条件
-        1、需有任务表
-            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充
-
-            参考建表语句如下:
-            CREATE TABLE `table_name` (
-              `id` int(11) NOT NULL AUTO_INCREMENT,
-              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
-              `state` int(11) DEFAULT NULL COMMENT '任务状态',
-              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
-              PRIMARY KEY (`id`),
-              UNIQUE KEY `nui` (`param`) USING BTREE
-            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-
-        2、需有批次记录表 不存在自动创建
-        ---------
-        @param task_table: mysql中的任务表
-        @param batch_record_table: mysql 中的批次记录表
-        @param batch_name: 批次采集程序名称
-        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24
-        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。
-        @param task_state: mysql中任务表的任务状态字段
-        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
-        @param check_task_interval: 检查是否还有任务的时间间隔;
-        @param task_limit: 从数据库中取任务的数量
-        @param redis_key: 任务等数据存放在redis中的key前缀
-        @param thread_count: 线程数,默认为配置文件中的线程数
-        @param begin_callback: 爬虫开始回调函数
-        @param end_callback: 爬虫结束回调函数
-        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
-        @param keep_alive: 爬虫是否常驻,默认否
-        @param related_redis_key: 有关联的其他爬虫任务表(redis)注意:要避免环路 如 A -> B & B -> A 。
-        @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。
-            related_redis_key 与 related_batch_record 选其一配置即可;用于相关联的爬虫没结束时,本爬虫也不结束
-            若相关连的爬虫为批次爬虫,推荐以related_batch_record配置,
-            若相关连的爬虫为普通爬虫,无批次表,可以以related_redis_key配置
-        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,即where后的条件语句
-        @param task_order_by: 取任务时的排序条件 如 id desc
-        ---------
-        @result:
-        """
-        Scheduler.__init__(
-            self,
-            redis_key=redis_key,
-            thread_count=thread_count,
-            begin_callback=begin_callback,
-            end_callback=end_callback,
-            delete_keys=delete_keys,
-            keep_alive=keep_alive,
-            auto_start_requests=False,
-            batch_interval=batch_interval,
-            task_table=task_table,
-            **kwargs,
-        )
-
-        self._redisdb = RedisDB()
-        self._mysqldb = MysqlDB()
-
-        self._task_table = task_table  # mysql中的任务表
-        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
-        self._batch_name = batch_name  # 批次采集程序名称
-        self._task_keys = task_keys  # 需要获取的任务字段
-
-        self._task_state = task_state  # mysql中任务表的state字段名
-        self._min_task_count = min_task_count  # redis 中最少任务数
-        self._check_task_interval = check_task_interval
-        self._task_limit = task_limit  # mysql中一次取的任务数量
-        self._related_task_tables = [
-            setting.TAB_REQUESTS.format(redis_key=redis_key)
-        ]  # 自己的task表也需要检查是否有任务
-        if related_redis_key:
-            self._related_task_tables.append(
-                setting.TAB_REQUESTS.format(redis_key=related_redis_key)
-            )
-
-        self._related_batch_record = related_batch_record
-        self._task_condition = task_condition
-        self._task_condition_prefix_and = task_condition and " and {}".format(
-            task_condition
-        )
-        self._task_condition_prefix_where = task_condition and " where {}".format(
-            task_condition
-        )
-        self._task_order_by = task_order_by and " order by {}".format(task_order_by)
-
-        self._batch_date_cache = None
-        if self._batch_interval >= 1:
-            self._date_format = "%Y-%m-%d"
-        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
-            self._date_format = "%Y-%m-%d %H"
-        else:
-            self._date_format = "%Y-%m-%d %H:%M"
-
-        # 报警相关
-        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
-        self._last_send_msg_time = None
-
-        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
-        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
-        self._spider_deal_speed_cached = None
-
-        self._is_more_parsers = True  # 多模版类爬虫
-
-    def init_property(self):
-        """
-        每个批次开始时需要重置的属性
-        @return:
-        """
-        self._last_send_msg_time = None
-
-        self._spider_last_done_time = None
-        self._spider_last_done_count = 0  # 爬虫刚开始启动时已做任务数量
-
-    def add_parser(self, parser):
-        parser = parser(
-            self._task_table,
-            self._batch_record_table,
-            self._task_state,
-            self._date_format,
-            self._mysqldb,
-        )  # parser 实例化
-        self._parsers.append(parser)
-
-    def start_monitor_task(self):
-        """
-        @summary: 监控任务状态
-        ---------
-        ---------
-        @result:
-        """
-        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
-            self._is_more_parsers = False
-            self._parsers.append(self)
-
-        elif len(self._parsers) <= 1:
-            self._is_more_parsers = False
-
-        self.create_batch_record_table()
-
-        # 添加任务
-        for parser in self._parsers:
-            parser.add_task()
-
-        is_first_check = True
-        while True:
-            try:
-                if self.check_batch(is_first_check):  # 该批次已经做完
-                    if self._keep_alive:
-                        is_first_check = True
-                        log.info("爬虫所有任务已做完,不自动结束,等待新任务...")
-                        time.sleep(self._check_task_interval)
-                        continue
-                    else:
-                        break
-
-                is_first_check = False
-
-                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
-                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
-                todo_task_count = self._redisdb.zget_count(tab_requests)
-
-                tasks = []
-                if todo_task_count < self._min_task_count:  # 从mysql中取任务
-                    # 更新batch表的任务状态数量
-                    self.update_task_done_count()
-
-                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count)
-                    tasks = self.get_todo_task_from_mysql()
-                    if not tasks:  # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失
-
-                        if (
-                            todo_task_count == 0
-                        ):  # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做
-                            lose_task_count = self.get_lose_task_count()
-
-                            if not lose_task_count:
-                                time.sleep(self._check_task_interval)
-                                continue
-
-                            elif (
-                                lose_task_count > self._task_limit * 5
-                            ):  # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢
-                                log.info("正在重置丢失任务为待做 共 {} 条".format(lose_task_count))
-                                # 重置正在做的任务为待做
-                                if self.reset_lose_task_from_mysql():
-                                    log.info("重置丢失任务成功")
-                                else:
-                                    log.info("重置丢失任务失败")
-
-                                continue
-
-                            else:  # 丢失任务少,直接取
-                                log.info(
-                                    "正在取丢失任务 共 {} 条, 取 {} 条".format(
-                                        lose_task_count,
-                                        self._task_limit
-                                        if self._task_limit <= lose_task_count
-                                        else lose_task_count,
-                                    )
-                                )
-                                tasks = self.get_doing_task_from_mysql()
-
-                    else:
-                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))
-
-                else:
-                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)
-
-                if not tasks:
-                    if todo_task_count >= self._min_task_count:
-                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
-                        pass
-                    else:
-                        log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count)
-                else:
-                    # make start requests
-                    self.distribute_task(tasks)
-                    log.info("添加任务到redis成功")
-
-            except Exception as e:
-                log.exception(e)
-
-            time.sleep(self._check_task_interval)
-
-    def create_batch_record_table(self):
-        sql = (
-            "select table_name from information_schema.tables where table_name like '%s'"
-            % self._batch_record_table
-        )
-        tables_name = self._mysqldb.find(sql)
-        if not tables_name:
-            sql = """
-                CREATE TABLE `{table_name}` (
-                      `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT,
-                      `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间',
-                      `total_count` int(11) DEFAULT NULL COMMENT '任务总数',
-                      `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)',
-                      `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)',
-                      `interval` float(11) DEFAULT NULL COMMENT '批次间隔',
-                      `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour',
-                      `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间',
-                      `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间',
-                      `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成  1 完成',
-                      PRIMARY KEY (`id`)
-                    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
-            """.format(
-                table_name=self._batch_record_table,
-                batch_date="date" if self._date_format == "%Y-%m-%d" else "datetime",
-            )
-
-            self._mysqldb.execute(sql)
-
-    def distribute_task(self, tasks):
-        """
-        @summary: 分发任务
-        ---------
-        @param tasks:
-        ---------
-        @result:
-        """
-        if self._is_more_parsers:  # 为多模版类爬虫,需要下发指定的parser
-            for task in tasks:
-                for parser in self._parsers:  # 寻找task对应的parser
-                    if parser.name in task:
-                        task = PerfectDict(
-                            _dict=dict(zip(self._task_keys, task)), _values=list(task)
-                        )
-                        requests = parser.start_requests(task)
-                        if requests and not isinstance(requests, Iterable):
-                            raise Exception(
-                                "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
-                            )
-
-                        result_type = 1
-                        for request in requests or []:
-                            if isinstance(request, Request):
-                                request.parser_name = request.parser_name or parser.name
-                                self._request_buffer.put_request(request)
-                                result_type = 1
-
-                            elif isinstance(request, Item):
-                                self._item_buffer.put_item(request)
-                                result_type = 2
-
-                                if (
-                                    self._item_buffer.get_items_count()
-                                    >= MAX_ITEM_COUNT
-                                ):
-                                    self._item_buffer.flush()
-
-                            elif callable(request):  # callbale的request可能是更新数据库操作的函数
-                                if result_type == 1:
-                                    self._request_buffer.put_request(request)
-                                else:
-                                    self._item_buffer.put_item(request)
-
-                                    if (
-                                        self._item_buffer.get_items_count()
-                                        >= MAX_ITEM_COUNT
-                                    ):
-                                        self._item_buffer.flush()
-
-                            else:
-                                raise TypeError(
-                                    "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
-                                        type(requests)
-                                    )
-                                )
-
-                        break
-
-        else:  # task没对应的parser 则将task下发到所有的parser
-            for task in tasks:
-                for parser in self._parsers:
-                    task = PerfectDict(
-                        _dict=dict(zip(self._task_keys, task)), _values=list(task)
-                    )
-                    requests = parser.start_requests(task)
-                    if requests and not isinstance(requests, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
-                        )
-
-                    result_type = 1
-                    for request in requests or []:
-                        if isinstance(request, Request):
-                            request.parser_name = request.parser_name or parser.name
-                            self._request_buffer.put_request(request)
-                            result_type = 1
-
-                        elif isinstance(request, Item):
-                            self._item_buffer.put_item(request)
-                            result_type = 2
-
-                            if self._item_buffer.get_items_count() >= MAX_ITEM_COUNT:
-                                self._item_buffer.flush()
-
-                        elif callable(request):  # callbale的request可能是更新数据库操作的函数
-                            if result_type == 1:
-                                self._request_buffer.put_request(request)
-                            else:
-                                self._item_buffer.put_item(request)
-
-                                if (
-                                    self._item_buffer.get_items_count()
-                                    >= MAX_ITEM_COUNT
-                                ):
-                                    self._item_buffer.flush()
-
-        self._request_buffer.flush()
-        self._item_buffer.flush()
-
-    def __get_task_state_count(self):
-        sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format(
-            state=self._task_state,
-            task_table=self._task_table,
-            task_condition=self._task_condition_prefix_where,
-        )
-        task_state_count = self._mysqldb.find(sql)
-
-        task_state = {
-            "total_count": sum(count for state, count in task_state_count),
-            "done_count": sum(
-                count for state, count in task_state_count if state in (1, -1)
-            ),
-            "failed_count": sum(
-                count for state, count in task_state_count if state == -1
-            ),
-        }
-
-        return task_state
-
-    def update_task_done_count(self):
-        """
-        @summary: 更新批次表中的任务状态
-        ---------
-        ---------
-        @result:
-        """
-        task_count = self.__get_task_state_count()
-
-        # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count))
-
-        # 更新批次表
-        sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format(
-            self._batch_record_table,
-            task_count.get("done_count"),
-            task_count.get("total_count"),
-            task_count.get("failed_count"),
-            self._batch_interval
-            if self._batch_interval >= 1
-            else self._batch_interval * 24,
-            "day" if self._batch_interval >= 1 else "hour",
-            self.batch_date,
-        )
-        self._mysqldb.update(sql)
-
-    def update_is_done(self):
-        sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format(
-            self._batch_record_table, self.batch_date
-        )
-        self._mysqldb.update(sql)
-
-    def get_todo_task_from_mysql(self):
-        """
-        @summary: 取待做的任务
-        ---------
-        ---------
-        @result:
-        """
-        # TODO 分批取数据 每批最大取 1000000个,防止内存占用过大
-        # 查询任务
-        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
-        sql = "select %s from %s where %s = 0%s%s limit %s" % (
-            task_keys,
-            self._task_table,
-            self._task_state,
-            self._task_condition_prefix_and,
-            self._task_order_by,
-            self._task_limit,
-        )
-        tasks = self._mysqldb.find(sql)
-
-        if tasks:
-            # 更新任务状态
-            for i in range(0, len(tasks), 10000):  # 10000 一批量更新
-                task_ids = str(
-                    tuple([task[0] for task in tasks[i : i + 10000]])
-                ).replace(",)", ")")
-                sql = "update %s set %s = 2 where id in %s" % (
-                    self._task_table,
-                    self._task_state,
-                    task_ids,
-                )
-                self._mysqldb.update(sql)
-
-        return tasks
-
-    def get_doing_task_from_mysql(self):
-        """
-        @summary: 取正在做的任务
-        ---------
-        ---------
-        @result:
-        """
-
-        # 查询任务
-        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
-        sql = "select %s from %s where %s = 2%s%s limit %s" % (
-            task_keys,
-            self._task_table,
-            self._task_state,
-            self._task_condition_prefix_and,
-            self._task_order_by,
-            self._task_limit,
-        )
-        tasks = self._mysqldb.find(sql)
-
-        return tasks
-
-    def get_lose_task_count(self):
-        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
-            date_format=self._date_format.replace(":%M", ":%i"),
-            batch_record_table=self._batch_record_table,
-        )
-        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
-        batch_date, total_count, done_count = batch_info[0]
-        return total_count - done_count
-
-    def reset_lose_task_from_mysql(self):
-        """
-        @summary: 重置丢失任务为待做
-        ---------
-        ---------
-        @result:
-        """
-
-        sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format(
-            table=self._task_table,
-            state=self._task_state,
-            task_condition=self._task_condition_prefix_and,
-        )
-        return self._mysqldb.update(sql)
-
-    def get_deal_speed(self, total_count, done_count, last_batch_date):
-        """
-        获取处理速度
-        @param total_count: 总数量
-        @param done_count: 做完数量
-        @param last_batch_date: 批次时间 datetime
-        @return:
-            deal_speed (条/小时), need_time (秒), overflow_time(秒) ( overflow_time < 0 时表示提前多少秒完成 )
-            或
-            None
-        """
-        if not self._spider_last_done_count:
-            now_date = datetime.datetime.now()
-            self._spider_last_done_count = done_count
-            self._spider_last_done_time = now_date
-
-        if done_count > self._spider_last_done_count:
-            now_date = datetime.datetime.now()
-
-            time_interval = (now_date - self._spider_last_done_time).total_seconds()
-            deal_speed = (
-                done_count - self._spider_last_done_count
-            ) / time_interval  # 条/秒
-            need_time = (total_count - done_count) / deal_speed  # 单位秒
-            overflow_time = (
-                (now_date - last_batch_date).total_seconds()
-                + need_time
-                - datetime.timedelta(days=self._batch_interval).total_seconds()
-            )  # 溢出时间 秒
-            calculate_speed_time = now_date.strftime("%Y-%m-%d %H:%M:%S")  # 统计速度时间
-
-            deal_speed = int(deal_speed * 3600)  # 条/小时
-
-            # 更新最近已做任务数及时间
-            self._spider_last_done_count = done_count
-            self._spider_last_done_time = now_date
-
-            self._spider_deal_speed_cached = (
-                deal_speed,
-                need_time,
-                overflow_time,
-                calculate_speed_time,
-            )
-
-        return self._spider_deal_speed_cached
-
-    def init_task(self):
-        """
-        @summary: 初始化任务表中的任务, 新一个批次开始时调用。 可能会重写
-        ---------
-        ---------
-        @result:
-        """
-
-        sql = "update {task_table} set {state} = 0 where {state} != -1{task_condition}".format(
-            task_table=self._task_table,
-            state=self._task_state,
-            task_condition=self._task_condition_prefix_and,
-        )
-        return self._mysqldb.update(sql)
-
-    def check_batch(self, is_first_check=False):
-        """
-        @summary: 检查批次是否完成
-        ---------
-        @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了
-        ---------
-        @result: 完成返回True 否则False
-        """
-
-        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
-            date_format=self._date_format.replace(":%M", ":%i"),
-            batch_record_table=self._batch_record_table,
-        )
-        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
-
-        if batch_info:
-            batch_date, total_count, done_count = batch_info[0]
-
-            now_date = datetime.datetime.now()
-            last_batch_date = datetime.datetime.strptime(batch_date, self._date_format)
-            time_difference = now_date - last_batch_date
-
-            if total_count == done_count and time_difference < datetime.timedelta(
-                days=self._batch_interval
-            ):  # 若在本批次内,再次检查任务表是否有新增任务
-                # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新
-                task_count = self.__get_task_state_count()
-
-                total_count = task_count.get("total_count")
-                done_count = task_count.get("done_count")
-
-            if total_count == done_count:
-                # 检查相关联的爬虫是否完成
-                releated_spider_is_done = self.related_spider_is_done()
-                if releated_spider_is_done == False:
-                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
-                        self._batch_name,
-                        self._related_batch_record or self._related_task_tables,
-                        batch_date,
-                        done_count,
-                        total_count,
-                    )
-                    log.info(msg)
-                    # 检查是否超时 超时发出报警
-                    if time_difference >= datetime.timedelta(
-                        days=self._batch_interval
-                    ):  # 已经超时
-                        if (
-                            not self._last_send_msg_time
-                            or now_date - self._last_send_msg_time
-                            >= self._send_msg_interval
-                        ):
-                            self._last_send_msg_time = now_date
-                            self.send_msg(
-                                msg,
-                                level="error",
-                                message_prefix="《{}》本批次未完成, 正在等待依赖爬虫 {} 结束".format(
-                                    self._batch_name,
-                                    self._related_batch_record
-                                    or self._related_task_tables,
-                                ),
-                            )
-
-                    return False
-
-                elif releated_spider_is_done == True:
-                    # 更新is_done 状态
-                    self.update_is_done()
-
-                else:
-                    self.update_is_done()
-
-                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
-                    self._batch_name, batch_date, done_count
-                )
-                log.info(msg)
-                if not is_first_check:
-                    self.send_msg(msg)
-
-                # 判断下一批次是否到
-                if time_difference >= datetime.timedelta(days=self._batch_interval):
-                    msg = "《{}》下一批次开始".format(self._batch_name)
-                    log.info(msg)
-                    self.send_msg(msg)
-
-                    # 初始化任务表状态
-                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
-                        # 初始化属性
-                        self.init_property()
-
-                        is_success = (
-                            self.record_batch()
-                        )  # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置
-                        if is_success:
-                            # 看是否有等待任务的worker,若有则需要等会再下发任务,防止work批次时间没来得及更新
-                            current_timestamp = tools.get_current_timestamp()
-                            spider_count = self._redisdb.zget_count(
-                                self._tab_spider_status,
-                                priority_min=current_timestamp
-                                - (setting.COLLECTOR_SLEEP_TIME + 10),
-                                priority_max=current_timestamp,
-                            )
-                            if spider_count:
-                                log.info(
-                                    f"插入新批次记录成功,检测到有{spider_count}个爬虫进程在等待任务,本批任务1分钟后开始下发, 防止爬虫端缓存的批次时间没来得及更新"
-                                )
-                                tools.delay_time(60)
-                            else:
-                                log.info("插入新批次记录成功")
-
-                            return False  # 下一批次开始
-
-                        else:
-                            return True  # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务
-
-                else:
-                    log.info("《{}》下次批次时间未到".format(self._batch_name))
-                    if not is_first_check:
-                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
-                    return True
-
-            else:
-                if time_difference >= datetime.timedelta(
-                    days=self._batch_interval
-                ):  # 已经超时
-                    time_out = time_difference - datetime.timedelta(
-                        days=self._batch_interval
-                    )
-                    time_out_pretty = tools.format_seconds(time_out.total_seconds())
-
-                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
-                        self._batch_name,
-                        time_out_pretty,
-                        batch_date,
-                        done_count,
-                        total_count,
-                    )
-                    if self._batch_interval >= 1:
-                        msg += ", 期望时间{}天".format(self._batch_interval)
-                    else:
-                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)
-
-                    result = self.get_deal_speed(
-                        total_count=total_count,
-                        done_count=done_count,
-                        last_batch_date=last_batch_date,
-                    )
-                    if result:
-                        deal_speed, need_time, overflow_time, calculate_speed_time = (
-                            result
-                        )
-                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
-                            calculate_speed_time,
-                            deal_speed,
-                            tools.format_seconds(need_time),
-                        )
-
-                        if overflow_time > 0:
-                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
-                                tools.format_seconds(overflow_time)
-                            )
-
-                    log.info(msg)
-
-                    if (
-                        not self._last_send_msg_time
-                        or now_date - self._last_send_msg_time
-                        >= self._send_msg_interval
-                    ):
-                        self._last_send_msg_time = now_date
-                        self.send_msg(
-                            msg,
-                            level="error",
-                            message_prefix="《{}》批次超时".format(self._batch_name),
-                        )
-
-                else:  # 未超时
-                    remaining_time = (
-                        datetime.timedelta(days=self._batch_interval) - time_difference
-                    )
-                    remaining_time_pretty = tools.format_seconds(
-                        remaining_time.total_seconds()
-                    )
-
-                    if self._batch_interval >= 1:
-                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
-                            self._batch_name,
-                            batch_date,
-                            done_count,
-                            total_count,
-                            self._batch_interval,
-                            remaining_time_pretty,
-                        )
-                    else:
-                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
-                            self._batch_name,
-                            batch_date,
-                            done_count,
-                            total_count,
-                            self._batch_interval * 24,
-                            remaining_time_pretty,
-                        )
-
-                    result = self.get_deal_speed(
-                        total_count=total_count,
-                        done_count=done_count,
-                        last_batch_date=last_batch_date,
-                    )
-                    if result:
-                        deal_speed, need_time, overflow_time, calculate_speed_time = (
-                            result
-                        )
-                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
-                            calculate_speed_time,
-                            deal_speed,
-                            tools.format_seconds(need_time),
-                        )
-
-                        if overflow_time > 0:
-                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
-                                tools.format_seconds(overflow_time)
-                            )
-                            # 发送警报
-                            if (
-                                not self._last_send_msg_time
-                                or now_date - self._last_send_msg_time
-                                >= self._send_msg_interval
-                            ):
-                                self._last_send_msg_time = now_date
-                                self.send_msg(
-                                    msg,
-                                    level="error",
-                                    message_prefix="《{}》批次可能超时".format(
-                                        self._batch_name
-                                    ),
-                                )
-
-                        elif overflow_time < 0:
-                            msg += ", 该批次预计提前 {} 完成".format(
-                                tools.format_seconds(-overflow_time)
-                            )
-
-                    log.info(msg)
-
-        else:
-            # 插入batch_date
-            self.record_batch()
-
-            # 初始化任务表状态 可能有产生任务的代码
-            self.init_task()
-
-            return False
-
-    def related_spider_is_done(self):
-        """
-        相关连的爬虫是否跑完
-        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
-        """
-
-        for related_redis_task_table in self._related_task_tables:
-            if self._redisdb.exists_key(related_redis_task_table):
-                return False
-
-        if self._related_batch_record:
-            sql = "select is_done from {} order by id desc limit 1".format(
-                self._related_batch_record
-            )
-            is_done = self._mysqldb.find(sql)
-            is_done = is_done[0][0] if is_done else None
-
-            if is_done is None:
-                log.warning("相关联的批次表不存在或无批次信息")
-                return None
-
-            if not is_done:
-                return False
-
-        return True
-
-    def record_batch(self):
-        """
-        @summary: 记录批次信息(初始化)
-        ---------
-        ---------
-        @result:
-        """
-
-        # 查询总任务数
-        sql = "select count(1) from %s%s" % (
-            self._task_table,
-            self._task_condition_prefix_where,
-        )
-        total_task_count = self._mysqldb.find(sql)[0][0]
-
-        batch_date = tools.get_current_date(self._date_format)
-
-        sql = (
-            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
-            % (
-                self._batch_record_table,
-                batch_date,
-                0,
-                total_task_count,
-                self._batch_interval
-                if self._batch_interval >= 1
-                else self._batch_interval * 24,
-                "day" if self._batch_interval >= 1 else "hour",
-            )
-        )
-
-        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
-        if affect_count:
-            # 重置批次日期
-            self._batch_date_cache = batch_date
-            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
-            os.environ["batch_date"] = self._batch_date_cache
-
-            # 爬虫开始
-            self.spider_begin()
-            self.record_spider_state(
-                spider_type=2,
-                state=0,
-                batch_date=batch_date,
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
-        else:
-            log.error("插入新批次失败")
-
-        return affect_count
-
-    # -------- 批次结束逻辑 ------------
-
-    def task_is_done(self):
-        """
-        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
-        ---------
-        ---------
-        @result: True / False (做完 / 未做完)
-        """
-
-        is_done = False
-
-        # 查看批次记录表任务状态
-        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
-            date_format=self._date_format.replace(":%M", ":%i"),
-            batch_record_table=self._batch_record_table,
-        )
-
-        batch_info = self._mysqldb.find(sql)
-        if batch_info is None:
-            raise Exception("查询批次信息失败")
-
-        if batch_info:
-            self._batch_date_cache, total_count, done_count, is_done = batch_info[
-                0
-            ]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间
-
-            log.info(
-                "《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d"
-                % (
-                    self._batch_name,
-                    self._batch_date_cache,
-                    done_count,
-                    total_count,
-                    is_done,
-                )
-            )
-            os.environ["batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间
-
-        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
-            # 比较耗时 加锁防止多进程同时查询
-            with RedisLock(key=self._spider_name) as lock:
-                if lock.locked:
-                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")
-
-                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
-                        self._task_table,
-                        self._task_state,
-                        self._task_state,
-                        self._task_condition_prefix_and,
-                    )
-                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
-                    if tasks:
-                        log.info("检测到任务表中有未完成任务,等待任务下发")
-                        is_done = False
-
-                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
-                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
-                            batch_record_table=self._batch_record_table,
-                            batch_date=self._batch_date_cache,
-                        )
-                        self._mysqldb.update(sql)
-
-                    else:
-                        log.info("任务表中任务均已完成,爬虫结束")
-                else:
-                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")
-
-                    is_done = False
-
-        return is_done
-
-    def run(self):
-        """
-        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
-        ---------
-        ---------
-        @result:
-        """
-        try:
-            self.create_batch_record_table()
-
-            if not self._parsers:  # 不是add_parser 模式
-                self._parsers.append(self)
-
-            self._start()
-
-            while True:
-                try:
-                    if (
-                        self.task_is_done() and self.all_thread_is_done()
-                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
-                        if not self._is_notify_end:
-                            self.spider_end()
-                            self.record_spider_state(
-                                spider_type=2,
-                                state=1,
-                                batch_date=self._batch_date_cache,
-                                spider_end_time=tools.get_current_date(),
-                                batch_interval=self._batch_interval,
-                            )
-
-                            self._is_notify_end = True
-
-                        if not self._keep_alive:
-                            self._stop_all_thread()
-                            break
-                    else:
-                        self._is_notify_end = False
-
-                    self.check_task_status()
-
-                except Exception as e:
-                    log.exception(e)
-
-                tools.delay_time(10)  # 10秒钟检查一次爬虫状态
-
-        except Exception as e:
-            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
-            log.error(msg)
-            self.send_msg(
-                msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)
-            )
-
-            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
-
-    @classmethod
-    def to_DebugBatchSpider(cls, *args, **kwargs):
-        # DebugBatchSpider 继承 cls
-        DebugBatchSpider.__bases__ = (cls,)
-        DebugBatchSpider.__name__ = cls.__name__
-        return DebugBatchSpider(*args, **kwargs)
-
-
-class DebugBatchSpider(BatchSpider):
-    """
-    Debug批次爬虫
-    """
-
-    __debug_custom_setting__ = dict(
-        COLLECTOR_SLEEP_TIME=1,
-        COLLECTOR_TASK_COUNT=1,
-        # SPIDER
-        SPIDER_THREAD_COUNT=1,
-        SPIDER_SLEEP_TIME=0,
-        SPIDER_TASK_COUNT=1,
-        SPIDER_MAX_RETRY_TIMES=10,
-        REQUEST_LOST_TIMEOUT=600,  # 10分钟
-        PROXY_ENABLE=False,
-        RETRY_FAILED_REQUESTS=False,
-        # 保存失败的request
-        SAVE_FAILED_REQUEST=False,
-        # 过滤
-        ITEM_FILTER_ENABLE=False,
-        REQUEST_FILTER_ENABLE=False,
-        OSS_UPLOAD_TABLES=(),
-        DELETE_KEYS=True,
-        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
-    )
-
-    def __init__(
-        self,
-        task_id=None,
-        task=None,
-        save_to_db=False,
-        update_stask=False,
-        *args,
-        **kwargs,
-    ):
-        """
-        @param task_id:  任务id
-        @param task:  任务  task 与 task_id 二者选一即可
-        @param save_to_db: 数据是否入库 默认否
-        @param update_stask: 是否更新任务 默认否
-        @param args:
-        @param kwargs:
-        """
-        warnings.warn(
-            "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。正式发布前请更改为正常模式", category=Warning
-        )
-
-        if not task and not task_id:
-            raise Exception("task_id 与 task 不能同时为null")
-
-        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
-        if save_to_db and not self.__class__.__custom_setting__.get("ITEM_PIPELINES"):
-            self.__class__.__debug_custom_setting__.update(
-                ITEM_PIPELINES=[MYSQL_PIPELINE_PATH]
-            )
-        self.__class__.__custom_setting__.update(
-            self.__class__.__debug_custom_setting__
-        )
-
-        super(DebugBatchSpider, self).__init__(*args, **kwargs)
-
-        self._task_id = task_id
-        self._task = task
-        self._update_task = update_stask
-
-    def start_monitor_task(self):
-        """
-        @summary: 监控任务状态
-        ---------
-        ---------
-        @result:
-        """
-        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
-            self._is_more_parsers = False
-            self._parsers.append(self)
-
-        elif len(self._parsers) <= 1:
-            self._is_more_parsers = False
-
-        if self._task:
-            self.distribute_task([self._task])
-        else:
-            tasks = self.get_todo_task_from_mysql()
-            if not tasks:
-                raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format(self._task_id))
-            self.distribute_task(tasks)
-
-        os.environ.setdefault("batch_date", "1970-00-00")
-        log.debug("下发任务完毕")
-
-    def get_todo_task_from_mysql(self):
-        """
-        @summary: 取待做的任务
-        ---------
-        ---------
-        @result:
-        """
-
-        # 查询任务
-        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
-        sql = "select %s from %s where id=%s" % (
-            task_keys,
-            self._task_table,
-            self._task_id,
-        )
-        tasks = self._mysqldb.find(sql)
-
-        return tasks
-
-    def save_cached(self, request, response, table):
-        pass
-
-    def update_task_state(self, task_id, state=1, *args, **kwargs):
-        """
-        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
-        调用方法为 yield lambda : self.update_task_state(task_id, state)
-        ---------
-        @param task_id:
-        @param state:
-        ---------
-        @result:
-        """
-        if self._update_task:
-            kwargs["id"] = task_id
-            kwargs[self._task_state] = state
-
-            sql = tools.make_update_sql(
-                self._task_table,
-                kwargs,
-                condition="id = {task_id}".format(task_id=task_id),
-            )
-
-            if self._mysqldb.update(sql):
-                log.debug("置任务%s状态成功" % task_id)
-            else:
-                log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
-
-    def update_task_batch(self, task_id, state=1, *args, **kwargs):
-        """
-        批量更新任务 多处调用,更新的字段必须一致
-        注意:需要 写成 yield update_task_batch(...) 否则不会更新
-        @param task_id:
-        @param state:
-        @param kwargs:
-        @return:
-        """
-        if self._update_task:
-            kwargs["id"] = task_id
-            kwargs[self._task_state] = state
-
-            update_item = UpdateItem(**kwargs)
-            update_item.table_name = self._task_table
-            update_item.name_underline = self._task_table + "_item"
-
-            return update_item
-
-    def delete_tables(self, delete_tables_list):
-        if isinstance(delete_tables_list, bool):
-            delete_tables_list = [self._redis_key + "*"]
-        elif not isinstance(delete_tables_list, (list, tuple)):
-            delete_tables_list = [delete_tables_list]
-
-        redis = RedisDB()
-        for delete_tab in delete_tables_list:
-            if delete_tab == "*":
-                delete_tab = self._redis_key + "*"
-
-            tables = redis.getkeys(delete_tab)
-            for table in tables:
-                log.info("正在删除表 %s" % table)
-                redis.clear(table)
-
-    def run(self):
-        self.start_monitor_task()
-
-        if not self._parsers:  # 不是add_parser 模式
-            self._parsers.append(self)
-
-        self._start()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    self._stop_all_thread()
-                    break
-
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-        self.delete_tables([self._redis_key + "*"])
-
-    def record_spider_state(
-        self,
-        spider_type,
-        state,
-        batch_date=None,
-        spider_start_time=None,
-        spider_end_time=None,
-        batch_interval=None,
-    ):
-        pass

+ 0 - 437
zgztb_cookie/FworkSpider/feapder/core/spiders/spider.py

@@ -1,437 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:05 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import time
-import warnings
-from collections import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.core.base_parser import BaseParser
-from feapder.core.scheduler import Scheduler
-from feapder.db.redisdb import RedisDB
-from feapder.network.item import Item
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
-
-
-class Spider(
-    BaseParser, Scheduler
-):  # threading 中有name函数, 必须先继承BaseParser 否则其内部的name会被Schedule的基类threading.Thread的name覆盖
-    """
-    @summary: 为了简化搭建爬虫
-    ---------
-    """
-
-    def __init__(
-        self,
-        redis_key=None,
-        min_task_count=1,
-        check_task_interval=5,
-        thread_count=None,
-        begin_callback=None,
-        end_callback=None,
-        delete_keys=(),
-        keep_alive=None,
-        auto_start_requests=None,
-        batch_interval=0,
-        wait_lock=True,
-        **kwargs
-    ):
-        """
-        @summary: 爬虫
-        ---------
-        @param redis_key: 任务等数据存放在redis中的key前缀
-        @param min_task_count: 任务队列中最少任务数, 少于这个数量才会添加任务,默认1。start_monitor_task 模式下生效
-        @param check_task_interval: 检查是否还有任务的时间间隔;默认5秒
-        @param thread_count: 线程数,默认为配置文件中的线程数
-        @param begin_callback: 爬虫开始回调函数
-        @param end_callback: 爬虫结束回调函数
-        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
-        @param keep_alive: 爬虫是否常驻
-        @param auto_start_requests: 爬虫是否自动添加任务
-        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
-        @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
-        ---------
-        @result:
-        """
-        super(Spider, self).__init__(
-            redis_key=redis_key,
-            thread_count=thread_count,
-            begin_callback=begin_callback,
-            end_callback=end_callback,
-            delete_keys=delete_keys,
-            keep_alive=keep_alive,
-            auto_start_requests=auto_start_requests,
-            batch_interval=batch_interval,
-            wait_lock=wait_lock,
-            **kwargs
-        )
-
-        self._min_task_count = min_task_count
-        self._check_task_interval = check_task_interval
-
-        self._is_distributed_task = False
-        self._is_show_not_task = False
-
-    def start_monitor_task(self, *args, **kws):
-        if not self.is_reach_next_spider_time():
-            return
-
-        self._auto_start_requests = False
-        redisdb = RedisDB()
-
-        if not self._parsers:  # 不是add_parser 模式
-            self._parsers.append(self)
-
-        while True:
-            try:
-                # 检查redis中是否有任务
-                tab_requests = setting.TAB_REQUESTS.format(redis_key=self._redis_key)
-                todo_task_count = redisdb.zget_count(tab_requests)
-
-                if todo_task_count < self._min_task_count:  # 添加任务
-                    # make start requests
-                    self.distribute_task(*args, **kws)
-
-                else:
-                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)
-
-            except Exception as e:
-                log.exception(e)
-
-            if not self._keep_alive:
-                break
-
-            time.sleep(self._check_task_interval)
-
-    def distribute_task(self, *args, **kws):
-        """
-        @summary: 分发任务 并将返回的request入库
-        ---------
-        @param tasks:
-        ---------
-        @result:
-        """
-        self._is_distributed_task = False
-
-        for parser in self._parsers:
-            requests = parser.start_requests(*args, **kws)
-            if requests and not isinstance(requests, Iterable):
-                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
-
-            result_type = 1
-            for request in requests or []:
-                if isinstance(request, Request):
-                    request.parser_name = request.parser_name or parser.name
-                    self._request_buffer.put_request(request)
-
-                    self._is_distributed_task = True
-                    result_type = 1
-
-                elif isinstance(request, Item):
-                    self._item_buffer.put_item(request)
-                    result_type = 2
-
-                elif callable(request):  # callbale的request可能是更新数据库操作的函数
-                    if result_type == 1:
-                        self._request_buffer.put_request(request)
-                    else:
-                        self._item_buffer.put_item(request)
-                else:
-                    raise TypeError(
-                        "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
-                            type(request)
-                        )
-                    )
-
-            self._request_buffer.flush()
-            self._item_buffer.flush()
-
-        if self._is_distributed_task:  # 有任务时才提示启动爬虫
-            # begin
-            self.spider_begin()
-            self.record_spider_state(
-                spider_type=1,
-                state=0,
-                batch_date=tools.get_current_date(),
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
-
-            # 重置已经提示无任务状态为False
-            self._is_show_not_task = False
-
-        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
-            # 发送无任务消息
-            msg = "《%s》start_requests无任务添加" % (self._spider_name)
-            log.info(msg)
-
-            # self.send_msg(msg)
-
-            self._is_show_not_task = True
-
-    def run(self):
-        if not self.is_reach_next_spider_time():
-            return
-
-        if not self._parsers:  # 不是add_parser 模式
-            self._parsers.append(self)
-
-        self._start()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    if not self._is_notify_end:
-                        self.spider_end()  # 跑完一轮
-                        self.record_spider_state(
-                            spider_type=1,
-                            state=1,
-                            spider_end_time=tools.get_current_date(),
-                            batch_interval=self._batch_interval,
-                        )
-
-                        self._is_notify_end = True
-
-                    if not self._keep_alive:
-                        self._stop_all_thread()
-                        break
-
-                else:
-                    self._is_notify_end = False
-
-                self.check_task_status()
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-    @classmethod
-    def to_DebugSpider(cls, *args, **kwargs):
-        # DebugSpider 继承 cls
-        DebugSpider.__bases__ = (cls,)
-        DebugSpider.__name__ = cls.__name__
-        return DebugSpider(*args, **kwargs)
-
-
-class DebugSpider(Spider):
-    """
-    Debug爬虫
-    """
-
-    __debug_custom_setting__ = dict(
-        COLLECTOR_SLEEP_TIME=1,
-        COLLECTOR_TASK_COUNT=1,
-        # SPIDER
-        SPIDER_THREAD_COUNT=1,
-        SPIDER_SLEEP_TIME=0,
-        SPIDER_TASK_COUNT=1,
-        SPIDER_MAX_RETRY_TIMES=10,
-        REQUEST_LOST_TIMEOUT=600,  # 10分钟
-        PROXY_ENABLE=False,
-        RETRY_FAILED_REQUESTS=False,
-        # 保存失败的request
-        SAVE_FAILED_REQUEST=False,
-        # 过滤
-        ITEM_FILTER_ENABLE=False,
-        REQUEST_FILTER_ENABLE=False,
-        OSS_UPLOAD_TABLES=(),
-        DELETE_KEYS=True,
-        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
-    )
-
-    def __init__(self, request=None, request_dict=None, *args, **kwargs):
-        """
-        @param request: request 类对象
-        @param request_dict: request 字典。 request 与 request_dict 二者选一即可
-        @param kwargs:
-        """
-        warnings.warn(
-            "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。正式发布前请更改为正常模式", category=Warning
-        )
-
-        if not request and not request_dict:
-            raise Exception("request 与 request_dict 不能同时为null")
-
-        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
-        self.__class__.__custom_setting__.update(
-            self.__class__.__debug_custom_setting__
-        )
-
-        super(DebugSpider, self).__init__(*args, **kwargs)
-
-        self._request = request or Request.from_dict(request_dict)
-
-    def save_cached(self, request, response, table):
-        pass
-
-    def delete_tables(self, delete_tables_list):
-        if isinstance(delete_tables_list, bool):
-            delete_tables_list = [self._redis_key + "*"]
-        elif not isinstance(delete_tables_list, (list, tuple)):
-            delete_tables_list = [delete_tables_list]
-
-        redis = RedisDB()
-        for delete_tab in delete_tables_list:
-            if delete_tab == "*":
-                delete_tab = self._redis_key + "*"
-
-            tables = redis.getkeys(delete_tab)
-            for table in tables:
-                log.info("正在删除表 %s" % table)
-                redis.clear(table)
-
-    def __start_requests(self):
-        yield self._request
-
-    def distribute_task(self):
-        """
-        @summary: 分发任务 并将返回的request入库
-        ---------
-        ---------
-        @result:
-        """
-        self._is_distributed_task = False
-
-        for parser in self._parsers:
-            requests = parser.__start_requests()
-            if requests and not isinstance(requests, Iterable):
-                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
-
-            result_type = 1
-            for request in requests or []:
-                if isinstance(request, Request):
-                    request.parser_name = request.parser_name or parser.name
-                    self._request_buffer.put_request(request)
-
-                    self._is_distributed_task = True
-                    result_type = 1
-
-                elif isinstance(request, Item):
-                    self._item_buffer.put_item(request)
-                    result_type = 2
-
-                elif callable(request):  # callbale的request可能是更新数据库操作的函数
-                    if result_type == 1:
-                        self._request_buffer.put_request(request)
-                    else:
-                        self._item_buffer.put_item(request)
-
-            self._request_buffer.flush()
-            self._item_buffer.flush()
-
-        if self._is_distributed_task:  # 有任务时才提示启动爬虫
-            # begin
-            self.spider_begin()
-            self.record_spider_state(
-                spider_type=1,
-                state=0,
-                batch_date=tools.get_current_date(),
-                spider_start_time=tools.get_current_date(),
-                batch_interval=self._batch_interval,
-            )
-
-            # 重置已经提示无任务状态为False
-            self._is_show_not_task = False
-
-        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
-            # 发送无任务消息
-            msg = "《%s》start_requests无任务添加" % (self._spider_name)
-            log.info(msg)
-
-            # self.send_msg(msg)
-
-            self._is_show_not_task = True
-
-    def record_spider_state(
-        self,
-        spider_type,
-        state,
-        batch_date=None,
-        spider_start_time=None,
-        spider_end_time=None,
-        batch_interval=None,
-    ):
-        pass
-
-    def _start(self):
-        # 启动parser 的 start_requests
-        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
-
-        for parser in self._parsers:
-            results = parser.__start_requests()
-            # 添加request到请求队列,由请求队列统一入库
-            if results and not isinstance(results, Iterable):
-                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
-
-            result_type = 1
-            for result in results or []:
-                if isinstance(result, Request):
-                    result.parser_name = result.parser_name or parser.name
-                    self._request_buffer.put_request(result)
-                    result_type = 1
-
-                elif isinstance(result, Item):
-                    self._item_buffer.put_item(result)
-                    result_type = 2
-
-                elif callable(result):  # callbale的request可能是更新数据库操作的函数
-                    if result_type == 1:
-                        self._request_buffer.put_request(result)
-                    else:
-                        self._item_buffer.put_item(result)
-
-            self._request_buffer.flush()
-            self._item_buffer.flush()
-
-        # 启动collector
-        self._collector.start()
-
-        # 启动parser control
-        for i in range(self._thread_count):
-            parser_control = self._parser_control_obj(
-                self._collector,
-                self._redis_key,
-                self._request_buffer,
-                self._item_buffer,
-            )
-
-            for parser in self._parsers:
-                parser_control.add_parser(parser)
-
-            parser_control.start()
-            self._parser_controls.append(parser_control)
-
-        # 启动request_buffer
-        self._request_buffer.start()
-
-        # 启动item_buffer
-        self._item_buffer.start()
-
-    def run(self):
-        if not self._parsers:  # 不是add_parser 模式
-            self._parsers.append(self)
-
-        self._start()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    self._stop_all_thread()
-                    break
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-        self.delete_tables([self._redis_key + "*"])

+ 0 - 9
zgztb_cookie/FworkSpider/feapder/db/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""

+ 0 - 37
zgztb_cookie/FworkSpider/feapder/db/memory_db.py

@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/21 11:42 PM
----------
-@summary: 基于内存的队列,代替redis
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-from queue import PriorityQueue
-
-
-class MemoryDB:
-    def __init__(self):
-        self.priority_queue = PriorityQueue()
-
-    def add(self, item):
-        """
-        添加任务
-        :param item: 数据: 支持小于号比较的类 或者 (priority, item)
-        :return:
-        """
-        self.priority_queue.put(item)
-
-    def get(self):
-        """
-        获取任务
-        :return:
-        """
-        try:
-            item = self.priority_queue.get_nowait()
-            return item
-        except:
-            return
-
-    def empty(self):
-        return self.priority_queue.empty()

+ 0 - 422
zgztb_cookie/FworkSpider/feapder/db/mongodb.py

@@ -1,422 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 操作mongo数据库
----------
-@author: Mkdir700
-@email:  mkdir700@gmail.com
-"""
-import re
-from typing import List, Dict, Optional
-from urllib import parse
-
-import pymongo
-from pymongo import MongoClient
-from pymongo.collection import Collection
-from pymongo.database import Database
-from pymongo.errors import DuplicateKeyError, BulkWriteError
-
-import feapder.setting as setting
-from feapder.utils.log import log
-
-
-class MongoDB:
-    def __init__(
-        self,
-        ip=None,
-        port=None,
-        db=None,
-        user_name=None,
-        user_pass=None,
-        url=None,
-        **kwargs,
-    ):
-        if url:
-            self.client = MongoClient(url, **kwargs)
-        else:
-            if not ip:
-                ip = setting.MONGO_IP
-            if not port:
-                port = setting.MONGO_PORT
-            if not db:
-                db = setting.MONGO_DB
-            if not user_name:
-                user_name = setting.MONGO_USER_NAME
-            if not user_pass:
-                user_pass = setting.MONGO_USER_PASS
-            self.client = MongoClient(
-                host=ip, port=port, username=user_name, password=user_pass
-            )
-
-        self.db = self.get_database(db)
-
-        # 缓存索引信息
-        self.__index__cached = {}
-
-    @classmethod
-    def from_url(cls, url, **kwargs):
-        """
-        Args:
-            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
-                 参考:http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
-            **kwargs:
-
-        Returns:
-
-        """
-        url_parsed = parse.urlparse(url)
-
-        db_type = url_parsed.scheme.strip()
-        if db_type != "mongodb":
-            raise Exception(
-                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
-                    url
-                )
-            )
-
-        return cls(url=url, **kwargs)
-
-    def get_database(self, database, **kwargs) -> Database:
-        """
-        获取数据库对象
-        @param database: 数据库名
-        @return:
-        """
-        return self.client.get_database(database, **kwargs)
-
-    def get_collection(self, coll_name, **kwargs) -> Collection:
-        """
-        根据集合名获取集合对象
-        @param coll_name: 集合名
-        @return:
-        """
-        return self.db.get_collection(coll_name, **kwargs)
-
-    def find(
-        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
-    ) -> List[Dict]:
-        """
-        @summary:
-        无数据: 返回[]
-        有数据: [{'_id': 'xx', ...}, ...]
-        ---------
-        @param coll_name: 集合名(表名)
-        @param condition: 查询条件
-        @param limit: 结果数量
-        @param kwargs:
-            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
-
-        ---------
-        @result:
-        """
-        condition = {} if condition is None else condition
-        command = {"find": coll_name, "filter": condition, "limit": limit}
-        command.update(kwargs)
-        result = self.run_command(command)
-        cursor = result["cursor"]
-        cursor_id = cursor["id"]
-        dataset = cursor["firstBatch"]
-        while True:
-            if cursor_id == 0:
-                break
-            result = self.run_command(
-                {
-                    "getMore": cursor_id,
-                    "collection": coll_name,
-                    "batchSize": kwargs.get("batchSize", 100),
-                }
-            )
-            cursor = result["cursor"]
-            cursor_id = cursor["id"]
-            dataset.extend(cursor["nextBatch"])
-        return dataset
-
-    def add(
-        self,
-        coll_name,
-        data: Dict,
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        insert_ignore=False,
-    ):
-        """
-        添加单条数据
-        Args:
-            coll_name: 集合名
-            data: 单条数据
-            replace: 唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据唯一索引冲突,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            insert_ignore: 索引冲突是否忽略 默认False
-
-        Returns: 插入成功的行数
-
-        """
-        affect_count = 1
-        collection = self.get_collection(coll_name)
-        try:
-            collection.insert_one(data)
-        except DuplicateKeyError as e:
-            # 存在则更新
-            if update_columns:
-                if not isinstance(update_columns, (tuple, list)):
-                    update_columns = [update_columns]
-
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-
-                # 更新指定的列
-                if update_columns_value:
-                    # 使用指定的值更新
-                    doc = {
-                        key: value
-                        for key, value in zip(update_columns, update_columns_value)
-                    }
-                else:
-                    # 使用数据本身的值更新
-                    doc = {key: data[key] for key in update_columns}
-
-                collection.update_one(condition, {"$set": doc})
-
-            # 覆盖更新
-            elif replace:
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-                # 替换已存在的数据
-                collection.replace_one(condition, data)
-
-            elif not insert_ignore:
-                raise e
-
-        return affect_count
-
-    def add_batch(
-        self,
-        coll_name: str,
-        datas: List[Dict],
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        condition_fields: dict = None,
-    ):
-        """
-        批量添加数据
-        Args:
-            coll_name: 集合名
-            datas: 数据 [{'_id': 'xx'}, ... ]
-            replace:  唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            condition_fields: 用于条件查找的字段,不指定则用索引冲突中的字段查找
-
-        Returns: 添加行数,不包含更新
-
-        """
-        add_count = 0
-
-        if not datas:
-            return add_count
-
-        collection = self.get_collection(coll_name)
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-
-        try:
-            add_count = len(datas)
-            collection.insert_many(datas, ordered=False)
-        except BulkWriteError as e:
-            write_errors = e.details.get("writeErrors")
-            for error in write_errors:
-                if error.get("code") == 11000:
-                    # 数据重复
-                    # 获取重复的数据
-                    data = error.get("op")
-
-                    def get_condition():
-                        # 获取更新条件
-                        if condition_fields:
-                            condition = {
-                                condition_field: data[condition_field]
-                                for condition_field in condition_fields
-                            }
-                        else:
-                            # 根据重复的值获取更新条件
-                            condition = self.__get_update_condition(
-                                coll_name, data, error.get("errmsg")
-                            )
-
-                        return condition
-
-                    if update_columns:
-                        # 更新指定的列
-                        if update_columns_value:
-                            # 使用指定的值更新
-                            doc = {
-                                key: value
-                                for key, value in zip(
-                                    update_columns, update_columns_value
-                                )
-                            }
-                        else:
-                            # 使用数据本身的值更新
-                            doc = {key: data.get(key) for key in update_columns}
-
-                        collection.update_one(get_condition(), {"$set": doc})
-                        add_count -= 1
-
-                    elif replace:
-                        # 覆盖更新
-                        collection.replace_one(get_condition(), data)
-                        add_count -= 1
-
-                    else:
-                        # log.error(error)
-                        add_count -= 1
-
-        return add_count
-
-    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
-        """
-        计数
-        @param coll_name: 集合名
-        @param condition: 查询条件
-        @param limit: 限制数量
-        @param kwargs:
-        ----
-        command = {
-          count: <collection or view>,
-          query: <document>,
-          limit: <integer>,
-          skip: <integer>,
-          hint: <hint>,
-          readConcern: <document>,
-          collation: <document>,
-          comment: <any>
-        }
-        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
-        @return: 数据数量
-        """
-        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
-        result = self.run_command(command)
-        return result["n"]
-
-    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
-        """
-        更新
-        Args:
-            coll_name: 集合名
-            data: 单条数据 {"xxx":"xxx"}
-            condition: 更新条件 {"_id": "xxxx"}
-            upsert: 数据不存在则插入,默认为 False
-
-        Returns: True / False
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.update_one(condition, {"$set": data}, upsert=upsert)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def delete(self, coll_name, condition: Dict) -> bool:
-        """
-        删除
-        Args:
-            coll_name: 集合名
-            condition: 查找条件
-        Returns: True / False
-
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.delete_one(condition)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def run_command(self, command: Dict):
-        """
-        运行指令
-        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
-        @param command:
-        @return:
-        """
-        return self.db.command(command)
-
-    def create_index(self, coll_name, keys, unique=True):
-        collection = self.get_collection(coll_name)
-        _keys = [(key, pymongo.ASCENDING) for key in keys]
-        collection.create_index(_keys, unique=unique)
-
-    def get_index(self, coll_name):
-        return self.get_collection(coll_name).index_information()
-
-    def drop_collection(self, coll_name):
-        return self.db.drop_collection(coll_name)
-
-    def get_index_key(self, coll_name, index_name):
-        """
-        获取参与索引的key
-        Args:
-            index_name: 索引名
-
-        Returns:
-
-        """
-        cache_key = f"{coll_name}:{index_name}"
-
-        if cache_key in self.__index__cached:
-            return self.__index__cached.get(cache_key)
-
-        index = self.get_index(coll_name)
-        index_detail = index.get(index_name)
-        if not index_detail:
-            errmsg = f"not found index {index_name} in collection {coll_name}"
-            raise Exception(errmsg)
-
-        index_keys = [val[0] for val in index_detail.get("key")]
-        self.__index__cached[cache_key] = index_keys
-        return index_keys
-
-    def __get_update_condition(
-        self, coll_name: str, data: dict, duplicate_errmsg: str
-    ) -> dict:
-        """
-        根据索引冲突的报错信息 获取更新条件
-        Args:
-            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
-            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
-
-        Returns: {"a": 1, "b": "你好"}
-
-        """
-        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
-        index_keys = self.get_index_key(coll_name, index_name)
-
-        condition = {key: data.get(key) for key in index_keys}
-        return condition
-
-    def __getattr__(self, name):
-        return getattr(self.db, name)

+ 0 - 381
zgztb_cookie/FworkSpider/feapder/db/mysqldb.py

@@ -1,381 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-11-16 16:25
----------
-@summary: 操作oracle数据库
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import datetime
-import json
-from urllib import parse
-from typing import List, Dict
-
-import pymysql
-from dbutils.pooled_db import PooledDB
-from pymysql import cursors
-from pymysql import err
-
-import feapder.setting as setting
-from feapder.utils.log import log
-from feapder.utils.tools import make_insert_sql, make_batch_sql, make_update_sql
-
-
-def auto_retry(func):
-    def wapper(*args, **kwargs):
-        for i in range(3):
-            try:
-                return func(*args, **kwargs)
-            except (err.InterfaceError, err.OperationalError) as e:
-                log.error(
-                    """
-                    error:%s
-                    sql:  %s
-                    """
-                    % (e, kwargs.get("sql") or args[1])
-                )
-
-    return wapper
-
-
-class MysqlDB:
-    def __init__(
-        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
-    ):
-        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
-        if not ip:
-            ip = setting.MYSQL_IP
-        if not port:
-            port = setting.MYSQL_PORT
-        if not db:
-            db = setting.MYSQL_DB
-        if not user_name:
-            user_name = setting.MYSQL_USER_NAME
-        if not user_pass:
-            user_pass = setting.MYSQL_USER_PASS
-
-        try:
-
-            self.connect_pool = PooledDB(
-                creator=pymysql,
-                mincached=1,
-                maxcached=100,
-                maxconnections=100,
-                blocking=True,
-                ping=7,
-                host=ip,
-                port=port,
-                user=user_name,
-                passwd=user_pass,
-                db=db,
-                charset="utf8mb4",
-                cursorclass=cursors.SSCursor,
-            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增
-
-        except Exception as e:
-            log.error(
-                """
-            连接数据失败:
-            ip: {}
-            port: {}
-            db: {}
-            user_name: {}
-            user_pass: {}
-            exception: {}
-            """.format(
-                    ip, port, db, user_name, user_pass, e
-                )
-            )
-        else:
-            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
-
-    @classmethod
-    def from_url(cls, url, **kwargs):
-        # mysql://username:password@ip:port/db?charset=utf8mb4
-        url_parsed = parse.urlparse(url)
-
-        db_type = url_parsed.scheme.strip()
-        if db_type != "mysql":
-            raise Exception(
-                "url error, expect mysql://username:ip:port/db?charset=utf8mb4, but get {}".format(
-                    url
-                )
-            )
-
-        connect_params = {}
-        connect_params["ip"] = url_parsed.hostname.strip()
-        connect_params["port"] = url_parsed.port
-        connect_params["user_name"] = url_parsed.username.strip()
-        connect_params["user_pass"] = url_parsed.password.strip()
-        connect_params["db"] = url_parsed.path.strip("/").strip()
-
-        connect_params.update(kwargs)
-
-        return cls(**connect_params)
-
-    @staticmethod
-    def unescape_string(value):
-        if not isinstance(value, str):
-            return value
-
-        value = value.replace("\\0", "\0")
-        value = value.replace("\\\\", "\\")
-        value = value.replace("\\n", "\n")
-        value = value.replace("\\r", "\r")
-        value = value.replace("\\Z", "\032")
-        value = value.replace('\\"', '"')
-        value = value.replace("\\'", "'")
-
-        return value
-
-    def get_connection(self):
-        conn = self.connect_pool.connection(shareable=False)
-        # cursor = conn.cursor(cursors.SSCursor)
-        cursor = conn.cursor()
-
-        return conn, cursor
-
-    def close_connection(self, conn, cursor):
-        cursor.close()
-        conn.close()
-
-    def size_of_connections(self):
-        """
-        当前活跃的连接数
-        @return:
-        """
-        return self.connect_pool._connections
-
-    def size_of_connect_pool(self):
-        """
-        池子里一共有多少连接
-        @return:
-        """
-        return len(self.connect_pool._idle_cache)
-
-    @auto_retry
-    def find(self, sql, limit=0, to_json=False):
-        """
-        @summary:
-        无数据: 返回()
-        有数据: 若limit == 1 则返回 (data1, data2)
-                否则返回 ((data1, data2),)
-        ---------
-        @param sql:
-        @param limit:
-        @param to_json 是否将查询结果转为json
-        ---------
-        @result:
-        """
-        conn, cursor = self.get_connection()
-
-        cursor.execute(sql)
-
-        if limit == 1:
-            result = cursor.fetchone()  # 全部查出来,截取 不推荐使用
-        elif limit > 1:
-            result = cursor.fetchmany(limit)  # 全部查出来,截取 不推荐使用
-        else:
-            result = cursor.fetchall()
-
-        if to_json:
-            columns = [i[0] for i in cursor.description]
-
-            # 处理数据
-            def convert(col):
-                if isinstance(col, (datetime.date, datetime.time)):
-                    return str(col)
-                elif isinstance(col, str) and (
-                    col.startswith("{") or col.startswith("[")
-                ):
-                    try:
-                        # col = self.unescape_string(col)
-                        return json.loads(col)
-                    except:
-                        return col
-                else:
-                    # col = self.unescape_string(col)
-                    return col
-
-            if limit == 1:
-                result = [convert(col) for col in result]
-                result = dict(zip(columns, result))
-            else:
-                result = [[convert(col) for col in row] for row in result]
-                result = [dict(zip(columns, r)) for r in result]
-
-        self.close_connection(conn, cursor)
-
-        return result
-
-    def add(self, sql, exception_callfunc=None):
-        """
-
-        Args:
-            sql:
-            exception_callfunc: 异常回调
-
-        Returns: 添加行数
-
-        """
-        affect_count = None
-
-        try:
-            conn, cursor = self.get_connection()
-            affect_count = cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            if exception_callfunc:
-                exception_callfunc(e)
-        finally:
-            self.close_connection(conn, cursor)
-
-        return affect_count
-
-    def add_smart(self, table, data: Dict, **kwargs):
-        """
-        添加数据, 直接传递json格式的数据,不用拼sql
-        Args:
-            table: 表名
-            data: 字典 {"xxx":"xxx"}
-            **kwargs:
-
-        Returns: 添加行数
-
-        """
-        sql = make_insert_sql(table, data, **kwargs)
-        return self.add(sql)
-
-    def add_batch(self, sql, datas: List[Dict]):
-        """
-        @summary: 批量添加数据
-        ---------
-        @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s)
-        # param datas: 列表 [{}, {}, {}]
-        ---------
-        @result: 添加行数
-        """
-        affect_count = None
-
-        try:
-            conn, cursor = self.get_connection()
-            affect_count = cursor.executemany(sql, datas)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-                """
-                % (e, sql)
-            )
-        finally:
-            self.close_connection(conn, cursor)
-
-        return affect_count
-
-    def add_batch_smart(self, table, datas: List[Dict], **kwargs):
-        """
-        批量添加数据, 直接传递list格式的数据,不用拼sql
-        Args:
-            table: 表名
-            datas: 列表 [{}, {}, {}]
-            **kwargs:
-
-        Returns: 添加行数
-
-        """
-        sql, datas = make_batch_sql(table, datas, **kwargs)
-        return self.add_batch(sql, datas)
-
-    def update(self, sql):
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)
-
-    def update_smart(self, table, data: Dict, condition):
-        """
-        更新, 不用拼sql
-        Args:
-            table: 表名
-            data: 数据 {"xxx":"xxx"}
-            condition: 更新条件 where后面的条件,如 condition='status=1'
-
-        Returns: True / False
-
-        """
-        sql = make_update_sql(table, data, condition)
-        return self.update(sql)
-
-    def delete(self, sql):
-        """
-        删除
-        Args:
-            sql:
-
-        Returns: True / False
-
-        """
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)
-
-    def execute(self, sql):
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)

+ 0 - 848
zgztb_cookie/FworkSpider/feapder/db/redisdb.py

@@ -1,848 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-11-16 16:25
----------
-@summary: 操作redis数据库
----------
-@author: Boris
-"""
-
-import time
-
-import redis
-from redis._compat import unicode, long, basestring
-from redis.connection import Encoder as _Encoder
-from redis.exceptions import ConnectionError, TimeoutError
-from redis.exceptions import DataError
-from redis.sentinel import Sentinel
-from rediscluster import RedisCluster
-
-import feapder.setting as setting
-from feapder.utils.log import log
-
-
-class Encoder(_Encoder):
-    def encode(self, value):
-        "Return a bytestring or bytes-like representation of the value"
-        if isinstance(value, (bytes, memoryview)):
-            return value
-        # elif isinstance(value, bool):
-        #     # special case bool since it is a subclass of int
-        #     raise DataError(
-        #         "Invalid input of type: 'bool'. Convert to a "
-        #         "bytes, string, int or float first."
-        #     )
-        elif isinstance(value, float):
-            value = repr(value).encode()
-        elif isinstance(value, (int, long)):
-            # python 2 repr() on longs is '123L', so use str() instead
-            value = str(value).encode()
-        elif isinstance(value, (list, dict, tuple)):
-            value = unicode(value)
-        elif not isinstance(value, basestring):
-            # a value we don't know how to deal with. throw an error
-            typename = type(value).__name__
-            raise DataError(
-                "Invalid input of type: '%s'. Convert to a "
-                "bytes, string, int or float first." % typename
-            )
-        if isinstance(value, unicode):
-            value = value.encode(self.encoding, self.encoding_errors)
-        return value
-
-
-redis.connection.Encoder = Encoder
-
-
-class RedisDB:
-    def __init__(
-        self,
-        ip_ports=None,
-        db=None,
-        user_pass=None,
-        url=None,
-        decode_responses=True,
-        service_name=None,
-        max_connections=32,
-        **kwargs,
-    ):
-        """
-        redis的封装
-        Args:
-            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-            db:
-            user_pass:
-            url:
-            decode_responses:
-            service_name: 适用于redis哨兵模式
-        """
-
-        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
-        if ip_ports is None:
-            ip_ports = setting.REDISDB_IP_PORTS
-        if db is None:
-            db = setting.REDISDB_DB
-        if user_pass is None:
-            user_pass = setting.REDISDB_USER_PASS
-        if service_name is None:
-            service_name = setting.REDISDB_SERVICE_NAME
-
-        self._is_redis_cluster = False
-
-        self.__redis = None
-        self._url = url
-        self._ip_ports = ip_ports
-        self._db = db
-        self._user_pass = user_pass
-        self._decode_responses = decode_responses
-        self._service_name = service_name
-        self._max_connections = max_connections
-        self._kwargs = kwargs
-        self.get_connect()
-
-    def __repr__(self):
-        if self._url:
-            return "<Redisdb url:{}>".format(self._url)
-
-        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
-            self._ip_ports, self._db, self._user_pass
-        )
-
-    @property
-    def _redis(self):
-        try:
-            if not self.__redis.ping():
-                raise ConnectionError("unable to connect to redis")
-        except:
-            self._reconnect()
-
-        return self.__redis
-
-    @_redis.setter
-    def _redis(self, val):
-        self.__redis = val
-
-    def get_connect(self):
-        # 获取数据库连接
-        try:
-            if not self._url:
-                if not self._ip_ports:
-                    raise Exception("未设置 redis 连接信息")
-
-                ip_ports = (
-                    self._ip_ports
-                    if isinstance(self._ip_ports, list)
-                    else self._ip_ports.split(",")
-                )
-                if len(ip_ports) > 1:
-                    startup_nodes = []
-                    for ip_port in ip_ports:
-                        ip, port = ip_port.split(":")
-                        startup_nodes.append({"host": ip, "port": port})
-
-                    if self._service_name:
-                        # log.debug("使用redis哨兵模式")
-                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
-                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
-                        self._redis = sentinel.master_for(
-                            self._service_name,
-                            password=self._user_pass,
-                            db=self._db,
-                            redis_class=redis.StrictRedis,
-                            decode_responses=self._decode_responses,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    else:
-                        # log.debug("使用redis集群模式")
-                        self._redis = RedisCluster(
-                            startup_nodes=startup_nodes,
-                            decode_responses=self._decode_responses,
-                            password=self._user_pass,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    self._is_redis_cluster = True
-                else:
-                    ip, port = ip_ports[0].split(":")
-                    self._redis = redis.StrictRedis(
-                        host=ip,
-                        port=port,
-                        db=self._db,
-                        password=self._user_pass,
-                        decode_responses=self._decode_responses,
-                        max_connections=self._max_connections,
-                        **self._kwargs,
-                    )
-                    self._is_redis_cluster = False
-            else:
-                self._redis = redis.StrictRedis.from_url(
-                    self._url, decode_responses=self._decode_responses
-                )
-                self._is_redis_cluster = False
-
-        except Exception as e:
-            raise
-
-        # 不要写成self._redis.ping() 否则循环调用了
-        return self.__redis.ping()
-
-    @classmethod
-    def from_url(cls, url):
-        """
-
-        Args:
-            url: redis://[[username]:[password]]@[host]:[port]/[db]
-
-        Returns:
-
-        """
-        return cls(url=url)
-
-    def sadd(self, table, values):
-        """
-        @summary: 使用无序set集合存储数据, 去重
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        ---------
-        @result: 若库中存在 返回0,否则入库,返回1。 批量添加返回None
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.sadd(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.sadd(table, values)
-
-    def sget(self, table, count=1, is_pop=True):
-        """
-        返回 list 如 ['1'] 或 []
-        @param table:
-        @param count:
-        @param is_pop:
-        @return:
-        """
-
-        datas = []
-        if is_pop:
-            count = count if count <= self.sget_count(table) else self.sget_count(table)
-            if count:
-                if count > 1:
-                    pipe = self._redis.pipeline()
-
-                    if not self._is_redis_cluster:
-                        pipe.multi()
-                    while count:
-                        pipe.spop(table)
-                        count -= 1
-                    datas = pipe.execute()
-
-                else:
-                    datas.append(self._redis.spop(table))
-
-        else:
-            datas = self._redis.srandmember(table, count)
-
-        return datas
-
-    def srem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.srem(table, value)
-            pipe.execute()
-        else:
-            self._redis.srem(table, values)
-
-    def sget_count(self, table):
-        return self._redis.scard(table)
-
-    def sdelete(self, table):
-        """
-        @summary: 删除set集合的大键(数据量大的表)
-        删除大set键,使用sscan命令,每次扫描集合中500个元素,再用srem命令每次删除一个键
-        若直接用delete命令,会导致Redis阻塞,出现故障切换和应用程序崩溃的故障。
-        ---------
-        @param table:
-        ---------
-        @result:
-        """
-
-        # 当 SCAN 命令的游标参数被设置为 0 时, 服务器将开始一次新的迭代, 而当服务器向用户返回值为 0 的游标时, 表示迭代已结束
-        cursor = "0"
-        while cursor != 0:
-            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
-            for item in data:
-                # pipe.srem(table, item)
-                self._redis.srem(table, item)
-
-            # pipe.execute()
-
-    def sismember(self, table, key):
-        "Return a boolean indicating if ``value`` is a member of set ``name``"
-        return self._redis.sismember(table, key)
-
-    def zadd(self, table, values, prioritys=0):
-        """
-        @summary: 使用有序set集合存储数据, 去重(值存在更新)
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        @param prioritys: 优先级; double类型,支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值,默认value的优先级为0
-        ---------
-        @result:若库中存在 返回0,否则入库,返回1。 批量添加返回 [0, 1 ...]
-        """
-        if isinstance(values, list):
-            if not isinstance(prioritys, list):
-                prioritys = [prioritys] * len(values)
-            else:
-                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
-
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value, priority in zip(values, prioritys):
-                pipe.execute_command(
-                    "ZADD", table, priority, value
-                )  # 为了兼容2.x与3.x版本的redis
-            return pipe.execute()
-
-        else:
-            return self._redis.execute_command(
-                "ZADD", table, prioritys, values
-            )  # 为了兼容2.x与3.x版本的redis
-
-    def zget(self, table, count=1, is_pop=True):
-        """
-        @summary: 从有序set集合中获取数据 优先返回分数小的(优先级高的)
-        ---------
-        @param table:
-        @param count: 数量 -1 返回全部数据
-        @param is_pop:获取数据后,是否在原set集合中删除,默认是
-        ---------
-        @result: 列表
-        """
-
-        start_pos = 0  # 包含
-        end_pos = count - 1 if count > 0 else count
-
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
-        pipe.zrange(table, start_pos, end_pos)  # 取值
-        if is_pop:
-            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
-        results, *count = pipe.execute()
-        return results
-
-    def zremrangebyscore(self, table, priority_min, priority_max):
-        """
-        根据分数移除成员 闭区间
-        @param table:
-        @param priority_min:
-        @param priority_max:
-        @return: 被移除的成员个数
-        """
-        return self._redis.zremrangebyscore(table, priority_min, priority_max)
-
-    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
-        """
-        @summary: 返回指定分数区间的数据 闭区间
-        ---------
-        @param table:
-        @param priority_min: 优先级越小越优先
-        @param priority_max:
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        @param is_pop: 是否删除
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[2]
-            local max_score = ARGV[3]
-            local is_pop = ARGV[4]
-            local count = ARGV[5]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            -- 删除redis中刚取到的值
-            if (is_pop=='True' or is_pop=='1') then
-                for i=1, #datas do
-                    redis.call('zrem', KEYS[1], datas[i])
-                end
-            end
-
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
-
-        return res
-
-    def zrangebyscore_increase_score(
-        self, table, priority_min, priority_max, increase_score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param increase_score: 分数值增量 正数则在原有的分数上叠加,负数则相减
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local increase_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            --修改优先级
-            for i=1, #datas do
-                redis.call('zincrby', KEYS[1], increase_score, datas[i])
-            end
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[priority_min, priority_max, increase_score, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
-
-        return res
-
-    def zrangebyscore_set_score(
-        self, table, priority_min, priority_max, score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param score: 分数值
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local set_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
-            end
-
-            local real_datas = {} -- 数据
-            --修改优先级
-            for i=1, #datas, 2 do
-               local data = datas[i]
-               local score = datas[i+1]
-
-               table.insert(real_datas, data) -- 添加数据
-
-               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
-            end
-
-            return real_datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score])
-
-        return res
-
-    def zincrby(self, table, amount, value):
-        return self._redis.zincrby(table, amount, value)
-
-    def zget_count(self, table, priority_min=None, priority_max=None):
-        """
-        @summary: 获取表数据的数量
-        ---------
-        @param table:
-        @param priority_min:优先级范围 最小值(包含)
-        @param priority_max:优先级范围 最大值(包含)
-        ---------
-        @result:
-        """
-
-        if priority_min != None and priority_max != None:
-            return self._redis.zcount(table, priority_min, priority_max)
-        else:
-            return self._redis.zcard(table)
-
-    def zrem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            self._redis.zrem(table, *values)
-        else:
-            self._redis.zrem(table, values)
-
-    def zexists(self, table, values):
-        """
-        利用zscore判断某元素是否存在
-        @param values:
-        @return:
-        """
-
-        is_exists = []
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for value in values:
-                pipe.zscore(table, value)
-            is_exists_temp = pipe.execute()
-            for is_exist in is_exists_temp:
-                if is_exist != None:
-                    is_exists.append(1)
-                else:
-                    is_exists.append(0)
-
-        else:
-            is_exists = self._redis.zscore(table, values)
-            is_exists = 1 if is_exists != None else 0
-
-        return is_exists
-
-    def lpush(self, table, values):
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.rpush(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.rpush(table, values)
-
-    def lpop(self, table, count=1):
-        """
-        @summary:
-        ---------
-        @param table:
-        @param count:
-        ---------
-        @result: count>1时返回列表
-        """
-
-        datas = None
-
-        count = count if count <= self.lget_count(table) else self.lget_count(table)
-
-        if count:
-            if count > 1:
-                pipe = self._redis.pipeline()
-
-                if not self._is_redis_cluster:
-                    pipe.multi()
-                while count:
-                    pipe.lpop(table)
-                    count -= 1
-                datas = pipe.execute()
-
-            else:
-                datas = self._redis.lpop(table)
-
-        return datas
-
-    def rpoplpush(self, from_table, to_table=None):
-        """
-        将列表 from_table 中的最后一个元素(尾元素)弹出,并返回给客户端。
-        将 from_table 弹出的元素插入到列表 to_table ,作为 to_table 列表的的头元素。
-        如果 from_table 和 to_table 相同,则列表中的表尾元素被移动到表头,并返回该元素,可以把这种特殊情况视作列表的旋转(rotation)操作
-        @param from_table:
-        @param to_table:
-        @return:
-        """
-
-        if not to_table:
-            to_table = from_table
-
-        return self._redis.rpoplpush(from_table, to_table)
-
-    def lget_count(self, table):
-        return self._redis.llen(table)
-
-    def lrem(self, table, value, num=0):
-        """
-        @summary:
-        删除value
-        ---------
-        @param table:
-        @param value:
-        @param num:
-        ---------
-        @result: 删除的条数
-        """
-        return self._redis.lrem(table, num, value)
-
-    def lrange(self, table, start=0, end=-1):
-        return self._redis.lrange(table, start, end)
-
-    def hset(self, table, key, value):
-        """
-        @summary:
-        如果 key 不存在,一个新的哈希表被创建并进行 HSET 操作。
-        如果域 field 已经存在于哈希表中,旧值将被覆盖
-        ---------
-        @param table:
-        @param key:
-        @param value:
-        ---------
-        @result: 1 新插入; 0 覆盖
-        """
-        return self._redis.hset(table, key, value)
-
-    def hset_batch(self, table, datas):
-        """
-        批量插入
-        Args:
-            datas:
-                [[key, value]]
-        Returns:
-
-        """
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()
-        for key, value in datas:
-            pipe.hset(table, key, value)
-        return pipe.execute()
-
-    def hincrby(self, table, key, increment):
-        return self._redis.hincrby(table, key, increment)
-
-    def hget(self, table, key, is_pop=False):
-        if not is_pop:
-            return self._redis.hget(table, key)
-        else:
-            lua = """
-                -- local key = KEYS[1]
-                local field = ARGV[1]
-
-                -- 取值
-                local datas = redis.call('hget', KEYS[1], field)
-                -- 删除值
-                redis.call('hdel', KEYS[1], field)
-
-                return datas
-
-                    """
-            cmd = self._redis.register_script(lua)
-            res = cmd(keys=[table], args=[key])
-
-            return res
-
-    def hgetall(self, table):
-        return self._redis.hgetall(table)
-
-    def hexists(self, table, key):
-        return self._redis.hexists(table, key)
-
-    def hdel(self, table, *keys):
-        """
-        @summary: 删除对应的key 可传多个
-        ---------
-        @param table:
-        @param *keys:
-        ---------
-        @result:
-        """
-        self._redis.hdel(table, *keys)
-
-    def hget_count(self, table):
-        return self._redis.hlen(table)
-
-    def setbit(self, table, offsets, values):
-        """
-        设置字符串数组某一位的值, 返回之前的值
-        @param table:
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            if not isinstance(values, list):
-                values = [values] * len(offsets)
-            else:
-                assert len(offsets) == len(values), "offsets值要与values值一一对应"
-
-            pipe = self._redis.pipeline()
-            pipe.multi()
-
-            for offset, value in zip(offsets, values):
-                pipe.setbit(table, offset, value)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.setbit(table, offsets, values)
-
-    def getbit(self, table, offsets):
-        """
-        取字符串数组某一位的值
-        @param table:
-        @param offsets: 支持列表
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for offset in offsets:
-                pipe.getbit(table, offset)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.getbit(table, offsets)
-
-    def bitcount(self, table):
-        return self._redis.bitcount(table)
-
-    def strset(self, table, value, **kwargs):
-        return self._redis.set(table, value, **kwargs)
-
-    def str_incrby(self, table, value):
-        return self._redis.incrby(table, value)
-
-    def strget(self, table):
-        return self._redis.get(table)
-
-    def strlen(self, table):
-        return self._redis.strlen(table)
-
-    def getkeys(self, regex):
-        return self._redis.keys(regex)
-
-    def exists_key(self, key):
-        return self._redis.exists(key)
-
-    def set_expire(self, key, seconds):
-        """
-        @summary: 设置过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        self._redis.expire(key, seconds)
-
-    def get_expire(self, key):
-        """
-        @summary: 查询过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        return self._redis.ttl(key)
-
-    def clear(self, table):
-        try:
-            self._redis.delete(table)
-        except Exception as e:
-            log.error(e)
-
-    def get_redis_obj(self):
-        return self._redis
-
-    def _reconnect(self):
-        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
-        retry_count = 0
-        while True:
-            try:
-                retry_count += 1
-                log.error(f"redis 连接断开, 重新连接 {retry_count}")
-                if self.get_connect():
-                    log.info(f"redis 连接成功")
-                    return True
-            except (ConnectionError, TimeoutError) as e:
-                log.error(f"连接失败 e: {e}")
-
-            time.sleep(2)
-
-    def __getattr__(self, name):
-        return getattr(self._redis, name)

+ 0 - 156
zgztb_cookie/FworkSpider/feapder/dedup/README.md

@@ -1,156 +0,0 @@
-# Dedup
-
-Dedup是feapder大数据去重模块,内置3种去重机制,使用方式一致,可容纳的去重数据量与内存有关。不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。
-
-
-## 去重方式
-
-### 临时去重
-
-> 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G
-
-```
-from feapder.dedup import Dedup
-
-data = {"xxx": 123, "xxxx": "xxxx"}
-datas = ["xxx", "bbb"]
-
-def test_ExpireFilter():
-    dedup = Dedup(
-        Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0"
-    )
-
-    # 逐条去重
-    assert dedup.add(data) == 1
-    assert dedup.get(data) == 1
-
-    # 批量去重
-    assert dedup.add(datas) == [1, 1]
-    assert dedup.get(datas) == [1, 1]
-```
-
-
-### 内存去重
-
-> 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB
-
-```
-from feapder.dedup import Dedup
-
-data = {"xxx": 123, "xxxx": "xxxx"}
-datas = ["xxx", "bbb"]
-
-def test_MemoryFilter():
-    dedup = Dedup(Dedup.MemoryFilter)  # 表名为test 历史数据3秒有效期
-
-    # 逐条去重
-    assert dedup.add(data) == 1
-    assert dedup.get(data) == 1
-
-    # 批量去重
-    assert dedup.add(datas) == [1, 1]
-    assert dedup.get(datas) == [1, 1]
-```
-
-### 永久去重
-
-> 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB
-
-    from feapder.dedup import Dedup
-
-    datas = {
-        "xxx": xxx,
-        "xxxx": "xxxx",
-    }
-
-    dedup = Dedup()
-
-    print(dedup) # <ScalableBloomFilter: RedisBitArray: dedup:bloomfilter:bloomfilter>
-    print(dedup.add(datas)) # 0 不存在
-    print(dedup.get(datas)) # 1 存在
-    
-## 过滤数据
-
-Dedup可以通过如下方法,过滤掉已存在的数据
-
-
-```python
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    dedup.filter_exist_data(datas)
-    assert datas == ["ccc"]
-```
-
-```python
-# redis cluster 去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, ip_ports=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    ss = dedup.filter_exist_data(datas)
-    print(ss)
-    assert datas == ["ccc"]
-```
-
-```python
-# redis 去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, expire_time=60)
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    ss = dedup.filter_exist_data(datas)
-    print(ss)
-    assert datas == ["ccc"]
-```
-
-```python
-# redis 多实例去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    redis_conf = dict(
-        pylist_=dict(
-            redisdb_ip_port="192.168.3.71:8371",
-            redisdb_user_pass="top@123",
-            redisdb_db=0
-        ),
-        list_=dict(
-            redisdb_ip_port="192.168.3.165:8165",
-            redisdb_user_pass="",
-            redisdb_db=0
-        )
-    )
-    
-    dedup = Dedup(filter_type=6, to_md5=False, redis_conf=redis_conf, expire_time=60)
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-    
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    dedup.filter_exist_data(datas)
-    print(datas)
-    assert datas == ["ccc"]
-```

+ 0 - 177
zgztb_cookie/FworkSpider/feapder/dedup/__init__.py

@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-12-13 21:08
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import copy
-from typing import Any, List, Union, Tuple, Callable, Optional
-
-from feapder.utils.tools import get_md5
-from .bloomfilter import BloomFilter, ScalableBloomFilter
-from .expirefilter import ExpireFilter
-from .litefilter import LiteFilter
-from .redisfilter import RedisFilter, MRedisFilter
-
-
-class Dedup:
-    BloomFilter = 1
-    MemoryFilter = 2
-    ExpireFilter = 3
-    LiteFilter = 4
-    RedisFilter = 5
-    MRedisFilter = 6
-
-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
-        if filter_type == Dedup.ExpireFilter:
-            try:
-                expire_time = kwargs["expire_time"]
-            except:
-                raise ValueError("需传参数 expire_time")
-
-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
-                "name", expire_time
-            )
-            expire_time_record_key = "dedup:expire_set:expire_time"
-
-            self.dedup = ExpireFilter(
-                name=name,
-                expire_time=expire_time,
-                expire_time_record_key=expire_time_record_key,
-                redis_url=kwargs.get("redis_url"),
-            )
-        elif filter_type == Dedup.RedisFilter:
-            self.dedup = RedisFilter(
-                ip_ports=kwargs.get("ip_ports"),
-                user_pass=kwargs.get("user_pass", ""),
-                redis_url=kwargs.get("redis_url"),
-                expire_time=kwargs.get("expire_time")
-            )
-        elif filter_type == Dedup.MRedisFilter:
-            self.dedup = MRedisFilter(
-                redis_conf=kwargs.get("redis_conf"),
-                expire_time=kwargs.get("expire_time")
-            )
-        else:
-            initial_capacity = kwargs.get("initial_capacity", 100000000)
-            error_rate = kwargs.get("error_rate", 0.00001)
-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get("name", "bloomfilter")
-            if filter_type == Dedup.BloomFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
-                    redis_url=kwargs.get("redis_url"),
-                )
-            elif filter_type == Dedup.MemoryFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
-                )
-            elif filter_type == Dedup.LiteFilter:
-                self.dedup = LiteFilter()
-            else:
-                raise ValueError(
-                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
-                )
-
-        self._to_md5 = to_md5
-
-    def __repr__(self):
-        return str(self.dedup)
-
-    def _deal_datas(self, datas):
-        if self._to_md5:
-            if isinstance(datas, list):
-                keys = [get_md5(data) for data in datas]
-            else:
-                keys = get_md5(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def add(
-        self, datas: Union[List[Any], Any], skip_check: bool = False
-    ) -> Union[List[Any], Any]:
-        """
-        添加数据
-        @param datas: list / 单个值
-        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
-        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-
-        keys = self._deal_datas(datas)
-        is_added = self.dedup.add(keys, skip_check)
-
-        return is_added
-
-    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
-        """
-        检查数据是否存在
-        @param datas: list / 单个值
-        @return: list / 单个值 (存在返回1 不存在返回0)
-        """
-        keys = self._deal_datas(datas)
-        is_exists = self.dedup.get(keys)
-
-        return is_exists
-
-    def filter_exist_data(
-        self,
-        datas: List[Any],
-        *,
-        datas_fingerprints: Optional[List] = None,
-        callback: Callable[[Any], None] = None
-    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
-        """
-        过滤掉已存在的数据
-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
-        @param datas_fingerprints: 数据的唯一指纹 列表
-        @param datas: 数据 列表
-        @param callback: 数据已存在时的回调 callback(data)
-        @return: None
-        """
-
-        is_exists = self.get(datas_fingerprints or datas)
-
-        dedup_datas = []
-
-        if datas_fingerprints:
-            dedup_datas_fingerprints = []
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-                data_fingerprint = datas_fingerprints.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                    dedup_datas_fingerprints.append(data_fingerprint)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas_fingerprints.extend(dedup_datas_fingerprints)
-            datas.extend(dedup_datas)
-            return datas, datas_fingerprints
-
-        else:
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas.extend(dedup_datas)
-            return datas

+ 0 - 41
zgztb_cookie/FworkSpider/feapder/dedup/basefilter.py

@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/21 11:17 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import abc
-from typing import List, Union
-
-
-class BaseFilter:
-    @abc.abstractmethod
-    def add(
-        self, keys: Union[List[str], str], *args, **kwargs
-    ) -> Union[List[bool], bool]:
-        """
-
-        Args:
-            keys: list / 单个值
-            *args:
-            **kwargs:
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-        pass
-
-    @abc.abstractmethod
-    def get(self, keys: Union[List[str], str]) -> Union[List[bool], bool]:
-        """
-        检查数据是否存在
-        Args:
-            keys: list / 单个值
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 1 否则返回 0)
-        """
-        pass

+ 0 - 143
zgztb_cookie/FworkSpider/feapder/dedup/bitarray.py

@@ -1,143 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/14 1:05 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-from __future__ import absolute_import
-
-
-from feapder.db.redisdb import RedisDB
-
-
-class BitArray:
-    def setall(self, value):
-        pass
-
-    def __repr__(self):
-        raise ImportError("this method mush be implement")
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        raise ImportError("this method mush be implement")
-
-    def get(self, offsets):
-        """
-        取字符串数字某一位的值
-        @param offsets: 支持列表或单个值
-        @return: list / 单个值
-        """
-        raise ImportError("this method mush be implement")
-
-    def count(self, value=True):
-        raise ImportError("this method mush be implement")
-
-
-class MemoryBitArray(BitArray):
-    def __init__(self, num_bits):
-        try:
-            import bitarray
-        except Exception as e:
-            raise Exception(
-                "需要安装feapder完整版\ncommand: pip install feapder[all]\n若安装出错,参考:https://boris.org.cn/feapder/#/question/%E5%AE%89%E8%A3%85%E9%97%AE%E9%A2%98"
-            )
-
-        self.num_bits = num_bits
-        self.bitarray = bitarray.bitarray(num_bits, endian="little")
-
-        self.setall(0)
-
-    def __repr__(self):
-        return "MemoryBitArray: {}".format(self.num_bits)
-
-    def setall(self, value):
-        self.bitarray.setall(value)
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-
-        old_values = []
-
-        if isinstance(offsets, list):
-            if not isinstance(values, list):
-                values = [values] * len(offsets)
-            else:
-                assert len(offsets) == len(values), "offsets值要与values值一一对应"
-
-            for offset, value in zip(offsets, values):
-                old_values.append(int(self.bitarray[offset]))
-                self.bitarray[offset] = value
-
-        else:
-            old_values = int(self.bitarray[offsets])
-            self.bitarray[offsets] = values
-
-        return old_values
-
-    def get(self, offsets):
-        """
-        取字符串数字某一位的值
-        @param offsets: 支持列表或单个值
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            return [self.bitarray[offset] for offset in offsets]
-        else:
-            return self.bitarray[offsets]
-
-    def count(self, value=True):
-        return self.bitarray.count(value)
-
-
-class RedisBitArray(BitArray):
-    """
-    仿bitarray 基于redis
-    """
-
-    redis_db = None
-
-    def __init__(self, name, redis_url=None):
-        self.name = name
-        self.count_cached_name = name + "_count_cached"
-
-        if not self.__class__.redis_db:
-            self.__class__.redis_db = RedisDB(url=redis_url)
-
-    def __repr__(self):
-        return "RedisBitArray: {}".format(self.name)
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        return self.redis_db.setbit(self.name, offsets, values)
-
-    def get(self, offsets):
-        return self.redis_db.getbit(self.name, offsets)
-
-    def count(self, value=True):
-        # 先查redis的缓存,若没有 在统计数量
-        count = self.redis_db.strget(self.count_cached_name)
-        if count:
-            return int(count)
-        else:
-            count = self.redis_db.bitcount(self.name)
-            self.redis_db.strset(self.count_cached_name, count, ex=1800)  # 半小时过期
-            return count

+ 0 - 379
zgztb_cookie/FworkSpider/feapder/dedup/bloomfilter.py

@@ -1,379 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/13 4:11 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import hashlib
-import math
-import threading
-import time
-from struct import unpack, pack
-
-from feapder.dedup.basefilter import BaseFilter
-from feapder.utils.redis_lock import RedisLock
-from . import bitarray
-
-
-def make_hashfuncs(num_slices, num_bits):
-    if num_bits >= (1 << 31):
-        fmt_code, chunk_size = "Q", 8
-    elif num_bits >= (1 << 15):
-        fmt_code, chunk_size = "I", 4
-    else:
-        fmt_code, chunk_size = "H", 2
-    total_hash_bits = 8 * num_slices * chunk_size
-    if total_hash_bits > 384:
-        hashfn = hashlib.sha512
-    elif total_hash_bits > 256:
-        hashfn = hashlib.sha384
-    elif total_hash_bits > 160:
-        hashfn = hashlib.sha256
-    elif total_hash_bits > 128:
-        hashfn = hashlib.sha1
-    else:
-        hashfn = hashlib.md5
-    fmt = fmt_code * (hashfn().digest_size // chunk_size)
-    num_salts, extra = divmod(num_slices, len(fmt))
-    if extra:
-        num_salts += 1
-    salts = tuple(hashfn(hashfn(pack("I", i)).digest()) for i in range(num_salts))
-
-    def _make_hashfuncs(key):
-        if isinstance(key, str):
-            key = key.encode("utf-8")
-        else:
-            key = str(key).encode("utf-8")
-
-        i = 0
-        for salt in salts:
-            h = salt.copy()
-            h.update(key)
-            for uint in unpack(fmt, h.digest()):
-                yield uint % num_bits
-                i += 1
-                if i >= num_slices:
-                    return
-
-    return _make_hashfuncs
-
-
-class BloomFilter(object):
-    BASE_MEMORY = 1
-    BASE_REDIS = 2
-
-    def __init__(
-        self,
-        capacity: int,
-        error_rate: float = 0.00001,
-        bitarray_type=BASE_REDIS,
-        name=None,
-        redis_url=None,
-    ):
-        if not (0 < error_rate < 1):
-            raise ValueError("Error_Rate must be between 0 and 1.")
-        if not capacity > 0:
-            raise ValueError("Capacity must be > 0")
-
-        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
-        # k = log2(1/P)
-        # solving for m = bits_per_slice
-        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
-        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
-        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
-        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
-        bits_per_slice = int(
-            math.ceil(
-                (capacity * abs(math.log(error_rate)))
-                / (num_slices * (math.log(2) ** 2))
-            )
-        )
-        self._setup(error_rate, num_slices, bits_per_slice, capacity)
-
-        if bitarray_type == BloomFilter.BASE_MEMORY:
-            self.bitarray = bitarray.MemoryBitArray(self.num_bits)
-            self.bitarray.setall(False)
-        elif bitarray_type == BloomFilter.BASE_REDIS:
-            assert name, "name can't be None "
-            self.bitarray = bitarray.RedisBitArray(name, redis_url)
-        else:
-            raise ValueError("not support this bitarray type")
-
-    def _setup(self, error_rate, num_slices, bits_per_slice, capacity):
-        self.error_rate = error_rate
-        self.num_slices = num_slices
-        self.bits_per_slice = bits_per_slice
-        self.capacity = capacity
-        self.num_bits = num_slices * bits_per_slice
-        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
-
-        self._is_at_capacity = False
-        self._check_capacity_time = 0
-
-    def __repr__(self):
-        return "<BloomFilter: {}>".format(self.bitarray)
-
-    def get(self, keys, to_list=False):
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-        is_exists = []
-
-        offsets = []
-        for key in keys:
-            hashes = self.make_hashes(key)
-            offset = 0
-            for k in hashes:
-                offsets.append(offset + k)
-                offset += self.bits_per_slice
-
-        old_values = self.bitarray.get(offsets)
-        for i in range(0, len(old_values), self.num_slices):
-            is_exists.append(int(all(old_values[i : i + self.num_slices])))
-
-        if to_list:
-            return is_exists
-        else:
-            return is_exists if is_list else is_exists[0]
-
-    @property
-    def is_at_capacity(self):
-        """
-        是否容量已满, 1的个数满位数组的一半的时,则看做已满
-        比较耗时 半小时检查一次
-        @return:
-        """
-        if self._is_at_capacity:
-            return self._is_at_capacity
-
-        if (
-            not self._check_capacity_time
-            or time.time() - self._check_capacity_time > 1800
-        ):
-            bit_count = self.bitarray.count()
-            if bit_count and bit_count / self.num_bits > 0.5:
-                self._is_at_capacity = True
-
-            self._check_capacity_time = time.time()
-
-        return self._is_at_capacity
-
-    def add(self, keys):
-        """
-        Adds a key to this bloom filter. If the key already exists in this
-        filter it will return False. Otherwise True. keys support list
-        @param keys: list or one key
-        @return:
-        """
-        # if self.is_at_capacity:
-        #     raise IndexError("BloomFilter is at capacity")
-
-        is_list = isinstance(keys, list)
-
-        keys = keys if is_list else [keys]
-        is_added = []
-
-        offsets = []
-        for key in keys:
-            hashes = self.make_hashes(key)
-            offset = 0
-            for k in hashes:
-                offsets.append(offset + k)
-                offset += self.bits_per_slice
-
-        old_values = self.bitarray.set(offsets, 1)
-        for i in range(0, len(old_values), self.num_slices):
-            is_added.append(1 ^ int(all(old_values[i : i + self.num_slices])))
-
-        return is_added if is_list else is_added[0]
-
-
-class ScalableBloomFilter(BaseFilter):
-    """
-    自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
-    """
-
-    BASE_MEMORY = BloomFilter.BASE_MEMORY
-    BASE_REDIS = BloomFilter.BASE_REDIS
-
-    def __init__(
-        self,
-        initial_capacity: int = 100000000,
-        error_rate: float = 0.00001,
-        bitarray_type=BASE_REDIS,
-        name=None,
-        redis_url=None,
-    ):
-
-        if not error_rate or error_rate < 0:
-            raise ValueError("Error_Rate must be a decimal less than 0.")
-
-        self._setup(
-            initial_capacity, error_rate, name, bitarray_type, redis_url=redis_url
-        )
-
-    def _setup(self, initial_capacity, error_rate, name, bitarray_type, redis_url):
-        self.initial_capacity = initial_capacity
-        self.error_rate = error_rate
-        self.name = name
-        self.bitarray_type = bitarray_type
-        self.redis_url = redis_url
-
-        self.filters = []
-
-        self.filters.append(self.create_filter())
-        self._thread_lock = threading.RLock()
-        self._check_capacity_time = 0
-
-    def __repr__(self):
-        return "<ScalableBloomFilter: {}>".format(self.filters[-1].bitarray)
-
-    def create_filter(self):
-        filter = BloomFilter(
-            capacity=self.initial_capacity,
-            error_rate=self.error_rate,
-            bitarray_type=self.bitarray_type,
-            name=self.name + str(len(self.filters)) if self.name else self.name,
-            redis_url=self.redis_url,
-        )
-
-        return filter
-
-    def check_filter_capacity(self):
-        """
-        检测filter状态,如果已满,加载新的filter
-        @return:
-        """
-        if (
-            not self._check_capacity_time
-            or time.time() - self._check_capacity_time > 1800
-        ):
-            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
-                with self._thread_lock:
-                    while True:
-                        if self.filters[-1].is_at_capacity:
-                            self.filters.append(self.create_filter())
-                        else:
-                            break
-
-                    self._check_capacity_time = time.time()
-            else:
-                # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
-                key = (
-                    f"ScalableBloomFilter:{self.name}"
-                    if self.name
-                    else "ScalableBloomFilter"
-                )
-                with RedisLock(key=key, redis_url=self.redis_url) as lock:
-                    if lock.locked:
-                        while True:
-                            if self.filters[-1].is_at_capacity:
-                                self.filters.append(self.create_filter())
-                            else:
-                                break
-
-                        self._check_capacity_time = time.time()
-
-    def add(self, keys, skip_check=False):
-        """
-        Adds a key to this bloom filter. If the key already exists in this
-        filter it will return False. Otherwise True. keys support list
-        @param keys: list or one key
-        @param skip_check: add directly,not check if is exist in bloomfilters
-        @return:
-        """
-
-        self.check_filter_capacity()
-
-        current_filter = self.filters[-1]
-
-        if skip_check:
-            return current_filter.add(keys)
-
-        else:
-            is_list = isinstance(keys, list)
-
-            keys = keys if is_list else [keys]
-            not_exist_keys = list(set(keys))
-
-            # 检查之前的bloomfilter是否存在
-            # 记录下每级filter存在的key,不存在的key继续向下检查
-            for filter in reversed(self.filters):
-                current_filter_is_exists = filter.get(
-                    not_exist_keys, to_list=True
-                )  # 当前的filter是否存在
-
-                not_exist_keys_temp = []
-
-                for key, is_exist in zip(not_exist_keys, current_filter_is_exists):
-                    if not is_exist:  # 当前filter不存在的key 需要继续向下检查
-                        not_exist_keys_temp.append(key)
-
-                not_exist_keys = not_exist_keys_temp
-
-                if not not_exist_keys:
-                    break
-
-            # 仍有不存在的关键词,记录该关键词
-            if not_exist_keys:
-                current_filter.add(not_exist_keys)
-
-            # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
-            for i, key in enumerate(keys):
-                for j, not_exist_key in enumerate(not_exist_keys):
-                    if key == not_exist_key:
-                        keys[i] = 1
-                        not_exist_keys.pop(j)
-                        break
-                else:
-                    keys[i] = 0
-
-            is_added = keys
-            return is_added if is_list else is_added[0]
-
-    def get(self, keys):
-        self.check_filter_capacity()
-
-        is_list = isinstance(keys, list)
-
-        keys = keys if is_list else [keys]  # 最终会修改为 [0, 1, ...] 0表示不存在 1 已存在
-        not_exist_keys = list(set(keys))
-
-        # 检查之前的bloomfilter是否存在
-        # 记录下每级filter存在的key,不存在的key继续向下检查
-        for filter in reversed(self.filters):
-            current_filter_is_exists = filter.get(
-                not_exist_keys, to_list=True
-            )  # 当前的filter是否存在
-
-            not_exist_keys_temp = []
-
-            for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists):
-                if not is_exist:  # 当前filter不存在的key 需要继续向下检查
-                    not_exist_keys_temp.append(checked_key)
-
-            not_exist_keys = not_exist_keys_temp
-
-            if not not_exist_keys:
-                break
-
-        # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
-        for i, key in enumerate(keys):
-            for j, not_exist_key in enumerate(not_exist_keys):
-                if key == not_exist_key:
-                    keys[i] = 0
-                    not_exist_keys.pop(j)
-                    break
-            else:
-                keys[i] = 1
-
-        is_exists = keys
-        return is_exists if is_list else is_exists[0]
-
-    @property
-    def capacity(self):
-        """Returns the total capacity for all filters in this SBF"""
-        return sum(f.capacity for f in self.filters)

+ 0 - 81
zgztb_cookie/FworkSpider/feapder/dedup/expirefilter.py

@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/13 9:44 PM
----------
-@summary: 带有有效期的去重集合
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import time
-
-from feapder.db.redisdb import RedisDB
-from feapder.dedup.basefilter import BaseFilter
-
-
-class ExpireFilter(BaseFilter):
-    redis_db = None
-
-    def __init__(
-        self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None
-    ):
-        if not name:
-            raise ValueError("name cant't be None")
-        if not expire_time:
-            raise ValueError("please set expire time, units is seconds")
-
-        if not self.__class__.redis_db:
-            self.__class__.redis_db = RedisDB(url=redis_url)
-
-        self.name = name
-        self.expire_time = expire_time
-        self.expire_time_record_key = expire_time_record_key
-        self.del_expire_key_time = None
-
-        self.record_expire_time()
-
-        self.del_expire_key()
-
-    def __repr__(self):
-        return "<ExpireSet: {}>".format(self.name)
-
-    @property
-    def current_timestamp(self):
-        return int(time.time())
-
-    def add(self, keys, *args, **kwargs):
-        """
-        @param keys: 检查关键词在zset中是否存在,支持列表批量
-        @return: list / 单个值
-        """
-        if self.current_timestamp - self.del_expire_key_time > self.expire_time:
-            self.del_expire_key()
-
-        is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp)
-        return is_added
-
-    def get(self, keys):
-        is_exist = self.redis_db.zexists(self.name, keys)
-        if isinstance(keys, list):
-            # 判断数据本身是否重复
-            temp_set = set()
-            for i, key in enumerate(keys):
-                if key in temp_set:
-                    is_exist[i] = 1
-                else:
-                    temp_set.add(key)
-
-        return is_exist
-
-    def del_expire_key(self):
-        self.redis_db.zremrangebyscore(
-            self.name, "-inf", self.current_timestamp - self.expire_time
-        )
-        self.del_expire_key_time = self.current_timestamp
-
-    def record_expire_time(self):
-        if self.expire_time_record_key:
-            self.redis_db.hset(
-                self.expire_time_record_key, key=self.name, value=self.expire_time
-            )

+ 0 - 70
zgztb_cookie/FworkSpider/feapder/dedup/litefilter.py

@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/21 11:28 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-from typing import List, Union, Set
-
-from feapder.dedup.basefilter import BaseFilter
-
-
-class LiteFilter(BaseFilter):
-    def __init__(self):
-        self.datas: Set[str] = set()
-
-    def add(
-        self, keys: Union[List[str], str], *args, **kwargs
-    ) -> Union[List[int], int]:
-        """
-
-        Args:
-            keys: list / 单个值
-            *args:
-            **kwargs:
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-        if isinstance(keys, list):
-            is_add = []
-            for key in keys:
-                if key not in self.datas:
-                    self.datas.add(key)
-                    is_add.append(1)
-                else:
-                    is_add.append(0)
-        else:
-            if keys not in self.datas:
-                is_add = 1
-                self.datas.add(keys)
-            else:
-                is_add = 0
-        return is_add
-
-    def get(self, keys: Union[List[str], str]) -> Union[List[int], int]:
-        """
-        检查数据是否存在
-        Args:
-            keys: list / 单个值
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 1 否则返回 0)
-        """
-        if isinstance(keys, list):
-            temp_set = set()
-            is_exist = []
-            for key in keys:
-                # 数据本身重复或者数据在去重库里
-                if key in temp_set or key in self.datas:
-                    is_exist.append(1)
-                else:
-                    is_exist.append(0)
-                    temp_set.add(key)
-
-            return is_exist
-        else:
-            return int(keys in self.datas)

+ 0 - 131
zgztb_cookie/FworkSpider/feapder/dedup/redisfilter.py

@@ -1,131 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-03-01
----------
-@summary: redis 过滤器
----------
-@author: dzr
-"""
-from feapder.db.redisdb import RedisDB
-from feapder.dedup.basefilter import BaseFilter
-
-
-class RedisFilter(BaseFilter):
-    redis_db = None
-
-    def __init__(self, ip_ports=None, user_pass=None, redis_url=None, expire_time=None):
-        if ip_ports:
-            self.__class__.redis_db = RedisDB(
-                ip_ports=ip_ports,
-                user_pass=user_pass,
-                decode_responses=True,
-            )  # 集群
-        elif redis_url:
-            self.__class__.redis_db = RedisDB.from_url(redis_url)  # 单机
-        else:
-            self.__class__.redis_db = RedisDB()
-
-        self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
-        self._prefix1 = 'list_'
-        self._prefix2 = 'pylist_'
-
-    def __repr__(self):
-        return "<RedisFilter: {}>".format(self.redis_db)
-
-    def exists(self, key):
-        """全量python检索"""
-        if self.redis_db.exists(self._prefix2 + key) > 0:
-            return True
-        return False
-
-    def add(self, keys, *args, **kwargs):
-        """
-        添加数据
-        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-
-        is_added = []
-        for key in keys:
-            if not self.exists(key):
-                is_added.append(
-                    self.redis_db.set(self._prefix2 + key, 1, ex=self._ex)
-                )
-            else:
-                is_added.append(False)
-
-        return is_added if is_list else is_added[0]
-
-    def get(self, keys):
-        """
-        检查数据是否存在
-        @param keys: list / 单个值
-        @return: list / 单个值 (存在返回True 不存在返回False)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-
-        is_exist = []
-        for key in keys:
-            is_exist.append(self.exists(key))
-
-        # 判断数据本身是否重复
-        temp_set = set()
-        for i, key in enumerate(keys):
-            if key in temp_set:
-                is_exist[i] = True
-            else:
-                temp_set.add(key)
-
-        return is_exist if is_list else is_exist[0]
-
-
-class MRedisFilter(RedisFilter):
-    redis_dbs = {}
-
-    def __init__(self, redis_conf=None, **kwargs):
-        super(MRedisFilter, self).__init__(**kwargs)
-        if not redis_conf:
-            self.__class__.redis_dbs[self._prefix2] = RedisDB()
-        else:
-            if not isinstance(redis_conf, dict):
-                raise ValueError("redis_conf 必须是一个 dict")
-
-            for prefix, conf in redis_conf.items():
-                self.__class__.redis_dbs[prefix] = RedisDB(
-                    ip_ports=conf['redisdb_ip_port'],
-                    user_pass=conf['redisdb_user_pass'],
-                    db=conf['redisdb_db']
-                )
-
-    def __repr__(self):
-        return "<MRedisFilter: {}>".format(self.redis_dbs)
-
-    def exists(self, key):
-        """lua增量检索/python增量检索"""
-        for prefix, redis_db in self.redis_dbs.items():
-            if redis_db.exists(prefix + key) > 0:
-                return True
-        return False
-
-    def add(self, keys, *args, **kwargs):
-        """
-        添加数据
-        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-
-        redis_db = self.redis_dbs[self._prefix2]
-
-        is_added = []
-        for key in keys:
-            if not self.exists(key):
-                is_added.append(redis_db.set(self._prefix2 + key, 1, ex=self._ex))
-            else:
-                is_added.append(False)
-
-        return is_added if is_list else is_added[0]

+ 0 - 799
zgztb_cookie/FworkSpider/feapder/network/cookie_pool.py

@@ -1,799 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/27 11:32 AM
----------
-@summary: cookie池
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import abc
-import datetime
-import random
-import time
-import warnings
-from collections import Iterable
-from enum import Enum, unique
-
-import requests
-from func_timeout import func_set_timeout
-
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.db.mongodb import MongoDB
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
-from feapder.utils import metrics
-from feapder.utils.log import log
-from feapder.utils.redis_lock import RedisLock
-from feapder.utils.tools import send_msg
-
-
-class CookiePoolInterface(metaclass=abc.ABCMeta):
-    """
-    cookie pool interface
-    """
-
-    @abc.abstractmethod
-    def create_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def get_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def del_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def run(self):
-        raise NotImplementedError
-
-
-class PageCookiePool(CookiePoolInterface):
-    """
-    由页面产生的cookie 不需要用户登陆
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        page_url=None,
-        min_cookies=10000,
-        must_contained_keys=(),
-        keep_alive=False,
-        **kwargs,
-    ):
-        """
-        @param redis_key: 项目名
-        @param page_url: 生产cookie的url
-        @param min_cookies: 最小cookie数
-        @param must_contained_keys: cookie 必须包含的key
-        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
-        ---
-        @param kwargs: WebDriver的一些参数
-            load_images: 是否加载图片
-            user_agent_pool: user-agent池 为None时不使用
-            proxies_pool: ;代理池 为None时不使用
-            headless: 是否启用无头模式
-            driver_type: web driver 类型
-            timeout: 请求超时时间 默认16s
-            window_size: 屏幕分辨率 (width, height)
-
-        """
-
-        self._redisdb = RedisDB()
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
-            redis_key
-        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
-        self._page_url = page_url
-        self._min_cookies = min_cookies
-        self._must_contained_keys = must_contained_keys
-        self._keep_alive = keep_alive
-
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-
-    def create_cookie(self):
-        """
-        可能会重写
-        @return:
-        """
-        url = self._page_url
-        header = {
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": user_agent.get()
-        }
-        res = requests.get(url, headers=header)
-        cookies = requests.utils.dict_from_cookiejar(res.cookies)
-        return cookies
-
-    def add_cookies(self, cookies):
-        log.info("添加cookie {}".format(cookies))
-        self._redisdb.lpush(self._tab_cookie_pool, cookies)
-
-    def run(self):
-        for i in range(10):
-            try:
-                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
-                need_cookie_count = self._min_cookies - now_cookie_count
-
-                if need_cookie_count > 0:
-                    log.info(
-                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
-                            now_cookie_count, self._min_cookies
-                        )
-                    )
-                    try:
-                        cookies = self.create_cookie()
-                        if cookies:
-                            self.add_cookies(cookies)
-                    except Exception as e:
-                        log.exception(e)
-                else:
-                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-
-                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
-                    last_count_info = self._redisdb.strget(
-                        self._tab_cookie_pool_last_count
-                    )
-                    if not last_count_info:
-                        self._redisdb.strset(
-                            self._tab_cookie_pool_last_count,
-                            "{}:{}".format(time.time(), now_cookie_count),
-                        )
-                    else:
-                        last_time, last_count = last_count_info.split(":")
-                        last_time = float(last_time)
-                        last_count = int(last_count)
-
-                        if time.time() - last_time > 60:
-                            if now_cookie_count == last_count:
-                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
-                                break
-                            else:
-                                self._redisdb.strset(
-                                    self._tab_cookie_pool_last_count,
-                                    "{}:{}".format(time.time(), now_cookie_count),
-                                )
-
-                    if self._keep_alive:
-                        log.info("sleep 10")
-                        tools.delay_time(10)
-                    else:
-                        break
-
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    @func_set_timeout(300)
-    def get_cookie(self, wait_when_null=True):
-        for i in range(10):
-            try:
-                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not cookie_info and wait_when_null:
-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
-                    self._keep_alive = False
-                    self._min_cookies = 1
-                    with RedisLock(
-                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
-                    ) as _lock:
-                        if _lock.locked:
-                            self.run()
-                    continue
-                return eval(cookie_info) if cookie_info else {}
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, cookies):
-        self._redisdb.lrem(self._tab_cookie_pool, cookies)
-
-
-class User:
-    def __init__(self, username, cookie):
-        self.username = username
-        self.cookie = cookie
-
-
-class LoginCookiePool(CookiePoolInterface):
-    """
-    需要登陆的cookie池, 用户账号密码等信息用mysql保存
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        *,
-        login_site,
-        table_userbase="feapder_login",
-        login_state_key="login_state",
-        lock_state_key="lock_state",
-        username_key="username",
-        password_key="password",
-        login_retry_times=10,
-    ):
-        """
-        @param redis_key: 项目名
-        @param table_userbase: 用户表名
-        @param login_state_key: 登录状态列名
-        @param lock_state_key: 封锁状态列名
-        @param username_key: 登陆名列名
-        @param password_key: 密码列名
-        @param login_retry_times: 登陆失败重试次数
-        """
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._login_retry_times = login_retry_times
-        self._table_userbase = table_userbase
-        self._login_state_key = login_state_key
-        self._lock_state_key = lock_state_key
-        self._username_key = username_key
-        self._password_key = password_key
-        self._login_site = login_site
-        self._redisdb = RedisDB()
-        self._mongo = MongoDB(db='user_login')
-
-
-    def create_cookie(self, username, password):
-
-        """
-        创建cookie
-        @param username: 用户名
-        @param password: 密码
-        @return: return cookie / None
-        """
-        raise NotImplementedError
-
-    def get_user_info(self):
-        """
-        返回用户信息
-        @return: yield username, password
-        """
-
-        return self._mongo.find(self._table_userbase,{"site":self._login_site,self._lock_state_key:0,self._login_state_key:0})
-
-    def handle_login_failed_user(self, username, password):
-        """
-        处理登录失败的user
-        @param username:
-        @param password:
-        @return:
-        """
-
-        pass
-
-    def handel_exception(self, e):
-        """
-        处理异常
-        @param e:
-        @return:
-        """
-        log.exception(e)
-
-    def save_cookie(self, username, cookie):
-        user_cookie = {"username": username, "cookie": cookie}
-
-        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
-        self._mongo.add(
-                # coll_name=self._table_userbase,
-                coll_name="feapder_login_record",
-                data={self._login_state_key: 1,
-                      "status": "create",
-                      "site": self._login_site,
-                      "login_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(round(time.time()))))},
-                update_columns=self._username_key,
-                update_columns_value=username)
-
-    def get_cookie(self, wait_when_null=True) -> User:
-        for i in range(10):
-            try:
-                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not user_cookie and wait_when_null:
-                    log.info("暂无cookie 生产中..."+self._tab_cookie_pool)
-                    self.login()
-                    continue
-
-                if user_cookie:
-                    user_cookie = eval(user_cookie)
-                    return User(**user_cookie)
-
-                return None
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, user: User):
-        """
-        删除失效的cookie
-        @param user:
-        @return:
-        """
-        user_info = {"username": user.username, "cookie": user.cookie}
-        self._redisdb.lrem(self._tab_cookie_pool, user_info)
-
-        self._mongo.add(
-            # coll_name=self._table_userbase,
-            coll_name="feapder_login_record",
-            data={self._login_state_key: 1,
-                  "status": "remove",
-                  "site": self._login_site,
-                  "login_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(round(time.time()))))},
-            update_columns=self._username_key,
-            update_columns_value=user.username)
-
-    def user_is_locked(self, user: User):
-
-        self._mongo.add(
-            # coll_name=self._table_userbase,
-            coll_name="feapder_login_record",
-            data={self._lock_state_key: 1,
-                  "site": self._login_site,
-                  "login_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(round(time.time()))))},
-            update_columns=self._username_key,
-            update_columns_value=user.username)
-
-    def run(self):
-        with RedisLock(
-            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
-        ) as _lock:
-            if _lock.locked:
-                user_infos = self.get_user_info()
-                if not isinstance(user_infos, Iterable):
-                    raise ValueError("get_user_info 返回值必须可迭代")
-
-                if not user_infos:
-                    log.info("无可用用户")
-
-                for info in user_infos:
-                    username = info.get("username")
-                    password = info.get("password")
-                    for i in range(self._login_retry_times):
-                        try:
-                            cookie = self.create_cookie(username, password)
-                            if cookie:
-                                self.save_cookie(username, cookie)
-                            else:
-                                self.handle_login_failed_user(username, password)
-
-                            break
-                        except Exception as e:
-                            self.handel_exception(e)
-
-                    else:
-                        self.handle_login_failed_user(username, password)
-
-    login = run
-
-
-@unique
-class LimitTimesUserStatus(Enum):
-    # 使用状态
-    USED = "used"
-    SUCCESS = "success"
-    OVERDUE = "overdue"  # cookie 过期
-    SLEEP = "sleep"
-    EXCEPTION = "exception"
-    # 登陆状态
-    LOGIN_SUCCESS = "login_success"
-    LOGIN_FALIED = "login_failed"
-
-
-class LimitTimesUser:
-    """
-    有次数限制的账户
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
-    SITE_NAME = ""  # 网站名
-
-    redisdb = None
-
-    def __init__(
-        self,
-        username,
-        password,
-        max_search_times,
-        proxies=None,
-        search_interval=0,
-        **kwargs,
-    ):
-        """
-        @param username:
-        @param password:
-        @param max_search_times:
-        @param proxies:
-        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
-        """
-        self.__dict__.update(kwargs)
-        self.username = username
-        self.password = password
-        self.max_search_times = max_search_times
-        self.proxies = proxies
-        self.search_interval = search_interval
-        self.delay_use = 0  # 延时使用,用于等待解封的用户
-
-        if isinstance(search_interval, (tuple, list)):
-            if len(search_interval) != 2:
-                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
-
-            self.used_for_time_length = (
-                search_interval[1] * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-        else:
-            self.used_for_time_length = (
-                search_interval * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-
-        self.account_info = {
-            "login_time": 0,
-            "cookies": {},
-            "search_times": 0,
-            "last_search_time": 0,
-            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
-            "init_search_times_time": 0,  # 初始化搜索次数的时间
-        }
-
-        if not self.__class__.redisdb:
-            self.__class__.redisdb = RedisDB()
-
-        self.sync_account_info_from_redis()
-
-        self.__init_metrics()
-
-    def __init_metrics(self):
-        """
-        初始化打点系统
-        @return:
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def record_user_status(self, status: LimitTimesUserStatus):
-        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
-
-    def __repr__(self):
-        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
-
-    def __eq__(self, other):
-        return self.username == other.username
-
-    def sync_account_info_from_redis(self):
-        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
-        if account_info:
-            account_info = eval(account_info)
-            self.account_info.update(account_info)
-
-    @property
-    def cookies(self):
-        cookies = self.account_info.get("cookies")
-        return cookies
-
-    def set_cookies(self, cookies):
-        self.account_info["cookies"] = cookies
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def set_login_time(self, login_time=None):
-        self.account_info["login_time"] = login_time or time.time()
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def get_login_time(self):
-        return self.account_info.get("login_time")
-
-    def is_time_to_login(self):
-        return time.time() - self.get_login_time() > 40 * 60
-
-    def get_last_search_time(self):
-        return self.account_info.get("last_search_time", 0)
-
-    def is_time_to_search(self):
-        if self.delay_use:
-            is_time = time.time() - self.get_last_search_time() > self.delay_use
-            if is_time:
-                self.delay_use = 0
-
-        else:
-            is_time = time.time() - self.get_last_search_time() > (
-                random.randint(*self.search_interval)
-                if isinstance(self.search_interval, (tuple, list))
-                else self.search_interval
-            )
-
-        return is_time
-
-    @property
-    def used_for_spider_name(self):
-        return self.account_info.get("used_for_spider_name")
-
-    @used_for_spider_name.setter
-    def used_for_spider_name(self, spider_name):
-        self.account_info["used_for_spider_name"] = spider_name
-
-    def update_status(self):
-        """
-        更新search的一些状态
-        @return:
-        """
-        self.account_info["search_times"] += 1
-        self.account_info["last_search_time"] = time.time()
-
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    @property
-    def search_times(self):
-        init_search_times_time = self.account_info.get("init_search_times_time")
-        current_time = time.time()
-        if (
-            current_time - init_search_times_time >= 86400
-        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
-            self.account_info["search_times"] = 0
-            self.account_info["init_search_times_time"] = current_time
-
-            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
-
-        return self.account_info["search_times"]
-
-    def is_overwork(self):
-        if self.search_times > self.max_search_times:
-            log.warning("账号 {} 请求次数超限制".format(self.username))
-            return True
-
-        return False
-
-    def is_at_work_time(self):
-        if datetime.datetime.now().hour in list(range(7, 23)):
-            return True
-
-        log.warning("账号 {} 不再工作时间内".format(self.username))
-        return False
-
-    def del_cookie(self):
-        self.account_info["cookies"] = {}
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def create_cookie(self):
-        """
-        生产cookie 有异常需要抛出
-        @return: cookie_dict
-        """
-
-        raise NotImplementedError
-
-    def login(self):
-        """
-        @return: 1 成功 0 失败
-        """
-
-        try:
-            # 预检查
-            if not self.is_time_to_login():
-                log.info("此账号尚未到登陆时间: {}".format(self.username))
-                time.sleep(5)
-                return 0
-
-            cookies = self.create_cookie()
-            if not cookies:
-                raise Exception("登陆失败 未获取到合法cookie")
-
-            if not isinstance(cookies, dict):
-                raise Exception("cookie 必须为字典格式")
-
-            # 保存cookie
-            self.set_login_time()
-            self.set_cookies(cookies)
-            log.info("登录成功 {}".format(self.username))
-            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
-            return 1
-
-        except Exception as e:
-            log.exception(e)
-            send_msg(
-                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
-                level="error",
-                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
-            )
-
-        log.info("登录失败 {}".format(self.username))
-        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
-        return 0
-
-
-class LimitTimesUserPool:
-    """
-    限制查询次数的用户的User pool
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    LOAD_USER_INTERVAL = 60
-
-    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
-        """
-        @param accounts_dic: 账户信息字典
-            {
-                "15011300228": {
-                    "password": "300228",
-                    "proxies": {},
-                    "max_search_times": 500,
-                    "search_interval": 1, # 使用时间间隔
-                    # 其他携带信息
-                }
-            }
-        @param limit_user_class: 用户重写的 limit_user_class
-        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
-        """
-        self.accounts_dict = accounts_dict
-        self.limit_user_class = limit_user_class
-
-        self.limit_times_users = []
-        self.current_user_index = -1
-
-        self.support_more_client = support_more_client
-
-        self.last_load_user_time = 0
-
-    def __load_users(self, username=None):
-        # 装载user
-        log.info("更新可用用户")
-
-        for _username, detail in self.accounts_dict.items():
-            if username and username != _username:
-                continue
-
-            limit_times_users = self.limit_user_class(username=_username, **detail)
-            if limit_times_users in self.limit_times_users:
-                continue
-
-            if limit_times_users.is_overwork():
-                continue
-            else:
-                if (
-                    limit_times_users.cookies or limit_times_users.login()
-                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
-                    self.limit_times_users.append(limit_times_users)
-
-        self.last_load_user_time = time.time()
-
-    def get_user(
-        self,
-        username=None,
-        used_for_spider_name=None,
-        wait_when_null=True,
-        not_limit_frequence=False,
-    ) -> LimitTimesUser:
-        """
-        @params username: 获取指定的用户
-        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
-        @params wait_when_null: 无用户时是否等待
-        @params not_limit_frequence: 不限制使用频率
-        @return: LimitTimesUser
-        """
-        if not self.support_more_client:
-            warnings.warn(
-                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
-                category=Warning,
-            )
-            self._is_show_warning = True
-
-        while True:
-            if (
-                not self.limit_times_users
-                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
-            ):
-                self.__load_users(username)
-                if not self.limit_times_users:
-                    log.warning("无可用的用户")
-                    if wait_when_null:
-                        time.sleep(1)
-                        continue
-                    else:
-                        return None
-
-            self.current_user_index += 1
-            self.current_user_index = self.current_user_index % len(
-                self.limit_times_users
-            )
-
-            limit_times_user = self.limit_times_users[self.current_user_index]
-            if self.support_more_client:  # 需要先同步下最新数据
-                limit_times_user.sync_account_info_from_redis()
-
-            if username and limit_times_user.username != username:
-                log.info(
-                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
-                )
-                time.sleep(1)
-                continue
-
-            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
-            if (
-                limit_times_user.used_for_spider_name
-                and limit_times_user.used_for_spider_name != used_for_spider_name
-            ):
-                wait_time = time.time() - limit_times_user.get_last_search_time()
-                if wait_time < limit_times_user.used_for_time_length:
-                    log.info(
-                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
-                            limit_times_user.username,
-                            limit_times_user.used_for_spider_name,
-                            limit_times_user.used_for_time_length - wait_time,
-                        )
-                    )
-                    time.sleep(1)
-                    continue
-
-            if (
-                not limit_times_user.is_overwork()
-                and limit_times_user.is_at_work_time()
-            ):
-                if not limit_times_user.cookies:
-                    self.limit_times_users.remove(limit_times_user)
-                    continue
-
-                if not_limit_frequence or limit_times_user.is_time_to_search():
-                    limit_times_user.used_for_spider_name = used_for_spider_name
-
-                    limit_times_user.update_status()
-                    log.info("使用用户 {}".format(limit_times_user.username))
-                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
-                    return limit_times_user
-                else:
-                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
-                    time.sleep(1)
-                    continue
-            else:
-                self.limit_times_users.remove(limit_times_user)
-                self.current_user_index -= 1
-
-                if not limit_times_user.is_at_work_time():
-                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
-                    if wait_when_null:
-                        time.sleep(30)
-                        continue
-                    else:
-                        return None
-
-    def del_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.del_cookie()
-                self.limit_times_users.remove(limit_times_user)
-                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
-                self.__load_users(username)
-                break
-
-    def update_cookies(self, username, cookies):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.set_cookies(cookies)
-                break
-
-    def delay_use(self, username, delay_seconds):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.delay_use = delay_seconds
-                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
-                break
-
-    def record_success_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
-
-    def record_exception_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)

+ 0 - 145
zgztb_cookie/FworkSpider/feapder/network/item.py

@@ -1,145 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-26 22:28:10
----------
-@summary: 定义实体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import feapder.utils.tools as tools
-
-
-class ItemMetaclass(type):
-    def __new__(cls, name, bases, attrs):
-        attrs.setdefault("__name__", None)
-        attrs.setdefault("__table_name__", None)
-        attrs.setdefault("__name_underline__", None)
-        attrs.setdefault("__update_key__", None)
-        attrs.setdefault("__unique_key__", None)
-
-        return type.__new__(cls, name, bases, attrs)
-
-
-class Item(metaclass=ItemMetaclass):
-    __unique_key__ = []
-
-    def __init__(self, **kwargs):
-        self.__dict__ = kwargs
-
-    def __repr__(self):
-        return "<{}: {}>".format(self.item_name, tools.dumps_json(self.to_dict))
-
-    def __getitem__(self, key):
-        return self.__dict__[key]
-
-    def __setitem__(self, key, value):
-        self.__dict__[key] = value
-
-    def pre_to_db(self):
-        """
-        入库前的处理
-        """
-        pass
-
-    @property
-    def to_dict(self):
-        propertys = {}
-        for key, value in self.__dict__.items():
-            if key not in (
-                "__name__",
-                "__table_name__",
-                "__name_underline__",
-                "__update_key__",
-                "__unique_key__",
-            ):
-                if key.startswith(f"_{self.__class__.__name__}"):
-                    key = key.replace(f"_{self.__class__.__name__}", "")
-                propertys[key] = value
-
-        return propertys
-
-    def to_sql(self, auto_update=False, update_columns=()):
-        return tools.make_insert_sql(
-            self.table_name, self.to_dict, auto_update, update_columns
-        )
-
-    @property
-    def item_name(self):
-        return self.__name__ or self.__class__.__name__
-
-    @item_name.setter
-    def item_name(self, name):
-        self.__name__ = name
-        self.__table_name__ = self.name_underline.replace("_item", "")
-
-    @property
-    def table_name(self):
-        if not self.__table_name__:
-            self.__table_name__ = self.name_underline.replace("_item", "")
-        return self.__table_name__
-
-    @table_name.setter
-    def table_name(self, name):
-        self.__table_name__ = name
-        self.__name__ = tools.key2hump(name) + "Item"
-
-    @property
-    def name_underline(self):
-        if not self.__name_underline__:
-            self.__name_underline__ = tools.key2underline(self.item_name)
-
-        return self.__name_underline__
-
-    @name_underline.setter
-    def name_underline(self, name):
-        self.__name_underline__ = name
-
-    @property
-    def unique_key(self):
-        return self.__unique_key__ or self.__class__.__unique_key__
-
-    @unique_key.setter
-    def unique_key(self, keys):
-        if isinstance(keys, (tuple, list)):
-            self.__unique_key__ = keys
-        else:
-            self.__unique_key__ = (keys,)
-
-    @property
-    def fingerprint(self):
-        args = []
-        for key, value in self.to_dict.items():
-            if value:
-                if (self.unique_key and key in self.unique_key) or not self.unique_key:
-                    args.append(str(value))
-
-        if args:
-            args = sorted(args)
-            return tools.get_sha256(*args)
-        else:
-            return None
-
-    def to_UpdateItem(self):
-        update_item = UpdateItem(**self.__dict__)
-        update_item.item_name = self.item_name
-        return update_item
-
-
-class UpdateItem(Item):
-    __update_key__ = []
-
-    def __init__(self, **kwargs):
-        super(UpdateItem, self).__init__(**kwargs)
-
-    @property
-    def update_key(self):
-        return self.__update_key__ or self.__class__.__update_key__
-
-    @update_key.setter
-    def update_key(self, keys):
-        if isinstance(keys, (tuple, list)):
-            self.__update_key__ = keys
-        else:
-            self.__update_key__ = (keys,)

+ 0 - 753
zgztb_cookie/FworkSpider/feapder/network/proxy_pool.py

@@ -1,753 +0,0 @@
-# coding:utf8
-"""
-代理池
-"""
-import datetime
-import json
-import os
-import random
-import socket
-import time
-from urllib import parse
-
-import redis
-import requests
-
-from feapder import setting
-from feapder.utils import tools
-from feapder.utils.log import log
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-
-# 建立本地缓存代理文件夹
-proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
-if not os.path.exists(proxy_path):
-    os.mkdir(proxy_path)
-
-
-def swordfish_proxy():
-    """剑鱼代理"""
-    headers = {"Authorization": setting.JIANYU_PROXY_AUTHOR}
-    proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
-    proxies = proxy.get('data')
-    log.info(f"切换代理:{proxies}")
-    return proxies
-
-
-def get_proxy_from_url(**kwargs):
-    """
-    获取指定url的代理
-    :param kwargs:
-    :return:
-    """
-    proxy_source_url = kwargs.get("proxy_source_url", [])
-    if not isinstance(proxy_source_url, list):
-        proxy_source_url = [proxy_source_url]
-        proxy_source_url = [x for x in proxy_source_url if x]
-    if not proxy_source_url:
-        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
-    kwargs = kwargs.copy()
-    kwargs.pop("proxy_source_url")
-    proxies_list = []
-    for url in proxy_source_url:
-        if url.startswith("http"):
-            proxies_list.extend(get_proxy_from_http(url, **kwargs))
-        elif url.startswith("redis"):
-            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
-
-    if proxies_list:
-        # 顺序打乱
-        random.shuffle(proxies_list)
-    return proxies_list
-
-
-def get_proxy_from_http(proxy_source_url, **kwargs):
-    """
-    从指定 http 地址获取代理
-    :param proxy_source_url:
-    :param kwargs:
-    :return:
-    """
-    filename = tools.get_md5(proxy_source_url) + ".txt"
-    abs_filename = os.path.join(proxy_path, filename)
-    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
-    update_flag = 0
-    if not update_interval:
-        # 强制更新
-        update_flag = 1
-    elif not os.path.exists(abs_filename):
-        # 文件不存在则更新
-        update_flag = 1
-    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
-        # 超过更新间隔
-        update_flag = 1
-    if update_flag:
-        pool = []
-        response = requests.get(proxy_source_url, timeout=20)
-        # 改写:获取socks5代理与代理格式处理
-        for proxy in response.json():
-            host = decrypt(proxy['ip'])
-            port = proxy['ports'][0]
-            endTime = proxy['lifetime']
-            pool.append(f"{host}:{port}&&{endTime}")
-
-        with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write('\n'.join(pool))
-    return get_proxy_from_file(filename)
-
-
-def get_proxy_from_file(filename, **kwargs):
-    """
-    从指定本地文件获取代理
-        文件格式
-        ip:port:https
-        ip:port:http
-        ip:port
-    :param filename:
-    :param kwargs:
-    :return:
-    """
-    proxies_list = []
-    with open(os.path.join(proxy_path, filename), "r") as f:
-        lines = f.readlines()
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # 解析
-        auth = ""
-        if "@" in line:
-            auth, line = line.split("@")
-        # 改写,解析代理有效期结束时间
-        line, end = line.split("&&")
-
-        items = line.split(":")
-        if len(items) < 2:
-            continue
-
-        ip, port, *protocol = items
-        if not all([port, ip]):
-            continue
-        if auth:
-            ip = "{}@{}".format(auth, ip)
-        if not protocol:
-            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
-            if time.time() < int(end):
-                proxies = {
-                    "https": "socks5://%s:%s" % (ip, port),
-                    "http": "socks5://%s:%s" % (ip, port),
-                    # "end":end
-                }
-            else:
-                continue
-        else:
-            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
-        proxies_list.append(proxies)
-
-    return proxies_list
-
-
-def get_proxy_from_redis(proxy_source_url, **kwargs):
-    """
-    从指定 redis 地址获取代理
-    @param proxy_source_url: redis://:passwd@host:ip/db
-        redis 存储结构 zset
-        ip:port ts
-    @param kwargs:
-        {"redis_proxies_key": "xxx"}
-    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
-    """
-
-    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
-    key = kwargs.get("redis_proxies_key")
-    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
-    proxies = redis_conn.zrange(key, 0, -1)
-    proxies_list = []
-    for proxy in proxies:
-        proxy = proxy.decode()
-        proxies_list.append(
-            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
-        )
-    return proxies_list
-
-
-def check_proxy(
-        ip="",
-        port="",
-        proxies=None,
-        type=0,
-        timeout=5,
-        logger=None,
-        show_error_log=True,
-        **kwargs,
-):
-    """
-    代理有效性检查
-    :param ip:
-    :param port:
-    :param type: 0:socket  1:requests
-    :param timeout:
-    :param logger:
-    :return:
-    """
-    if not logger:
-        logger = log
-    ok = 0
-    if type == 0 and ip and port:
-        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(timeout)
-            try:
-                # 必须检测 否则代理永远不刷新
-                sk.connect((ip, int(port)))
-                ok = 1
-            except Exception as e:
-                if show_error_log:
-                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
-            sk.close()
-    else:
-        if not proxies:
-            proxies = {
-                "http": "socks5://{}:{}".format(ip, port),
-                "https": "socks5//{}:{}".format(ip, port),
-            }
-        try:
-            # 改写:代理检测的url
-            r = requests.get(
-                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
-            )
-            ok = 1
-            r.close()
-        except Exception as e:
-            if show_error_log:
-                logger.debug(
-                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
-                )
-    return ok
-
-
-class ProxyItem(object):
-    """单个代理对象"""
-
-    # 代理标记
-    proxy_tag_list = (-1, 0, 1)
-
-    def __init__(
-            self,
-            proxies=None,
-            valid_timeout=20,
-            check_interval=180,
-            max_proxy_use_num=10000,
-            delay=30,
-            use_interval=None,
-            **kwargs,
-    ):
-        """
-        :param proxies:
-        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
-        :param check_interval:
-        :param max_proxy_use_num:
-        :param delay:
-        :param use_interval: 使用间隔 单位秒 默认不限制
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs:
-        """
-        # {"http": ..., "https": ...}
-        self.proxies = proxies
-        # 检测超时时间 秒
-        self.valid_timeout = valid_timeout
-        # 检测间隔 秒
-        self.check_interval = check_interval
-
-        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
-        self.flag = 0
-        # 上次状态变化时间
-        self.flag_ts = 0
-        # 上次更新时间 有效时间
-        self.update_ts = 0
-        # 最大被使用次数
-        self.max_proxy_use_num = max_proxy_use_num
-        # 被使用次数记录
-        self.use_num = 0
-        # 延迟使用时间
-        self.delay = delay
-        # 使用间隔 单位秒
-        self.use_interval = use_interval
-        # 使用时间
-        self.use_ts = 0
-
-        self.proxy_args = self.parse_proxies(self.proxies)
-        self.proxy_ip = self.proxy_args["ip"]
-        self.proxy_port = self.proxy_args["port"]
-        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
-        if self.proxy_args["user"]:
-            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
-        else:
-            self.proxy_id = self.proxy_ip_port
-
-        # 日志处理器
-        self.logger = log
-
-    def get_proxies(self):
-        self.use_num += 1
-        return self.proxies
-
-    def is_delay(self):
-        return self.flag == 1
-
-    def is_valid(self, force=0, type=0):
-        """
-        检测代理是否有效
-            1 有效
-            2 延时使用
-            0 无效 直接在代理池删除
-        :param force:
-        :param type:
-        :return:
-        """
-        if self.use_num > self.max_proxy_use_num > 0:
-            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
-            return 0
-        if self.flag == -1:
-            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
-            return 0
-        if self.delay > 0 and self.flag == 1:
-            if time.time() - self.flag_ts < self.delay:
-                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
-                return 2
-            else:
-                self.flag = 0
-                self.logger.debug("延迟代理释放: {}".format(self.proxies))
-        if self.use_interval:
-            if time.time() - self.use_ts < self.use_interval:
-                return 2
-        if not force:
-            if time.time() - self.update_ts < self.check_interval:
-                return 1
-        if self.valid_timeout > 0:
-            ok = check_proxy(
-                proxies=self.proxies,
-                type=type,
-                timeout=self.valid_timeout,
-                logger=self.logger,
-            )
-        else:
-            ok = 1
-        self.update_ts = time.time()
-        return ok
-
-    @classmethod
-    def parse_proxies(self, proxies):
-        """
-        分解代理组成部分
-        :param proxies:
-        :return:
-        """
-        if not proxies:
-            return {}
-        if isinstance(proxies, (str, bytes)):
-            proxies = json.loads(proxies)
-        protocol = list(proxies.keys())
-        if not protocol:
-            return {}
-        _url = proxies.get(protocol[0])
-        # 改写:注释http代理url的拼接,以正常生成代理池
-        # if not _url.startswith("http"):
-        #     _url = "http://" + _url
-        _url_parse = parse.urlparse(_url)
-        netloc = _url_parse.netloc
-        if "@" in netloc:
-            netloc_auth, netloc_host = netloc.split("@")
-        else:
-            netloc_auth, netloc_host = "", netloc
-        ip, *port = netloc_host.split(":")
-        port = port[0] if port else "80"
-        user, *password = netloc_auth.split(":")
-        password = password[0] if password else ""
-        return {
-            "protocol": protocol,
-            "ip": ip,
-            "port": port,
-            "user": user,
-            "password": password,
-            "ip_port": "{}:{}".format(ip, port),
-        }
-
-
-class ProxyPoolBase(object):
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def get(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class ProxyPool(ProxyPoolBase):
-    """代理池"""
-
-    def __init__(self, **kwargs):
-        """
-        :param size: 代理池大小  -1 为不限制
-        :param proxy_source_url: 代理文件地址 支持列表
-        :param proxy_instance:  提供代理的实例
-        :param reset_interval:  代理池重置间隔 最小间隔
-        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
-        :param check_valid: 是否在获取代理时进行检测有效性
-        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs: 其他的参数
-        """
-        kwargs.setdefault("size", -1)
-        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
-
-        super(ProxyPool, self).__init__(**kwargs)
-        # 队列最大长度
-        self.max_queue_size = kwargs.get("size", -1)
-        # 实际代理数量
-        self.real_max_proxy_count = 1000
-        # 代理可用最大次数
-        # 代理获取地址 http://localhost/proxy.txt
-        self.proxy_source_url = kwargs.get("proxy_source_url", [])
-        if not isinstance(self.proxy_source_url, list):
-            self.proxy_source_url = [self.proxy_source_url]
-            self.proxy_source_url = [x for x in self.proxy_source_url if x]
-            self.proxy_source_url = list(set(self.proxy_source_url))
-            kwargs.update({"proxy_source_url": self.proxy_source_url})
-        # 处理日志
-        self.logger = kwargs.get("logger") or log
-        kwargs["logger"] = self.logger
-        if not self.proxy_source_url:
-            self.logger.warn("need set proxy_source_url or proxy_instance")
-
-        # 代理池重置间隔
-        self.reset_interval = kwargs.get("reset_interval", 5)
-        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
-        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
-        # 是否监测代理有效性
-        self.check_valid = kwargs.get("check_valid", True)
-
-        # 代理队列
-        self.proxy_queue = None
-        # {代理id: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 失效代理队列
-        self.invalid_proxy_dict = {}
-
-        self.kwargs = kwargs
-
-        # 重置代理池锁
-        self.reset_lock = None
-        # 重置时间
-        self.last_reset_time = 0
-        # 重置的太快了  计数
-        self.reset_fast_count = 0
-        # 计数 获取代理重试3次仍然失败 次数
-        self.no_valid_proxy_times = 0
-
-        # 上次获取代理时间
-        self.last_get_ts = time.time()
-
-        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
-        self.proxy_item_update_ts_dict = {}
-
-        # 警告
-        self.warn_flag = False
-
-    def warn(self):
-        if not self.warn_flag:
-            for url in self.proxy_source_url:
-                if "zhima" in url:
-                    continue
-            self.warn_flag = True
-        return
-
-    @property
-    def queue_size(self):
-        """
-        当前代理池中代理数量
-        :return:
-        """
-        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
-
-    def clear(self):
-        """
-        清空自己
-        :return:
-        """
-        self.proxy_queue = None
-        # {代理ip: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 清理失效代理集合
-        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
-        self.invalid_proxy_dict = {
-            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
-        }
-        # 清理超时的update_ts记录
-        _limit = time.time() - 600
-        self.proxy_item_update_ts_dict = {
-            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
-        }
-        return
-
-    def get(self, retry: int = 0) -> dict:
-        """
-        从代理池中获取代理
-        :param retry:
-        :return:
-        """
-        retry += 1
-        if retry > 3:
-            self.no_valid_proxy_times += 1
-            return None
-        if time.time() - self.last_get_ts > 3 * 60:
-            # 3分钟没有获取过 重置一下
-            try:
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        # 记录获取时间
-        self.last_get_ts = time.time()
-        #
-        self.warn()
-        proxy_item = self.get_random_proxy()
-        if proxy_item:
-            # 不检测
-            if not self.check_valid:  #
-                # 塞回去
-                proxies = proxy_item.get_proxies()
-                self.put_proxy_item(proxy_item)
-                return proxies
-            else:
-                is_valid = proxy_item.is_valid()
-                if is_valid:
-                    # 记录update_ts
-                    self.proxy_item_update_ts_dict[
-                        proxy_item.proxy_id
-                    ] = proxy_item.update_ts
-                    # 塞回去
-                    proxies = proxy_item.get_proxies()
-                    self.put_proxy_item(proxy_item)
-                    if is_valid == 1:
-                        if proxy_item.use_interval:
-                            proxy_item.use_ts = time.time()
-                        return proxies
-                else:
-                    # 处理失效代理
-                    self.proxy_dict.pop(proxy_item.proxy_id, "")
-                    self.invalid_proxy_dict[
-                        proxy_item.proxy_id
-                    ] = datetime.datetime.now()
-        else:
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        if self.no_valid_proxy_times >= 5:
-            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
-            # 导致爬虫烂尾
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        return self.get(retry)
-
-    get_proxy = get
-
-    def get_random_proxy(self) -> ProxyItem:
-        """
-        随机获取代理
-        :return:
-        """
-        if self.proxy_queue is not None:
-            if random.random() < 0.5:
-                # 一半概率检查 这是个高频操作 优化一下
-                if time.time() - self.last_reset_time > self.reset_interval_max:
-                    time.sleep(3)
-                    self.reset_proxy_pool(force=True)
-                else:
-                    min_q_size = (
-                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
-                        if self.max_queue_size > 0
-                        else self.real_max_proxy_count / 2
-                    )
-                    if self.proxy_queue.qsize() < min_q_size:
-                        time.sleep(3)
-                        self.reset_proxy_pool()
-            try:
-                return self.proxy_queue.get_nowait()
-            except Exception:
-                pass
-        return None
-
-    def append_proxies(self, proxies_list: list) -> int:
-        """
-        添加代理到代理池
-        :param proxies_list:
-        :return:
-        """
-        count = 0
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if proxies:
-                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
-                # 增加失效判断 2018/12/18
-                if proxy_item.proxy_id in self.invalid_proxy_dict:
-                    continue
-                if proxy_item.proxy_id not in self.proxy_dict:
-                    # 补充update_ts
-                    if not proxy_item.update_ts:
-                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
-                            proxy_item.proxy_id, 0
-                        )
-                    self.put_proxy_item(proxy_item)
-                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
-                    count += 1
-        return count
-
-    def put_proxy_item(self, proxy_item: ProxyItem):
-        """
-        添加 ProxyItem 到代理池
-        :param proxy_item:
-        :return:
-        """
-        return self.proxy_queue.put_nowait(proxy_item)
-
-    def reset_proxy_pool(self, force: bool = False):
-        """
-        重置代理池
-        :param force: 是否强制重置代理池
-        :return:
-        """
-        if not self.reset_lock:
-            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
-            import threading
-
-            self.reset_lock = threading.RLock()
-        with self.reset_lock:
-            if (
-                    force
-                    or self.proxy_queue is None
-                    or (
-                    self.max_queue_size > 0
-                    and self.proxy_queue.qsize() < self.max_queue_size / 2
-            )
-                    or (
-                    self.max_queue_size < 0
-                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
-            )
-                    or self.no_valid_proxy_times >= 5
-            ):
-                if time.time() - self.last_reset_time < self.reset_interval:
-                    self.reset_fast_count += 1
-                    if self.reset_fast_count % 10 == 0:
-                        self.logger.debug(
-                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
-                        )
-                        time.sleep(1)
-                else:
-                    self.clear()
-                    if self.proxy_queue is None:
-                        import queue
-
-                        self.proxy_queue = queue.Queue()
-                    # TODO 这里获取到的可能重复
-                    proxies_list = get_proxy_from_url(**self.kwargs)
-                    self.real_max_proxy_count = len(proxies_list)
-                    if 0 < self.max_queue_size < self.real_max_proxy_count:
-                        proxies_list = random.sample(proxies_list, self.max_queue_size)
-                    _valid_count = self.append_proxies(proxies_list)
-                    self.last_reset_time = time.time()
-                    self.no_valid_proxy_times = 0
-                    self.logger.debug(
-                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
-                            len(proxies_list),
-                            _valid_count,
-                            len(self.invalid_proxy_dict),
-                            len(self.proxy_dict),
-                        )
-                    )
-        return
-
-    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
-        """
-        对代理进行标记
-        :param proxies_list:
-        :param flag:
-                    -1  废弃
-                    1 延迟使用
-        :param delay: 延迟时间
-        :return:
-        """
-        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
-            return False
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if not proxies:
-                continue
-            proxy_id = ProxyItem(proxies).proxy_id
-            if proxy_id not in self.proxy_dict:
-                continue
-            self.proxy_dict[proxy_id].flag = flag
-            self.proxy_dict[proxy_id].flag_ts = time.time()
-            self.proxy_dict[proxy_id].delay = delay
-
-        return True
-
-    def get_proxy_item(self, proxy_id="", proxies=None):
-        """
-        获取代理对象
-        :param proxy_id:
-        :param proxies:
-        :return:
-        """
-        if proxy_id:
-            return self.proxy_dict.get(proxy_id)
-        if proxies:
-            proxy_id = ProxyItem(proxies).proxy_id
-            return self.proxy_dict.get(proxy_id)
-        return
-
-    def copy(self):
-        return ProxyPool(**self.kwargs)
-
-    def all(self) -> list:
-        """
-        获取当前代理池中的全部代理
-        :return:
-        """
-        return get_proxy_from_url(**self.kwargs)

+ 0 - 527
zgztb_cookie/FworkSpider/feapder/network/request.py

@@ -1,527 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:49:08
----------
-@summary: 请求结构体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.cookies import RequestsCookieJar
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
-from feapder.network.proxy_pool import ProxyPool
-from feapder.network.response import Response
-from feapder.utils.log import log
-from feapder.utils.webdriver import WebDriverPool
-
-# 屏蔽warning信息
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-
-
-class Request(object):
-    session = None
-    webdriver_pool: WebDriverPool = None
-    user_agent_pool = user_agent
-    proxies_pool: ProxyPool = None
-
-    cache_db = None  # redis / pika
-    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
-    cached_expire_time = 1200  # 缓存过期时间
-
-    local_filepath = None
-    oss_handler = None
-
-    __REQUEST_ATTRS__ = {
-        # 'method', 'url', 必须传递 不加入**kwargs中
-        "params",
-        "data",
-        "headers",
-        "cookies",
-        "files",
-        "auth",
-        "timeout",
-        "allow_redirects",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "json",
-    }
-
-    DEFAULT_KEY_VALUE = dict(
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-    )
-
-    def __init__(
-        self,
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        splash=False,
-        iframes=0,
-        **kwargs,
-    ):
-        """
-        @summary: Request参数
-        ---------
-        框架参数
-        @param url: 待抓取url
-        @param retry_times: 当前重试次数
-        @param priority: 优先级 越小越优先 默认300
-        @param parser_name: 回调函数所在的类名 默认为当前类
-        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
-        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
-        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
-        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
-        @param use_session: 是否使用session方式
-        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
-        @param download_midware: 下载中间件。默认为parser中的download_midware
-        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
-        @param render: 是否用浏览器渲染
-        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
-        --
-        以下参数与requests参数使用方式一致
-        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
-        @param params: 请求参数
-        @param data: 请求body
-        @param json: 请求json字符串,同 json.dumps(data)
-        @param headers:
-        @param cookies: 字典 或 CookieJar 对象
-        @param files:
-        @param auth:
-        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
-        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @param verify: 为 True 时将会验证 SSL 证书
-        @param stream: 如果为 False,将会立即下载响应内容
-        @param cert:
-        --
-        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
-        ---------
-        @result:
-        """
-
-        self.url = url
-        self.retry_times = retry_times
-        self.priority = priority
-        self.parser_name = parser_name
-        self.callback = callback
-        self.filter_repeat = filter_repeat
-        self.auto_request = auto_request
-        self.request_sync = request_sync
-        self.use_session = use_session
-        self.random_user_agent = random_user_agent
-        self.download_midware = download_midware
-        self.is_abandoned = is_abandoned
-        self.render = render
-        self.splash = splash
-        self.iframes = iframes
-        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
-
-        self.requests_kwargs = {}
-        for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
-                self.requests_kwargs[key] = value
-
-            self.__dict__[key] = value
-
-    def __repr__(self):
-        try:
-            return "<Request {}>".format(self.url)
-        except:
-            return "<Request {}>".format(str(self.to_dict)[:40])
-
-    def __setattr__(self, key, value):
-        """
-        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
-        @param key:
-        @param value:
-        @return:
-        """
-        self.__dict__[key] = value
-
-        if key in self.__class__.__REQUEST_ATTRS__:
-            self.requests_kwargs[key] = value
-
-    def __lt__(self, other):
-        return self.priority < other.priority
-
-    @property
-    def _session(self):
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-        if use_session and not self.__class__.session:
-            self.__class__.session = requests.Session()
-            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-            self.__class__.session.mount("http", http_adapter)
-
-        return self.__class__.session
-
-    @property
-    def _webdriver_pool(self):
-        if not self.__class__.webdriver_pool:
-            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
-
-        return self.__class__.webdriver_pool
-
-    @property
-    def _proxies_pool(self):
-        if not self.__class__.proxies_pool:
-            self.__class__.proxies_pool = ProxyPool(check_valid=False)
-
-        return self.__class__.proxies_pool
-
-    @property
-    def to_dict(self):
-        request_dict = {}
-
-        self.callback = (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-        self.download_midware = (
-            getattr(self.download_midware, "__name__")
-            if callable(self.download_midware)
-            else self.download_midware
-        )
-
-        for key, value in self.__dict__.items():
-            if (
-                key in self.__class__.DEFAULT_KEY_VALUE
-                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
-                or key == "requests_kwargs"
-            ):
-                continue
-
-            if key in self.__class__.__REQUEST_ATTRS__:
-                if not isinstance(
-                    value, (bytes, bool, float, int, str, tuple, list, dict)
-                ):
-                    value = tools.dumps_obj(value)
-            else:
-                if not isinstance(value, (bytes, bool, float, int, str)):
-                    value = tools.dumps_obj(value)
-
-            request_dict[key] = value
-
-        return request_dict
-
-    @property
-    def callback_name(self):
-        return (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-    def get_response(self, save_cached=False):
-        """
-        获取带有selector功能的response
-        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
-        @return:
-        """
-        # 设置超时默认时间
-        self.requests_kwargs.setdefault(
-            "timeout", setting.REQUEST_TIMEOUT
-        )  # connect=22 read=22
-
-        # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
-        # 你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。
-        # 此时仅有响应头被下载下来了。
-        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,
-        # 除非你 消耗了所有的数据,或者调用了 Response.close。
-        # 这样会带来连接效率低下的问题。
-        self.requests_kwargs.setdefault("stream", True)
-
-        # 关闭证书验证
-        self.requests_kwargs.setdefault("verify", False)
-
-        # 设置请求方法
-        method = self.__dict__.get("method")
-        if not method:
-            if "data" in self.requests_kwargs:
-                method = "POST"
-            else:
-                method = "GET"
-
-        # 随机user—agent
-        headers = self.requests_kwargs.get("headers", {})
-        if "user-agent" not in headers and "User-Agent" not in headers:
-            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
-                ua = setting.WEBDRIVER.get(
-                    "user_agent"
-                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-            else:
-                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-
-            if self.random_user_agent and setting.RANDOM_HEADERS:
-                headers.update({"User-Agent": ua})
-                self.requests_kwargs.update(headers=headers)
-        else:
-            self.requests_kwargs.setdefault(
-                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
-            )
-
-        # 代理
-        proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-            while True:
-                proxies = self._proxies_pool.get()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
-
-        log.debug(
-            """
-                -------------- %srequest for ----------------
-                url  = %s
-                method = %s
-                body = %s
-                """
-            % (
-                ""
-                if not self.parser_name
-                else "%s.%s "
-                % (
-                    self.parser_name,
-                    (
-                        self.callback
-                        and callable(self.callback)
-                        and getattr(self.callback, "__name__")
-                        or self.callback
-                    )
-                    or "parse",
-                ),
-                self.url,
-                method,
-                self.requests_kwargs,
-            )
-        )
-
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-
-        if self.render:
-            # 使用request的user_agent、cookies、proxy
-            user_agent = headers.get("User-Agent") or headers.get("user-agent")
-            cookies = self.requests_kwargs.get("cookies")
-            if cookies and isinstance(cookies, RequestsCookieJar):
-                cookies = cookies.get_dict()
-
-            if not cookies:
-                cookie_str = headers.get("Cookie") or headers.get("cookie")
-                if cookie_str:
-                    cookies = tools.get_cookies_from_str(cookie_str)
-
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
-
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
-            try:
-                browser.get(self.url)
-                if cookies:
-                    browser.cookies = cookies
-                if self.render_time:
-                    tools.delay_time(self.render_time)
-
-                html = browser.page_source
-                response = Response.from_dict(
-                    {
-                        "url": browser.current_url,
-                        "cookies": browser.cookies,
-                        "_content": html.encode(),
-                        "status_code": 200,
-                        "elapsed": 666,
-                        "headers": {
-                            "User-Agent": browser.execute_script(
-                                "return navigator.userAgent"
-                            ),
-                            "Cookie": tools.cookies2str(browser.cookies),
-                        },
-                    }
-                )
-
-                response.browser = browser
-            except Exception as e:
-                self._webdriver_pool.remove(browser)
-                raise e
-
-        elif use_session:
-            response = self._session.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-        elif self.splash:
-            headers = self.requests_kwargs.get('headers')
-            if not headers:
-                headers = {'User-Agent': self.user_agent()}
-
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
-
-            params = {
-                'iframes': self.iframes,
-                'wait': self.render_time,
-                'html': 1,
-                'proxy': proxy,
-                'url': self.url,
-            }
-            splash_url = setting.JIANYU_SPLASH_URL
-            resp = requests.get(splash_url, params=params, headers=headers)
-            response = Response(resp)
-        else:
-            response = requests.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-
-        if save_cached:
-            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-
-        return response
-
-    def proxies(self):
-        """
-        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-        """
-        return self.requests_kwargs.get("proxies")
-
-    def proxy(self):
-        """
-
-        Returns: ip:port
-
-        """
-        proxies = self.proxies()
-        if proxies:
-            return proxies.get("http", "").strip("http://") or proxies.get(
-                "https", ""
-            ).strip("https://")
-
-    def user_agent(self):
-        headers = self.requests_kwargs.get("headers")
-        if headers:
-            return headers.get("user_agent") or headers.get("User-Agent")
-
-    @property
-    def fingerprint(self):
-        """
-        request唯一表识
-        @return:
-        """
-        url = self.__dict__.get("url", "")
-        # url 归一化
-        url = tools.canonicalize_url(url)
-        args = [url]
-
-        for arg in ["params", "data", "files", "auth", "cert", "json"]:
-            if self.requests_kwargs.get(arg):
-                args.append(self.requests_kwargs.get(arg))
-
-        return tools.get_md5(*args)
-
-    @property
-    def _cache_db(self):
-        if not self.__class__.cache_db:
-            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
-
-        return self.__class__.cache_db
-
-    @property
-    def _cached_redis_key(self):
-        if self.__class__.cached_redis_key:
-            return (
-                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
-            )
-        else:
-            return f"response_cached:test:{self.fingerprint}"
-
-    def save_cached(self, response, expire_time=1200):
-        """
-        使用redis保存response 用于调试 不用每回都下载
-        @param response:
-        @param expire_time: 过期时间
-        @return:
-        """
-
-        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
-
-    def get_response_from_cached(self, save_cached=True):
-        """
-        从缓存中获取response
-        注意:
-            属性值为空:
-                -raw : urllib3.response.HTTPResponse
-                -connection:requests.adapters.HTTPAdapter
-                -history
-
-            属性含义改变:
-                - request 由requests 改为Request
-        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
-        @return:
-        """
-        response_dict = self._cache_db.strget(self._cached_redis_key)
-        if not response_dict:
-            log.info("无response缓存  重新下载")
-            response_obj = self.get_response(save_cached=save_cached)
-        else:
-            response_dict = eval(response_dict)
-            response_obj = Response.from_dict(response_dict)
-        return response_obj
-
-    def del_response_cached(self):
-        self._cache_db.clear(self._cached_redis_key)
-
-    @classmethod
-    def from_dict(cls, request_dict):
-        for key, value in request_dict.items():
-            if isinstance(value, bytes):  # 反序列化 如item
-                request_dict[key] = tools.loads_obj(value)
-
-        return cls(**request_dict)
-
-    def copy(self):
-        return self.__class__.from_dict(self.to_dict)

Some files were not shown because too many files changed in this diff