瀏覽代碼

删除缓存

dongzhaorui 2 月之前
父節點
當前提交
080df5681d
共有 5 個文件被更改,包括 297 次插入964 次删除
  1. 二進制
      .DS_Store
  2. 297 0
      .gitignore
  3. 0 436
      .idea/workspace.xml
  4. 0 15
      jzsc/README.md
  5. 0 513
      jzsc/spider.py

二進制
.DS_Store


+ 297 - 0
.gitignore

@@ -58,3 +58,300 @@ docs/_build/
 # PyBuilder
 # PyBuilder
 target/
 target/
 
 
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Windows template
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+

+ 0 - 436
.idea/workspace.xml

@@ -1,436 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="8c2f903f-55ef-4ced-987d-6ac010bf606d" name="Changes" comment="add project - 中国招标投标公共服务平台(未按规范数据)">
-      <change beforePath="$PROJECT_DIR$/jzsc/chaojiying.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/config/__init__.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/config/conf.yaml" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/config/constants.yaml" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/config/load.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/spider.py" beforeDir="false" afterPath="$PROJECT_DIR$/jzsc/spider.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/__init__.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/databases.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/execptions.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/log.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/socks5.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/jzsc/utils/tools.py" beforeDir="false" />
-    </list>
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="FileTemplateManagerImpl">
-    <option name="RECENT_TEMPLATES">
-      <list>
-        <option value="JavaScript File" />
-        <option value="HTML File" />
-        <option value="Python Script" />
-      </list>
-    </option>
-  </component>
-  <component name="FlaskConsoleOptions" custom-start-script="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
-    <envs>
-      <env key="FLASK_APP" value="app" />
-    </envs>
-    <option name="myCustomStartScript" value="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
-    <option name="myEnvs">
-      <map>
-        <entry key="FLASK_APP" value="app" />
-      </map>
-    </option>
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="GitSEFilterConfiguration">
-    <file-type-list>
-      <filtered-out-file-type name="LOCAL_BRANCH" />
-      <filtered-out-file-type name="REMOTE_BRANCH" />
-      <filtered-out-file-type name="TAG" />
-      <filtered-out-file-type name="COMMIT_BY_MESSAGE" />
-    </file-type-list>
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProblemsViewState">
-    <option name="selectedTabId" value="CurrentFile" />
-  </component>
-  <component name="ProjectColorInfo"><![CDATA[{
-  "associatedIndex": 1
-}]]></component>
-  <component name="ProjectId" id="26cDQQPzrwjA2Sg1Lq9nLzcrVU6" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "RunOnceActivity.ShowReadmeOnStart": "true",
-    "git-widget-placeholder": "master",
-    "node.js.detected.package.eslint": "true",
-    "node.js.selected.package.eslint": "(autodetect)",
-    "nodejs_package_manager_path": "npm",
-    "vue.rearranger.settings.migration": "true"
-  }
-}]]></component>
-  <component name="RecentsManager">
-    <key name="CopyFile.RECENT_KEYS">
-      <recent name="$PROJECT_DIR$" />
-      <recent name="$PROJECT_DIR$/codes_hospital" />
-    </key>
-    <key name="MoveFile.RECENT_KEYS">
-      <recent name="$PROJECT_DIR$/codes_hospital" />
-    </key>
-  </component>
-  <component name="RunManager" selected="Python.crawl_hospital">
-    <configuration name="aaa (1)" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
-      <module name="topic_spider" />
-      <option name="ENV_FILES" value="" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <envs>
-        <env name="PYTHONUNBUFFERED" value="1" />
-      </envs>
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
-      <option name="IS_MODULE_SDK" value="true" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/aaa.py" />
-      <option name="PARAMETERS" value="" />
-      <option name="SHOW_COMMAND_LINE" value="false" />
-      <option name="EMULATE_TERMINAL" value="false" />
-      <option name="MODULE_MODE" value="false" />
-      <option name="REDIRECT_INPUT" value="false" />
-      <option name="INPUT_FILE" value="" />
-      <method v="2" />
-    </configuration>
-    <configuration name="b" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
-      <module name="topic_spider" />
-      <option name="ENV_FILES" value="" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <envs>
-        <env name="PYTHONUNBUFFERED" value="1" />
-      </envs>
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
-      <option name="IS_MODULE_SDK" value="true" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/b.py" />
-      <option name="PARAMETERS" value="" />
-      <option name="SHOW_COMMAND_LINE" value="false" />
-      <option name="EMULATE_TERMINAL" value="false" />
-      <option name="MODULE_MODE" value="false" />
-      <option name="REDIRECT_INPUT" value="false" />
-      <option name="INPUT_FILE" value="" />
-      <method v="2" />
-    </configuration>
-    <configuration name="c" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
-      <module name="topic_spider" />
-      <option name="ENV_FILES" value="" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <envs>
-        <env name="PYTHONUNBUFFERED" value="1" />
-      </envs>
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
-      <option name="IS_MODULE_SDK" value="true" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/c.py" />
-      <option name="PARAMETERS" value="" />
-      <option name="SHOW_COMMAND_LINE" value="false" />
-      <option name="EMULATE_TERMINAL" value="false" />
-      <option name="MODULE_MODE" value="false" />
-      <option name="REDIRECT_INPUT" value="false" />
-      <option name="INPUT_FILE" value="" />
-      <method v="2" />
-    </configuration>
-    <configuration name="crawl_hospital" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
-      <module name="topic_spider" />
-      <option name="ENV_FILES" value="" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <envs>
-        <env name="PYTHONUNBUFFERED" value="1" />
-      </envs>
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
-      <option name="IS_MODULE_SDK" value="true" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/crawl_hospital.py" />
-      <option name="PARAMETERS" value="" />
-      <option name="SHOW_COMMAND_LINE" value="false" />
-      <option name="EMULATE_TERMINAL" value="false" />
-      <option name="MODULE_MODE" value="false" />
-      <option name="REDIRECT_INPUT" value="false" />
-      <option name="INPUT_FILE" value="" />
-      <method v="2" />
-    </configuration>
-    <configuration name="defaults" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
-      <module name="topic_spider" />
-      <option name="ENV_FILES" value="" />
-      <option name="INTERPRETER_OPTIONS" value="" />
-      <option name="PARENT_ENVS" value="true" />
-      <envs>
-        <env name="PYTHONUNBUFFERED" value="1" />
-      </envs>
-      <option name="SDK_HOME" value="" />
-      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/codes_hospital" />
-      <option name="IS_MODULE_SDK" value="true" />
-      <option name="ADD_CONTENT_ROOTS" value="true" />
-      <option name="ADD_SOURCE_ROOTS" value="true" />
-      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
-      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/codes_hospital/defaults.py" />
-      <option name="PARAMETERS" value="" />
-      <option name="SHOW_COMMAND_LINE" value="false" />
-      <option name="EMULATE_TERMINAL" value="false" />
-      <option name="MODULE_MODE" value="false" />
-      <option name="REDIRECT_INPUT" value="false" />
-      <option name="INPUT_FILE" value="" />
-      <method v="2" />
-    </configuration>
-    <recent_temporary>
-      <list>
-        <item itemvalue="Python.crawl_hospital" />
-        <item itemvalue="Python.c" />
-        <item itemvalue="Python.b" />
-        <item itemvalue="Python.defaults" />
-        <item itemvalue="Python.aaa (1)" />
-      </list>
-    </recent_temporary>
-  </component>
-  <component name="SharedIndexes">
-    <attachedChunks>
-      <set>
-        <option value="bundled-js-predefined-1d06a55b98c1-0b3e54e931b4-JavaScript-PY-241.18034.82" />
-        <option value="bundled-python-sdk-975db3bf15a3-2767605e8bc2-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.18034.82" />
-      </set>
-    </attachedChunks>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="8c2f903f-55ef-4ced-987d-6ac010bf606d" name="Changes" comment="" />
-      <created>1647713963501</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1647713963501</updated>
-      <workItem from="1647713965332" duration="763000" />
-      <workItem from="1647910768281" duration="540000" />
-      <workItem from="1648631632649" duration="336000" />
-      <workItem from="1648694544136" duration="742000" />
-      <workItem from="1649240699733" duration="190000" />
-      <workItem from="1649661973115" duration="99000" />
-      <workItem from="1649668394783" duration="10000" />
-      <workItem from="1650511827592" duration="9000" />
-      <workItem from="1651908647592" duration="13000" />
-      <workItem from="1653448397924" duration="786000" />
-      <workItem from="1653622679966" duration="1000" />
-      <workItem from="1653900611887" duration="199000" />
-      <workItem from="1655950837952" duration="610000" />
-      <workItem from="1656551153853" duration="524000" />
-      <workItem from="1658974433813" duration="14000" />
-      <workItem from="1659510721458" duration="62787000" />
-      <workItem from="1659931805307" duration="2748000" />
-      <workItem from="1660016111330" duration="184000" />
-      <workItem from="1660020728622" duration="1379000" />
-      <workItem from="1660032595629" duration="9306000" />
-      <workItem from="1660108524756" duration="3319000" />
-      <workItem from="1660117699024" duration="24296000" />
-      <workItem from="1660203314924" duration="6999000" />
-      <workItem from="1660266196873" duration="16000" />
-      <workItem from="1660290776459" duration="3956000" />
-      <workItem from="1660525419402" duration="7502000" />
-      <workItem from="1660611424284" duration="4765000" />
-      <workItem from="1660640960007" duration="468000" />
-      <workItem from="1660697369474" duration="24146000" />
-      <workItem from="1660789633638" duration="4806000" />
-      <workItem from="1661216042374" duration="2194000" />
-      <workItem from="1661308591662" duration="149000" />
-      <workItem from="1661330414839" duration="415000" />
-      <workItem from="1661390503915" duration="498000" />
-      <workItem from="1661739076245" duration="78000" />
-      <workItem from="1661820771088" duration="10000" />
-      <workItem from="1661849635521" duration="90000" />
-      <workItem from="1662100724285" duration="28000" />
-      <workItem from="1662367616316" duration="715000" />
-      <workItem from="1662705496621" duration="566000" />
-      <workItem from="1734574047690" duration="179000" />
-    </task>
-    <task id="LOCAL-00001" summary="新增 - 全国统一组织查询">
-      <created>1660201252821</created>
-      <option name="number" value="00001" />
-      <option name="presentableId" value="LOCAL-00001" />
-      <option name="project" value="LOCAL" />
-      <updated>1660201252821</updated>
-    </task>
-    <task id="LOCAL-00002" summary="fixbug">
-      <created>1660207650214</created>
-      <option name="number" value="00002" />
-      <option name="presentableId" value="LOCAL-00002" />
-      <option name="project" value="LOCAL" />
-      <updated>1660207650214</updated>
-    </task>
-    <task id="LOCAL-00003" summary="fixbug">
-      <created>1660211038526</created>
-      <option name="number" value="00003" />
-      <option name="presentableId" value="LOCAL-00003" />
-      <option name="project" value="LOCAL" />
-      <updated>1660211038526</updated>
-    </task>
-    <task id="LOCAL-00004" summary="fixbug">
-      <created>1660302204646</created>
-      <option name="number" value="00004" />
-      <option name="presentableId" value="LOCAL-00004" />
-      <option name="project" value="LOCAL" />
-      <updated>1660302204646</updated>
-    </task>
-    <task id="LOCAL-00005" summary="fixbug">
-      <created>1660542942569</created>
-      <option name="number" value="00005" />
-      <option name="presentableId" value="LOCAL-00005" />
-      <option name="project" value="LOCAL" />
-      <updated>1660542942569</updated>
-    </task>
-    <task id="LOCAL-00006" summary="fixbug">
-      <created>1660616796886</created>
-      <option name="number" value="00006" />
-      <option name="presentableId" value="LOCAL-00006" />
-      <option name="project" value="LOCAL" />
-      <updated>1660616796886</updated>
-    </task>
-    <task id="LOCAL-00007" summary="update">
-      <created>1660707257585</created>
-      <option name="number" value="00007" />
-      <option name="presentableId" value="LOCAL-00007" />
-      <option name="project" value="LOCAL" />
-      <updated>1660707257585</updated>
-    </task>
-    <task id="LOCAL-00008" summary="fixbug">
-      <created>1660707264013</created>
-      <option name="number" value="00008" />
-      <option name="presentableId" value="LOCAL-00008" />
-      <option name="project" value="LOCAL" />
-      <updated>1660707264013</updated>
-    </task>
-    <task id="LOCAL-00009" summary="update">
-      <created>1660718674713</created>
-      <option name="number" value="00009" />
-      <option name="presentableId" value="LOCAL-00009" />
-      <option name="project" value="LOCAL" />
-      <updated>1660718674713</updated>
-    </task>
-    <task id="LOCAL-00010" summary="fixbug">
-      <created>1660718680064</created>
-      <option name="number" value="00010" />
-      <option name="presentableId" value="LOCAL-00010" />
-      <option name="project" value="LOCAL" />
-      <updated>1660718680064</updated>
-    </task>
-    <task id="LOCAL-00011" summary="fixbug">
-      <created>1660725321099</created>
-      <option name="number" value="00011" />
-      <option name="presentableId" value="LOCAL-00011" />
-      <option name="project" value="LOCAL" />
-      <updated>1660725321099</updated>
-    </task>
-    <task id="LOCAL-00012" summary="fixbug">
-      <created>1660734122824</created>
-      <option name="number" value="00012" />
-      <option name="presentableId" value="LOCAL-00012" />
-      <option name="project" value="LOCAL" />
-      <updated>1660734122824</updated>
-    </task>
-    <task id="LOCAL-00013" summary="update">
-      <created>1660872544902</created>
-      <option name="number" value="00013" />
-      <option name="presentableId" value="LOCAL-00013" />
-      <option name="project" value="LOCAL" />
-      <updated>1660872544903</updated>
-    </task>
-    <task id="LOCAL-00014" summary="update">
-      <created>1660872629395</created>
-      <option name="number" value="00014" />
-      <option name="presentableId" value="LOCAL-00014" />
-      <option name="project" value="LOCAL" />
-      <updated>1660872629395</updated>
-    </task>
-    <task id="LOCAL-00015" summary="update">
-      <created>1660896035987</created>
-      <option name="number" value="00015" />
-      <option name="presentableId" value="LOCAL-00015" />
-      <option name="project" value="LOCAL" />
-      <updated>1660896035987</updated>
-    </task>
-    <task id="LOCAL-00016" summary="add project - 中国招标投标公共服务平台(未按规范数据)">
-      <created>1661330637516</created>
-      <option name="number" value="00016" />
-      <option name="presentableId" value="LOCAL-00016" />
-      <option name="project" value="LOCAL" />
-      <updated>1661330637517</updated>
-    </task>
-    <option name="localTasksCounter" value="17" />
-    <servers />
-  </component>
-  <component name="TypeScriptGeneratedFilesManager">
-    <option name="version" value="3" />
-  </component>
-  <component name="Vcs.Log.Tabs.Properties">
-    <option name="TAB_STATES">
-      <map>
-        <entry key="MAIN">
-          <value>
-            <State />
-          </value>
-        </entry>
-      </map>
-    </option>
-  </component>
-  <component name="VcsManagerConfiguration">
-    <MESSAGE value="新增 - 全国统一组织查询" />
-    <MESSAGE value="fixbug" />
-    <MESSAGE value="update" />
-    <MESSAGE value="add project - 中国招标投标公共服务平台(未按规范数据)" />
-    <option name="LAST_COMMIT_MESSAGE" value="add project - 中国招标投标公共服务平台(未按规范数据)" />
-  </component>
-  <component name="XDebuggerManager">
-    <breakpoint-manager>
-      <breakpoints>
-        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
-          <url>file://$PROJECT_DIR$/codes_hospital/sp1.py</url>
-          <line>421</line>
-          <option name="timeStamp" value="13" />
-        </line-breakpoint>
-      </breakpoints>
-    </breakpoint-manager>
-  </component>
-  <component name="com.intellij.coverage.CoverageDataManagerImpl">
-    <SUITE FILE_PATH="coverage/topic_spider$query_hospital.coverage" NAME="query_spider Coverage Results" MODIFIED="1659847227682" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$aaa.coverage" NAME="aaa Coverage Results" MODIFIED="1660111607631" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
-    <SUITE FILE_PATH="coverage/topic_spider$aaaa.coverage" NAME="aaaa Coverage Results" MODIFIED="1660124559170" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$aaa__1_.coverage" NAME="aaa (1) Coverage Results" MODIFIED="1660706922075" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$c.coverage" NAME="c Coverage Results" MODIFIED="1660731422968" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$crawl_hospital.coverage" NAME="crawl_hospital Coverage Results" MODIFIED="1660733510071" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$search.coverage" NAME="retrieval Coverage Results" MODIFIED="1659587342978" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$crawl_spider.coverage" NAME="crawl_spider Coverage Results" MODIFIED="1659587372586" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$b.coverage" NAME="b Coverage Results" MODIFIED="1660731393321" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$a3.coverage" NAME="a3 Coverage Results" MODIFIED="1659786080163" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
-    <SUITE FILE_PATH="coverage/topic_spider$crawl_hospital_3.coverage" NAME="crawl_hospital_3 Coverage Results" MODIFIED="1660704358659" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$a2.coverage" NAME="t1 Coverage Results" MODIFIED="1660103182145" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$defaults.coverage" NAME="defaults Coverage Results" MODIFIED="1660730508043" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$query_spider.coverage" NAME="query_spider Coverage Results" MODIFIED="1660099299022" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-    <SUITE FILE_PATH="coverage/topic_spider$step1.coverage" NAME="step1 Coverage Results" MODIFIED="1660112070606" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
-    <SUITE FILE_PATH="coverage/topic_spider$t1.coverage" NAME="t1 Coverage Results" MODIFIED="1660117720005" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/codes_hospital" />
-  </component>
-</project>

+ 0 - 15
jzsc/README.md

@@ -1,15 +0,0 @@
-项目配置及启动方式
-
-`
-    开启远程浏览器接管配置方案:
-    参考地址:https://www.kingname.info/2021/12/23/remote-debug-selenium/
-    
-    打开准备接管的chrome浏览器
-    mac:
-        /Applications/Google Chrome.app/Contents/MacOS/Google Chrome"  --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir="/Users/dongzhaorui/Desktop/fffx"
-    windows:
-        待补充
-
-    启动
-    >>> python3 spider.py
-`

+ 0 - 513
jzsc/spider.py

@@ -1,513 +0,0 @@
-import hashlib
-import random
-import time
-from pathlib import Path
-
-import pandas as pd
-import redis
-import requests
-from loguru import logger
-from lxml.html import fromstring, tostring
-from pymongo import MongoClient
-from selenium import webdriver
-from selenium.webdriver import Chrome
-from selenium.webdriver.common.by import By
-
-'''MongoDB'''
-client = MongoClient('192.168.3.182', 27017)
-company_tab = client['national']['company']
-
-'''redis服务'''
-r = redis.Redis(
-    connection_pool=redis.ConnectionPool(
-        host='192.168.3.182',
-        port=6379,
-        password='jianyu@python',
-        db=10
-    ),
-    decode_responses=True
-)
-redis_key = 'jzsc_2022'
-
-'''日志'''
-log_path = (Path(__file__).absolute().parent / 'logs/log_{time:YYYYMMDD}.log').resolve()
-logger.add(
-    log_path,
-    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
-    level='INFO',
-    rotation='00:00',
-    retention='1 week',
-    encoding='utf-8',
-)
-
-'''企业资质'''
-COMPANY_QUALITY_MAPS = {
-    '资质类别': 'quality_type',
-    '资质证书号': 'quality_no',
-    '资质名称': 'quality_name',
-    '发证日期': 'fzrq',
-    '发证有效期': 'fzyxq',
-    '发证机关': 'fzjg',
-}
-'''不良行为'''
-BAD_BEHAVIOR_MAPS = {
-    '诚信记录主体及编号': 'integrity_no',
-    '决定内容': 'decide_content',
-    '实施部门': 'ssbm',
-    '决定日期与有效期': 'execution_date',
-}
-'''黑名单记录'''
-BLACK_LIST_MAPS = {
-    '黑名单记录主体及编号': 'black_list_no',
-    '黑名单认定依据': 'black_list_rdyj',
-    '认定部门': 'rdbm',
-    '决定日期与有效期': 'execution_date',
-}
-'''失信联合惩戒记录'''
-PUNISH_MAPS = {
-    '失信记录编号': 'punish_no',
-    '失信联合惩戒记录主体': 'punish_subject',
-    '法人姓名': 'legal_person',
-    '列入名单事由': 'reason',
-    '认定部门': 'rdbm',
-    '列入日期': 'join_date',
-}
-
-CRAWL_SITE = 'http://jzsc.mohurd.gov.cn/data/company'
-
-
-def sha1(*args):
-    """
-    十六进制数字字符串形式摘要值
-
-    @param args: 字符串
-    @return: 摘要值
-    """
-    hash_sha1 = hashlib.sha1()
-    for arg in args:
-        hash_sha1.update(arg.encode('utf-8'))
-    return hash_sha1.hexdigest()
-
-
-def get_proxy(scheme=None, default=None, socks5h=False):
-    url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
-    headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
-
-    try:
-        proxy = requests.get(url, headers=headers, timeout=15).json()
-    except requests.RequestException:
-        return default
-
-    if not proxy:
-        logger.debug('暂无代理...')
-        return default
-
-    proxies = proxy.get('data')
-    if proxies:
-        if socks5h:
-            proxy_items = proxies.get('http')
-            proxy_h = {
-                'http': proxy_items.replace('socks5', 'socks5h'),
-                'https': proxy_items.replace('socks5', 'socks5h')
-            }
-            proxies = proxy_h
-
-    return proxies if not scheme else proxies.get(scheme, default)
-
-
-def html2element(html):
-    return fromstring(html)
-
-
-def element2html(lxml_element):
-    return tostring(lxml_element, encoding='utf-8').decode()
-
-
-def display_prompt_popup(html):
-    _element = html2element(html)
-    node = _element.xpath('//div[@class="el-dialog__wrapper"]')[0]
-    _popup_style = node.attrib.get('style')
-    if _popup_style is not None:
-        _styles = str(_popup_style).split(';')
-        res = list(filter(lambda x: len(x) > 0, _styles))[-1].strip().lower()
-        if res == 'display: none':
-            '''无提示弹框'''
-            return False
-    '''有提示弹框'''
-    return True
-
-
-def display_geetest_panel(html):
-    _element = html2element(html)
-    node = _element.xpath('//div[@class="geetest_panel_next"]')
-    if len(node) == 0:
-        '''无验证码'''
-        return False
-    _geetest_panel = node[0]
-    geetest_style = _geetest_panel.attrib.get('style')
-    if geetest_style is not None and geetest_style == 'display: block;':
-        '''有验证码'''
-        return True
-    else:
-        '''无验证码'''
-        return False
-
-
-def prompt_popup(driver: Chrome, wait_time=None):
-    while True:
-        if not display_prompt_popup(driver.page_source):
-            break
-        logger.info(">>> 点击提示弹框")
-        driver.find_element_by_xpath('//div[@class="el-dialog captchaDilaog"]/div[3]/div/button[1]').click()
-        time.sleep(1)
-    '''流程之间的间隔时间'''
-    _wait_time = (wait_time or 1)
-    time.sleep(_wait_time)
-
-
-def geetest_panel(driver: Chrome, wait_time=None, save_img_to_local=False):
-    while True:
-        if not display_geetest_panel(driver.page_source):
-            break
-        logger.info(">>> 验证码检测")
-
-        text = input("通过验证后,结束等待。请输入:y")
-        if text == 'y':
-            continue
-
-    _wait_time = (wait_time or 1)
-    time.sleep(wait_time)
-
-
-def check_page(driver: Chrome, wait_time=None, **kwargs):
-    """检查页面"""
-    wait_time = (wait_time or 1)
-    prompt_popup(driver, wait_time=wait_time)
-    geetest_panel(
-        driver,
-        wait_time=wait_time,
-        save_img_to_local=kwargs.get('save_img_to_local'),
-    )
-
-
-def click(driver: Chrome, button, wait_time=None, allow_check_page=False, run_js=True):
-    if run_js:
-        driver.execute_script("arguments[0].click();", button)
-    else:
-        button.click()
-
-    wait_time = (wait_time or 1)
-    time.sleep(wait_time)
-    if allow_check_page:
-        check_page(driver, wait_time=wait_time)
-
-
-def click_query(driver: Chrome, wait_time=None):
-    """查询按钮"""
-    button = driver.find_element_by_class_name("ssButton")
-    wait_time = (wait_time or 1)
-    click(driver, button, wait_time=wait_time)
-
-
-def next_page(driver: Chrome):
-    element = html2element(driver.page_source)
-    node = element.xpath('//button[@class="btn-next"]')[0]
-    attrib = node.attrib.get('disabled')
-    if attrib is not None and attrib == 'disabled':
-        '''最大页码'''
-        return False
-    else:
-        '''继续翻页'''
-        button = driver.find_element_by_class_name('btn-next')
-        click(driver, button)
-        return True
-
-
-def current_page(html):
-    element = html2element(html)
-    nodes = element.xpath('//ul[@class="el-pager"]/li')
-    for node in nodes:
-        if node.attrib.get('class') == 'number active':
-            return node.text
-
-
-def extract_content(html):
-    """抽取页面结构化数据"""
-    results = []
-    '''字段映射表'''
-    _maps = {
-        **COMPANY_QUALITY_MAPS,
-        **BAD_BEHAVIOR_MAPS,
-        **BLACK_LIST_MAPS,
-        **PUNISH_MAPS,
-    }
-    '''转化成dataframe'''
-    dfs = pd.read_html(html)
-    if len(dfs) == 2:
-        columns = list(dfs[0].columns.array)
-        values = dfs[1].values
-        '''合并内容'''
-        panel_container = [dict(zip(columns, val)) for val in values]
-        '''转换字段'''
-        for item in panel_container:
-            _item = {}
-            for key, val in item.items():
-                if key in _maps:
-                    _item[_maps[key]] = val
-            results.append(_item)
-    return results
-
-
-def crawl_spider(driver: Chrome, handler):
-    """采集爬虫"""
-    exception_count = 0
-    td_elements = driver.find_elements(By.XPATH, value='//table[@class="el-table__body"]//tr/td[3]')
-    for td_element in td_elements:
-        if exception_count > 3:
-            '''数据异常,停止采集'''
-            return False
-        title = td_element.text
-        '''使用公司名称进行去重'''
-        if r.hexists(redis_key, sha1(title)):
-            logger.info(f"[重复数据]{title} - 丢弃")
-            continue
-        button = td_element.find_element_by_class_name("link")
-        click(driver, button, wait_time=random.randint(3, 10), run_js=False)
-        for current_handler in driver.window_handles:
-            if current_handler == handler:
-                continue
-            '''切换到弹出页面'''
-            driver.switch_to.window(current_handler)
-            current_url = driver.current_url
-            '''首次进入详情页,检查提示弹框和验证码面板'''
-            check_page(driver, wait_time=random.randint(2, 6))
-            '''企业数据处理'''
-            company = {}
-            '''企业基础数据'''
-            element = html2element(driver.page_source)
-            nodes = element.xpath('//div[@class="detaile-header__info--table"]')
-            for node in nodes:
-                credit_no = "".join(node.xpath('./div[1]/div[1]/div[2]/text()')).strip()
-                legal_person = "".join(node.xpath('./div[1]/div[2]/div[2]/text()')).strip()
-                company_type = "".join(node.xpath('./div[2]/div[1]/div[2]/text()')).strip()
-                address = "".join(node.xpath('./div[2]/div[2]/div[2]/text()')).strip()
-                business_address = "".join(node.xpath('./div[3]/div[1]/div[2]/text()')).strip()
-                company = {
-                    'company_name': title,  # 企业名称
-                    'credit_no': credit_no,  # 统一社会信用代码
-                    'legal_person': legal_person,  # 企业法定代表人
-                    'company_type': company_type,  # 企业登记注册类型
-                    'address': address,  # 企业注册属地
-                    'business_address': business_address,  # 企业经营地址
-                    'industry': '',  # 所属行业
-                    'register_date': '',  # 注册时间
-                    'tel_phone': '',  # 联系方式
-                }
-                # logger.info(item)
-
-            '''企业资质'''
-            try:
-                element = html2element(driver.page_source)
-                node = element.xpath('//div[@class="panel-container"]')[0]
-                company_quality_html = element2html(node)
-                company_quality = extract_content(company_quality_html)
-                company['company_quality'] = company_quality
-                company['company_quality_html'] = {'html': company_quality_html}
-            except IndexError:
-                pass
-
-            '''注册人员'''
-            try:
-                company_staff = driver.find_element_by_id("tab-companyStaff")
-                click(driver, company_staff, allow_check_page=True)
-                reg_buttons = driver.find_elements(by=By.XPATH, value='//div[contains(@id, "tab-")]/span')
-                logger.info(f'>>> 人员注册类别 <<<')
-                for element in reg_buttons:
-                    # TODO 页面需翻页的逻辑未添加
-                    logger.info(f'[{element.text}]')
-                    click(driver, element, wait_time=random.randint(1, 3))
-
-                registrar = []
-                element = html2element(driver.page_source)
-                nodes = element.xpath('//div[@id="pane-companyStaff"]//table[@class="el-table__body"]//tr')
-                for node in nodes:
-                    name = "".join(node.xpath('./td[2]//span/text()')).strip()
-                    id_no = "".join(node.xpath('./td[3]/div/text()')).strip()
-                    reg_type = "".join(node.xpath('./td[4]/div/text()')).strip()
-                    reg_no = "".join(node.xpath('./td[5]/div/text()')).strip()
-                    reg_major = "".join(node.xpath('./td[6]/div/text()')).strip()
-                    registrar.append({
-                        'name': name,  # 姓名
-                        'id_no': id_no,  # 身份证号
-                        'reg_type': reg_type,  # 注册类别
-                        'reg_no': reg_no,  # 注册号(执业印章号)
-                        'reg_major': reg_major,  # 注册专业
-                    })
-                company['company_staff'] = registrar
-            except IndexError:
-                pass
-
-            '''不良行为'''
-            try:
-                bad_behavior = driver.find_element_by_id('tab-badBehavior')
-                click(driver, bad_behavior, allow_check_page=True)
-                element = html2element(driver.page_source)
-                node = element.xpath('//div[@aria-labelledby="tab-badBehavior"]/div')[0]
-                bad_behavior_html = element2html(node)
-                bad_behaviors = extract_content(bad_behavior_html)
-                company['bad_behavior'] = bad_behaviors
-                company['bad_behavior_html'] = {'html': bad_behavior_html}
-            except IndexError:
-                pass
-
-            '''黑名单记录'''
-            try:
-                black_list = driver.find_element_by_id('tab-blackList')
-                click(driver, black_list, allow_check_page=True)
-                element = html2element(driver.page_source)
-                node = element.xpath('//div[@id="pane-blackList"]/div')[0]
-                black_list_html = element2html(node)
-                black_list_array = extract_content(black_list_html)
-                company['black_list'] = black_list_array
-                company['black_list_html'] = {'html': black_list_html}
-            except IndexError:
-                pass
-
-            '''失信联合惩戒记录'''
-            try:
-                punish = driver.find_element_by_id('tab-punishLog')
-                click(driver, punish, allow_check_page=True)
-                element = html2element(driver.page_source)
-                node = element.xpath('//div[@id="pane-punishLog"]/div')[0]
-                punish_html = element2html(node)
-                punish_array = extract_content(punish_html)
-                company['punish'] = punish_array
-                company['punish_html'] = {'html': punish_html}
-            except IndexError:
-                pass
-
-            '''保存企业数据'''
-            if len(company['credit_no']) > 0:
-                company_tab.insert_one(company)
-                r.hset(redis_key, sha1(title), title)
-                logger.info(f'>>> {title} - {current_url} - 采集成功 - 保存入库')
-            else:
-                exception_count += 1  # 页面无企业数据
-                logger.info(f'>>> {title} - {current_url} - 采集失败 - 无社会信用代码')
-
-            '''关闭详情页标签'''
-            driver.close()
-            '''返回列表页'''
-            driver.switch_to.window(handler)
-            '''下一条执行时间'''
-            time.sleep(2)
-    else:
-        return True
-
-
-def downloader(driver: Chrome, handler):
-    while True:
-        logger.info(f">>> 第{current_page(driver.page_source)}页 <<<")
-        allow_crawl = crawl_spider(driver, handler)
-        '''是否继续采集'''
-        if not allow_crawl:
-            logger.info("网站数据异常,终止采集")
-            return False
-        '''翻页'''
-        if not next_page(driver):
-            logger.info('采集结束')
-            break
-    return True
-
-
-def select_province(driver: Chrome, records):
-    """选择注册属地"""
-    '''点击省份下拉框'''
-    drop_down_button = driver.find_element_by_xpath('//div[@class="region-select"]/div[1]/div[1]/span[1]/span[1]/i[contains(@class,"el-select__caret el-input__icon el-icon-arrow-up")]')
-    click(driver, drop_down_button, wait_time=1)
-    '''选择省份'''
-    li_elements = driver.find_elements(by=By.XPATH, value='/html/body/div[@class="el-select-dropdown el-popper"][1]/div[1]/div[1]/ul/li')
-    for element in li_elements:
-        province = element.text
-        if province not in records:
-            logger.info(f'>> 企业注册属地省份:{province} <<')
-            click(driver, element, wait_time=1.5)
-            records.append(province)
-            return False
-    else:
-        return True
-
-
-def select_categories(driver: Chrome, records):
-    span_elements = driver.find_elements(by=By.XPATH, value='//div[@class="labelInPut labelInPutRadio"]/span')
-    for element in span_elements:
-        qualification = element.text
-        if qualification not in records:
-            logger.info(f'>> 企业资质类别:{qualification} <<')
-            records.setdefault(qualification, [])
-
-        provinces = records.get(qualification)
-        if provinces is not None:
-            if len(provinces) < 32:
-                click(driver, element, wait_time=1.5)
-                crawl_finished = select_province(driver, provinces)
-                if not crawl_finished:
-                    click_query(driver, wait_time=2)
-                    return False
-    else:
-        return True
-
-
-def start(enable_remote_driver=False):
-    '''
-
-    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"  --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir="./data"
-
-    '''
-    options = webdriver.ChromeOptions()
-    if enable_remote_driver:
-        options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
-    options.add_argument("--disable-gpu")
-
-    chrome_driver = webdriver.Chrome(
-        executable_path="/Users/dongzhaorui/Downloads/chromedriver-mac-x64/chromedriver",
-        options=options
-    )
-    main_handler = chrome_driver.current_window_handle  # 获取句柄
-    '''清除其余窗口'''
-    for handler in chrome_driver.window_handles:
-        if handler != main_handler:
-            chrome_driver.switch_to.window(handler)
-            chrome_driver.close()
-            chrome_driver.switch_to.window(main_handler)
-
-    chrome_driver.get(CRAWL_SITE)
-    time.sleep(3)
-    '''采集记录'''
-    records = {
-        '全部': None,
-        '造价咨询企业': None,
-    }
-    while True:
-        crawl_finished = select_categories(chrome_driver, records)
-        if crawl_finished:
-            logger.info('任务结束')
-            break
-        '''下载数据'''
-        _continue = downloader(chrome_driver, main_handler)
-        if not _continue:
-            break
-
-    if not enable_remote_driver:
-        chrome_driver.quit()
-
-
-if __name__ == '__main__':
-    # while True:
-    #     try:
-    #         start(enable_remote_driver=True)
-    #     except:
-    #         logger.info("等待100秒")
-    #         time.sleep(100)
-
-    start(enable_remote_driver=True)