Browse Source

新增-去重域名表|新发现的域名|种子urls表

dongzhaorui 3 years ago
parent
commit
53ee767ace
1 changed files with 11 additions and 7 deletions
  1. 11 7
      find_source/settings.py

+ 11 - 7
find_source/settings.py

@@ -5,12 +5,16 @@ from common.databases import mongo_table, redis_client
 MGO_DATABASE = 'shujuziyuan'
 MGO_DATABASE = 'shujuziyuan'
 '''判重库'''
 '''判重库'''
 MGO_REPETITION = mongo_table(db=MGO_DATABASE, name='repetition_url')
 MGO_REPETITION = mongo_table(db=MGO_DATABASE, name='repetition_url')
-'''结果'''
-MGO_RECORDS = mongo_table(db=MGO_DATABASE, name='records')
-'''搜索|组织|单位'''
-MGO_VISIT_ORGANIZATION = mongo_table(db=MGO_DATABASE, name='visit_organization')
-'''搜索|关键词'''
-MGO_VISIT_KEYWORDS = mongo_table(db=MGO_DATABASE, name='visit_keywords')
+'''去重域名表'''
+MGO_REMOVAL_DOMAIN = mongo_table(db=MGO_DATABASE, name='removal_duplicate_domains')
+'''新发现的域名'''
+MGO_DOMAIN = mongo_table(db=MGO_DATABASE, name='new_domains')
+'''搜索【组织|单位】'''
+MGO_SEED_ORGS = mongo_table(db=MGO_DATABASE, name='seed_organizations')
+'''搜索关键词'''
+MGO_SEED_KEYWORDS = mongo_table(db=MGO_DATABASE, name='seed_keywords')
+'''种子urls'''
+MGO_SEED_URLS = mongo_table(db=MGO_DATABASE, name='seed_urls')
 '''redis'''
 '''redis'''
 REDIS = redis_client()
 REDIS = redis_client()
 REDIS_KEY = 'retrieve_urls'
 REDIS_KEY = 'retrieve_urls'
@@ -21,7 +25,7 @@ REQUIREMENT_PHRASE = [
     '议价', '中选', '答疑', '合同', '竞价', '变更', '更正', '预告', '集采', '抽取', '抽签',
     '议价', '中选', '答疑', '合同', '竞价', '变更', '更正', '预告', '集采', '抽取', '抽签',
     '中止公告', '终止公告', '竞卖', '竞买', '论证', '拟建', '审批', '环评'
     '中止公告', '终止公告', '竞卖', '竞买', '论证', '拟建', '审批', '环评'
 ]
 ]
-'''无用词组|广告|涉黄信息|涉黑信息|垃圾'''
+'''需要剔除或者过滤的关键字'''
 SENSITIVE_WORDS = [
 SENSITIVE_WORDS = [
     '通知', '邮箱', '登录'
     '通知', '邮箱', '登录'
 ]
 ]