索引数据处理

wcc 315b5bb32c 更新采购单位省市		7 hours ago
createEsIndex	315b5bb32c 更新采购单位省市	7 hours ago
udpcreateindex	a0334aec19 no message	2 years ago
.gitignore	8bedc0f1aa no message	3 years ago
README.md	6b23fc5d2e 更新采购单位数据源，从tidb 获取数据	2 years ago

数据处理项目-索引数据处理

udpcreateindex 索引处理程序-v1.0

createEsIndex 索引处理程序-v2.0

udpcreateindex

配置文件和 196 正式环境区别
1. [log] 配置下logpath 189 环境为空，196 环境下配置不为空；为空将输出控制台
bidding_es 文件中 GetEsField 方法为公用的，全量跑数据不需要saveErr，`saveErr 用来存储针对 projectscope 、detail 、 filetext 附件内容错误的；全量跑数据时屏蔽掉，跑增量数据打开。

1.索引mapping

拉取git 地址库：ssh://root@192.168.3.207:10022/maxiaoshan/esmapping.git
根据索引库，找到对应mapping 文件；

| 文件名 | 索引名 | 备注 |

  | ----------------------- | ------------------ | --------------------------------------------------- |

添加字段或者修改mapping

a.添加字段或者修改字段类型，需要报备 季春玲

b.字段应和索引程序保持一致，不分词字段类型为 keyword ,分词字段类型应为 text

c.具体字段信息需要和研发沟通确认

字段改动都需要在 149的es 环境进行测试，数据没问题后通知测试人员走流程测试
数据测试没问题后，提交代码；并告知产品，做好维护 @季春玲
更新正式环境钱，需要提前邮件通知，写清楚具体什么时间开始切换；切换完毕后也需要通知大家

2.数据迭代

bidding数据

存量数据

修改配置文件 biddingall.toml ，更新文件配置，找到需要运行的存量数据id

[routines]  ## 开启协程个数
num = 50
   
[[all]]
[all.bidding_back]
coll = "bidding_back"
gtid = "0"
lteid = "5a862e7040d2d9bbe88e3b1f" ## bidding_back 最后一个ID
   
[all.02]
coll = "bidding"
gtid = "0"
lteid = "5c531b800000000000000000" ## 2019.2.1  15493432
   
[all.03]
coll = "bidding"
gtid = "5c531b800000000000000000"
lteid = "5e0b70800000000000000000" ## 2020.1.1  17995862
   
[all.04]
coll = "bidding"
gtid = "5e0b70800000000000000000"
lteid = "5f74ab800000000000000000" ## 2020.10.1 17611742
   
[all.05]
coll = "bidding"
gtid = "5f74ab800000000000000000"
lteid = "608c29800000000000000000" ## 2021.5.1  17135203
   
[all.06]
coll = "bidding"
gtid = "608c29800000000000000000"
lteid = "6155df000000000000000000" ## 2021.10.1  20316855
   
[all.07]
coll = "bidding"
gtid = "6155df000000000000000000"
lteid = "621cf1800000000000000000" ## 2022.3.1  18930270
   
[all.08]
coll = "bidding"
gtid = "621cf1800000000000000000"
lteid = "62bdc8800000000000000000" ## 2022.7.1 18373938
   
[all.09]
coll = "bidding"
gtid = "62bdc8800000000000000000"
lteid = "633712800000000000000000" ## 2022.10.1  19093157
   
[all.10]
coll = "bidding"
gtid = "633712800000000000000000"
lteid = "63b05c800000000000000000" ## 2023.1.1   20198847
   
[all.11]
coll = "bidding"
gtid = "63b05c800000000000000000"
lteid = "644e90800000000000000000" ##  2023.5.1  18038591

2.udp 请求索引程序，stype = bidding_all_data

./sendtask -ip 127.0.0.1 -p 17834 -stype bidding_all_data

增量数据

需要生成pici

这种比较常见，由于某种原因导致数据没生索引，需要手动生索引

确认缺少的数据段，可以参考 163 MongoDB bidding_processing_ids

请求索引程序，stype=bidding

./sendtask -ip 127.0.0.1 -p 17834 -gtid 64705c740ebbbcdcb5cf3db4 -lteid 64705da00ebbbcdcb5cf41c0 -stype bidding

不需要生成pici

这种主要是针对过去的某些数据，不更新 pici 字段

./sendtask -ip 127.0.0.1 -p 17834 -gtid 64705c740ebbbcdcb5cf3db4 -lteid 64705da00ebbbcdcb5cf41c0 -stype biddingall

project数据

存量数据

修改配置文 projectall.toml ，更新文件配置，合理配置存量数据区间段

[routines]  ## 开启协程个数
num = 20
[[all]]
   
[all.01]
coll = "projectset_20230407"
gtid = "0"
lteid = "5d839796a5cb26b9b770bc27" ##
   
[all.02]
coll = "projectset_20230407"
gtid = "5d839796a5cb26b9b770bc27"
lteid = "60e28e641a75b8f446ee805d" ##
   
[all.03]
coll = "projectset_20230407"
gtid = "60e28e641a75b8f446ee805d"
lteid = "62d9519d4d0d9b2bc2b402fa" ##
   
[all.04]
coll = "projectset_20230407"
gtid = "62d9519d4d0d9b2bc2b402fa"
lteid = "6476e4b7eb01e8efa62a676e" ## mongo表最新ID

2.发送UDP 请求，注意修改对应的端口参数 p

/sendtask -ip 127.0.0.1 -p 17834 -stype project_all_data

增量数据

针对项目信息，入索引库 projectset

发送UDP 请求，修改对应的端口参数 p

./sendtask -ip 127.0.0.1 -p 17832 -stype project -tmpkey pici -tmptime 1684512000相当于查询 pici 大于 1684512000 的数据

3.索引程序部署

1.数据库依赖

需要 bidding_processing_field 数据表，里面配置了bidding以及project 生索引的字段处理信息

2.配置文件

[udp]
    locport = ":17834"          ## 本地监听地址
    jyaddr = "127.0.0.1"
    jyport = 11118

[db]
[db.mongoB] ## bidding数据
    addr = "192.168.3.206:27002"    ## 测试环境
    dbname = "qfw_data"
    coll = "bidding"
    size = 15
    user = "root"
    password = "root"

[db.mongoP] ## project 数据
    addr = "192.168.3.206:27002"
    dbname = "qfw_data"
    coll = "projectset"
    size = 15
    user = "root"
    password = "root"

[db.mongoQ] ## buyer winner 索引连接，只使用到了dbname，coll暂未使用
    addr = "192.168.3.206:27002"
    dbname = "mixdata"
    coll = "qyxy_std"
    size = 15
    user = "root"
    password = "root"

[db.oss] ## 获取附件内容
#    endpoint = "oss-cn-beijing-internal.aliyuncs.com"## 正式环境
    endpoint = "oss-cn-beijing.aliyuncs.com"## 测试环境
    accesskey = "LTAI4G5x9aoZx8dDamQ7vfZi"
    accesssecret = "Bk98FsbPYXcJe72n1bG3Ssf73acuNh"
    bucketname = "topjy"
[db.es]
    addr = "http://192.168.3.149:9200"      ## 正常bidding 链接
    addrp = "http://172.17.145.178:9200"    ## 采集使用的单机版地址
    username = "es_all"
    password = "TopJkO2E_d1x"
    size = 5
    indexb = "bidding"
    indextmp = "bidding_temporary"         ## 临时索引，其他程序需要
    indexp = "projectset"
    indexwinner = "winner"
    indexbuyer = "buyer"
detailfilter = ["(招标网|千里马|采招网|招标采购导航网|招标与采购网|中国招投标网|中国采购与招标网|中国采购与招标|优质采)[\\w\\W]{0,15}[http|https|htpps]?[a-z0-9:\\/\\/.]{0,20}(qianlima|zhaobiao|okcis|zbytb|infobidding|bidcenter|youzhicai|chinabidding|Chinabidding｜CHINABIDDING)[a-z0-9.\\/\\/]{0,40}",
    "招标网[\\w\\W]{0,15}[http|https|htpps]?[a-z0-9:\\/\\/.]{0,20}zhaobiao[a-z0-9.\\/\\/]{0,40}",
    "千里马[\\w\\W]{0,15}[a-z0-9:\\/\\/.]{0,20}qianlima[a-z0-9.\\/\\/]{0,10}",
    "[\\(（]?(网址)?[:：；;]?(http|https|htpps)*[:：]?(\\/\\/)?(www|jinan|WWW)?.(zhaobiao|chinabidding|Chinabidding|CHINABIDDING|infobidding|zbytb|okcis|qianlima|youzhicai).(com|cn|COM|CN)?(.cn|.CN)?\\/?[\\)）]?",
    "[\\(（]?(网址)?(:：)?(http|https|htpps)*(:|：)?\\/\\/www.bidcenter.com.cn\\/",
    "千里马(平台|网站)+", "[“\"]?优质采(平台|电子交易平台|云采购平台|交易平台)?[”\"]?", "《?(中国采购与|中国)?招(投)?标(与采购|采购导航)?网》?",
    "《?元博网(采购与招标网)?》?", "《?(中国)?招标采购导航网》?", "中\\W{0,3}国采\\W{0,3}招\\W{0,3}网\\W*[(（]?(bidcenter.com.cn)?[)）]?", "已方宝", "中国招标与采购"]

[mail]
send = false
to = "wangjianghan@topnet.net.cn"
api = "http://172.17.145.179:19281/_send/_mail"

# 日志
[log]
# 日志路径，为空将输出控制台
logpath = ""
# log size (M)
maxsize = 10
# compress log
compress = true
# log save  time (day)
maxage =  7
# save total log file total
maxbackups = 10
# log level
loglevel  = "debug"
# text or json output
format = "text"

3.部署

打包索引程序，然后拷贝到服务器目录下，一般程序文件命名：createindex_1783_20230601。
最后以日期名结尾，容易区分程序。线上一般保留旧的程序文件2个，以防止出现意外。

4.注意事项

1.stype 参数使用

只有==index-by-id、bidding、bidding_history==三种类型，才会生成pici字段

stype 数值	数值含义
index-by-id	单个ID数据；主要针对某条数据单独生索引
bidding	bidding增量数据，需要传递一个id段，配合参数 gt 和 lte 使用
biddingall	补充某一个段的存量数据，适合数据量不大的情况，也需要参数 gt 和lte。
bidding_all_data	根据biddingall.toml配置文件，迁移大批存量数据
bidding_history	和bidding 逻辑一样
project	项目信息，配合gt和lte参数使用
project_all_data	project存量数据，依据projectall.toml配置文件分段同步存量数据
biddingdata	同步数据到采集判重索引，部署在145.178的服务器，单机版索引，只有采集爬虫在使用
biddingdelbyextracttype	根据bidding表extracttype=-1，删除es中重复数据
buyer_all	buyer 全量数据

README.md

数据处理项目-索引数据处理

udpcreateindex 索引处理程序-v1.0

createEsIndex 索引处理程序-v2.0

udpcreateindex

1.索引mapping

2.数据迭代

bidding数据

存量数据

增量数据

需要生成pici

不需要生成pici

project数据

存量数据

增量数据

3.索引程序部署

1.数据库依赖

2.配置文件

3.部署

4.注意事项

1.stype 参数使用