extractInit.go 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. log "github.com/donnie4w/go-logger/logger"
  14. )
  15. type RegLuaInfo struct { //正则或脚本信息
  16. Code, Name, Field string //
  17. RuleText string //
  18. IsLua bool //
  19. RegPreBac *ExtReg //
  20. RegCore *ExtReg //
  21. LFields map[string]string //lua抽取字段属性组
  22. }
  23. type ExtReg struct {
  24. Reg *regexp.Regexp
  25. Replace string
  26. Bextract bool
  27. ExtractPos map[string]int
  28. }
  29. type RuleCore struct {
  30. Field string //逻辑字段
  31. LuaLogic string //进入逻辑
  32. ExtFrom string //从哪个字段抽取
  33. RulePres []*RegLuaInfo //抽取前置规则
  34. RuleBacks []*RegLuaInfo //抽取后置规则
  35. RuleCores []*RegLuaInfo //抽取规则
  36. }
  37. type Tag struct {
  38. Type string //标签类型 string 字符串、regexp 正则
  39. Key string //
  40. Reg *regexp.Regexp //
  41. }
  42. type TaskInfo struct {
  43. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  44. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  45. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  46. TestColl, LastExtId string //测试结果表、上次抽取信息id
  47. FDB *db.Pool //数据库连接池
  48. TDB *db.Pool //数据库连接池
  49. IsEtxLog bool //是否开启抽取日志
  50. ProcessPool chan bool //任务进程池
  51. TestLua bool //检查测试用
  52. }
  53. type ExtractTask struct {
  54. Id string //任务id
  55. IsRun bool //是否启动
  56. Content string //信息内容
  57. TaskInfo *TaskInfo //任务信息
  58. RulePres []*RegLuaInfo //通用前置规则
  59. RuleBacks []*RegLuaInfo //通用后置规则
  60. RuleCores []*RuleCore //抽取规则
  61. PkgRuleCores []*RuleCore //分包抽取规则
  62. Tag map[string][]*Tag //标签库
  63. ClearFn map[string][]string //清理函数
  64. IsExtractCity bool //是否开启城市抽取
  65. Fields map[string]int //抽取属性组
  66. IsFileField bool //是否开启附件抽取
  67. FileFields *sync.Map //抽取附件属性组
  68. ResultChanel chan bool //抽取结果详情
  69. ResultArr [][]map[string]interface{} //抽取结果详情
  70. BidChanel chan bool //抽取结果
  71. BidArr [][]map[string]interface{} //抽取结果
  72. RecogFieldMap map[string]map[string]interface{} //识别字段
  73. FidClassMap map[string][]map[string]interface{} //分类
  74. CidRuleMap map[string][]map[string]interface{} //规则
  75. AuditFields []string //需要审核的字段名称
  76. ProvinceMap map[string]string
  77. CityBrief map[string]*City //只加载一次即可
  78. ProvinceBrief map[string]*Province //只加载一次
  79. AreaToCity map[string][]*City //两个文件共用
  80. DistrictCityMap map[string]*City
  81. StreetDistrictMap map[string]*District
  82. AreaGet *ju.DFA //市全称
  83. AreaDistrict *ju.DFA //区或县
  84. AreaProvinceGet *ju.DFA //省
  85. AreaSimGet *ju.DFA //市简称
  86. AreaStreet *ju.DFA //街道
  87. }
  88. type ClearTaskInfo struct {
  89. Name, Version, VersionId string //名称、版本、版本id
  90. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  91. FDB *db.Pool //数据库连接池
  92. TDB *db.Pool //数据库连接池
  93. IsCltLog bool //是否开启清理日志
  94. ProcessPool chan bool //任务进程池
  95. }
  96. type ClearLua struct {
  97. Field string //字段字段
  98. Code string //代码
  99. Name string //名称
  100. LuaText string
  101. //LuaLogic string //进入逻辑
  102. //ExtFrom string //从哪个字段抽取
  103. LFields map[string]string //lua抽取字段属性组
  104. }
  105. type ClearTask struct {
  106. Id string //任务id
  107. Content string //信息内容
  108. ClearTaskInfo *ClearTaskInfo //任务信息
  109. ClearLuas map[string][]*ClearLua //清理脚本
  110. UpdateResult [][]map[string]interface{} //清理后结果
  111. ClearChannel chan bool
  112. }
  113. func init() {
  114. TaskList = make(map[string]*ExtractTask)
  115. ClearTaskList = make(map[string]*ClearTask)
  116. go SaveExtLog()
  117. go SaveCltLog() //保存清理日志
  118. }
  119. //加载任务信息
  120. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  121. task, _ := db.Mgo.FindById("task", e.Id, nil)
  122. if len(*task) > 1 {
  123. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  124. e.TaskInfo = &TaskInfo{
  125. Name: (*task)["s_taskname"].(string),
  126. Version: (*task)["s_version"].(string),
  127. VersionId: qu.BsonIdToSId((*v)["_id"]),
  128. TrackColl: trackcoll,
  129. FromDbAddr: (*task)["s_mgoaddr"].(string),
  130. FromDB: (*task)["s_mgodb"].(string),
  131. FromColl: (*task)["s_mgocoll"].(string),
  132. TestColl: resultcoll,
  133. IsEtxLog: true,
  134. ProcessPool: make(chan bool, 1),
  135. }
  136. if (*v)["isextractcity"] != nil {
  137. e.IsExtractCity = (*v)["isextractcity"].(bool)
  138. }
  139. } else {
  140. return
  141. }
  142. }
  143. //加载任务信息
  144. func (e *ExtractTask) InitTaskInfo() {
  145. task, _ := db.Mgo.FindById("task", e.Id, nil)
  146. log.Debug("task", task)
  147. if len(*task) > 1 {
  148. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  149. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  150. log.Debug("s_mgosavecoll", strs)
  151. if len(strs) < 3 {
  152. return
  153. } else {
  154. e.TaskInfo = &TaskInfo{
  155. Name: (*task)["s_taskname"].(string),
  156. Version: (*task)["s_version"].(string),
  157. VersionId: qu.BsonIdToSId((*v)["_id"]),
  158. //TrackColl: (*task)["s_trackcoll"].(string),
  159. FromDbAddr: (*task)["s_mgoaddr"].(string),
  160. FromDB: (*task)["s_mgodb"].(string),
  161. FromColl: (*task)["s_mgocoll"].(string),
  162. ToDbAddr: strs[0],
  163. ToDB: strs[1],
  164. ToColl: strs[2],
  165. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  166. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  167. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  168. }
  169. if (*v)["isextractcity"] != nil {
  170. e.IsExtractCity = (*v)["isextractcity"].(bool)
  171. }
  172. }
  173. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  174. } else {
  175. return
  176. }
  177. }
  178. //加载通用前置规则
  179. func (e *ExtractTask) InitRulePres() {
  180. defer qu.Catch()
  181. e.RulePres = []*RegLuaInfo{}
  182. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  183. for _, v := range *list {
  184. rinfo := &RegLuaInfo{
  185. Code: v["s_code"].(string),
  186. Name: v["s_name"].(string),
  187. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  188. }
  189. if rinfo.IsLua {
  190. rinfo.RuleText = v["s_luascript"].(string)
  191. e.RulePres = append(e.RulePres, rinfo)
  192. } else {
  193. qu.Try(func() {
  194. rinfo.RuleText = v["s_rule"].(string)
  195. tmp := strings.Split(rinfo.RuleText, "__")
  196. var pattern string
  197. if strings.Contains(tmp[0], "\\u") {
  198. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  199. } else {
  200. pattern = tmp[0]
  201. }
  202. if len(tmp) == 2 {
  203. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  204. } else {
  205. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  206. }
  207. e.RulePres = append(e.RulePres, rinfo)
  208. }, func(err interface{}) {
  209. log.Debug(rinfo.Code, rinfo.Field, err)
  210. })
  211. }
  212. }
  213. }
  214. //加载通用后置规则
  215. func (e *ExtractTask) InitRuleBacks() {
  216. defer qu.Catch()
  217. e.RuleBacks = []*RegLuaInfo{}
  218. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  219. for _, v := range *list {
  220. rinfo := &RegLuaInfo{
  221. Code: v["s_code"].(string),
  222. Name: v["s_name"].(string),
  223. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  224. }
  225. if rinfo.IsLua {
  226. rinfo.RuleText = v["s_luascript"].(string)
  227. e.RuleBacks = append(e.RuleBacks, rinfo)
  228. } else {
  229. qu.Try(func() {
  230. rinfo.RuleText = v["s_rule"].(string)
  231. tmp := strings.Split(rinfo.RuleText, "__")
  232. var pattern string
  233. if strings.Contains(tmp[0], "\\u") {
  234. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  235. } else {
  236. pattern = tmp[0]
  237. }
  238. if len(tmp) == 2 {
  239. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  240. } else {
  241. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  242. }
  243. e.RuleBacks = append(e.RuleBacks, rinfo)
  244. }, func(err interface{}) {
  245. log.Debug(rinfo.Code, rinfo.Field, err)
  246. })
  247. }
  248. }
  249. }
  250. //加载抽取规则
  251. func (e *ExtractTask) InitRuleCore() {
  252. defer qu.Catch()
  253. e.Fields = map[string]int{}
  254. e.RuleCores = []*RuleCore{}
  255. vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  256. for _, vinfo := range *vinfos {
  257. if b, _ := vinfo["isuse"].(bool); !b {
  258. continue
  259. }
  260. s_field := qu.ObjToString(vinfo["s_field"])
  261. pid := qu.BsonIdToSId(vinfo["_id"])
  262. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  263. for _, vv := range *list {
  264. if b, _ := vv["isuse"].(bool); !b {
  265. continue
  266. }
  267. rcore := &RuleCore{}
  268. rcore.Field = s_field
  269. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  270. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  271. //前置规则
  272. rulePres := []*RegLuaInfo{}
  273. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  274. for _, v := range *plist {
  275. rinfo := &RegLuaInfo{
  276. Field: qu.ObjToString(v["s_field"]),
  277. Code: v["s_code"].(string),
  278. Name: v["s_name"].(string),
  279. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  280. }
  281. if rinfo.IsLua {
  282. rinfo.RuleText = v["s_luascript"].(string)
  283. rulePres = append(rulePres, rinfo)
  284. } else {
  285. qu.Try(func() {
  286. rinfo.RuleText = v["s_rule"].(string)
  287. tmp := strings.Split(rinfo.RuleText, "__")
  288. var pattern string
  289. if strings.Contains(tmp[0], "\\u") {
  290. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  291. } else {
  292. pattern = tmp[0]
  293. }
  294. if len(tmp) == 2 {
  295. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  296. } else {
  297. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  298. }
  299. rulePres = append(rulePres, rinfo)
  300. }, func(err interface{}) {
  301. log.Debug(rinfo.Code, rinfo.Field, err)
  302. })
  303. }
  304. }
  305. rcore.RulePres = rulePres
  306. //后置规则
  307. ruleBacks := []*RegLuaInfo{}
  308. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  309. for _, v := range *blist {
  310. rinfo := &RegLuaInfo{
  311. Field: qu.ObjToString(v["s_field"]),
  312. Code: v["s_code"].(string),
  313. Name: v["s_name"].(string),
  314. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  315. }
  316. if rinfo.IsLua {
  317. rinfo.RuleText = v["s_luascript"].(string)
  318. ruleBacks = append(ruleBacks, rinfo)
  319. } else {
  320. qu.Try(func() {
  321. rinfo.RuleText = v["s_rule"].(string)
  322. tmp := strings.Split(rinfo.RuleText, "__")
  323. var pattern string
  324. if strings.Contains(tmp[0], "\\u") {
  325. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  326. } else {
  327. pattern = tmp[0]
  328. }
  329. if len(tmp) == 2 {
  330. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  331. } else {
  332. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  333. }
  334. ruleBacks = append(ruleBacks, rinfo)
  335. }, func(err interface{}) {
  336. log.Debug(rinfo.Code, rinfo.Field, err)
  337. })
  338. }
  339. }
  340. rcore.RuleBacks = ruleBacks
  341. //抽取规则
  342. ruleCores := []*RegLuaInfo{}
  343. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  344. for _, v := range *clist {
  345. if b, _ := v["isuse"].(bool); !b {
  346. continue
  347. }
  348. field := qu.ObjToString(v["s_field"])
  349. e.Fields[field] = 1 //加入抽取属性组备用
  350. rinfo := &RegLuaInfo{
  351. Field: field,
  352. Code: v["s_code"].(string),
  353. Name: v["s_name"].(string),
  354. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  355. }
  356. if rinfo.IsLua {
  357. rinfo.RuleText = v["s_luascript"].(string)
  358. //提取全部属性
  359. rinfo.LFields = getALLFields()
  360. ruleCores = append(ruleCores, rinfo)
  361. } else {
  362. qu.Try(func() {
  363. rinfo.RuleText = v["s_rule"].(string)
  364. tmp := strings.Split(rinfo.RuleText, "__")
  365. var pattern string
  366. if strings.Contains(tmp[0], "\\u") {
  367. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  368. } else {
  369. pattern = tmp[0]
  370. }
  371. if len(tmp) == 2 {
  372. epos := strings.Split(tmp[1], ",")
  373. posm := map[string]int{}
  374. for _, v := range epos {
  375. ks := strings.Split(v, ":")
  376. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  377. posm[ks[1]] = qu.IntAll(ks[0])
  378. } else { //(.*)招标公告__2
  379. posm[rinfo.Field] = qu.IntAll(ks[0])
  380. }
  381. }
  382. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  383. } else {
  384. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  385. }
  386. ruleCores = append(ruleCores, rinfo)
  387. }, func(err interface{}) {
  388. log.Debug(rinfo.Code, rinfo.Field, err)
  389. })
  390. }
  391. }
  392. rcore.RuleCores = ruleCores
  393. //
  394. e.RuleCores = append(e.RuleCores, rcore)
  395. }
  396. }
  397. }
  398. //加载分包抽取规则
  399. func (e *ExtractTask) InitPkgCore() {
  400. defer qu.Catch()
  401. e.PkgRuleCores = []*RuleCore{}
  402. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  403. for _, pkginfo := range *pkginfos {
  404. if b, _ := pkginfo["isuse"].(bool); !b {
  405. continue
  406. }
  407. s_field := qu.ObjToString(pkginfo["s_field"])
  408. pid := qu.BsonIdToSId(pkginfo["_id"])
  409. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  410. for _, vv := range *logicList {
  411. if b, _ := vv["isuse"].(bool); !b {
  412. continue
  413. }
  414. rcore := &RuleCore{}
  415. rcore.Field = s_field
  416. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  417. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  418. //后置规则
  419. ruleBacks := []*RegLuaInfo{}
  420. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  421. for _, v := range *blist {
  422. rinfo := &RegLuaInfo{
  423. Field: qu.ObjToString(v["s_field"]),
  424. Code: v["s_code"].(string),
  425. Name: v["s_name"].(string),
  426. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  427. }
  428. if rinfo.IsLua {
  429. rinfo.RuleText = v["s_luascript"].(string)
  430. ruleBacks = append(ruleBacks, rinfo)
  431. } else {
  432. qu.Try(func() {
  433. rinfo.RuleText = v["s_rule"].(string)
  434. tmp := strings.Split(rinfo.RuleText, "__")
  435. var pattern string
  436. if strings.Contains(tmp[0], "\\u") {
  437. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  438. } else {
  439. pattern = tmp[0]
  440. }
  441. if len(tmp) == 2 {
  442. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  443. } else {
  444. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  445. }
  446. ruleBacks = append(ruleBacks, rinfo)
  447. }, func(err interface{}) {
  448. log.Debug(rinfo.Code, rinfo.Field, err)
  449. })
  450. }
  451. }
  452. rcore.RuleBacks = ruleBacks
  453. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  454. }
  455. }
  456. }
  457. //加载标签库
  458. func (e *ExtractTask) InitTag() {
  459. defer qu.Catch()
  460. e.Tag = map[string][]*Tag{}
  461. //字符串标签库
  462. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  463. for _, v := range *list {
  464. field := qu.ObjToString(v["s_field"])
  465. if tmp, ok := v["content"].([]interface{}); ok {
  466. fname := qu.ObjToString(v["s_name"])
  467. tab := ju.TagFile{Name: fname} //用于表格kv
  468. tab.Items = make([]*ju.Tag, len(tmp))
  469. for k, key := range tmp {
  470. tag := &Tag{Type: "string", Key: key.(string)}
  471. e.Tag[field] = append(e.Tag[field], tag)
  472. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  473. }
  474. sort.Sort(tab.Items)
  475. ju.TagdbTable[fname] = &tab
  476. }
  477. }
  478. //正则标签库
  479. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  480. for _, v := range *list {
  481. field := qu.ObjToString(v["s_field"])
  482. if tmp, ok := v["content"].([]interface{}); ok {
  483. fname := qu.ObjToString(v["s_name"])
  484. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  485. tab.Items = make([]*ju.Tag, len(tmp))
  486. for k, key := range tmp {
  487. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  488. e.Tag[field] = append(e.Tag[field], tag)
  489. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  490. }
  491. sort.Sort(tab.Items)
  492. ju.TagdbTable[fname+"_reg"] = &tab
  493. }
  494. }
  495. }
  496. //获取fields
  497. func getALLFields() map[string]string {
  498. fields := map[string]string{}
  499. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  500. for _, v := range *list {
  501. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  502. }
  503. return fields
  504. }
  505. //加载clear函数
  506. func (e *ExtractTask) InitClearFn() {
  507. defer qu.Catch()
  508. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  509. fn := map[string][]string{}
  510. for _, tmp := range *list {
  511. field := tmp["s_field"].(string)
  512. fns := tmp["clear"].([]interface{})
  513. if fn[field] == nil {
  514. fn[field] = []string{}
  515. }
  516. for _, v := range fns {
  517. fn[field] = append(fn[field], v.(string))
  518. }
  519. }
  520. e.ClearFn = fn
  521. }
  522. //加载省份
  523. func InitProvince(version string) map[string]interface{} {
  524. defer qu.Catch()
  525. fn := map[string]interface{}{}
  526. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  527. for _, v := range *list {
  528. name := qu.ObjToString(v["s_name"])
  529. content := v["content"]
  530. switch content.(type) {
  531. case string:
  532. fn[name] = []interface{}{content.(string)}
  533. case []interface{}:
  534. fn[name] = content
  535. }
  536. }
  537. return fn
  538. }
  539. //加载城市简称
  540. func InitCitySim(version string) map[string]map[string]interface{} {
  541. defer qu.Catch()
  542. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  543. fn := map[string]map[string]interface{}{}
  544. for _, v := range *list {
  545. name := qu.ObjToString(v["s_name"])
  546. tmp := v["content"].(map[string]interface{})
  547. fn[name] = tmp
  548. }
  549. return fn
  550. }
  551. //加载城市全称
  552. func InitCityAll(version string) map[string]map[string]interface{} {
  553. defer qu.Catch()
  554. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  555. fn := map[string]map[string]interface{}{}
  556. for _, v := range *list {
  557. name := qu.ObjToString(v["s_name"])
  558. tmp := v["content"].(map[string]interface{})
  559. fn[name] = tmp
  560. }
  561. return fn
  562. }
  563. //初始化城市省份敏感词
  564. func (e *ExtractTask) InitDFA() {
  565. defer qu.Catch()
  566. e.AreaGet = &ju.DFA{}
  567. e.AreaDistrict = &ju.DFA{}
  568. e.AreaProvinceGet = &ju.DFA{}
  569. e.AreaStreet = &ju.DFA{}
  570. //初始化map
  571. if e.ProvinceMap == nil {
  572. e.ProvinceMap = make(map[string]string)
  573. }
  574. if e.CityBrief == nil {
  575. e.CityBrief = make(map[string]*City)
  576. }
  577. if e.ProvinceBrief == nil {
  578. e.ProvinceBrief = make(map[string]*Province)
  579. }
  580. if e.AreaToCity == nil {
  581. e.AreaToCity = make(map[string][]*City)
  582. }
  583. if e.DistrictCityMap == nil {
  584. e.DistrictCityMap = make(map[string]*City)
  585. }
  586. if e.StreetDistrictMap == nil {
  587. e.StreetDistrictMap = make(map[string]*District)
  588. }
  589. //初始化省
  590. fn1 := InitProvince(e.TaskInfo.Version)
  591. for k, v := range fn1 {
  592. for _, p := range v.([]interface{}) {
  593. p1, _ := p.(string)
  594. e.AreaProvinceGet.AddWord(p1)
  595. e.ProvinceMap[p1] = k
  596. }
  597. }
  598. //初始化城市全称
  599. fn2 := InitCityAll(e.TaskInfo.Version)
  600. for k, v := range fn2 {
  601. e.AreaProvinceGet.AddWord(k) //省全称
  602. p := &Province{}
  603. p.Name = k
  604. p.Brief = v["brief"].(string)
  605. e.ProvinceMap[k] = p.Brief
  606. //
  607. e.ProvinceBrief[p.Brief] = p
  608. p.Cap = v["captial"].(string)
  609. city, _ := v["city"].(map[string]interface{})
  610. for k1, v1 := range city {
  611. v1m, _ := v1.(map[string]interface{})
  612. c := &City{}
  613. c.Name = k1
  614. // if v1m["brief"] == nil {
  615. // }
  616. c.Brief = v1m["brief"].(string)
  617. //
  618. e.CityBrief[c.Brief] = c
  619. c.P = p
  620. if c.Brief == p.Cap {
  621. p.Captial = c
  622. }
  623. //加入到城市map中
  624. //
  625. cs := e.AreaToCity[k1]
  626. e.AreaGet.AddWord(k1) //市全称
  627. if cs != nil {
  628. cs = append(cs, c)
  629. } else {
  630. cs = []*City{c}
  631. }
  632. e.AreaToCity[k1] = cs
  633. //区县
  634. districtmap := v1m["area"].(map[string]interface{}) //区或县
  635. for district, streetarr := range districtmap {
  636. d := &District{}
  637. d.Name = district
  638. d.C = c
  639. e.AreaDistrict.AddWord(district) //加入区或县敏感词
  640. ctmp := e.DistrictCityMap[district]
  641. if ctmp == nil {
  642. e.DistrictCityMap[district] = c
  643. }
  644. //街道
  645. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  646. e.AreaStreet.AddWord(s) //加入街道敏感词
  647. dtmp := e.StreetDistrictMap[s]
  648. if dtmp == nil {
  649. e.StreetDistrictMap[s] = d
  650. }
  651. }
  652. }
  653. }
  654. }
  655. //初始化城市简称
  656. fn3 := InitCitySim(e.TaskInfo.Version)
  657. e.AreaSimGet = &ju.DFA{}
  658. for k, v := range fn3 {
  659. pb := v["brief"].(string)
  660. p := e.ProvinceBrief[pb]
  661. //加载
  662. for _, ss := range []string{k, pb} {
  663. cs := e.AreaToCity[ss]
  664. if cs != nil {
  665. cs = append(cs, p.Captial)
  666. } else {
  667. cs = []*City{p.Captial}
  668. }
  669. e.AreaToCity[ss] = cs
  670. e.AreaSimGet.AddWord(ss) //省全称和省简称
  671. }
  672. city, _ := v["city"].(map[string]interface{})
  673. for k1, v1 := range city {
  674. v1m, _ := v1.(map[string]interface{})
  675. if v1m["brief"] == nil {
  676. }
  677. cb := v1m["brief"].(string)
  678. c := e.AreaToCity[k1][0]
  679. //加入到城市map中
  680. for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  681. e.AreaSimGet.AddWord(ss)
  682. cs := e.AreaToCity[ss]
  683. if cs != nil {
  684. cs = append(cs, c)
  685. } else {
  686. cs = []*City{c}
  687. }
  688. e.AreaToCity[ss] = cs
  689. }
  690. arr := v1m["area"].([]interface{})
  691. for _, k2 := range arr {
  692. s := k2.(string)
  693. for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  694. cs := e.AreaToCity[ss]
  695. e.AreaSimGet.AddWord(ss)
  696. if cs != nil {
  697. cs = append(cs, c)
  698. } else {
  699. cs = []*City{c}
  700. }
  701. e.AreaToCity[ss] = cs
  702. //只加入简称
  703. if n == 0 {
  704. d := &District{}
  705. d.Name = ss
  706. d.C = c
  707. e.AreaDistrict.AddWord(ss) //加入区或县简称敏感词
  708. ctmp := e.DistrictCityMap[ss]
  709. if ctmp == nil {
  710. e.DistrictCityMap[ss] = c
  711. }
  712. }
  713. }
  714. }
  715. }
  716. }
  717. }
  718. //保存抽取详情数据
  719. func (e *ExtractTask) ResultSave(init bool) {
  720. defer qu.Catch()
  721. if e.ResultArr == nil {
  722. e.ResultArr = [][]map[string]interface{}{}
  723. }
  724. if init {
  725. go func() {
  726. for {
  727. if len(e.ResultArr) > 500 {
  728. arr := e.ResultArr[:500]
  729. qu.Try(func() {
  730. db.Mgo.UpSertBulk("extract_result", arr...)
  731. }, func(err interface{}) {
  732. log.Debug(err)
  733. })
  734. e.ResultArr = e.ResultArr[500:]
  735. } else {
  736. arr := e.ResultArr
  737. qu.Try(func() {
  738. db.Mgo.UpSertBulk("extract_result", arr...)
  739. }, func(err interface{}) {
  740. log.Debug(err)
  741. })
  742. e.ResultArr = [][]map[string]interface{}{}
  743. }
  744. time.Sleep(10 * time.Second)
  745. }
  746. }()
  747. } else {
  748. arr := e.ResultArr
  749. qu.Try(func() {
  750. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  751. }, func(err interface{}) {
  752. log.Debug(err)
  753. })
  754. e.ResultArr = [][]map[string]interface{}{}
  755. }
  756. }
  757. //保存抽取数据
  758. func (e *ExtractTask) BidSave(init bool) {
  759. defer qu.Catch()
  760. if e.BidArr == nil {
  761. e.BidArr = [][]map[string]interface{}{}
  762. }
  763. if init {
  764. go func() {
  765. for {
  766. if len(e.BidArr) > 500 {
  767. arr := e.BidArr[:500]
  768. qu.Try(func() {
  769. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  770. }, func(err interface{}) {
  771. log.Debug(err)
  772. })
  773. e.BidArr = e.BidArr[500:]
  774. } else {
  775. arr := e.BidArr
  776. qu.Try(func() {
  777. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  778. }, func(err interface{}) {
  779. log.Debug(err)
  780. })
  781. e.BidArr = [][]map[string]interface{}{}
  782. }
  783. time.Sleep(10 * time.Second)
  784. }
  785. }()
  786. } else {
  787. arr := e.BidArr
  788. qu.Try(func() {
  789. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  790. }, func(err interface{}) {
  791. log.Debug(err)
  792. })
  793. e.BidArr = [][]map[string]interface{}{}
  794. time.Sleep(1 * time.Second)
  795. }
  796. }
  797. func (e *ExtractTask) InitAuditRecogField() {
  798. defer qu.Catch()
  799. e.RecogFieldMap = make(map[string]map[string]interface{})
  800. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  801. for _, f := range *recogFieldList {
  802. field := qu.ObjToString(f["s_recogfield"])
  803. e.RecogFieldMap[field] = f
  804. }
  805. }
  806. func (e *ExtractTask) InitAuditClass() {
  807. defer qu.Catch()
  808. e.FidClassMap = make(map[string][]map[string]interface{})
  809. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  810. for _, c := range *class {
  811. classList := []map[string]interface{}{}
  812. fid := qu.ObjToString(c["s_fid"])
  813. if len(e.FidClassMap[fid]) > 0 { //追加
  814. classList = e.FidClassMap[fid]
  815. }
  816. classList = append(classList, c)
  817. e.FidClassMap[fid] = classList
  818. }
  819. }
  820. //加载规则
  821. func (e *ExtractTask) InitAuditRule() {
  822. defer qu.Catch()
  823. var rureg *regexp.Regexp
  824. var rs []rune
  825. var ru string
  826. var err error
  827. e.CidRuleMap = make(map[string][]map[string]interface{})
  828. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  829. for _, v := range *rule {
  830. i_rule := []interface{}{}
  831. ss, _ := (v["s_rule"].([]interface{}))
  832. for _, r := range qu.ObjArrToStringArr(ss) {
  833. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  834. rs = []rune(r)
  835. ru = string(rs[1 : len(rs)-1])
  836. rureg, err = regexp.Compile(ru)
  837. if err != nil {
  838. log.Debug("error---rule:", r)
  839. continue
  840. }
  841. i_rule = append(i_rule, []interface{}{rureg}...)
  842. } else { //规则
  843. i_rule = append(i_rule, r)
  844. }
  845. }
  846. v["rule"] = i_rule
  847. ruleList := []map[string]interface{}{}
  848. classid := qu.ObjToString(v["s_classid"])
  849. if len(e.CidRuleMap[classid]) > 0 { //追加
  850. ruleList = e.CidRuleMap[classid]
  851. }
  852. ruleList = append(ruleList, v)
  853. e.CidRuleMap[classid] = ruleList
  854. }
  855. }
  856. //
  857. func (e *ExtractTask) InitAuditFields() {
  858. if len(e.AuditFields) == 0 {
  859. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  860. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  861. vid := qu.BsonIdToSId((*v)["_id"])
  862. query := map[string]interface{}{
  863. "isaudit": true,
  864. "delete": false,
  865. "vid": vid,
  866. }
  867. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  868. for _, d := range *data {
  869. field := qu.ObjToString(d["s_field"])
  870. e.AuditFields = append(e.AuditFields, field)
  871. }
  872. }
  873. }
  874. }
  875. //加载附件抽取
  876. func (e *ExtractTask) InitFile() {
  877. defer qu.Catch()
  878. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  879. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  880. //ve, _ := db.Mgo.FindOne("version", query)
  881. if ve == nil {
  882. return
  883. }
  884. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  885. e.IsFileField = true
  886. }
  887. syscefiled := new(sync.Map)
  888. if (*ve)["s_filefileds"] != nil {
  889. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  890. syscefiled.Store(vff.(string),1)
  891. }
  892. }
  893. e.FileFields = syscefiled
  894. }
  895. //加载清理任务信息
  896. func (c *ClearTask) InitClearTaskInfo() {
  897. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  898. if len(*cleartask) > 1 {
  899. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  900. c.ClearTaskInfo = &ClearTaskInfo{
  901. Name: (*cleartask)["s_taskname"].(string),
  902. Version: (*cleartask)["s_version"].(string),
  903. VersionId: qu.BsonIdToSId((*v)["_id"]),
  904. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  905. FromDB: (*cleartask)["s_mgodb"].(string),
  906. FromColl: (*cleartask)["s_mgocoll"].(string),
  907. IsCltLog: ju.Config["iscltlog"].(bool),
  908. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  909. }
  910. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  911. } else {
  912. return
  913. }
  914. }
  915. //加载清理脚本
  916. func (c *ClearTask) InitClearLuas() {
  917. defer qu.Catch()
  918. c.ClearLuas = make(map[string][]*ClearLua)
  919. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  920. for _, l := range *list {
  921. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  922. continue
  923. }
  924. s_field := qu.ObjToString(l["s_field"])
  925. pid := qu.BsonIdToSId(l["_id"])
  926. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  927. for _, vv := range *luas {
  928. if b, _ := vv["isuse"].(bool); !b {
  929. continue
  930. }
  931. clearLua := &ClearLua{
  932. Field: s_field,
  933. Code: vv["s_code"].(string),
  934. Name: vv["s_name"].(string),
  935. LuaText: vv["s_luascript"].(string),
  936. LFields: getALLFields(),
  937. }
  938. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  939. }
  940. }
  941. }