extractInit.go 40 KB


  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. log "github.com/donnie4w/go-logger/logger"
  14. )
  15. type RegLuaInfo struct { //正则或脚本信息
  16. Code, Name, Field string //
  17. RuleText string //
  18. IsLua bool //
  19. RegPreBac *ExtReg //
  20. RegCore *ExtReg //
  21. LFields map[string]string //lua抽取字段属性组
  22. }
  23. type ExtReg struct {
  24. Reg *regexp.Regexp
  25. Replace string
  26. Bextract bool
  27. ExtractPos map[string]int
  28. }
  29. type RuleCore struct {
  30. Field string //逻辑字段
  31. LuaLogic string //进入逻辑
  32. ExtFrom string //从哪个字段抽取
  33. RulePres []*RegLuaInfo //抽取前置规则
  34. RuleBacks []*RegLuaInfo //抽取后置规则
  35. RuleCores []*RegLuaInfo //抽取规则
  36. }
  37. type Tag struct {
  38. Type string //标签类型 string 字符串、regexp 正则
  39. Key string //
  40. Reg *regexp.Regexp //
  41. }
  42. type TaskInfo struct {
  43. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  44. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  45. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  46. TestColl, LastExtId string //测试结果表、上次抽取信息id
  47. FDB *db.Pool //数据库连接池
  48. TDB *db.Pool //数据库连接池
  49. IsEtxLog bool //是否开启抽取日志
  50. ProcessPool chan bool //任务进程池
  51. TestLua bool //检查测试用
  52. }
  53. type ExtractTask struct {
  54. Id string //任务id
  55. IsRun bool //是否启动
  56. Content string //信息内容
  57. TaskInfo *TaskInfo //任务信息
  58. RulePres []*RegLuaInfo //通用前置规则
  59. RuleBacks []*RegLuaInfo //通用后置规则
  60. RuleBlock *ju.RuleBlock
  61. //RuleCores []*RuleCore //抽取规则
  62. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  63. PkgRuleCores []*RuleCore //分包抽取规则
  64. Tag map[string][]*Tag //标签库
  65. ClearFn map[string][]string //清理函数
  66. IsExtractCity bool //是否开启城市抽取
  67. Fields map[string]int //抽取属性组
  68. IsFileField bool //是否开启附件抽取
  69. FileFields *sync.Map //抽取附件属性组
  70. ResultChanel chan bool //抽取结果详情
  71. ResultArr [][]map[string]interface{} //抽取结果详情
  72. BidChanel chan bool //抽取结果
  73. BidArr [][]map[string]interface{} //抽取结果
  74. RecogFieldMap map[string]map[string]interface{} //识别字段
  75. FidClassMap map[string][]map[string]interface{} //分类
  76. CidRuleMap map[string][]map[string]interface{} //规则
  77. AuditFields []string //需要审核的字段名称
  78. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  79. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  80. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  81. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  82. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  83. DistrictCityMap map[string]*City //区或县对应的city
  84. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  85. StreetDistrictMap map[string]*District //街道对应的区或县
  86. ProvinceAllGet *ju.DFA //省全称
  87. ProvinceSimGet *ju.DFA //省简称
  88. CityAllGet *ju.DFA //市全称
  89. CitySimGet *ju.DFA //市简称
  90. DistrictAllGet *ju.DFA //区或县全称
  91. DistrictSimGet *ju.DFA //区或县简称
  92. StreetGet *ju.DFA //街道
  93. PostCodeMap map[string]*PostCode //邮编
  94. AreaCodeMap map[string]*AreaCode //区号
  95. InfoType []map[string]interface{}
  96. }
  97. type ClearTaskInfo struct {
  98. Name, Version, VersionId string //名称、版本、版本id
  99. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  100. FDB *db.Pool //数据库连接池
  101. TDB *db.Pool //数据库连接池
  102. IsCltLog bool //是否开启清理日志
  103. ProcessPool chan bool //任务进程池
  104. }
  105. type ClearLua struct {
  106. Field string //字段字段
  107. Code string //代码
  108. Name string //名称
  109. LuaText string
  110. //LuaLogic string //进入逻辑
  111. //ExtFrom string //从哪个字段抽取
  112. LFields map[string]string //lua抽取字段属性组
  113. }
  114. type ClearTask struct {
  115. Id string //任务id
  116. Content string //信息内容
  117. ClearTaskInfo *ClearTaskInfo //任务信息
  118. ClearLuas map[string][]*ClearLua //清理脚本
  119. UpdateResult [][]map[string]interface{} //清理后结果
  120. ClearChannel chan bool
  121. }
  122. func init() {
  123. TaskList = make(map[string]*ExtractTask)
  124. ClearTaskList = make(map[string]*ClearTask)
  125. go SaveExtLog()
  126. go SaveCltLog() //保存清理日志
  127. }
  128. //加载任务信息
  129. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  130. task, _ := db.Mgo.FindById("task", e.Id, nil)
  131. if len(*task) > 1 {
  132. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  133. e.TaskInfo = &TaskInfo{
  134. Name: (*task)["s_taskname"].(string),
  135. Version: (*task)["s_version"].(string),
  136. VersionId: qu.BsonIdToSId((*v)["_id"]),
  137. TrackColl: trackcoll,
  138. FromDbAddr: (*task)["s_mgoaddr"].(string),
  139. FromDB: (*task)["s_mgodb"].(string),
  140. FromColl: (*task)["s_mgocoll"].(string),
  141. TestColl: resultcoll,
  142. IsEtxLog: true,
  143. ProcessPool: make(chan bool, 1),
  144. }
  145. if (*v)["isextractcity"] != nil {
  146. e.IsExtractCity = (*v)["isextractcity"].(bool)
  147. }
  148. } else {
  149. return
  150. }
  151. }
  152. //加载任务信息
  153. func (e *ExtractTask) InitTaskInfo() {
  154. task, _ := db.Mgo.FindById("task", e.Id, nil)
  155. log.Debug("task", task)
  156. if len(*task) > 1 {
  157. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  158. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  159. log.Debug("s_mgosavecoll", strs)
  160. if len(strs) < 3 {
  161. return
  162. } else {
  163. e.TaskInfo = &TaskInfo{
  164. Name: (*task)["s_taskname"].(string),
  165. Version: (*task)["s_version"].(string),
  166. VersionId: qu.BsonIdToSId((*v)["_id"]),
  167. //TrackColl: (*task)["s_trackcoll"].(string),
  168. FromDbAddr: (*task)["s_mgoaddr"].(string),
  169. FromDB: (*task)["s_mgodb"].(string),
  170. FromColl: (*task)["s_mgocoll"].(string),
  171. ToDbAddr: strs[0],
  172. ToDB: strs[1],
  173. ToColl: strs[2],
  174. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  175. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  176. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  177. }
  178. if (*v)["isextractcity"] != nil {
  179. e.IsExtractCity = (*v)["isextractcity"].(bool)
  180. }
  181. }
  182. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  183. } else {
  184. return
  185. }
  186. }
  187. //加载通用前置规则
  188. func (e *ExtractTask) InitRulePres() {
  189. defer qu.Catch()
  190. e.RulePres = []*RegLuaInfo{}
  191. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  192. for _, v := range *list {
  193. rinfo := &RegLuaInfo{
  194. Code: v["s_code"].(string),
  195. Name: v["s_name"].(string),
  196. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  197. }
  198. if rinfo.IsLua {
  199. rinfo.RuleText = v["s_luascript"].(string)
  200. e.RulePres = append(e.RulePres, rinfo)
  201. } else {
  202. qu.Try(func() {
  203. rinfo.RuleText = v["s_rule"].(string)
  204. tmp := strings.Split(rinfo.RuleText, "__")
  205. var pattern string
  206. if strings.Contains(tmp[0], "\\u") {
  207. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  208. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  209. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  210. } else {
  211. pattern = tmp[0]
  212. }
  213. if len(tmp) == 2 {
  214. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  215. } else {
  216. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  217. }
  218. e.RulePres = append(e.RulePres, rinfo)
  219. }, func(err interface{}) {
  220. log.Debug(rinfo.Code, rinfo.Field, err)
  221. })
  222. }
  223. }
  224. }
  225. //加载通用后置规则
  226. func (e *ExtractTask) InitRuleBacks() {
  227. defer qu.Catch()
  228. e.RuleBacks = []*RegLuaInfo{}
  229. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  230. for _, v := range *list {
  231. rinfo := &RegLuaInfo{
  232. Code: v["s_code"].(string),
  233. Name: v["s_name"].(string),
  234. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  235. }
  236. if rinfo.IsLua {
  237. rinfo.RuleText = v["s_luascript"].(string)
  238. e.RuleBacks = append(e.RuleBacks, rinfo)
  239. } else {
  240. qu.Try(func() {
  241. rinfo.RuleText = v["s_rule"].(string)
  242. tmp := strings.Split(rinfo.RuleText, "__")
  243. var pattern string
  244. if strings.Contains(tmp[0], "\\u") {
  245. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  246. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  247. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  248. } else {
  249. pattern = tmp[0]
  250. }
  251. if len(tmp) == 2 {
  252. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  253. } else {
  254. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  255. }
  256. e.RuleBacks = append(e.RuleBacks, rinfo)
  257. }, func(err interface{}) {
  258. log.Debug(rinfo.Code, rinfo.Field, err)
  259. })
  260. }
  261. }
  262. }
  263. func (e *ExtractTask) InfoTypeList() {
  264. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  265. infolist := *infolist1
  266. for _, v := range infolist {
  267. e.InfoType = append(e.InfoType, v)
  268. }
  269. }
  270. //加载抽取规则
  271. func (e *ExtractTask) InitRuleCore() {
  272. defer qu.Catch()
  273. e.Fields = map[string]int{}
  274. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  275. e.RuleCores = make(map[string]map[string][]*RuleCore)
  276. for _, v := range *infolist {
  277. topclass := qu.ObjToString(v["topclass"])
  278. if v["subclass"] == nil {
  279. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  280. for attr, _ := range v["fields"].(map[string]interface{}) {
  281. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
  282. e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
  283. }
  284. } else {
  285. for ca, fs := range v["subclass"].(map[string]interface{}) {
  286. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  287. for field, _ := range fs.(map[string]interface{}) {
  288. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
  289. e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
  290. }
  291. }
  292. }
  293. }
  294. }
  295. func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
  296. maps := []*RuleCore{}
  297. if b, _ := vinfo["isuse"].(bool); !b {
  298. return nil
  299. }
  300. s_field := qu.ObjToString(vinfo["s_field"])
  301. pid := qu.BsonIdToSId(vinfo["_id"])
  302. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  303. for _, vv := range *list {
  304. if b, _ := vv["isuse"].(bool); !b {
  305. continue
  306. }
  307. rcore := &RuleCore{}
  308. rcore.Field = s_field
  309. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  310. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  311. //前置规则
  312. rulePres := []*RegLuaInfo{}
  313. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  314. for _, v := range *plist {
  315. rinfo := &RegLuaInfo{
  316. Field: qu.ObjToString(v["s_field"]),
  317. Code: v["s_code"].(string),
  318. Name: v["s_name"].(string),
  319. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  320. }
  321. if rinfo.IsLua {
  322. rinfo.RuleText = v["s_luascript"].(string)
  323. rulePres = append(rulePres, rinfo)
  324. } else {
  325. qu.Try(func() {
  326. rinfo.RuleText = v["s_rule"].(string)
  327. tmp := strings.Split(rinfo.RuleText, "__")
  328. var pattern string
  329. if strings.Contains(tmp[0], "\\u") {
  330. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  331. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  332. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  333. } else {
  334. pattern = tmp[0]
  335. }
  336. if len(tmp) == 2 {
  337. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  338. } else {
  339. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  340. }
  341. rulePres = append(rulePres, rinfo)
  342. }, func(err interface{}) {
  343. log.Debug(rinfo.Code, rinfo.Field, err)
  344. })
  345. }
  346. }
  347. rcore.RulePres = rulePres
  348. //后置规则
  349. ruleBacks := []*RegLuaInfo{}
  350. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  351. for _, v := range *blist {
  352. rinfo := &RegLuaInfo{
  353. Field: qu.ObjToString(v["s_field"]),
  354. Code: v["s_code"].(string),
  355. Name: v["s_name"].(string),
  356. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  357. }
  358. if rinfo.IsLua {
  359. rinfo.RuleText = v["s_luascript"].(string)
  360. ruleBacks = append(ruleBacks, rinfo)
  361. } else {
  362. qu.Try(func() {
  363. rinfo.RuleText = v["s_rule"].(string)
  364. tmp := strings.Split(rinfo.RuleText, "__")
  365. var pattern string
  366. if strings.Contains(tmp[0], "\\u") {
  367. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  368. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  369. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  370. } else {
  371. pattern = tmp[0]
  372. }
  373. if len(tmp) == 2 {
  374. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  375. } else {
  376. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  377. }
  378. ruleBacks = append(ruleBacks, rinfo)
  379. }, func(err interface{}) {
  380. log.Debug(rinfo.Code, rinfo.Field, err)
  381. })
  382. }
  383. }
  384. rcore.RuleBacks = ruleBacks
  385. //抽取规则
  386. ruleCores := []*RegLuaInfo{}
  387. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  388. for _, v := range *clist {
  389. if b, _ := v["isuse"].(bool); !b {
  390. continue
  391. }
  392. field := qu.ObjToString(v["s_field"])
  393. e.Fields[field] = 1 //加入抽取属性组备用
  394. rinfo := &RegLuaInfo{
  395. Field: field,
  396. Code: v["s_code"].(string),
  397. Name: v["s_name"].(string),
  398. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  399. }
  400. if rinfo.IsLua {
  401. rinfo.RuleText = v["s_luascript"].(string)
  402. //提取全部属性
  403. rinfo.LFields = getALLFields()
  404. ruleCores = append(ruleCores, rinfo)
  405. } else {
  406. qu.Try(func() {
  407. rinfo.RuleText = v["s_rule"].(string)
  408. tmp := strings.Split(rinfo.RuleText, "__")
  409. var pattern string
  410. if strings.Contains(tmp[0], "\\u") {
  411. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  412. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  413. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  414. } else {
  415. pattern = tmp[0]
  416. }
  417. if len(tmp) == 2 {
  418. epos := strings.Split(tmp[1], ",")
  419. posm := map[string]int{}
  420. for _, v := range epos {
  421. ks := strings.Split(v, ":")
  422. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  423. posm[ks[1]] = qu.IntAll(ks[0])
  424. } else { //(.*)招标公告__2
  425. posm[rinfo.Field] = qu.IntAll(ks[0])
  426. }
  427. }
  428. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  429. } else {
  430. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  431. }
  432. ruleCores = append(ruleCores, rinfo)
  433. }, func(err interface{}) {
  434. log.Debug(rinfo.Code, rinfo.Field, err)
  435. })
  436. }
  437. }
  438. rcore.RuleCores = ruleCores
  439. //
  440. maps = append(maps, rcore)
  441. }
  442. return maps
  443. }
  444. //加载分包抽取规则
  445. func (e *ExtractTask) InitPkgCore() {
  446. defer qu.Catch()
  447. e.PkgRuleCores = []*RuleCore{}
  448. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  449. for _, pkginfo := range *pkginfos {
  450. if b, _ := pkginfo["isuse"].(bool); !b {
  451. continue
  452. }
  453. s_field := qu.ObjToString(pkginfo["s_field"])
  454. pid := qu.BsonIdToSId(pkginfo["_id"])
  455. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  456. for _, vv := range *logicList {
  457. if b, _ := vv["isuse"].(bool); !b {
  458. continue
  459. }
  460. rcore := &RuleCore{}
  461. rcore.Field = s_field
  462. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  463. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  464. //后置规则
  465. ruleBacks := []*RegLuaInfo{}
  466. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  467. for _, v := range *blist {
  468. rinfo := &RegLuaInfo{
  469. Field: qu.ObjToString(v["s_field"]),
  470. Code: v["s_code"].(string),
  471. Name: v["s_name"].(string),
  472. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  473. }
  474. if rinfo.IsLua {
  475. rinfo.RuleText = v["s_luascript"].(string)
  476. ruleBacks = append(ruleBacks, rinfo)
  477. } else {
  478. qu.Try(func() {
  479. rinfo.RuleText = v["s_rule"].(string)
  480. tmp := strings.Split(rinfo.RuleText, "__")
  481. var pattern string
  482. if strings.Contains(tmp[0], "\\u") {
  483. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  484. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  485. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  486. } else {
  487. pattern = tmp[0]
  488. }
  489. if len(tmp) == 2 {
  490. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  491. } else {
  492. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  493. }
  494. ruleBacks = append(ruleBacks, rinfo)
  495. }, func(err interface{}) {
  496. log.Debug(rinfo.Code, rinfo.Field, err)
  497. })
  498. }
  499. }
  500. rcore.RuleBacks = ruleBacks
  501. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  502. }
  503. }
  504. }
  505. //加载标签库
  506. func (e *ExtractTask) InitTag() {
  507. defer qu.Catch()
  508. e.Tag = map[string][]*Tag{}
  509. //字符串标签库
  510. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  511. for _, v := range *list {
  512. field := qu.ObjToString(v["s_field"])
  513. if tmp, ok := v["content"].([]interface{}); ok {
  514. fname := qu.ObjToString(v["s_name"])
  515. tab := ju.TagFile{Name: fname} //用于表格kv
  516. tab.Items = make([]*ju.Tag, len(tmp))
  517. for k, key := range tmp {
  518. tag := &Tag{Type: "string", Key: key.(string)}
  519. e.Tag[field] = append(e.Tag[field], tag)
  520. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  521. }
  522. sort.Sort(tab.Items)
  523. ju.TagdbTable[fname] = &tab
  524. }
  525. }
  526. //正则标签库
  527. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  528. for _, v := range *list {
  529. field := qu.ObjToString(v["s_field"])
  530. if tmp, ok := v["content"].([]interface{}); ok {
  531. fname := qu.ObjToString(v["s_name"])
  532. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  533. tab.Items = make([]*ju.Tag, len(tmp))
  534. for k, key := range tmp {
  535. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  536. e.Tag[field] = append(e.Tag[field], tag)
  537. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  538. }
  539. sort.Sort(tab.Items)
  540. ju.TagdbTable[fname+"_reg"] = &tab
  541. }
  542. }
  543. }
  544. //获取fields
  545. func getALLFields() map[string]string {
  546. fields := map[string]string{}
  547. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  548. for _, v := range *list {
  549. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  550. }
  551. return fields
  552. }
  553. //加载clear函数
  554. func (e *ExtractTask) InitClearFn() {
  555. defer qu.Catch()
  556. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  557. fn := map[string][]string{}
  558. for _, tmp := range *list {
  559. field := tmp["s_field"].(string)
  560. fns := tmp["clear"].([]interface{})
  561. if fn[field] == nil {
  562. fn[field] = []string{}
  563. }
  564. for _, v := range fns {
  565. fn[field] = append(fn[field], v.(string))
  566. }
  567. }
  568. e.ClearFn = fn
  569. }
  570. //加载省份
  571. func InitProvince(version string) map[string]interface{} {
  572. defer qu.Catch()
  573. fn := map[string]interface{}{}
  574. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  575. for _, v := range *list {
  576. name := qu.ObjToString(v["s_name"])
  577. content := v["content"]
  578. switch content.(type) {
  579. case string:
  580. fn[name] = []interface{}{content.(string)}
  581. case []interface{}:
  582. fn[name] = content
  583. }
  584. }
  585. return fn
  586. }
  587. //加载城市简称
  588. func InitCitySim(version string) map[string]map[string]interface{} {
  589. defer qu.Catch()
  590. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  591. fn := map[string]map[string]interface{}{}
  592. for _, v := range *list {
  593. name := qu.ObjToString(v["s_name"])
  594. tmp := v["content"].(map[string]interface{})
  595. fn[name] = tmp
  596. }
  597. return fn
  598. }
  599. //加载城市全称
  600. func InitCityAll(version string) map[string]map[string]interface{} {
  601. defer qu.Catch()
  602. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  603. fn := map[string]map[string]interface{}{}
  604. for _, v := range *list {
  605. name := qu.ObjToString(v["s_name"])
  606. tmp := v["content"].(map[string]interface{})
  607. fn[name] = tmp
  608. }
  609. return fn
  610. }
  611. //初始化城市省份敏感词
  612. func (e *ExtractTask) InitCityDFA() {
  613. defer qu.Catch()
  614. e.CityAllGet = &ju.DFA{}
  615. e.CitySimGet = &ju.DFA{}
  616. e.DistrictAllGet = &ju.DFA{}
  617. e.DistrictSimGet = &ju.DFA{}
  618. e.ProvinceAllGet = &ju.DFA{}
  619. e.ProvinceSimGet = &ju.DFA{}
  620. e.StreetGet = &ju.DFA{}
  621. //初始化map
  622. if e.ProvinceMap == nil {
  623. e.ProvinceMap = make(map[string]string)
  624. }
  625. if e.CityMap == nil {
  626. e.CityMap = make(map[string]string)
  627. }
  628. if e.DistrictSimAndAll == nil {
  629. e.DistrictSimAndAll = make(map[string]string)
  630. }
  631. if e.CityBriefMap == nil {
  632. e.CityBriefMap = make(map[string]*City)
  633. }
  634. if e.CityFullMap == nil {
  635. e.CityFullMap = make(map[string]*City)
  636. }
  637. if e.ProvinceBriefMap == nil {
  638. e.ProvinceBriefMap = make(map[string]*Province)
  639. }
  640. if e.DistrictCityMap == nil {
  641. e.DistrictCityMap = make(map[string]*City)
  642. }
  643. if e.StreetDistrictMap == nil {
  644. e.StreetDistrictMap = make(map[string]*District)
  645. }
  646. //初始化省
  647. fn1 := InitProvince(e.TaskInfo.Version)
  648. for k, v := range fn1 {
  649. for _, p := range v.([]interface{}) {
  650. p1, _ := p.(string)
  651. e.ProvinceAllGet.AddWord(p1) //华中科技大学
  652. e.ProvinceMap[p1] = k //华中科技大学:湖北
  653. }
  654. }
  655. //初始化城市全称
  656. fn2 := InitCityAll(e.TaskInfo.Version)
  657. for k, v := range fn2 {
  658. //加载省信息
  659. e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
  660. p := &Province{}
  661. p.Name = k //省全称:浙江省
  662. p.Brief = v["brief"].(string) //省简称:浙江
  663. e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
  664. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  665. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  666. p.Cap = v["captial"].(string) //省会(杭州)
  667. //加载市信息
  668. city, _ := v["city"].(map[string]interface{})
  669. for k1, v1 := range city {
  670. e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
  671. v1m, _ := v1.(map[string]interface{})
  672. c := &City{}
  673. c.Name = k1 //市全称:杭州市
  674. c.Brief = v1m["brief"].(string) //市简称:杭州
  675. e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
  676. e.CityMap[k1] = c.Brief //杭州市:杭州
  677. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  678. e.CityFullMap[k1] = c //杭州市:市信息{}
  679. c.P = p
  680. if c.Name == p.Cap {
  681. p.Captial = c //加载province中的省会市信息{}
  682. }
  683. //区县
  684. districtmap := v1m["area"].(map[string]interface{}) //区或县
  685. for district, streetarr := range districtmap {
  686. d := &District{}
  687. d.Name = district
  688. d.C = c
  689. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  690. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
  691. e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
  692. ctmp := e.DistrictCityMap[district]
  693. if ctmp == nil {
  694. e.DistrictCityMap[district] = c
  695. }
  696. //街道
  697. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  698. e.StreetGet.AddWord(s) //加入街道敏感词
  699. dtmp := e.StreetDistrictMap[s]
  700. if dtmp == nil {
  701. e.StreetDistrictMap[s] = d
  702. }
  703. }
  704. }
  705. }
  706. }
  707. //初始化城市简称
  708. fn3 := InitCitySim(e.TaskInfo.Version)
  709. for _, v := range fn3 {
  710. city, _ := v["city"].(map[string]interface{})
  711. for _, v1 := range city {
  712. v1m, _ := v1.(map[string]interface{})
  713. cb := v1m["brief"].(string) //市简称
  714. arr := v1m["area"].(map[string]interface{}) //区或县简称
  715. for districtsim, districtall := range arr {
  716. e.DistrictSimAndAll[districtsim] = districtall.(string)
  717. d := &District{}
  718. d.Name = districtsim
  719. d.C = e.CityBriefMap[cb]
  720. e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
  721. ctmp := e.DistrictCityMap[districtsim]
  722. if ctmp == nil {
  723. e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
  724. }
  725. }
  726. }
  727. }
  728. }
  729. //初始化邮编库
  730. func (e *ExtractTask) InitPostCode() {
  731. defer qu.Catch()
  732. if e.PostCodeMap == nil {
  733. e.PostCodeMap = make(map[string]*PostCode)
  734. }
  735. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  736. for _, l := range *list {
  737. pc := &PostCode{}
  738. pc.Code = qu.ObjToString(l["code"])
  739. pc.P = qu.ObjToString(l["province"])
  740. pc.C = qu.ObjToString(l["city"])
  741. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  742. e.PostCodeMap[pc.Code] = pc
  743. }
  744. }
  745. //初始化区号库
  746. func (e *ExtractTask) InitAreaCode() {
  747. defer qu.Catch()
  748. if e.AreaCodeMap == nil {
  749. e.AreaCodeMap = make(map[string]*AreaCode)
  750. }
  751. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  752. for _, l := range *list {
  753. ac := &AreaCode{}
  754. ac.Code = qu.ObjToString(l["code"])
  755. ac.P = qu.ObjToString(l["province"])
  756. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  757. e.AreaCodeMap[ac.Code] = ac
  758. }
  759. }
  760. //初始化城市省份敏感词
  761. //func (e *ExtractTask) InitCityDFA() {
  762. // defer qu.Catch()
  763. // e.CityAllGet = &ju.DFA{}
  764. // e.DistrictGet = &ju.DFA{}
  765. // e.AreaProvinceGet = &ju.DFA{}
  766. // e.StreetGet = &ju.DFA{}
  767. // //初始化map
  768. // if e.ProvinceMap == nil {
  769. // e.ProvinceMap = make(map[string]string)
  770. // }
  771. // if e.CityBriefMap == nil {
  772. // e.CityBriefMap = make(map[string]*City)
  773. // }
  774. // if e.ProvinceBriefMap == nil {
  775. // e.ProvinceBriefMap = make(map[string]*Province)
  776. // }
  777. // if e.AreaToCityMap == nil {
  778. // e.AreaToCityMap = make(map[string][]*City)
  779. // }
  780. // if e.DistrictCityMap == nil {
  781. // e.DistrictCityMap = make(map[string]*City)
  782. // }
  783. // if e.StreetDistrictMap == nil {
  784. // e.StreetDistrictMap = make(map[string]*District)
  785. // }
  786. // //初始化省
  787. // fn1 := InitProvince(e.TaskInfo.Version)
  788. // for k, v := range fn1 {
  789. // for _, p := range v.([]interface{}) {
  790. // p1, _ := p.(string)
  791. // e.AreaProvinceGet.AddWord(p1) //华中科技大学
  792. // e.ProvinceMap[p1] = k //华中科技大学:湖北
  793. // }
  794. // }
  795. // //初始化城市全称
  796. // fn2 := InitCityAll(e.TaskInfo.Version)
  797. // for k, v := range fn2 {
  798. // e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
  799. // p := &Province{}
  800. // p.Name = k //省全称
  801. // p.Brief = v["brief"].(string) //省简称
  802. // e.ProvinceMap[k] = p.Brief //浙江省:浙江
  803. // e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
  804. // p.Cap = v["captial"].(string) //省会(杭州)
  805. // city, _ := v["city"].(map[string]interface{})
  806. // //
  807. // for k1, v1 := range city {
  808. // v1m, _ := v1.(map[string]interface{})
  809. // c := &City{}
  810. // c.Name = k1
  811. // c.Brief = v1m["brief"].(string)
  812. // e.CityBriefMap[c.Brief] = c
  813. // c.P = p
  814. // if c.Brief == p.Cap {
  815. // p.Captial = c
  816. // }
  817. // //加入到城市map中
  818. // //
  819. // cs := e.AreaToCityMap[k1]
  820. // e.CityAllGet.AddWord(k1) //市全称
  821. // if cs != nil {
  822. // cs = append(cs, c)
  823. // } else {
  824. // cs = []*City{c}
  825. // }
  826. // e.AreaToCityMap[k1] = cs
  827. // //区县
  828. // districtmap := v1m["area"].(map[string]interface{}) //区或县
  829. // for district, streetarr := range districtmap {
  830. // d := &District{}
  831. // d.Name = district
  832. // d.C = c
  833. // e.DistrictGet.AddWord(district) //加入区或县敏感词
  834. // ctmp := e.DistrictCityMap[district]
  835. // if ctmp == nil {
  836. // e.DistrictCityMap[district] = c
  837. // }
  838. // //街道
  839. // for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  840. // e.StreetGet.AddWord(s) //加入街道敏感词
  841. // dtmp := e.StreetDistrictMap[s]
  842. // if dtmp == nil {
  843. // e.StreetDistrictMap[s] = d
  844. // }
  845. // }
  846. // }
  847. // }
  848. // }
  849. // //初始化城市简称
  850. // fn3 := InitCitySim(e.TaskInfo.Version)
  851. // e.CitySimGet = &ju.DFA{}
  852. // for k, v := range fn3 {
  853. // pb := v["brief"].(string)
  854. // p := e.ProvinceBriefMap[pb]
  855. // //加载
  856. // for _, ss := range []string{k, pb} { //省全称和省简称
  857. // cs := e.AreaToCityMap[ss]
  858. // if cs != nil {
  859. // cs = append(cs, p.Captial)
  860. // } else {
  861. // cs = []*City{p.Captial}
  862. // }
  863. // e.AreaToCityMap[ss] = cs
  864. // e.CitySimGet.AddWord(ss)
  865. // }
  866. // city, _ := v["city"].(map[string]interface{})
  867. // for k1, v1 := range city {
  868. // v1m, _ := v1.(map[string]interface{})
  869. // if v1m["brief"] == nil {
  870. // }
  871. // cb := v1m["brief"].(string)
  872. // c := e.AreaToCityMap[k1][0]
  873. // //加入到城市map中
  874. // for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  875. // e.CitySimGet.AddWord(ss)
  876. // cs := e.AreaToCityMap[ss]
  877. // if cs != nil {
  878. // cs = append(cs, c)
  879. // } else {
  880. // cs = []*City{c}
  881. // }
  882. // e.AreaToCityMap[ss] = cs
  883. // }
  884. // arr := v1m["area"].([]interface{})
  885. // for _, k2 := range arr {
  886. // s := k2.(string)
  887. // for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  888. // cs := e.AreaToCityMap[ss]
  889. // e.CitySimGet.AddWord(ss)
  890. // if cs != nil {
  891. // cs = append(cs, c)
  892. // } else {
  893. // cs = []*City{c}
  894. // }
  895. // e.AreaToCityMap[ss] = cs
  896. // //只加入简称
  897. // if n == 0 {
  898. // d := &District{}
  899. // d.Name = ss
  900. // d.C = c
  901. // e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
  902. // ctmp := e.DistrictCityMap[ss]
  903. // if ctmp == nil {
  904. // e.DistrictCityMap[ss] = c
  905. // }
  906. // }
  907. // }
  908. // }
  909. // }
  910. // }
  911. //}
  912. //保存抽取详情数据
  913. func (e *ExtractTask) ResultSave(init bool) {
  914. defer qu.Catch()
  915. if e.ResultArr == nil {
  916. e.ResultArr = [][]map[string]interface{}{}
  917. }
  918. if init {
  919. go func() {
  920. for {
  921. if len(e.ResultArr) > 500 {
  922. arr := e.ResultArr[:500]
  923. qu.Try(func() {
  924. db.Mgo.UpSertBulk("extract_result", arr...)
  925. }, func(err interface{}) {
  926. log.Debug(err)
  927. })
  928. e.ResultArr = e.ResultArr[500:]
  929. } else {
  930. arr := e.ResultArr
  931. qu.Try(func() {
  932. db.Mgo.UpSertBulk("extract_result", arr...)
  933. }, func(err interface{}) {
  934. log.Debug(err)
  935. })
  936. e.ResultArr = [][]map[string]interface{}{}
  937. }
  938. time.Sleep(10 * time.Second)
  939. }
  940. }()
  941. } else {
  942. arr := e.ResultArr
  943. qu.Try(func() {
  944. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  945. }, func(err interface{}) {
  946. log.Debug(err)
  947. })
  948. e.ResultArr = [][]map[string]interface{}{}
  949. }
  950. }
  951. //保存抽取数据
  952. func (e *ExtractTask) BidSave(init bool) {
  953. defer qu.Catch()
  954. if e.BidArr == nil {
  955. e.BidArr = [][]map[string]interface{}{}
  956. }
  957. if init {
  958. go func() {
  959. for {
  960. if len(e.BidArr) > 500 {
  961. arr := e.BidArr[:500]
  962. qu.Try(func() {
  963. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  964. }, func(err interface{}) {
  965. log.Debug(err)
  966. })
  967. e.BidArr = e.BidArr[500:]
  968. } else {
  969. arr := e.BidArr
  970. qu.Try(func() {
  971. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  972. }, func(err interface{}) {
  973. log.Debug(err)
  974. })
  975. e.BidArr = [][]map[string]interface{}{}
  976. }
  977. time.Sleep(10 * time.Second)
  978. }
  979. }()
  980. } else {
  981. arr := e.BidArr
  982. qu.Try(func() {
  983. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  984. }, func(err interface{}) {
  985. log.Debug(err)
  986. })
  987. e.BidArr = [][]map[string]interface{}{}
  988. time.Sleep(1 * time.Second)
  989. }
  990. }
  991. func (e *ExtractTask) InitAuditRecogField() {
  992. defer qu.Catch()
  993. e.RecogFieldMap = make(map[string]map[string]interface{})
  994. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  995. for _, f := range *recogFieldList {
  996. field := qu.ObjToString(f["s_recogfield"])
  997. e.RecogFieldMap[field] = f
  998. }
  999. }
  1000. func (e *ExtractTask) InitAuditClass() {
  1001. defer qu.Catch()
  1002. e.FidClassMap = make(map[string][]map[string]interface{})
  1003. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1004. for _, c := range *class {
  1005. classList := []map[string]interface{}{}
  1006. fid := qu.ObjToString(c["s_fid"])
  1007. if len(e.FidClassMap[fid]) > 0 { //追加
  1008. classList = e.FidClassMap[fid]
  1009. }
  1010. classList = append(classList, c)
  1011. e.FidClassMap[fid] = classList
  1012. }
  1013. }
  1014. //加载规则
  1015. func (e *ExtractTask) InitAuditRule() {
  1016. defer qu.Catch()
  1017. var rureg *regexp.Regexp
  1018. var rs []rune
  1019. var ru string
  1020. var err error
  1021. e.CidRuleMap = make(map[string][]map[string]interface{})
  1022. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1023. for _, v := range *rule {
  1024. i_rule := []interface{}{}
  1025. ss, _ := (v["s_rule"].([]interface{}))
  1026. for _, r := range qu.ObjArrToStringArr(ss) {
  1027. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1028. rs = []rune(r)
  1029. ru = string(rs[1 : len(rs)-1])
  1030. rureg, err = regexp.Compile(ru)
  1031. if err != nil {
  1032. log.Debug("error---rule:", r)
  1033. continue
  1034. }
  1035. i_rule = append(i_rule, []interface{}{rureg}...)
  1036. } else { //规则
  1037. i_rule = append(i_rule, r)
  1038. }
  1039. }
  1040. v["rule"] = i_rule
  1041. ruleList := []map[string]interface{}{}
  1042. classid := qu.ObjToString(v["s_classid"])
  1043. if len(e.CidRuleMap[classid]) > 0 { //追加
  1044. ruleList = e.CidRuleMap[classid]
  1045. }
  1046. ruleList = append(ruleList, v)
  1047. e.CidRuleMap[classid] = ruleList
  1048. }
  1049. }
  1050. //
  1051. func (e *ExtractTask) InitAuditFields() {
  1052. if len(e.AuditFields) == 0 {
  1053. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1054. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1055. vid := qu.BsonIdToSId((*v)["_id"])
  1056. query := map[string]interface{}{
  1057. "isaudit": true,
  1058. "delete": false,
  1059. "vid": vid,
  1060. }
  1061. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1062. for _, d := range *data {
  1063. field := qu.ObjToString(d["s_field"])
  1064. e.AuditFields = append(e.AuditFields, field)
  1065. }
  1066. }
  1067. }
  1068. }
  1069. //加载附件抽取
  1070. func (e *ExtractTask) InitFile() {
  1071. defer qu.Catch()
  1072. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1073. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1074. //ve, _ := db.Mgo.FindOne("version", query)
  1075. if ve == nil {
  1076. return
  1077. }
  1078. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1079. e.IsFileField = true
  1080. }
  1081. syscefiled := new(sync.Map)
  1082. if (*ve)["s_filefileds"] != nil {
  1083. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1084. syscefiled.Store(vff.(string), 1)
  1085. }
  1086. }
  1087. e.FileFields = syscefiled
  1088. }
  1089. //加载清理任务信息
  1090. func (c *ClearTask) InitClearTaskInfo() {
  1091. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1092. if len(*cleartask) > 1 {
  1093. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1094. c.ClearTaskInfo = &ClearTaskInfo{
  1095. Name: (*cleartask)["s_taskname"].(string),
  1096. Version: (*cleartask)["s_version"].(string),
  1097. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1098. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1099. FromDB: (*cleartask)["s_mgodb"].(string),
  1100. FromColl: (*cleartask)["s_mgocoll"].(string),
  1101. IsCltLog: ju.Config["iscltlog"].(bool),
  1102. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1103. }
  1104. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1105. } else {
  1106. return
  1107. }
  1108. }
  1109. //加载清理脚本
  1110. func (c *ClearTask) InitClearLuas() {
  1111. defer qu.Catch()
  1112. c.ClearLuas = make(map[string][]*ClearLua)
  1113. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1114. for _, l := range *list {
  1115. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1116. continue
  1117. }
  1118. s_field := qu.ObjToString(l["s_field"])
  1119. pid := qu.BsonIdToSId(l["_id"])
  1120. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1121. for _, vv := range *luas {
  1122. if b, _ := vv["isuse"].(bool); !b {
  1123. continue
  1124. }
  1125. clearLua := &ClearLua{
  1126. Field: s_field,
  1127. Code: vv["s_code"].(string),
  1128. Name: vv["s_name"].(string),
  1129. LuaText: vv["s_luascript"].(string),
  1130. LFields: getALLFields(),
  1131. }
  1132. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1133. }
  1134. }
  1135. }
  1136. //加载分块规则
  1137. func (e *ExtractTask) InitBlockRule() {
  1138. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1139. "vid": e.TaskInfo.VersionId,
  1140. "delete": false,
  1141. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1142. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1143. for _, v := range *datas {
  1144. block_reg, _ := v["block_reg"].(string)
  1145. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1146. title_reg, _ := v["title_reg"].(string)
  1147. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1148. if block_reg == "" || title_reg == "" {
  1149. continue
  1150. }
  1151. b_reg, b_err := regexp.Compile(block_reg)
  1152. t_reg, t_err := regexp.Compile(title_reg)
  1153. if b_err != nil || t_err != nil {
  1154. continue
  1155. }
  1156. brs = append(brs, b_reg)
  1157. trs = append(trs, t_reg)
  1158. }
  1159. e.RuleBlock = &ju.RuleBlock{
  1160. BlockRegs: brs,
  1161. TitleRegs: trs,
  1162. Classify: e.InitBlockClassify(),
  1163. }
  1164. }
  1165. //加载分块规则
  1166. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1167. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1168. "vid": e.TaskInfo.VersionId,
  1169. "delete": false,
  1170. }, nil, `{"name":1}`, false, -1, -1)
  1171. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1172. "vid": e.TaskInfo.VersionId,
  1173. "delete": false,
  1174. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1175. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1176. "vid": e.TaskInfo.VersionId,
  1177. "delete": false,
  1178. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1179. tag_map := map[string]ju.Tags{}
  1180. for _, v := range *classify_tag {
  1181. pid := qu.ObjToString(v["pid"])
  1182. tag_map[pid] = append(tag_map[pid], &ju.Tag{Value: qu.ObjToString(v["name"])})
  1183. }
  1184. //
  1185. info_map := map[string][]*ju.NameCode{}
  1186. info_tag := map[string]*ju.TagFile{}
  1187. for _, v := range *classify_info {
  1188. pid := qu.ObjToString(v["pid"])
  1189. _id := qu.BsonIdToSId(v["_id"])
  1190. name := qu.ObjToString(v["name"])
  1191. info_tag[name] = &ju.TagFile{
  1192. Name: name,
  1193. Items: tag_map[_id],
  1194. }
  1195. info_map[pid] = append(info_map[pid], &ju.NameCode{
  1196. Name: name,
  1197. Code: qu.ObjToString(v["code"]),
  1198. })
  1199. }
  1200. classify_map := map[string][]*ju.NameCode{}
  1201. for _, v := range *classify {
  1202. _id := qu.BsonIdToSId(v["_id"])
  1203. if info_map[_id] == nil {
  1204. continue
  1205. }
  1206. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1207. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1208. }
  1209. }
  1210. return &ju.BlockClassify{
  1211. Type: classify_map,
  1212. Classify: info_tag,
  1213. }
  1214. }