extractInit.go 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. log "github.com/donnie4w/go-logger/logger"
  14. )
  15. type RegLuaInfo struct { //正则或脚本信息
  16. Code, Name, Field string //
  17. RuleText string //
  18. IsLua bool //
  19. RegPreBac *ExtReg //
  20. RegCore *ExtReg //
  21. LFields map[string]string //lua抽取字段属性组
  22. }
  23. type ExtReg struct {
  24. Reg *regexp.Regexp
  25. Replace string
  26. Bextract bool
  27. ExtractPos map[string]int
  28. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  29. }
  30. type RuleCore struct {
  31. Field string //逻辑字段
  32. LuaLogic string //进入逻辑
  33. ExtFrom string //从哪个字段抽取
  34. RulePres []*RegLuaInfo //抽取前置规则
  35. RuleBacks []*RegLuaInfo //抽取后置规则
  36. RuleCores []*RegLuaInfo //抽取规则
  37. }
  38. type Tag struct {
  39. Type string //标签类型 string 字符串、regexp 正则
  40. Key string //
  41. Reg *regexp.Regexp //
  42. }
  43. type TaskInfo struct {
  44. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  45. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  46. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  47. TestColl, LastExtId string //测试结果表、上次抽取信息id
  48. FDB *db.Pool //数据库连接池
  49. TDB *db.Pool //数据库连接池
  50. IsEtxLog bool //是否开启抽取日志
  51. ProcessPool chan bool //任务进程池
  52. TestLua bool //检查测试用
  53. }
  54. type ExtractTask struct {
  55. Id string //任务id
  56. IsRun bool //是否启动
  57. Content string //信息内容
  58. TaskInfo *TaskInfo //任务信息
  59. RulePres []*RegLuaInfo //通用前置规则
  60. RuleBacks []*RegLuaInfo //通用后置规则
  61. RuleBlock *ju.RuleBlock
  62. //RuleCores []*RuleCore //抽取规则
  63. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  64. PkgRuleCores []*RuleCore //分包抽取规则
  65. Tag map[string][]*Tag //标签库
  66. ClearFn map[string][]string //清理函数
  67. IsExtractCity bool //是否开启城市抽取
  68. Fields map[string]int //抽取属性组
  69. IsFileField bool //是否开启附件抽取
  70. FileFields *sync.Map //抽取附件属性组
  71. ResultChanel chan bool //抽取结果详情
  72. ResultArr [][]map[string]interface{} //抽取结果详情
  73. BidChanel chan bool //抽取结果
  74. BidArr [][]map[string]interface{} //抽取结果
  75. BidTotal int //结果数量
  76. RecogFieldMap map[string]map[string]interface{} //识别字段
  77. FidClassMap map[string][]map[string]interface{} //分类
  78. CidRuleMap map[string][]map[string]interface{} //规则
  79. AuditFields []string //需要审核的字段名称
  80. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  81. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  82. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  83. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  84. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  85. DistrictCityMap map[string]*City //区或县对应的city
  86. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  87. StreetDistrictMap map[string]*District //街道对应的区或县
  88. ProvinceAllGet *ju.DFA //省全称
  89. ProvinceSimGet *ju.DFA //省简称
  90. CityAllGet *ju.DFA //市全称
  91. CitySimGet *ju.DFA //市简称
  92. DistrictAllGet *ju.DFA //区或县全称
  93. DistrictSimGet *ju.DFA //区或县简称
  94. StreetGet *ju.DFA //街道
  95. PostCodeMap map[string]*PostCode //邮编
  96. AreaCodeMap map[string]*AreaCode //区号
  97. InfoType []map[string]interface{}
  98. }
  99. type ClearTaskInfo struct {
  100. Name, Version, VersionId string //名称、版本、版本id
  101. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  102. FDB *db.Pool //数据库连接池
  103. TDB *db.Pool //数据库连接池
  104. IsCltLog bool //是否开启清理日志
  105. ProcessPool chan bool //任务进程池
  106. }
  107. type ClearLua struct {
  108. Field string //字段字段
  109. Code string //代码
  110. Name string //名称
  111. LuaText string
  112. //LuaLogic string //进入逻辑
  113. //ExtFrom string //从哪个字段抽取
  114. LFields map[string]string //lua抽取字段属性组
  115. }
  116. type ClearTask struct {
  117. Id string //任务id
  118. Content string //信息内容
  119. ClearTaskInfo *ClearTaskInfo //任务信息
  120. ClearLuas map[string][]*ClearLua //清理脚本
  121. UpdateResult [][]map[string]interface{} //清理后结果
  122. ClearChannel chan bool
  123. }
  124. func init() {
  125. TaskList = make(map[string]*ExtractTask)
  126. ClearTaskList = make(map[string]*ClearTask)
  127. go SaveExtLog()
  128. go SaveCltLog() //保存清理日志
  129. }
  130. //加载任务信息
  131. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  132. task, _ := db.Mgo.FindById("task", e.Id, nil)
  133. if len(*task) > 1 {
  134. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  135. e.TaskInfo = &TaskInfo{
  136. Name: (*task)["s_taskname"].(string),
  137. Version: (*task)["s_version"].(string),
  138. VersionId: qu.BsonIdToSId((*v)["_id"]),
  139. TrackColl: trackcoll,
  140. FromDbAddr: (*task)["s_mgoaddr"].(string),
  141. FromDB: (*task)["s_mgodb"].(string),
  142. FromColl: (*task)["s_mgocoll"].(string),
  143. TestColl: resultcoll,
  144. IsEtxLog: true,
  145. ProcessPool: make(chan bool, 1),
  146. }
  147. if (*v)["isextractcity"] != nil {
  148. e.IsExtractCity = (*v)["isextractcity"].(bool)
  149. }
  150. } else {
  151. return
  152. }
  153. }
  154. //加载任务信息
  155. func (e *ExtractTask) InitTaskInfo() {
  156. task, _ := db.Mgo.FindById("task", e.Id, nil)
  157. log.Debug("task", task)
  158. if len(*task) > 1 {
  159. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  160. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  161. log.Debug("s_mgosavecoll", strs)
  162. if len(strs) < 3 {
  163. return
  164. } else {
  165. e.TaskInfo = &TaskInfo{
  166. Name: (*task)["s_taskname"].(string),
  167. Version: (*task)["s_version"].(string),
  168. VersionId: qu.BsonIdToSId((*v)["_id"]),
  169. //TrackColl: (*task)["s_trackcoll"].(string),
  170. FromDbAddr: (*task)["s_mgoaddr"].(string),
  171. FromDB: (*task)["s_mgodb"].(string),
  172. FromColl: (*task)["s_mgocoll"].(string),
  173. ToDbAddr: strs[0],
  174. ToDB: strs[1],
  175. ToColl: strs[2],
  176. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  177. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  178. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  179. }
  180. if (*v)["isextractcity"] != nil {
  181. e.IsExtractCity = (*v)["isextractcity"].(bool)
  182. }
  183. }
  184. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  185. } else {
  186. return
  187. }
  188. }
  189. //加载通用前置规则
  190. func (e *ExtractTask) InitRulePres() {
  191. defer qu.Catch()
  192. e.RulePres = []*RegLuaInfo{}
  193. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  194. for _, v := range *list {
  195. rinfo := &RegLuaInfo{
  196. Code: v["s_code"].(string),
  197. Name: v["s_name"].(string),
  198. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  199. }
  200. if rinfo.IsLua {
  201. rinfo.RuleText = v["s_luascript"].(string)
  202. e.RulePres = append(e.RulePres, rinfo)
  203. } else {
  204. qu.Try(func() {
  205. rinfo.RuleText = v["s_rule"].(string)
  206. tmp := strings.Split(rinfo.RuleText, "__")
  207. var pattern string
  208. if strings.Contains(tmp[0], "\\u") {
  209. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  210. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  211. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  212. } else {
  213. pattern = tmp[0]
  214. }
  215. if len(tmp) == 2 {
  216. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  217. } else {
  218. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  219. }
  220. e.RulePres = append(e.RulePres, rinfo)
  221. }, func(err interface{}) {
  222. log.Debug(rinfo.Code, rinfo.Field, err)
  223. })
  224. }
  225. }
  226. }
  227. //加载通用后置规则
  228. func (e *ExtractTask) InitRuleBacks() {
  229. defer qu.Catch()
  230. e.RuleBacks = []*RegLuaInfo{}
  231. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  232. for _, v := range *list {
  233. rinfo := &RegLuaInfo{
  234. Code: v["s_code"].(string),
  235. Name: v["s_name"].(string),
  236. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  237. }
  238. if rinfo.IsLua {
  239. rinfo.RuleText = v["s_luascript"].(string)
  240. e.RuleBacks = append(e.RuleBacks, rinfo)
  241. } else {
  242. qu.Try(func() {
  243. rinfo.RuleText = v["s_rule"].(string)
  244. tmp := strings.Split(rinfo.RuleText, "__")
  245. var pattern string
  246. if strings.Contains(tmp[0], "\\u") {
  247. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  248. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  249. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  250. } else {
  251. pattern = tmp[0]
  252. }
  253. if len(tmp) == 2 {
  254. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  255. } else {
  256. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  257. }
  258. e.RuleBacks = append(e.RuleBacks, rinfo)
  259. }, func(err interface{}) {
  260. log.Debug(rinfo.Code, rinfo.Field, err)
  261. })
  262. }
  263. }
  264. }
  265. func (e *ExtractTask) InfoTypeList() {
  266. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  267. infolist := *infolist1
  268. for _, v := range infolist {
  269. e.InfoType = append(e.InfoType, v)
  270. }
  271. }
  272. //加载抽取规则
  273. func (e *ExtractTask) InitRuleCore() {
  274. defer qu.Catch()
  275. e.Fields = map[string]int{}
  276. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  277. e.RuleCores = make(map[string]map[string][]*RuleCore)
  278. for _, v := range *infolist {
  279. topclass := qu.ObjToString(v["topclass"])
  280. if v["subclass"] == nil {
  281. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  282. for attr, _ := range v["fields"].(map[string]interface{}) {
  283. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
  284. e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
  285. }
  286. } else {
  287. for ca, fs := range v["subclass"].(map[string]interface{}) {
  288. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  289. for field, _ := range fs.(map[string]interface{}) {
  290. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
  291. e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
  292. }
  293. }
  294. }
  295. }
  296. }
  297. func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
  298. maps := []*RuleCore{}
  299. if b, _ := vinfo["isuse"].(bool); !b {
  300. return nil
  301. }
  302. s_field := qu.ObjToString(vinfo["s_field"])
  303. pid := qu.BsonIdToSId(vinfo["_id"])
  304. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  305. for _, vv := range *list {
  306. if b, _ := vv["isuse"].(bool); !b {
  307. continue
  308. }
  309. rcore := &RuleCore{}
  310. rcore.Field = s_field
  311. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  312. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  313. //前置规则
  314. rulePres := []*RegLuaInfo{}
  315. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  316. for _, v := range *plist {
  317. rinfo := &RegLuaInfo{
  318. Field: qu.ObjToString(v["s_field"]),
  319. Code: v["s_code"].(string),
  320. Name: v["s_name"].(string),
  321. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  322. }
  323. if rinfo.IsLua {
  324. rinfo.RuleText = v["s_luascript"].(string)
  325. rulePres = append(rulePres, rinfo)
  326. } else {
  327. qu.Try(func() {
  328. rinfo.RuleText = v["s_rule"].(string)
  329. tmp := strings.Split(rinfo.RuleText, "__")
  330. var pattern string
  331. if strings.Contains(tmp[0], "\\u") {
  332. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  333. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  334. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  335. } else {
  336. pattern = tmp[0]
  337. }
  338. if len(tmp) == 2 {
  339. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  340. } else {
  341. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  342. }
  343. rulePres = append(rulePres, rinfo)
  344. }, func(err interface{}) {
  345. log.Debug(rinfo.Code, rinfo.Field, err)
  346. })
  347. }
  348. }
  349. rcore.RulePres = rulePres
  350. //后置规则
  351. ruleBacks := []*RegLuaInfo{}
  352. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  353. for _, v := range *blist {
  354. rinfo := &RegLuaInfo{
  355. Field: qu.ObjToString(v["s_field"]),
  356. Code: v["s_code"].(string),
  357. Name: v["s_name"].(string),
  358. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  359. }
  360. if rinfo.IsLua {
  361. rinfo.RuleText = v["s_luascript"].(string)
  362. ruleBacks = append(ruleBacks, rinfo)
  363. } else {
  364. qu.Try(func() {
  365. rinfo.RuleText = v["s_rule"].(string)
  366. tmp := strings.Split(rinfo.RuleText, "__")
  367. var pattern string
  368. if strings.Contains(tmp[0], "\\u") {
  369. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  370. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  371. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  372. } else {
  373. pattern = tmp[0]
  374. }
  375. if len(tmp) == 2 {
  376. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  377. } else {
  378. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  379. }
  380. ruleBacks = append(ruleBacks, rinfo)
  381. }, func(err interface{}) {
  382. log.Debug(rinfo.Code, rinfo.Field, err)
  383. })
  384. }
  385. }
  386. rcore.RuleBacks = ruleBacks
  387. //抽取规则
  388. ruleCores := []*RegLuaInfo{}
  389. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  390. for _, v := range *clist {
  391. if b, _ := v["isuse"].(bool); !b {
  392. continue
  393. }
  394. field := qu.ObjToString(v["s_field"])
  395. e.Fields[field] = 1 //加入抽取属性组备用
  396. rinfo := &RegLuaInfo{
  397. Field: field,
  398. Code: v["s_code"].(string),
  399. Name: v["s_name"].(string),
  400. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  401. }
  402. if rinfo.IsLua {
  403. rinfo.RuleText = v["s_luascript"].(string)
  404. //提取全部属性
  405. rinfo.LFields = getALLFields()
  406. ruleCores = append(ruleCores, rinfo)
  407. } else {
  408. qu.Try(func() {
  409. rinfo.RuleText = v["s_rule"].(string)
  410. ptmp := strings.Split(rinfo.RuleText, "#")
  411. sign := 0
  412. if len(ptmp) == 2 {
  413. if ptmp[1] == "正" {
  414. sign = 1
  415. } else if ptmp[1] == "负" {
  416. sign = -1
  417. }
  418. }
  419. tmp := strings.Split(ptmp[0], "__")
  420. var pattern string
  421. if strings.Contains(tmp[0], "\\u") {
  422. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  423. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  424. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  425. } else {
  426. pattern = tmp[0]
  427. }
  428. if len(tmp) == 2 {
  429. epos := strings.Split(tmp[1], ",")
  430. posm := map[string]int{}
  431. for _, v := range epos {
  432. ks := strings.Split(v, ":")
  433. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  434. posm[ks[1]] = qu.IntAll(ks[0])
  435. } else { //(.*)招标公告__2
  436. posm[rinfo.Field] = qu.IntAll(ks[0])
  437. }
  438. }
  439. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
  440. } else {
  441. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  442. }
  443. ruleCores = append(ruleCores, rinfo)
  444. }, func(err interface{}) {
  445. log.Debug(rinfo.Code, rinfo.Field, err)
  446. })
  447. }
  448. }
  449. rcore.RuleCores = ruleCores
  450. //
  451. maps = append(maps, rcore)
  452. }
  453. return maps
  454. }
  455. //加载分包抽取规则
  456. func (e *ExtractTask) InitPkgCore() {
  457. defer qu.Catch()
  458. e.PkgRuleCores = []*RuleCore{}
  459. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  460. for _, pkginfo := range *pkginfos {
  461. if b, _ := pkginfo["isuse"].(bool); !b {
  462. continue
  463. }
  464. s_field := qu.ObjToString(pkginfo["s_field"])
  465. pid := qu.BsonIdToSId(pkginfo["_id"])
  466. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  467. for _, vv := range *logicList {
  468. if b, _ := vv["isuse"].(bool); !b {
  469. continue
  470. }
  471. rcore := &RuleCore{}
  472. rcore.Field = s_field
  473. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  474. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  475. //后置规则
  476. ruleBacks := []*RegLuaInfo{}
  477. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  478. for _, v := range *blist {
  479. rinfo := &RegLuaInfo{
  480. Field: qu.ObjToString(v["s_field"]),
  481. Code: v["s_code"].(string),
  482. Name: v["s_name"].(string),
  483. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  484. }
  485. if rinfo.IsLua {
  486. rinfo.RuleText = v["s_luascript"].(string)
  487. ruleBacks = append(ruleBacks, rinfo)
  488. } else {
  489. qu.Try(func() {
  490. rinfo.RuleText = v["s_rule"].(string)
  491. tmp := strings.Split(rinfo.RuleText, "__")
  492. var pattern string
  493. if strings.Contains(tmp[0], "\\u") {
  494. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  495. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  496. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  497. } else {
  498. pattern = tmp[0]
  499. }
  500. if len(tmp) == 2 {
  501. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  502. } else {
  503. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  504. }
  505. ruleBacks = append(ruleBacks, rinfo)
  506. }, func(err interface{}) {
  507. log.Debug(rinfo.Code, rinfo.Field, err)
  508. })
  509. }
  510. }
  511. rcore.RuleBacks = ruleBacks
  512. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  513. }
  514. }
  515. }
  516. //加载标签库
  517. func (e *ExtractTask) InitTag() {
  518. defer qu.Catch()
  519. e.Tag = map[string][]*Tag{}
  520. //字符串标签库
  521. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  522. for _, v := range *list {
  523. field := qu.ObjToString(v["s_field"])
  524. if tmp, ok := v["content"].([]interface{}); ok {
  525. fname := qu.ObjToString(v["s_name"])
  526. tab := ju.TagFile{Name: fname} //用于表格kv
  527. tab.Items = make([]*ju.Tag, len(tmp))
  528. for k, key := range tmp {
  529. tag := &Tag{Type: "string", Key: key.(string)}
  530. e.Tag[field] = append(e.Tag[field], tag)
  531. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  532. }
  533. sort.Sort(tab.Items)
  534. ju.TagdbTable[fname] = &tab
  535. }
  536. }
  537. //正则标签库
  538. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  539. for _, v := range *list {
  540. field := qu.ObjToString(v["s_field"])
  541. if tmp, ok := v["content"].([]interface{}); ok {
  542. fname := qu.ObjToString(v["s_name"])
  543. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  544. tab.Items = make([]*ju.Tag, len(tmp))
  545. for k, key := range tmp {
  546. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  547. e.Tag[field] = append(e.Tag[field], tag)
  548. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  549. }
  550. sort.Sort(tab.Items)
  551. ju.TagdbTable[fname+"_reg"] = &tab
  552. }
  553. }
  554. }
  555. //获取fields
  556. func getALLFields() map[string]string {
  557. fields := map[string]string{}
  558. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  559. for _, v := range *list {
  560. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  561. }
  562. return fields
  563. }
  564. //加载clear函数
  565. func (e *ExtractTask) InitClearFn() {
  566. defer qu.Catch()
  567. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  568. fn := map[string][]string{}
  569. for _, tmp := range *list {
  570. field := tmp["s_field"].(string)
  571. fns := tmp["clear"].([]interface{})
  572. if fn[field] == nil {
  573. fn[field] = []string{}
  574. }
  575. for _, v := range fns {
  576. fn[field] = append(fn[field], v.(string))
  577. }
  578. }
  579. e.ClearFn = fn
  580. }
  581. //加载省份
  582. func InitProvince(version string) map[string]interface{} {
  583. defer qu.Catch()
  584. fn := map[string]interface{}{}
  585. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  586. for _, v := range *list {
  587. name := qu.ObjToString(v["s_name"])
  588. content := v["content"]
  589. switch content.(type) {
  590. case string:
  591. fn[name] = []interface{}{content.(string)}
  592. case []interface{}:
  593. fn[name] = content
  594. }
  595. }
  596. return fn
  597. }
  598. //加载城市简称
  599. func InitCitySim(version string) map[string]map[string]interface{} {
  600. defer qu.Catch()
  601. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  602. fn := map[string]map[string]interface{}{}
  603. for _, v := range *list {
  604. name := qu.ObjToString(v["s_name"])
  605. tmp := v["content"].(map[string]interface{})
  606. fn[name] = tmp
  607. }
  608. return fn
  609. }
  610. //加载城市全称
  611. func InitCityAll(version string) map[string]map[string]interface{} {
  612. defer qu.Catch()
  613. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  614. fn := map[string]map[string]interface{}{}
  615. for _, v := range *list {
  616. name := qu.ObjToString(v["s_name"])
  617. tmp := v["content"].(map[string]interface{})
  618. fn[name] = tmp
  619. }
  620. return fn
  621. }
  622. //初始化城市省份敏感词
  623. func (e *ExtractTask) InitCityDFA() {
  624. defer qu.Catch()
  625. e.CityAllGet = &ju.DFA{}
  626. e.CitySimGet = &ju.DFA{}
  627. e.DistrictAllGet = &ju.DFA{}
  628. e.DistrictSimGet = &ju.DFA{}
  629. e.ProvinceAllGet = &ju.DFA{}
  630. e.ProvinceSimGet = &ju.DFA{}
  631. e.StreetGet = &ju.DFA{}
  632. //初始化map
  633. if e.ProvinceMap == nil {
  634. e.ProvinceMap = make(map[string]string)
  635. }
  636. if e.CityMap == nil {
  637. e.CityMap = make(map[string]string)
  638. }
  639. if e.DistrictSimAndAll == nil {
  640. e.DistrictSimAndAll = make(map[string]string)
  641. }
  642. if e.CityBriefMap == nil {
  643. e.CityBriefMap = make(map[string]*City)
  644. }
  645. if e.CityFullMap == nil {
  646. e.CityFullMap = make(map[string]*City)
  647. }
  648. if e.ProvinceBriefMap == nil {
  649. e.ProvinceBriefMap = make(map[string]*Province)
  650. }
  651. if e.DistrictCityMap == nil {
  652. e.DistrictCityMap = make(map[string]*City)
  653. }
  654. if e.StreetDistrictMap == nil {
  655. e.StreetDistrictMap = make(map[string]*District)
  656. }
  657. //初始化省
  658. fn1 := InitProvince(e.TaskInfo.Version)
  659. for k, v := range fn1 {
  660. for _, p := range v.([]interface{}) {
  661. p1, _ := p.(string)
  662. e.ProvinceAllGet.AddWord(p1) //华中科技大学
  663. e.ProvinceMap[p1] = k //华中科技大学:湖北
  664. }
  665. }
  666. //初始化城市全称
  667. fn2 := InitCityAll(e.TaskInfo.Version)
  668. for k, v := range fn2 {
  669. //加载省信息
  670. e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
  671. p := &Province{}
  672. p.Name = k //省全称:浙江省
  673. p.Brief = v["brief"].(string) //省简称:浙江
  674. e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
  675. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  676. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  677. p.Cap = v["captial"].(string) //省会(杭州)
  678. //加载市信息
  679. city, _ := v["city"].(map[string]interface{})
  680. for k1, v1 := range city {
  681. e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
  682. v1m, _ := v1.(map[string]interface{})
  683. c := &City{}
  684. c.Name = k1 //市全称:杭州市
  685. c.Brief = v1m["brief"].(string) //市简称:杭州
  686. e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
  687. e.CityMap[k1] = c.Brief //杭州市:杭州
  688. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  689. e.CityFullMap[k1] = c //杭州市:市信息{}
  690. c.P = p
  691. if c.Name == p.Cap {
  692. p.Captial = c //加载province中的省会市信息{}
  693. }
  694. //区县
  695. districtmap := v1m["area"].(map[string]interface{}) //区或县
  696. for district, streetarr := range districtmap {
  697. d := &District{}
  698. d.Name = district
  699. d.C = c
  700. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  701. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
  702. e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
  703. ctmp := e.DistrictCityMap[district]
  704. if ctmp == nil {
  705. e.DistrictCityMap[district] = c
  706. }
  707. //街道
  708. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  709. e.StreetGet.AddWord(s) //加入街道敏感词
  710. dtmp := e.StreetDistrictMap[s]
  711. if dtmp == nil {
  712. e.StreetDistrictMap[s] = d
  713. }
  714. }
  715. }
  716. }
  717. }
  718. //初始化城市简称
  719. fn3 := InitCitySim(e.TaskInfo.Version)
  720. for _, v := range fn3 {
  721. city, _ := v["city"].(map[string]interface{})
  722. for _, v1 := range city {
  723. v1m, _ := v1.(map[string]interface{})
  724. cb := v1m["brief"].(string) //市简称
  725. arr := v1m["area"].(map[string]interface{}) //区或县简称
  726. for districtsim, districtall := range arr {
  727. e.DistrictSimAndAll[districtsim] = districtall.(string)
  728. d := &District{}
  729. d.Name = districtsim
  730. d.C = e.CityBriefMap[cb]
  731. e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
  732. ctmp := e.DistrictCityMap[districtsim]
  733. if ctmp == nil {
  734. e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
  735. }
  736. }
  737. }
  738. }
  739. }
  740. //初始化邮编库
  741. func (e *ExtractTask) InitPostCode() {
  742. defer qu.Catch()
  743. if e.PostCodeMap == nil {
  744. e.PostCodeMap = make(map[string]*PostCode)
  745. }
  746. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  747. for _, l := range *list {
  748. pc := &PostCode{}
  749. pc.Code = qu.ObjToString(l["code"])
  750. pc.P = qu.ObjToString(l["province"])
  751. pc.C = qu.ObjToString(l["city"])
  752. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  753. e.PostCodeMap[pc.Code] = pc
  754. }
  755. }
  756. //初始化区号库
  757. func (e *ExtractTask) InitAreaCode() {
  758. defer qu.Catch()
  759. if e.AreaCodeMap == nil {
  760. e.AreaCodeMap = make(map[string]*AreaCode)
  761. }
  762. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  763. for _, l := range *list {
  764. ac := &AreaCode{}
  765. ac.Code = qu.ObjToString(l["code"])
  766. ac.P = qu.ObjToString(l["province"])
  767. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  768. e.AreaCodeMap[ac.Code] = ac
  769. }
  770. }
  771. //初始化城市省份敏感词
  772. //func (e *ExtractTask) InitCityDFA() {
  773. // defer qu.Catch()
  774. // e.CityAllGet = &ju.DFA{}
  775. // e.DistrictGet = &ju.DFA{}
  776. // e.AreaProvinceGet = &ju.DFA{}
  777. // e.StreetGet = &ju.DFA{}
  778. // //初始化map
  779. // if e.ProvinceMap == nil {
  780. // e.ProvinceMap = make(map[string]string)
  781. // }
  782. // if e.CityBriefMap == nil {
  783. // e.CityBriefMap = make(map[string]*City)
  784. // }
  785. // if e.ProvinceBriefMap == nil {
  786. // e.ProvinceBriefMap = make(map[string]*Province)
  787. // }
  788. // if e.AreaToCityMap == nil {
  789. // e.AreaToCityMap = make(map[string][]*City)
  790. // }
  791. // if e.DistrictCityMap == nil {
  792. // e.DistrictCityMap = make(map[string]*City)
  793. // }
  794. // if e.StreetDistrictMap == nil {
  795. // e.StreetDistrictMap = make(map[string]*District)
  796. // }
  797. // //初始化省
  798. // fn1 := InitProvince(e.TaskInfo.Version)
  799. // for k, v := range fn1 {
  800. // for _, p := range v.([]interface{}) {
  801. // p1, _ := p.(string)
  802. // e.AreaProvinceGet.AddWord(p1) //华中科技大学
  803. // e.ProvinceMap[p1] = k //华中科技大学:湖北
  804. // }
  805. // }
  806. // //初始化城市全称
  807. // fn2 := InitCityAll(e.TaskInfo.Version)
  808. // for k, v := range fn2 {
  809. // e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
  810. // p := &Province{}
  811. // p.Name = k //省全称
  812. // p.Brief = v["brief"].(string) //省简称
  813. // e.ProvinceMap[k] = p.Brief //浙江省:浙江
  814. // e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
  815. // p.Cap = v["captial"].(string) //省会(杭州)
  816. // city, _ := v["city"].(map[string]interface{})
  817. // //
  818. // for k1, v1 := range city {
  819. // v1m, _ := v1.(map[string]interface{})
  820. // c := &City{}
  821. // c.Name = k1
  822. // c.Brief = v1m["brief"].(string)
  823. // e.CityBriefMap[c.Brief] = c
  824. // c.P = p
  825. // if c.Brief == p.Cap {
  826. // p.Captial = c
  827. // }
  828. // //加入到城市map中
  829. // //
  830. // cs := e.AreaToCityMap[k1]
  831. // e.CityAllGet.AddWord(k1) //市全称
  832. // if cs != nil {
  833. // cs = append(cs, c)
  834. // } else {
  835. // cs = []*City{c}
  836. // }
  837. // e.AreaToCityMap[k1] = cs
  838. // //区县
  839. // districtmap := v1m["area"].(map[string]interface{}) //区或县
  840. // for district, streetarr := range districtmap {
  841. // d := &District{}
  842. // d.Name = district
  843. // d.C = c
  844. // e.DistrictGet.AddWord(district) //加入区或县敏感词
  845. // ctmp := e.DistrictCityMap[district]
  846. // if ctmp == nil {
  847. // e.DistrictCityMap[district] = c
  848. // }
  849. // //街道
  850. // for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  851. // e.StreetGet.AddWord(s) //加入街道敏感词
  852. // dtmp := e.StreetDistrictMap[s]
  853. // if dtmp == nil {
  854. // e.StreetDistrictMap[s] = d
  855. // }
  856. // }
  857. // }
  858. // }
  859. // }
  860. // //初始化城市简称
  861. // fn3 := InitCitySim(e.TaskInfo.Version)
  862. // e.CitySimGet = &ju.DFA{}
  863. // for k, v := range fn3 {
  864. // pb := v["brief"].(string)
  865. // p := e.ProvinceBriefMap[pb]
  866. // //加载
  867. // for _, ss := range []string{k, pb} { //省全称和省简称
  868. // cs := e.AreaToCityMap[ss]
  869. // if cs != nil {
  870. // cs = append(cs, p.Captial)
  871. // } else {
  872. // cs = []*City{p.Captial}
  873. // }
  874. // e.AreaToCityMap[ss] = cs
  875. // e.CitySimGet.AddWord(ss)
  876. // }
  877. // city, _ := v["city"].(map[string]interface{})
  878. // for k1, v1 := range city {
  879. // v1m, _ := v1.(map[string]interface{})
  880. // if v1m["brief"] == nil {
  881. // }
  882. // cb := v1m["brief"].(string)
  883. // c := e.AreaToCityMap[k1][0]
  884. // //加入到城市map中
  885. // for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  886. // e.CitySimGet.AddWord(ss)
  887. // cs := e.AreaToCityMap[ss]
  888. // if cs != nil {
  889. // cs = append(cs, c)
  890. // } else {
  891. // cs = []*City{c}
  892. // }
  893. // e.AreaToCityMap[ss] = cs
  894. // }
  895. // arr := v1m["area"].([]interface{})
  896. // for _, k2 := range arr {
  897. // s := k2.(string)
  898. // for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  899. // cs := e.AreaToCityMap[ss]
  900. // e.CitySimGet.AddWord(ss)
  901. // if cs != nil {
  902. // cs = append(cs, c)
  903. // } else {
  904. // cs = []*City{c}
  905. // }
  906. // e.AreaToCityMap[ss] = cs
  907. // //只加入简称
  908. // if n == 0 {
  909. // d := &District{}
  910. // d.Name = ss
  911. // d.C = c
  912. // e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
  913. // ctmp := e.DistrictCityMap[ss]
  914. // if ctmp == nil {
  915. // e.DistrictCityMap[ss] = c
  916. // }
  917. // }
  918. // }
  919. // }
  920. // }
  921. // }
  922. //}
  923. //保存抽取详情数据
  924. func (e *ExtractTask) ResultSave(init bool) {
  925. defer qu.Catch()
  926. if e.ResultArr == nil {
  927. e.ResultArr = [][]map[string]interface{}{}
  928. }
  929. if init {
  930. go func() {
  931. for {
  932. if len(e.ResultArr) > 500 {
  933. arr := e.ResultArr[:500]
  934. qu.Try(func() {
  935. db.Mgo.UpSertBulk("extract_result", arr...)
  936. }, func(err interface{}) {
  937. log.Debug(err)
  938. })
  939. e.ResultArr = e.ResultArr[500:]
  940. } else {
  941. arr := e.ResultArr
  942. qu.Try(func() {
  943. db.Mgo.UpSertBulk("extract_result", arr...)
  944. }, func(err interface{}) {
  945. log.Debug(err)
  946. })
  947. e.ResultArr = [][]map[string]interface{}{}
  948. }
  949. time.Sleep(10 * time.Second)
  950. }
  951. }()
  952. } else {
  953. arr := e.ResultArr
  954. qu.Try(func() {
  955. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  956. }, func(err interface{}) {
  957. log.Debug(err)
  958. })
  959. e.ResultArr = [][]map[string]interface{}{}
  960. }
  961. }
  962. //保存抽取数据
  963. func (e *ExtractTask) BidSave(init bool) {
  964. defer qu.Catch()
  965. if e.BidArr == nil {
  966. e.BidArr = [][]map[string]interface{}{}
  967. }
  968. if init {
  969. go func() {
  970. for {
  971. if len(e.BidArr) > 500 {
  972. arr := e.BidArr[:500]
  973. qu.Try(func() {
  974. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  975. }, func(err interface{}) {
  976. log.Debug(err)
  977. })
  978. e.BidArr = e.BidArr[500:]
  979. } else {
  980. arr := e.BidArr
  981. qu.Try(func() {
  982. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  983. }, func(err interface{}) {
  984. log.Debug(err)
  985. })
  986. e.BidArr = [][]map[string]interface{}{}
  987. }
  988. time.Sleep(10 * time.Second)
  989. }
  990. }()
  991. } else {
  992. arr := e.BidArr
  993. qu.Try(func() {
  994. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  995. }, func(err interface{}) {
  996. log.Debug(err)
  997. })
  998. e.BidArr = [][]map[string]interface{}{}
  999. time.Sleep(1 * time.Second)
  1000. }
  1001. }
  1002. func (e *ExtractTask) InitAuditRecogField() {
  1003. defer qu.Catch()
  1004. e.RecogFieldMap = make(map[string]map[string]interface{})
  1005. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1006. for _, f := range *recogFieldList {
  1007. field := qu.ObjToString(f["s_recogfield"])
  1008. e.RecogFieldMap[field] = f
  1009. }
  1010. }
  1011. func (e *ExtractTask) InitAuditClass() {
  1012. defer qu.Catch()
  1013. e.FidClassMap = make(map[string][]map[string]interface{})
  1014. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1015. for _, c := range *class {
  1016. classList := []map[string]interface{}{}
  1017. fid := qu.ObjToString(c["s_fid"])
  1018. if len(e.FidClassMap[fid]) > 0 { //追加
  1019. classList = e.FidClassMap[fid]
  1020. }
  1021. classList = append(classList, c)
  1022. e.FidClassMap[fid] = classList
  1023. }
  1024. }
  1025. //加载规则
  1026. func (e *ExtractTask) InitAuditRule() {
  1027. defer qu.Catch()
  1028. var rureg *regexp.Regexp
  1029. var rs []rune
  1030. var ru string
  1031. var err error
  1032. e.CidRuleMap = make(map[string][]map[string]interface{})
  1033. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1034. for _, v := range *rule {
  1035. i_rule := []interface{}{}
  1036. ss, _ := (v["s_rule"].([]interface{}))
  1037. for _, r := range qu.ObjArrToStringArr(ss) {
  1038. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1039. rs = []rune(r)
  1040. ru = string(rs[1 : len(rs)-1])
  1041. rureg, err = regexp.Compile(ru)
  1042. if err != nil {
  1043. log.Debug("error---rule:", r)
  1044. continue
  1045. }
  1046. i_rule = append(i_rule, []interface{}{rureg}...)
  1047. } else { //规则
  1048. i_rule = append(i_rule, r)
  1049. }
  1050. }
  1051. v["rule"] = i_rule
  1052. ruleList := []map[string]interface{}{}
  1053. classid := qu.ObjToString(v["s_classid"])
  1054. if len(e.CidRuleMap[classid]) > 0 { //追加
  1055. ruleList = e.CidRuleMap[classid]
  1056. }
  1057. ruleList = append(ruleList, v)
  1058. e.CidRuleMap[classid] = ruleList
  1059. }
  1060. }
  1061. //
  1062. func (e *ExtractTask) InitAuditFields() {
  1063. if len(e.AuditFields) == 0 {
  1064. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1065. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1066. vid := qu.BsonIdToSId((*v)["_id"])
  1067. query := map[string]interface{}{
  1068. "isaudit": true,
  1069. "delete": false,
  1070. "vid": vid,
  1071. }
  1072. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1073. for _, d := range *data {
  1074. field := qu.ObjToString(d["s_field"])
  1075. e.AuditFields = append(e.AuditFields, field)
  1076. }
  1077. }
  1078. }
  1079. }
  1080. //加载附件抽取
  1081. func (e *ExtractTask) InitFile() {
  1082. defer qu.Catch()
  1083. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1084. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1085. //ve, _ := db.Mgo.FindOne("version", query)
  1086. if ve == nil {
  1087. return
  1088. }
  1089. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1090. e.IsFileField = true
  1091. }
  1092. syscefiled := new(sync.Map)
  1093. if (*ve)["s_filefileds"] != nil {
  1094. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1095. syscefiled.Store(vff.(string), 1)
  1096. }
  1097. }
  1098. e.FileFields = syscefiled
  1099. }
  1100. //加载清理任务信息
  1101. func (c *ClearTask) InitClearTaskInfo() {
  1102. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1103. if len(*cleartask) > 1 {
  1104. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1105. c.ClearTaskInfo = &ClearTaskInfo{
  1106. Name: (*cleartask)["s_taskname"].(string),
  1107. Version: (*cleartask)["s_version"].(string),
  1108. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1109. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1110. FromDB: (*cleartask)["s_mgodb"].(string),
  1111. FromColl: (*cleartask)["s_mgocoll"].(string),
  1112. IsCltLog: ju.Config["iscltlog"].(bool),
  1113. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1114. }
  1115. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1116. } else {
  1117. return
  1118. }
  1119. }
  1120. //加载清理脚本
  1121. func (c *ClearTask) InitClearLuas() {
  1122. defer qu.Catch()
  1123. c.ClearLuas = make(map[string][]*ClearLua)
  1124. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1125. for _, l := range *list {
  1126. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1127. continue
  1128. }
  1129. s_field := qu.ObjToString(l["s_field"])
  1130. pid := qu.BsonIdToSId(l["_id"])
  1131. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1132. for _, vv := range *luas {
  1133. if b, _ := vv["isuse"].(bool); !b {
  1134. continue
  1135. }
  1136. clearLua := &ClearLua{
  1137. Field: s_field,
  1138. Code: vv["s_code"].(string),
  1139. Name: vv["s_name"].(string),
  1140. LuaText: vv["s_luascript"].(string),
  1141. LFields: getALLFields(),
  1142. }
  1143. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1144. }
  1145. }
  1146. }
  1147. //加载分块规则
  1148. func (e *ExtractTask) InitBlockRule() {
  1149. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1150. "vid": e.TaskInfo.VersionId,
  1151. "delete": false,
  1152. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1153. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1154. for _, v := range *datas {
  1155. block_reg, _ := v["block_reg"].(string)
  1156. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1157. title_reg, _ := v["title_reg"].(string)
  1158. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1159. if block_reg == "" || title_reg == "" {
  1160. continue
  1161. }
  1162. b_reg, b_err := regexp.Compile(block_reg)
  1163. t_reg, t_err := regexp.Compile(title_reg)
  1164. if b_err != nil || t_err != nil {
  1165. continue
  1166. }
  1167. brs = append(brs, b_reg)
  1168. trs = append(trs, t_reg)
  1169. }
  1170. e.RuleBlock = &ju.RuleBlock{
  1171. BlockRegs: brs,
  1172. TitleRegs: trs,
  1173. Classify: e.InitBlockClassify(),
  1174. }
  1175. }
  1176. //加载分块规则
  1177. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1178. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1179. "vid": e.TaskInfo.VersionId,
  1180. "delete": false,
  1181. }, nil, `{"name":1}`, false, -1, -1)
  1182. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1183. "vid": e.TaskInfo.VersionId,
  1184. "delete": false,
  1185. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1186. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1187. "vid": e.TaskInfo.VersionId,
  1188. "delete": false,
  1189. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1190. tag_map := map[string]ju.Tags{}
  1191. for _, v := range *classify_tag {
  1192. pid := qu.ObjToString(v["pid"])
  1193. tag_map[pid] = append(tag_map[pid], &ju.Tag{Value: qu.ObjToString(v["name"])})
  1194. }
  1195. //
  1196. info_map := map[string][]*ju.NameCode{}
  1197. info_tag := map[string]*ju.TagFile{}
  1198. for _, v := range *classify_info {
  1199. pid := qu.ObjToString(v["pid"])
  1200. _id := qu.BsonIdToSId(v["_id"])
  1201. name := qu.ObjToString(v["name"])
  1202. info_tag[name] = &ju.TagFile{
  1203. Name: name,
  1204. Items: tag_map[_id],
  1205. }
  1206. info_map[pid] = append(info_map[pid], &ju.NameCode{
  1207. Name: name,
  1208. Code: qu.ObjToString(v["code"]),
  1209. })
  1210. }
  1211. classify_map := map[string][]*ju.NameCode{}
  1212. for _, v := range *classify {
  1213. _id := qu.BsonIdToSId(v["_id"])
  1214. if info_map[_id] == nil {
  1215. continue
  1216. }
  1217. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1218. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1219. }
  1220. }
  1221. return &ju.BlockClassify{
  1222. Type: classify_map,
  1223. Classify: info_tag,
  1224. }
  1225. }