extractInit.go 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. log "github.com/donnie4w/go-logger/logger"
  14. )
  15. type RegLuaInfo struct { //正则或脚本信息
  16. Code, Name, Field string //
  17. RuleText string //
  18. IsLua bool //
  19. RegPreBac *ExtReg //
  20. RegCore *ExtReg //
  21. LFields map[string]string //lua抽取字段属性组
  22. }
  23. type ExtReg struct {
  24. Reg *regexp.Regexp
  25. Replace string
  26. Bextract bool
  27. ExtractPos map[string]int
  28. }
  29. type RuleCore struct {
  30. Field string //逻辑字段
  31. LuaLogic string //进入逻辑
  32. ExtFrom string //从哪个字段抽取
  33. RulePres []*RegLuaInfo //抽取前置规则
  34. RuleBacks []*RegLuaInfo //抽取后置规则
  35. RuleCores []*RegLuaInfo //抽取规则
  36. }
  37. type Tag struct {
  38. Type string //标签类型 string 字符串、regexp 正则
  39. Key string //
  40. Reg *regexp.Regexp //
  41. }
  42. type TaskInfo struct {
  43. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  44. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  45. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  46. TestColl, LastExtId string //测试结果表、上次抽取信息id
  47. FDB *db.Pool //数据库连接池
  48. TDB *db.Pool //数据库连接池
  49. IsEtxLog bool //是否开启抽取日志
  50. ProcessPool chan bool //任务进程池
  51. TestLua bool //检查测试用
  52. }
  53. type ExtractTask struct {
  54. Id string //任务id
  55. IsRun bool //是否启动
  56. Content string //信息内容
  57. TaskInfo *TaskInfo //任务信息
  58. RulePres []*RegLuaInfo //通用前置规则
  59. RuleBacks []*RegLuaInfo //通用后置规则
  60. RuleBlock *ju.RuleBlock
  61. //RuleCores []*RuleCore //抽取规则
  62. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  63. PkgRuleCores []*RuleCore //分包抽取规则
  64. Tag map[string][]*Tag //标签库
  65. ClearFn map[string][]string //清理函数
  66. IsExtractCity bool //是否开启城市抽取
  67. Fields map[string]int //抽取属性组
  68. IsFileField bool //是否开启附件抽取
  69. FileFields *sync.Map //抽取附件属性组
  70. ResultChanel chan bool //抽取结果详情
  71. ResultArr [][]map[string]interface{} //抽取结果详情
  72. BidChanel chan bool //抽取结果
  73. BidArr [][]map[string]interface{} //抽取结果
  74. RecogFieldMap map[string]map[string]interface{} //识别字段
  75. FidClassMap map[string][]map[string]interface{} //分类
  76. CidRuleMap map[string][]map[string]interface{} //规则
  77. AuditFields []string //需要审核的字段名称
  78. ProvinceMap map[string]string
  79. CityBrief map[string]*City //只加载一次即可
  80. ProvinceBrief map[string]*Province //只加载一次
  81. AreaToCity map[string][]*City //两个文件共用
  82. DistrictCityMap map[string]*City
  83. StreetDistrictMap map[string]*District
  84. AreaGet *ju.DFA //市全称
  85. AreaDistrict *ju.DFA //区或县
  86. AreaProvinceGet *ju.DFA //省
  87. AreaSimGet *ju.DFA //市简称
  88. AreaStreet *ju.DFA //街道
  89. InfoType []map[string]interface{}
  90. }
  91. type ClearTaskInfo struct {
  92. Name, Version, VersionId string //名称、版本、版本id
  93. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  94. FDB *db.Pool //数据库连接池
  95. TDB *db.Pool //数据库连接池
  96. IsCltLog bool //是否开启清理日志
  97. ProcessPool chan bool //任务进程池
  98. }
  99. type ClearLua struct {
  100. Field string //字段字段
  101. Code string //代码
  102. Name string //名称
  103. LuaText string
  104. //LuaLogic string //进入逻辑
  105. //ExtFrom string //从哪个字段抽取
  106. LFields map[string]string //lua抽取字段属性组
  107. }
  108. type ClearTask struct {
  109. Id string //任务id
  110. Content string //信息内容
  111. ClearTaskInfo *ClearTaskInfo //任务信息
  112. ClearLuas map[string][]*ClearLua //清理脚本
  113. UpdateResult [][]map[string]interface{} //清理后结果
  114. ClearChannel chan bool
  115. }
  116. func init() {
  117. TaskList = make(map[string]*ExtractTask)
  118. ClearTaskList = make(map[string]*ClearTask)
  119. go SaveExtLog()
  120. go SaveCltLog() //保存清理日志
  121. }
  122. //加载任务信息
  123. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  124. task, _ := db.Mgo.FindById("task", e.Id, nil)
  125. if len(*task) > 1 {
  126. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  127. e.TaskInfo = &TaskInfo{
  128. Name: (*task)["s_taskname"].(string),
  129. Version: (*task)["s_version"].(string),
  130. VersionId: qu.BsonIdToSId((*v)["_id"]),
  131. TrackColl: trackcoll,
  132. FromDbAddr: (*task)["s_mgoaddr"].(string),
  133. FromDB: (*task)["s_mgodb"].(string),
  134. FromColl: (*task)["s_mgocoll"].(string),
  135. TestColl: resultcoll,
  136. IsEtxLog: true,
  137. ProcessPool: make(chan bool, 1),
  138. }
  139. if (*v)["isextractcity"] != nil {
  140. e.IsExtractCity = (*v)["isextractcity"].(bool)
  141. }
  142. } else {
  143. return
  144. }
  145. }
  146. //加载任务信息
  147. func (e *ExtractTask) InitTaskInfo() {
  148. task, _ := db.Mgo.FindById("task", e.Id, nil)
  149. log.Debug("task", task)
  150. if len(*task) > 1 {
  151. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  152. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  153. log.Debug("s_mgosavecoll", strs)
  154. if len(strs) < 3 {
  155. return
  156. } else {
  157. e.TaskInfo = &TaskInfo{
  158. Name: (*task)["s_taskname"].(string),
  159. Version: (*task)["s_version"].(string),
  160. VersionId: qu.BsonIdToSId((*v)["_id"]),
  161. //TrackColl: (*task)["s_trackcoll"].(string),
  162. FromDbAddr: (*task)["s_mgoaddr"].(string),
  163. FromDB: (*task)["s_mgodb"].(string),
  164. FromColl: (*task)["s_mgocoll"].(string),
  165. ToDbAddr: strs[0],
  166. ToDB: strs[1],
  167. ToColl: strs[2],
  168. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  169. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  170. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  171. }
  172. if (*v)["isextractcity"] != nil {
  173. e.IsExtractCity = (*v)["isextractcity"].(bool)
  174. }
  175. }
  176. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  177. } else {
  178. return
  179. }
  180. }
  181. //加载通用前置规则
  182. func (e *ExtractTask) InitRulePres() {
  183. defer qu.Catch()
  184. e.RulePres = []*RegLuaInfo{}
  185. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  186. for _, v := range *list {
  187. rinfo := &RegLuaInfo{
  188. Code: v["s_code"].(string),
  189. Name: v["s_name"].(string),
  190. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  191. }
  192. if rinfo.IsLua {
  193. rinfo.RuleText = v["s_luascript"].(string)
  194. e.RulePres = append(e.RulePres, rinfo)
  195. } else {
  196. qu.Try(func() {
  197. rinfo.RuleText = v["s_rule"].(string)
  198. tmp := strings.Split(rinfo.RuleText, "__")
  199. var pattern string
  200. if strings.Contains(tmp[0], "\\u") {
  201. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  202. } else {
  203. pattern = tmp[0]
  204. }
  205. if len(tmp) == 2 {
  206. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  207. } else {
  208. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  209. }
  210. e.RulePres = append(e.RulePres, rinfo)
  211. }, func(err interface{}) {
  212. log.Debug(rinfo.Code, rinfo.Field, err)
  213. })
  214. }
  215. }
  216. }
  217. //加载通用后置规则
  218. func (e *ExtractTask) InitRuleBacks() {
  219. defer qu.Catch()
  220. e.RuleBacks = []*RegLuaInfo{}
  221. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  222. for _, v := range *list {
  223. rinfo := &RegLuaInfo{
  224. Code: v["s_code"].(string),
  225. Name: v["s_name"].(string),
  226. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  227. }
  228. if rinfo.IsLua {
  229. rinfo.RuleText = v["s_luascript"].(string)
  230. e.RuleBacks = append(e.RuleBacks, rinfo)
  231. } else {
  232. qu.Try(func() {
  233. rinfo.RuleText = v["s_rule"].(string)
  234. tmp := strings.Split(rinfo.RuleText, "__")
  235. var pattern string
  236. if strings.Contains(tmp[0], "\\u") {
  237. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  238. } else {
  239. pattern = tmp[0]
  240. }
  241. if len(tmp) == 2 {
  242. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  243. } else {
  244. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  245. }
  246. e.RuleBacks = append(e.RuleBacks, rinfo)
  247. }, func(err interface{}) {
  248. log.Debug(rinfo.Code, rinfo.Field, err)
  249. })
  250. }
  251. }
  252. }
  253. func (e *ExtractTask) InfoTypeList() {
  254. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  255. infolist := *infolist1
  256. for _, v := range infolist {
  257. e.InfoType = append(e.InfoType, v)
  258. }
  259. }
  260. //加载抽取规则
  261. func (e *ExtractTask) InitRuleCore() {
  262. defer qu.Catch()
  263. e.Fields = map[string]int{}
  264. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  265. e.RuleCores = make(map[string]map[string][]*RuleCore)
  266. for _, v := range *infolist {
  267. topclass := qu.ObjToString(v["topclass"])
  268. if v["subclass"] == nil {
  269. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  270. for attr, _ := range v["fields"].(map[string]interface{}) {
  271. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
  272. e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
  273. }
  274. } else {
  275. for ca, fs := range v["subclass"].(map[string]interface{}) {
  276. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  277. for field, _ := range fs.(map[string]interface{}) {
  278. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
  279. e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
  280. }
  281. }
  282. }
  283. }
  284. }
  285. func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
  286. maps := []*RuleCore{}
  287. if b, _ := vinfo["isuse"].(bool); !b {
  288. return nil
  289. }
  290. s_field := qu.ObjToString(vinfo["s_field"])
  291. pid := qu.BsonIdToSId(vinfo["_id"])
  292. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  293. for _, vv := range *list {
  294. if b, _ := vv["isuse"].(bool); !b {
  295. continue
  296. }
  297. rcore := &RuleCore{}
  298. rcore.Field = s_field
  299. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  300. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  301. //前置规则
  302. rulePres := []*RegLuaInfo{}
  303. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  304. for _, v := range *plist {
  305. rinfo := &RegLuaInfo{
  306. Field: qu.ObjToString(v["s_field"]),
  307. Code: v["s_code"].(string),
  308. Name: v["s_name"].(string),
  309. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  310. }
  311. if rinfo.IsLua {
  312. rinfo.RuleText = v["s_luascript"].(string)
  313. rulePres = append(rulePres, rinfo)
  314. } else {
  315. qu.Try(func() {
  316. rinfo.RuleText = v["s_rule"].(string)
  317. tmp := strings.Split(rinfo.RuleText, "__")
  318. var pattern string
  319. if strings.Contains(tmp[0], "\\u") {
  320. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  321. } else {
  322. pattern = tmp[0]
  323. }
  324. if len(tmp) == 2 {
  325. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  326. } else {
  327. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  328. }
  329. rulePres = append(rulePres, rinfo)
  330. }, func(err interface{}) {
  331. log.Debug(rinfo.Code, rinfo.Field, err)
  332. })
  333. }
  334. }
  335. rcore.RulePres = rulePres
  336. //后置规则
  337. ruleBacks := []*RegLuaInfo{}
  338. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  339. for _, v := range *blist {
  340. rinfo := &RegLuaInfo{
  341. Field: qu.ObjToString(v["s_field"]),
  342. Code: v["s_code"].(string),
  343. Name: v["s_name"].(string),
  344. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  345. }
  346. if rinfo.IsLua {
  347. rinfo.RuleText = v["s_luascript"].(string)
  348. ruleBacks = append(ruleBacks, rinfo)
  349. } else {
  350. qu.Try(func() {
  351. rinfo.RuleText = v["s_rule"].(string)
  352. tmp := strings.Split(rinfo.RuleText, "__")
  353. var pattern string
  354. if strings.Contains(tmp[0], "\\u") {
  355. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  356. } else {
  357. pattern = tmp[0]
  358. }
  359. if len(tmp) == 2 {
  360. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  361. } else {
  362. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  363. }
  364. ruleBacks = append(ruleBacks, rinfo)
  365. }, func(err interface{}) {
  366. log.Debug(rinfo.Code, rinfo.Field, err)
  367. })
  368. }
  369. }
  370. rcore.RuleBacks = ruleBacks
  371. //抽取规则
  372. ruleCores := []*RegLuaInfo{}
  373. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  374. for _, v := range *clist {
  375. if b, _ := v["isuse"].(bool); !b {
  376. continue
  377. }
  378. field := qu.ObjToString(v["s_field"])
  379. e.Fields[field] = 1 //加入抽取属性组备用
  380. rinfo := &RegLuaInfo{
  381. Field: field,
  382. Code: v["s_code"].(string),
  383. Name: v["s_name"].(string),
  384. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  385. }
  386. if rinfo.IsLua {
  387. rinfo.RuleText = v["s_luascript"].(string)
  388. //提取全部属性
  389. rinfo.LFields = getALLFields()
  390. ruleCores = append(ruleCores, rinfo)
  391. } else {
  392. qu.Try(func() {
  393. rinfo.RuleText = v["s_rule"].(string)
  394. tmp := strings.Split(rinfo.RuleText, "__")
  395. var pattern string
  396. if strings.Contains(tmp[0], "\\u") {
  397. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  398. } else {
  399. pattern = tmp[0]
  400. }
  401. if len(tmp) == 2 {
  402. epos := strings.Split(tmp[1], ",")
  403. posm := map[string]int{}
  404. for _, v := range epos {
  405. ks := strings.Split(v, ":")
  406. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  407. posm[ks[1]] = qu.IntAll(ks[0])
  408. } else { //(.*)招标公告__2
  409. posm[rinfo.Field] = qu.IntAll(ks[0])
  410. }
  411. }
  412. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  413. } else {
  414. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  415. }
  416. ruleCores = append(ruleCores, rinfo)
  417. }, func(err interface{}) {
  418. log.Debug(rinfo.Code, rinfo.Field, err)
  419. })
  420. }
  421. }
  422. rcore.RuleCores = ruleCores
  423. //
  424. maps = append(maps, rcore)
  425. }
  426. return maps
  427. }
  428. //加载分包抽取规则
  429. func (e *ExtractTask) InitPkgCore() {
  430. defer qu.Catch()
  431. e.PkgRuleCores = []*RuleCore{}
  432. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  433. for _, pkginfo := range *pkginfos {
  434. if b, _ := pkginfo["isuse"].(bool); !b {
  435. continue
  436. }
  437. s_field := qu.ObjToString(pkginfo["s_field"])
  438. pid := qu.BsonIdToSId(pkginfo["_id"])
  439. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  440. for _, vv := range *logicList {
  441. if b, _ := vv["isuse"].(bool); !b {
  442. continue
  443. }
  444. rcore := &RuleCore{}
  445. rcore.Field = s_field
  446. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  447. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  448. //后置规则
  449. ruleBacks := []*RegLuaInfo{}
  450. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  451. for _, v := range *blist {
  452. rinfo := &RegLuaInfo{
  453. Field: qu.ObjToString(v["s_field"]),
  454. Code: v["s_code"].(string),
  455. Name: v["s_name"].(string),
  456. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  457. }
  458. if rinfo.IsLua {
  459. rinfo.RuleText = v["s_luascript"].(string)
  460. ruleBacks = append(ruleBacks, rinfo)
  461. } else {
  462. qu.Try(func() {
  463. rinfo.RuleText = v["s_rule"].(string)
  464. tmp := strings.Split(rinfo.RuleText, "__")
  465. var pattern string
  466. if strings.Contains(tmp[0], "\\u") {
  467. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  468. } else {
  469. pattern = tmp[0]
  470. }
  471. if len(tmp) == 2 {
  472. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  473. } else {
  474. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  475. }
  476. ruleBacks = append(ruleBacks, rinfo)
  477. }, func(err interface{}) {
  478. log.Debug(rinfo.Code, rinfo.Field, err)
  479. })
  480. }
  481. }
  482. rcore.RuleBacks = ruleBacks
  483. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  484. }
  485. }
  486. }
  487. //加载标签库
  488. func (e *ExtractTask) InitTag() {
  489. defer qu.Catch()
  490. e.Tag = map[string][]*Tag{}
  491. //字符串标签库
  492. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  493. for _, v := range *list {
  494. field := qu.ObjToString(v["s_field"])
  495. if tmp, ok := v["content"].([]interface{}); ok {
  496. fname := qu.ObjToString(v["s_name"])
  497. tab := ju.TagFile{Name: fname} //用于表格kv
  498. tab.Items = make([]*ju.Tag, len(tmp))
  499. for k, key := range tmp {
  500. tag := &Tag{Type: "string", Key: key.(string)}
  501. e.Tag[field] = append(e.Tag[field], tag)
  502. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  503. }
  504. sort.Sort(tab.Items)
  505. ju.TagdbTable[fname] = &tab
  506. }
  507. }
  508. //正则标签库
  509. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  510. for _, v := range *list {
  511. field := qu.ObjToString(v["s_field"])
  512. if tmp, ok := v["content"].([]interface{}); ok {
  513. fname := qu.ObjToString(v["s_name"])
  514. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  515. tab.Items = make([]*ju.Tag, len(tmp))
  516. for k, key := range tmp {
  517. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  518. e.Tag[field] = append(e.Tag[field], tag)
  519. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  520. }
  521. sort.Sort(tab.Items)
  522. ju.TagdbTable[fname+"_reg"] = &tab
  523. }
  524. }
  525. }
  526. //获取fields
  527. func getALLFields() map[string]string {
  528. fields := map[string]string{}
  529. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  530. for _, v := range *list {
  531. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  532. }
  533. return fields
  534. }
  535. //加载clear函数
  536. func (e *ExtractTask) InitClearFn() {
  537. defer qu.Catch()
  538. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  539. fn := map[string][]string{}
  540. for _, tmp := range *list {
  541. field := tmp["s_field"].(string)
  542. fns := tmp["clear"].([]interface{})
  543. if fn[field] == nil {
  544. fn[field] = []string{}
  545. }
  546. for _, v := range fns {
  547. fn[field] = append(fn[field], v.(string))
  548. }
  549. }
  550. e.ClearFn = fn
  551. }
  552. //加载省份
  553. func InitProvince(version string) map[string]interface{} {
  554. defer qu.Catch()
  555. fn := map[string]interface{}{}
  556. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  557. for _, v := range *list {
  558. name := qu.ObjToString(v["s_name"])
  559. content := v["content"]
  560. switch content.(type) {
  561. case string:
  562. fn[name] = []interface{}{content.(string)}
  563. case []interface{}:
  564. fn[name] = content
  565. }
  566. }
  567. return fn
  568. }
  569. //加载城市简称
  570. func InitCitySim(version string) map[string]map[string]interface{} {
  571. defer qu.Catch()
  572. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  573. fn := map[string]map[string]interface{}{}
  574. for _, v := range *list {
  575. name := qu.ObjToString(v["s_name"])
  576. tmp := v["content"].(map[string]interface{})
  577. fn[name] = tmp
  578. }
  579. return fn
  580. }
  581. //加载城市全称
  582. func InitCityAll(version string) map[string]map[string]interface{} {
  583. defer qu.Catch()
  584. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  585. fn := map[string]map[string]interface{}{}
  586. for _, v := range *list {
  587. name := qu.ObjToString(v["s_name"])
  588. tmp := v["content"].(map[string]interface{})
  589. fn[name] = tmp
  590. }
  591. return fn
  592. }
  593. //初始化城市省份敏感词
  594. func (e *ExtractTask) InitDFA() {
  595. defer qu.Catch()
  596. e.AreaGet = &ju.DFA{}
  597. e.AreaDistrict = &ju.DFA{}
  598. e.AreaProvinceGet = &ju.DFA{}
  599. e.AreaStreet = &ju.DFA{}
  600. //初始化map
  601. if e.ProvinceMap == nil {
  602. e.ProvinceMap = make(map[string]string)
  603. }
  604. if e.CityBrief == nil {
  605. e.CityBrief = make(map[string]*City)
  606. }
  607. if e.ProvinceBrief == nil {
  608. e.ProvinceBrief = make(map[string]*Province)
  609. }
  610. if e.AreaToCity == nil {
  611. e.AreaToCity = make(map[string][]*City)
  612. }
  613. if e.DistrictCityMap == nil {
  614. e.DistrictCityMap = make(map[string]*City)
  615. }
  616. if e.StreetDistrictMap == nil {
  617. e.StreetDistrictMap = make(map[string]*District)
  618. }
  619. //初始化省
  620. fn1 := InitProvince(e.TaskInfo.Version)
  621. for k, v := range fn1 {
  622. for _, p := range v.([]interface{}) {
  623. p1, _ := p.(string)
  624. e.AreaProvinceGet.AddWord(p1)
  625. e.ProvinceMap[p1] = k
  626. }
  627. }
  628. //初始化城市全称
  629. fn2 := InitCityAll(e.TaskInfo.Version)
  630. for k, v := range fn2 {
  631. e.AreaProvinceGet.AddWord(k) //省全称
  632. p := &Province{}
  633. p.Name = k
  634. p.Brief = v["brief"].(string)
  635. e.ProvinceMap[k] = p.Brief
  636. //
  637. e.ProvinceBrief[p.Brief] = p
  638. p.Cap = v["captial"].(string)
  639. city, _ := v["city"].(map[string]interface{})
  640. for k1, v1 := range city {
  641. v1m, _ := v1.(map[string]interface{})
  642. c := &City{}
  643. c.Name = k1
  644. // if v1m["brief"] == nil {
  645. // }
  646. c.Brief = v1m["brief"].(string)
  647. //
  648. e.CityBrief[c.Brief] = c
  649. c.P = p
  650. if c.Brief == p.Cap {
  651. p.Captial = c
  652. }
  653. //加入到城市map中
  654. //
  655. cs := e.AreaToCity[k1]
  656. e.AreaGet.AddWord(k1) //市全称
  657. if cs != nil {
  658. cs = append(cs, c)
  659. } else {
  660. cs = []*City{c}
  661. }
  662. e.AreaToCity[k1] = cs
  663. //区县
  664. districtmap := v1m["area"].(map[string]interface{}) //区或县
  665. for district, streetarr := range districtmap {
  666. d := &District{}
  667. d.Name = district
  668. d.C = c
  669. e.AreaDistrict.AddWord(district) //加入区或县敏感词
  670. ctmp := e.DistrictCityMap[district]
  671. if ctmp == nil {
  672. e.DistrictCityMap[district] = c
  673. }
  674. //街道
  675. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  676. e.AreaStreet.AddWord(s) //加入街道敏感词
  677. dtmp := e.StreetDistrictMap[s]
  678. if dtmp == nil {
  679. e.StreetDistrictMap[s] = d
  680. }
  681. }
  682. }
  683. }
  684. }
  685. //初始化城市简称
  686. fn3 := InitCitySim(e.TaskInfo.Version)
  687. e.AreaSimGet = &ju.DFA{}
  688. for k, v := range fn3 {
  689. pb := v["brief"].(string)
  690. p := e.ProvinceBrief[pb]
  691. //加载
  692. for _, ss := range []string{k, pb} {
  693. cs := e.AreaToCity[ss]
  694. if cs != nil {
  695. cs = append(cs, p.Captial)
  696. } else {
  697. cs = []*City{p.Captial}
  698. }
  699. e.AreaToCity[ss] = cs
  700. e.AreaSimGet.AddWord(ss) //省全称和省简称
  701. }
  702. city, _ := v["city"].(map[string]interface{})
  703. for k1, v1 := range city {
  704. v1m, _ := v1.(map[string]interface{})
  705. if v1m["brief"] == nil {
  706. }
  707. cb := v1m["brief"].(string)
  708. c := e.AreaToCity[k1][0]
  709. //加入到城市map中
  710. for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  711. e.AreaSimGet.AddWord(ss)
  712. cs := e.AreaToCity[ss]
  713. if cs != nil {
  714. cs = append(cs, c)
  715. } else {
  716. cs = []*City{c}
  717. }
  718. e.AreaToCity[ss] = cs
  719. }
  720. arr := v1m["area"].([]interface{})
  721. for _, k2 := range arr {
  722. s := k2.(string)
  723. for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  724. cs := e.AreaToCity[ss]
  725. e.AreaSimGet.AddWord(ss)
  726. if cs != nil {
  727. cs = append(cs, c)
  728. } else {
  729. cs = []*City{c}
  730. }
  731. e.AreaToCity[ss] = cs
  732. //只加入简称
  733. if n == 0 {
  734. d := &District{}
  735. d.Name = ss
  736. d.C = c
  737. e.AreaDistrict.AddWord(ss) //加入区或县简称敏感词
  738. ctmp := e.DistrictCityMap[ss]
  739. if ctmp == nil {
  740. e.DistrictCityMap[ss] = c
  741. }
  742. }
  743. }
  744. }
  745. }
  746. }
  747. }
  748. //保存抽取详情数据
  749. func (e *ExtractTask) ResultSave(init bool) {
  750. defer qu.Catch()
  751. if e.ResultArr == nil {
  752. e.ResultArr = [][]map[string]interface{}{}
  753. }
  754. if init {
  755. go func() {
  756. for {
  757. if len(e.ResultArr) > 500 {
  758. arr := e.ResultArr[:500]
  759. qu.Try(func() {
  760. db.Mgo.UpSertBulk("extract_result", arr...)
  761. }, func(err interface{}) {
  762. log.Debug(err)
  763. })
  764. e.ResultArr = e.ResultArr[500:]
  765. } else {
  766. arr := e.ResultArr
  767. qu.Try(func() {
  768. db.Mgo.UpSertBulk("extract_result", arr...)
  769. }, func(err interface{}) {
  770. log.Debug(err)
  771. })
  772. e.ResultArr = [][]map[string]interface{}{}
  773. }
  774. time.Sleep(10 * time.Second)
  775. }
  776. }()
  777. } else {
  778. arr := e.ResultArr
  779. qu.Try(func() {
  780. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  781. }, func(err interface{}) {
  782. log.Debug(err)
  783. })
  784. e.ResultArr = [][]map[string]interface{}{}
  785. }
  786. }
  787. //保存抽取数据
  788. func (e *ExtractTask) BidSave(init bool) {
  789. defer qu.Catch()
  790. if e.BidArr == nil {
  791. e.BidArr = [][]map[string]interface{}{}
  792. }
  793. if init {
  794. go func() {
  795. for {
  796. if len(e.BidArr) > 500 {
  797. arr := e.BidArr[:500]
  798. qu.Try(func() {
  799. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  800. }, func(err interface{}) {
  801. log.Debug(err)
  802. })
  803. e.BidArr = e.BidArr[500:]
  804. } else {
  805. arr := e.BidArr
  806. qu.Try(func() {
  807. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  808. }, func(err interface{}) {
  809. log.Debug(err)
  810. })
  811. e.BidArr = [][]map[string]interface{}{}
  812. }
  813. time.Sleep(10 * time.Second)
  814. }
  815. }()
  816. } else {
  817. arr := e.BidArr
  818. qu.Try(func() {
  819. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  820. }, func(err interface{}) {
  821. log.Debug(err)
  822. })
  823. e.BidArr = [][]map[string]interface{}{}
  824. time.Sleep(1 * time.Second)
  825. }
  826. }
  827. func (e *ExtractTask) InitAuditRecogField() {
  828. defer qu.Catch()
  829. e.RecogFieldMap = make(map[string]map[string]interface{})
  830. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  831. for _, f := range *recogFieldList {
  832. field := qu.ObjToString(f["s_recogfield"])
  833. e.RecogFieldMap[field] = f
  834. }
  835. }
  836. func (e *ExtractTask) InitAuditClass() {
  837. defer qu.Catch()
  838. e.FidClassMap = make(map[string][]map[string]interface{})
  839. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  840. for _, c := range *class {
  841. classList := []map[string]interface{}{}
  842. fid := qu.ObjToString(c["s_fid"])
  843. if len(e.FidClassMap[fid]) > 0 { //追加
  844. classList = e.FidClassMap[fid]
  845. }
  846. classList = append(classList, c)
  847. e.FidClassMap[fid] = classList
  848. }
  849. }
  850. //加载规则
  851. func (e *ExtractTask) InitAuditRule() {
  852. defer qu.Catch()
  853. var rureg *regexp.Regexp
  854. var rs []rune
  855. var ru string
  856. var err error
  857. e.CidRuleMap = make(map[string][]map[string]interface{})
  858. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  859. for _, v := range *rule {
  860. i_rule := []interface{}{}
  861. ss, _ := (v["s_rule"].([]interface{}))
  862. for _, r := range qu.ObjArrToStringArr(ss) {
  863. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  864. rs = []rune(r)
  865. ru = string(rs[1 : len(rs)-1])
  866. rureg, err = regexp.Compile(ru)
  867. if err != nil {
  868. log.Debug("error---rule:", r)
  869. continue
  870. }
  871. i_rule = append(i_rule, []interface{}{rureg}...)
  872. } else { //规则
  873. i_rule = append(i_rule, r)
  874. }
  875. }
  876. v["rule"] = i_rule
  877. ruleList := []map[string]interface{}{}
  878. classid := qu.ObjToString(v["s_classid"])
  879. if len(e.CidRuleMap[classid]) > 0 { //追加
  880. ruleList = e.CidRuleMap[classid]
  881. }
  882. ruleList = append(ruleList, v)
  883. e.CidRuleMap[classid] = ruleList
  884. }
  885. }
  886. //
  887. func (e *ExtractTask) InitAuditFields() {
  888. if len(e.AuditFields) == 0 {
  889. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  890. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  891. vid := qu.BsonIdToSId((*v)["_id"])
  892. query := map[string]interface{}{
  893. "isaudit": true,
  894. "delete": false,
  895. "vid": vid,
  896. }
  897. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  898. for _, d := range *data {
  899. field := qu.ObjToString(d["s_field"])
  900. e.AuditFields = append(e.AuditFields, field)
  901. }
  902. }
  903. }
  904. }
  905. //加载附件抽取
  906. func (e *ExtractTask) InitFile() {
  907. defer qu.Catch()
  908. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  909. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  910. //ve, _ := db.Mgo.FindOne("version", query)
  911. if ve == nil {
  912. return
  913. }
  914. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  915. e.IsFileField = true
  916. }
  917. syscefiled := new(sync.Map)
  918. if (*ve)["s_filefileds"] != nil {
  919. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  920. syscefiled.Store(vff.(string), 1)
  921. }
  922. }
  923. e.FileFields = syscefiled
  924. }
  925. //加载清理任务信息
  926. func (c *ClearTask) InitClearTaskInfo() {
  927. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  928. if len(*cleartask) > 1 {
  929. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  930. c.ClearTaskInfo = &ClearTaskInfo{
  931. Name: (*cleartask)["s_taskname"].(string),
  932. Version: (*cleartask)["s_version"].(string),
  933. VersionId: qu.BsonIdToSId((*v)["_id"]),
  934. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  935. FromDB: (*cleartask)["s_mgodb"].(string),
  936. FromColl: (*cleartask)["s_mgocoll"].(string),
  937. IsCltLog: ju.Config["iscltlog"].(bool),
  938. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  939. }
  940. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  941. } else {
  942. return
  943. }
  944. }
  945. //加载清理脚本
  946. func (c *ClearTask) InitClearLuas() {
  947. defer qu.Catch()
  948. c.ClearLuas = make(map[string][]*ClearLua)
  949. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  950. for _, l := range *list {
  951. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  952. continue
  953. }
  954. s_field := qu.ObjToString(l["s_field"])
  955. pid := qu.BsonIdToSId(l["_id"])
  956. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  957. for _, vv := range *luas {
  958. if b, _ := vv["isuse"].(bool); !b {
  959. continue
  960. }
  961. clearLua := &ClearLua{
  962. Field: s_field,
  963. Code: vv["s_code"].(string),
  964. Name: vv["s_name"].(string),
  965. LuaText: vv["s_luascript"].(string),
  966. LFields: getALLFields(),
  967. }
  968. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  969. }
  970. }
  971. }
  972. //加载分块规则
  973. func (e *ExtractTask) InitBlockRule() {
  974. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  975. "vid": e.TaskInfo.VersionId,
  976. "delete": false,
  977. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  978. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  979. for _, v := range *datas {
  980. block_reg, _ := v["block_reg"].(string)
  981. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  982. title_reg, _ := v["title_reg"].(string)
  983. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  984. if block_reg == "" || title_reg == "" {
  985. continue
  986. }
  987. b_reg, b_err := regexp.Compile(block_reg)
  988. t_reg, t_err := regexp.Compile(title_reg)
  989. if b_err != nil || t_err != nil {
  990. continue
  991. }
  992. brs = append(brs, b_reg)
  993. trs = append(trs, t_reg)
  994. }
  995. e.RuleBlock = &ju.RuleBlock{
  996. BlockRegs: brs,
  997. TitleRegs: trs,
  998. }
  999. }