extractInit.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. "log"
  6. qu "qfw/util"
  7. "regexp"
  8. "strings"
  9. )
  10. type RegLuaInfo struct { //正则或脚本信息
  11. Code, Name, Field string //
  12. RuleText string //
  13. IsLua, IsHasFields bool //IsHasFields正则配置有属性字段
  14. RegPreBac *ExtReg //
  15. RegCore *ExtReg //
  16. LFields []interface{} //lua抽取字段属性组
  17. }
  18. type ExtReg struct {
  19. Reg *regexp.Regexp
  20. Replace string
  21. Bextract bool
  22. ExtractPos map[string]int
  23. }
  24. type RuleCore struct {
  25. Field string //逻辑字段
  26. LuaLogic string //进入逻辑
  27. ExtFrom string //从哪个字段抽取
  28. RulePres []*RegLuaInfo //抽取前置规则
  29. RuleBacks []*RegLuaInfo //抽取后置规则
  30. RuleCores []*RegLuaInfo //抽取规则
  31. }
  32. type TaskInfo struct {
  33. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  34. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  35. SaveColl, TestColl, LastExtId string //抽取结果表、测试结果表、上次抽取信息id
  36. DB *db.Pool //数据库连接池
  37. IsEtxLog bool //是否开启抽取日志
  38. ProcessPool chan bool //任务进程池
  39. TestLua bool //检查测试用
  40. }
  41. type Tag struct {
  42. Type string //标签类型 string 字符串、regexp 正则
  43. Key string //
  44. Reg *regexp.Regexp //
  45. }
  46. type ExtractTask struct {
  47. Id string //任务id
  48. IsRun bool //是否启动
  49. Content string //信息内容
  50. TaskInfo *TaskInfo //任务信息
  51. RulePres []*RegLuaInfo //通用前置规则
  52. RuleBacks []*RegLuaInfo //通用后置规则
  53. RuleCores []*RuleCore //抽取规则
  54. Tag map[string][]*Tag //标签库
  55. ClearFn map[string][]string //清理函数
  56. }
  57. func init() {
  58. TaskList = make(map[string]*ExtractTask)
  59. go SaveExtLog()
  60. }
  61. //加载任务信息
  62. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  63. task, _ := db.Mgo.FindById("task", e.Id, nil)
  64. if len(*task) > 1 {
  65. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
  66. e.TaskInfo = &TaskInfo{
  67. Name: (*task)["s_taskname"].(string),
  68. Version: (*task)["s_version"].(string),
  69. VersionId: qu.BsonIdToSId((*v)["_id"]),
  70. TrackColl: trackcoll,
  71. FromDbAddr: (*task)["s_mgoaddr"].(string),
  72. FromDB: (*task)["s_mgodb"].(string),
  73. FromColl: (*task)["s_mgocoll"].(string),
  74. TestColl: resultcoll,
  75. IsEtxLog: true,
  76. ProcessPool: make(chan bool, 1),
  77. }
  78. } else {
  79. return
  80. }
  81. }
  82. //加载任务信息
  83. func (e *ExtractTask) InitTaskInfo() {
  84. task, _ := db.Mgo.FindById("task", e.Id, nil)
  85. log.Println("task", task)
  86. if len(*task) > 1 {
  87. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
  88. e.TaskInfo = &TaskInfo{
  89. Name: (*task)["s_taskname"].(string),
  90. Version: (*task)["s_version"].(string),
  91. VersionId: qu.BsonIdToSId((*v)["_id"]),
  92. //TrackColl: (*task)["s_trackcoll"].(string),
  93. FromDbAddr: (*task)["s_mgoaddr"].(string),
  94. FromDB: (*task)["s_mgodb"].(string),
  95. FromColl: (*task)["s_mgocoll"].(string),
  96. SaveColl: (*task)["s_mgosavecoll"].(string),
  97. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  98. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  99. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  100. }
  101. log.Println(e.TaskInfo.Name, e.TaskInfo.ProcessPool)
  102. } else {
  103. return
  104. }
  105. }
  106. //加载通用前置规则
  107. func (e *ExtractTask) InitRulePres() {
  108. defer qu.Catch()
  109. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
  110. for _, v := range *list {
  111. rinfo := &RegLuaInfo{
  112. Code: v["s_code"].(string),
  113. Name: v["s_name"].(string),
  114. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  115. }
  116. if rinfo.IsLua {
  117. rinfo.RuleText = v["s_luascript"].(string)
  118. } else {
  119. qu.Try(func() {
  120. rinfo.RuleText = v["s_rule"].(string)
  121. tmp := strings.Split(rinfo.RuleText, "__")
  122. if len(tmp) == 2 {
  123. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
  124. } else {
  125. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
  126. }
  127. }, func(err interface{}) {
  128. log.Println(rinfo.Code, rinfo.Field, err)
  129. })
  130. }
  131. e.RulePres = append(e.RulePres, rinfo)
  132. }
  133. }
  134. //加载通用后置规则
  135. func (e *ExtractTask) InitRuleBacks() {
  136. defer qu.Catch()
  137. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
  138. for _, v := range *list {
  139. rinfo := &RegLuaInfo{
  140. Code: v["s_code"].(string),
  141. Name: v["s_name"].(string),
  142. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  143. }
  144. if rinfo.IsLua {
  145. rinfo.RuleText = v["s_luascript"].(string)
  146. } else {
  147. qu.Try(func() {
  148. rinfo.RuleText = v["s_rule"].(string)
  149. tmp := strings.Split(rinfo.RuleText, "__")
  150. if len(tmp) == 2 {
  151. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
  152. } else {
  153. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
  154. }
  155. }, func(err interface{}) {
  156. log.Println(rinfo.Code, rinfo.Field, err)
  157. })
  158. }
  159. e.RuleBacks = append(e.RuleBacks, rinfo)
  160. }
  161. }
  162. //加载抽取规则
  163. func (e *ExtractTask) InitRuleCore() {
  164. defer qu.Catch()
  165. vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`"}`, nil, nil, false, -1, -1)
  166. for _, vinfo := range *vinfos {
  167. if b, _ := vinfo["isuse"].(bool); !b {
  168. continue
  169. }
  170. pid := qu.BsonIdToSId(vinfo["_id"])
  171. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`"}`, nil, nil, false, -1, -1)
  172. for _, vv := range *list {
  173. if b, _ := vv["isuse"].(bool); !b {
  174. continue
  175. }
  176. rcore := &RuleCore{}
  177. rcore.Field = vinfo["s_field"].(string)
  178. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  179. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  180. //前置规则
  181. rulePres := []*RegLuaInfo{}
  182. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
  183. for _, v := range *plist {
  184. rinfo := &RegLuaInfo{
  185. Code: v["s_code"].(string),
  186. Name: v["s_name"].(string),
  187. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  188. }
  189. if rinfo.IsLua {
  190. rinfo.RuleText = v["s_luascript"].(string)
  191. } else {
  192. qu.Try(func() {
  193. rinfo.RuleText = v["s_rule"].(string)
  194. rinfo.Field = v["s_field"].(string)
  195. tmp := strings.Split(rinfo.RuleText, "__")
  196. if len(tmp) == 2 {
  197. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
  198. } else {
  199. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
  200. }
  201. }, func(err interface{}) {
  202. log.Println(rinfo.Code, rinfo.Field, err)
  203. })
  204. }
  205. rulePres = append(rulePres, rinfo)
  206. }
  207. rcore.RulePres = rulePres
  208. //后置规则
  209. ruleBacks := []*RegLuaInfo{}
  210. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
  211. for _, v := range *blist {
  212. rinfo := &RegLuaInfo{
  213. Code: v["s_code"].(string),
  214. Name: v["s_name"].(string),
  215. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  216. }
  217. if rinfo.IsLua {
  218. rinfo.RuleText = v["s_luascript"].(string)
  219. } else {
  220. qu.Try(func() {
  221. rinfo.RuleText = v["s_rule"].(string)
  222. rinfo.Field = v["s_field"].(string)
  223. tmp := strings.Split(rinfo.RuleText, "__")
  224. if len(tmp) == 2 {
  225. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
  226. } else {
  227. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
  228. }
  229. }, func(err interface{}) {
  230. log.Println(rinfo.Code, rinfo.Field, err)
  231. })
  232. }
  233. ruleBacks = append(ruleBacks, rinfo)
  234. }
  235. rcore.RuleBacks = ruleBacks
  236. //抽取规则
  237. ruleCores := []*RegLuaInfo{}
  238. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
  239. for _, v := range *clist {
  240. if b, _ := v["isuse"].(bool); !b {
  241. continue
  242. }
  243. rinfo := &RegLuaInfo{
  244. Code: v["s_code"].(string),
  245. Name: v["s_name"].(string),
  246. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  247. }
  248. if rinfo.IsLua {
  249. rinfo.RuleText = v["s_luascript"].(string)
  250. //暂时提取全部属性
  251. rinfo.LFields = getALLFields()
  252. rinfo.IsHasFields = true
  253. /*rinfo.LFields, _ = v["s_fields"].([]interface{})
  254. if len(rinfo.LFields) > 0 {
  255. rinfo.IsHasFields = true
  256. }*/
  257. } else {
  258. qu.Try(func() {
  259. rinfo.RuleText = v["s_rule"].(string)
  260. rinfo.Field = v["s_field"].(string)
  261. tmp := strings.Split(rinfo.RuleText, "__")
  262. if len(tmp) == 2 {
  263. epos := strings.Split(tmp[1], ",")
  264. posm := map[string]int{}
  265. for _, v := range epos {
  266. ks := strings.Split(v, ":")
  267. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  268. posm[ks[1]] = qu.IntAll(ks[0])
  269. } else { //(.*)招标公告__2
  270. posm[rinfo.Field] = qu.IntAll(ks[0])
  271. }
  272. }
  273. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
  274. } else {
  275. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
  276. }
  277. }, func(err interface{}) {
  278. log.Println(rinfo.Code, rinfo.Field, err)
  279. })
  280. }
  281. ruleCores = append(ruleCores, rinfo)
  282. }
  283. rcore.RuleCores = ruleCores
  284. //
  285. e.RuleCores = append(e.RuleCores, rcore)
  286. }
  287. }
  288. }
  289. //加载标签库
  290. func (e *ExtractTask) InitTag() {
  291. defer qu.Catch()
  292. e.Tag = map[string][]*Tag{}
  293. //字符串标签库
  294. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"字符串","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
  295. for _, v := range *list {
  296. field := qu.ObjToString(v["s_field"])
  297. if tmp, ok := v["content"].([]interface{}); ok {
  298. for _, key := range tmp {
  299. tag := &Tag{Type: "string", Key: key.(string)}
  300. e.Tag[field] = append(e.Tag[field], tag)
  301. }
  302. }
  303. }
  304. //正则标签库
  305. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"正则","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
  306. for _, v := range *list {
  307. field := qu.ObjToString(v["s_field"])
  308. if tmp, ok := v["content"].([]interface{}); ok {
  309. for _, key := range tmp {
  310. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  311. e.Tag[field] = append(e.Tag[field], tag)
  312. }
  313. }
  314. }
  315. }
  316. //获取fields
  317. func getALLFields() []interface{} {
  318. fields := []interface{}{}
  319. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1}`, false, -1, -1)
  320. for _, v := range *list {
  321. fields = append(fields, v["s_field"])
  322. }
  323. return fields
  324. }
  325. //加载clear函数
  326. func (e *ExtractTask) InitClearFn() {
  327. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
  328. fn := map[string][]string{}
  329. for _, tmp := range *list {
  330. field := tmp["s_field"].(string)
  331. fns := tmp["clear"].([]interface{})
  332. if fn[field] == nil {
  333. fn[field] = []string{}
  334. }
  335. for _, v := range fns {
  336. fn[field] = append(fn[field], v.(string))
  337. }
  338. }
  339. e.ClearFn = fn
  340. }