extractInit.go 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. "log"
  7. qu "qfw/util"
  8. "regexp"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "time"
  13. )
  14. type RegLuaInfo struct { //正则或脚本信息
  15. Code, Name, Field string //
  16. RuleText string //
  17. IsLua bool //
  18. RegPreBac *ExtReg //
  19. RegCore *ExtReg //
  20. LFields map[string]string //lua抽取字段属性组
  21. }
  22. type ExtReg struct {
  23. Reg *regexp.Regexp
  24. Replace string
  25. Bextract bool
  26. ExtractPos map[string]int
  27. }
  28. type RuleCore struct {
  29. Field string //逻辑字段
  30. LuaLogic string //进入逻辑
  31. ExtFrom string //从哪个字段抽取
  32. RulePres []*RegLuaInfo //抽取前置规则
  33. RuleBacks []*RegLuaInfo //抽取后置规则
  34. RuleCores []*RegLuaInfo //抽取规则
  35. }
  36. type Tag struct {
  37. Type string //标签类型 string 字符串、regexp 正则
  38. Key string //
  39. Reg *regexp.Regexp //
  40. }
  41. type TaskInfo struct {
  42. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  43. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  44. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  45. TestColl, LastExtId string //测试结果表、上次抽取信息id
  46. FDB *db.Pool //数据库连接池
  47. TDB *db.Pool //数据库连接池
  48. IsEtxLog bool //是否开启抽取日志
  49. ProcessPool chan bool //任务进程池
  50. TestLua bool //检查测试用
  51. }
  52. type ExtractTask struct {
  53. Id string //任务id
  54. IsRun bool //是否启动
  55. Content string //信息内容
  56. TaskInfo *TaskInfo //任务信息
  57. RulePres []*RegLuaInfo //通用前置规则
  58. RuleBacks []*RegLuaInfo //通用后置规则
  59. RuleCores []*RuleCore //抽取规则
  60. PkgRuleCores []*RuleCore //分包抽取规则
  61. RuleBlock *ju.RuleBlock
  62. Tag map[string][]*Tag //标签库
  63. ClearFn map[string][]string //清理函数
  64. IsExtractCity bool //是否开启城市抽取
  65. Fields map[string]int //抽取属性组
  66. IsFileField bool //是否开启附件抽取
  67. FileFields map[string]int //抽取附件属性组
  68. ResultChanel chan bool //抽取结果详情
  69. ResultArr [][]map[string]interface{} //抽取结果详情
  70. BidChanel chan bool //抽取结果
  71. BidArr [][]map[string]interface{} //抽取结果
  72. RecogFieldMap map[string]map[string]interface{} //识别字段
  73. FidClassMap map[string][]map[string]interface{} //分类
  74. CidRuleMap map[string][]map[string]interface{} //规则
  75. AuditFields []string //需要审核的字段名称
  76. ProvinceMap map[string]string
  77. CityBrief map[string]*City //只加载一次即可
  78. ProvinceBrief map[string]*Province //只加载一次
  79. AreaToCity map[string][]*City //两个文件共用
  80. DistrictCityMap map[string]*City
  81. StreetDistrictMap map[string]*District
  82. AreaGet *ju.DFA //市全称
  83. AreaDistrict *ju.DFA //区或县
  84. AreaProvinceGet *ju.DFA //省
  85. AreaSimGet *ju.DFA //市简称
  86. AreaStreet *ju.DFA //街道
  87. }
  88. type ClearTaskInfo struct {
  89. Name, Version, VersionId string //名称、版本、版本id
  90. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  91. FDB *db.Pool //数据库连接池
  92. TDB *db.Pool //数据库连接池
  93. IsCltLog bool //是否开启清理日志
  94. ProcessPool chan bool //任务进程池
  95. }
  96. type ClearLua struct {
  97. Field string //字段字段
  98. Code string //代码
  99. Name string //名称
  100. LuaText string
  101. //LuaLogic string //进入逻辑
  102. //ExtFrom string //从哪个字段抽取
  103. LFields map[string]string //lua抽取字段属性组
  104. }
  105. type ClearTask struct {
  106. Id string //任务id
  107. Content string //信息内容
  108. ClearTaskInfo *ClearTaskInfo //任务信息
  109. ClearLuas map[string][]*ClearLua //清理脚本
  110. UpdateResult [][]map[string]interface{} //清理后结果
  111. ClearChannel chan bool
  112. }
  113. func init() {
  114. TaskList = make(map[string]*ExtractTask)
  115. ClearTaskList = make(map[string]*ClearTask)
  116. go SaveExtLog()
  117. go SaveCltLog() //保存清理日志
  118. }
  119. //加载任务信息
  120. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  121. task, _ := db.Mgo.FindById("task", e.Id, nil)
  122. if len(*task) > 1 {
  123. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  124. e.TaskInfo = &TaskInfo{
  125. Name: (*task)["s_taskname"].(string),
  126. Version: (*task)["s_version"].(string),
  127. VersionId: qu.BsonIdToSId((*v)["_id"]),
  128. TrackColl: trackcoll,
  129. FromDbAddr: (*task)["s_mgoaddr"].(string),
  130. FromDB: (*task)["s_mgodb"].(string),
  131. FromColl: (*task)["s_mgocoll"].(string),
  132. TestColl: resultcoll,
  133. IsEtxLog: true,
  134. ProcessPool: make(chan bool, 1),
  135. }
  136. if (*v)["isextractcity"] != nil {
  137. e.IsExtractCity = (*v)["isextractcity"].(bool)
  138. }
  139. } else {
  140. return
  141. }
  142. }
  143. //加载任务信息
  144. func (e *ExtractTask) InitTaskInfo() {
  145. task, _ := db.Mgo.FindById("task", e.Id, nil)
  146. log.Println("task", task)
  147. if len(*task) > 1 {
  148. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  149. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  150. log.Println("s_mgosavecoll", strs)
  151. if len(strs) < 3 {
  152. return
  153. } else {
  154. e.TaskInfo = &TaskInfo{
  155. Name: (*task)["s_taskname"].(string),
  156. Version: (*task)["s_version"].(string),
  157. VersionId: qu.BsonIdToSId((*v)["_id"]),
  158. //TrackColl: (*task)["s_trackcoll"].(string),
  159. FromDbAddr: (*task)["s_mgoaddr"].(string),
  160. FromDB: (*task)["s_mgodb"].(string),
  161. FromColl: (*task)["s_mgocoll"].(string),
  162. ToDbAddr: strs[0],
  163. ToDB: strs[1],
  164. ToColl: strs[2],
  165. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  166. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  167. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  168. }
  169. if (*v)["isextractcity"] != nil {
  170. e.IsExtractCity = (*v)["isextractcity"].(bool)
  171. }
  172. }
  173. log.Println(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  174. } else {
  175. return
  176. }
  177. }
  178. //加载通用前置规则
  179. func (e *ExtractTask) InitRulePres() {
  180. defer qu.Catch()
  181. e.RulePres = []*RegLuaInfo{}
  182. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  183. for _, v := range *list {
  184. rinfo := &RegLuaInfo{
  185. Code: v["s_code"].(string),
  186. Name: v["s_name"].(string),
  187. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  188. }
  189. if rinfo.IsLua {
  190. rinfo.RuleText = v["s_luascript"].(string)
  191. e.RulePres = append(e.RulePres, rinfo)
  192. } else {
  193. qu.Try(func() {
  194. rinfo.RuleText = v["s_rule"].(string)
  195. tmp := strings.Split(rinfo.RuleText, "__")
  196. var pattern string
  197. if strings.Contains(tmp[0], "\\u") {
  198. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  199. } else {
  200. pattern = tmp[0]
  201. }
  202. if len(tmp) == 2 {
  203. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  204. } else {
  205. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  206. }
  207. e.RulePres = append(e.RulePres, rinfo)
  208. }, func(err interface{}) {
  209. log.Println(rinfo.Code, rinfo.Field, err)
  210. })
  211. }
  212. }
  213. }
  214. //加载通用后置规则
  215. func (e *ExtractTask) InitRuleBacks() {
  216. defer qu.Catch()
  217. e.RuleBacks = []*RegLuaInfo{}
  218. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  219. for _, v := range *list {
  220. rinfo := &RegLuaInfo{
  221. Code: v["s_code"].(string),
  222. Name: v["s_name"].(string),
  223. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  224. }
  225. if rinfo.IsLua {
  226. rinfo.RuleText = v["s_luascript"].(string)
  227. e.RuleBacks = append(e.RuleBacks, rinfo)
  228. } else {
  229. qu.Try(func() {
  230. rinfo.RuleText = v["s_rule"].(string)
  231. tmp := strings.Split(rinfo.RuleText, "__")
  232. var pattern string
  233. if strings.Contains(tmp[0], "\\u") {
  234. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  235. } else {
  236. pattern = tmp[0]
  237. }
  238. if len(tmp) == 2 {
  239. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  240. } else {
  241. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  242. }
  243. e.RuleBacks = append(e.RuleBacks, rinfo)
  244. }, func(err interface{}) {
  245. log.Println(rinfo.Code, rinfo.Field, err)
  246. })
  247. }
  248. }
  249. }
  250. //加载抽取规则
  251. func (e *ExtractTask) InitRuleCore() {
  252. defer qu.Catch()
  253. e.Fields = map[string]int{}
  254. e.RuleCores = []*RuleCore{}
  255. vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  256. for _, vinfo := range *vinfos {
  257. if b, _ := vinfo["isuse"].(bool); !b {
  258. continue
  259. }
  260. s_field := qu.ObjToString(vinfo["s_field"])
  261. pid := qu.BsonIdToSId(vinfo["_id"])
  262. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  263. for _, vv := range *list {
  264. if b, _ := vv["isuse"].(bool); !b {
  265. continue
  266. }
  267. rcore := &RuleCore{}
  268. rcore.Field = s_field
  269. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  270. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  271. //前置规则
  272. rulePres := []*RegLuaInfo{}
  273. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  274. for _, v := range *plist {
  275. rinfo := &RegLuaInfo{
  276. Field: qu.ObjToString(v["s_field"]),
  277. Code: v["s_code"].(string),
  278. Name: v["s_name"].(string),
  279. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  280. }
  281. if rinfo.IsLua {
  282. rinfo.RuleText = v["s_luascript"].(string)
  283. rulePres = append(rulePres, rinfo)
  284. } else {
  285. qu.Try(func() {
  286. rinfo.RuleText = v["s_rule"].(string)
  287. tmp := strings.Split(rinfo.RuleText, "__")
  288. var pattern string
  289. if strings.Contains(tmp[0], "\\u") {
  290. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  291. } else {
  292. pattern = tmp[0]
  293. }
  294. if len(tmp) == 2 {
  295. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  296. } else {
  297. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  298. }
  299. rulePres = append(rulePres, rinfo)
  300. }, func(err interface{}) {
  301. log.Println(rinfo.Code, rinfo.Field, err)
  302. })
  303. }
  304. }
  305. rcore.RulePres = rulePres
  306. //后置规则
  307. ruleBacks := []*RegLuaInfo{}
  308. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  309. for _, v := range *blist {
  310. rinfo := &RegLuaInfo{
  311. Field: qu.ObjToString(v["s_field"]),
  312. Code: v["s_code"].(string),
  313. Name: v["s_name"].(string),
  314. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  315. }
  316. if rinfo.IsLua {
  317. rinfo.RuleText = v["s_luascript"].(string)
  318. ruleBacks = append(ruleBacks, rinfo)
  319. } else {
  320. qu.Try(func() {
  321. rinfo.RuleText = v["s_rule"].(string)
  322. tmp := strings.Split(rinfo.RuleText, "__")
  323. var pattern string
  324. if strings.Contains(tmp[0], "\\u") {
  325. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  326. } else {
  327. pattern = tmp[0]
  328. }
  329. if len(tmp) == 2 {
  330. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  331. } else {
  332. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  333. }
  334. ruleBacks = append(ruleBacks, rinfo)
  335. }, func(err interface{}) {
  336. log.Println(rinfo.Code, rinfo.Field, err)
  337. })
  338. }
  339. }
  340. rcore.RuleBacks = ruleBacks
  341. //抽取规则
  342. ruleCores := []*RegLuaInfo{}
  343. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  344. for _, v := range *clist {
  345. if b, _ := v["isuse"].(bool); !b {
  346. continue
  347. }
  348. field := qu.ObjToString(v["s_field"])
  349. e.Fields[field] = 1 //加入抽取属性组备用
  350. rinfo := &RegLuaInfo{
  351. Field: field,
  352. Code: v["s_code"].(string),
  353. Name: v["s_name"].(string),
  354. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  355. }
  356. if rinfo.IsLua {
  357. rinfo.RuleText = v["s_luascript"].(string)
  358. //提取全部属性
  359. rinfo.LFields = getALLFields()
  360. ruleCores = append(ruleCores, rinfo)
  361. } else {
  362. qu.Try(func() {
  363. rinfo.RuleText = v["s_rule"].(string)
  364. tmp := strings.Split(rinfo.RuleText, "__")
  365. var pattern string
  366. if strings.Contains(tmp[0], "\\u") {
  367. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  368. } else {
  369. pattern = tmp[0]
  370. }
  371. if len(tmp) == 2 {
  372. epos := strings.Split(tmp[1], ",")
  373. posm := map[string]int{}
  374. for _, v := range epos {
  375. ks := strings.Split(v, ":")
  376. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  377. posm[ks[1]] = qu.IntAll(ks[0])
  378. } else { //(.*)招标公告__2
  379. posm[rinfo.Field] = qu.IntAll(ks[0])
  380. }
  381. }
  382. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  383. } else {
  384. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  385. }
  386. ruleCores = append(ruleCores, rinfo)
  387. }, func(err interface{}) {
  388. log.Println(rinfo.Code, rinfo.Field, err)
  389. })
  390. }
  391. }
  392. rcore.RuleCores = ruleCores
  393. //
  394. e.RuleCores = append(e.RuleCores, rcore)
  395. }
  396. }
  397. }
  398. //加载分包抽取规则
  399. func (e *ExtractTask) InitPkgCore() {
  400. defer qu.Catch()
  401. e.PkgRuleCores = []*RuleCore{}
  402. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  403. for _, pkginfo := range *pkginfos {
  404. if b, _ := pkginfo["isuse"].(bool); !b {
  405. continue
  406. }
  407. s_field := qu.ObjToString(pkginfo["s_field"])
  408. pid := qu.BsonIdToSId(pkginfo["_id"])
  409. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  410. for _, vv := range *logicList {
  411. if b, _ := vv["isuse"].(bool); !b {
  412. continue
  413. }
  414. rcore := &RuleCore{}
  415. rcore.Field = s_field
  416. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  417. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  418. //后置规则
  419. ruleBacks := []*RegLuaInfo{}
  420. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  421. for _, v := range *blist {
  422. rinfo := &RegLuaInfo{
  423. Field: qu.ObjToString(v["s_field"]),
  424. Code: v["s_code"].(string),
  425. Name: v["s_name"].(string),
  426. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  427. }
  428. if rinfo.IsLua {
  429. rinfo.RuleText = v["s_luascript"].(string)
  430. ruleBacks = append(ruleBacks, rinfo)
  431. } else {
  432. qu.Try(func() {
  433. rinfo.RuleText = v["s_rule"].(string)
  434. tmp := strings.Split(rinfo.RuleText, "__")
  435. var pattern string
  436. if strings.Contains(tmp[0], "\\u") {
  437. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  438. } else {
  439. pattern = tmp[0]
  440. }
  441. if len(tmp) == 2 {
  442. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  443. } else {
  444. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  445. }
  446. ruleBacks = append(ruleBacks, rinfo)
  447. }, func(err interface{}) {
  448. log.Println(rinfo.Code, rinfo.Field, err)
  449. })
  450. }
  451. }
  452. rcore.RuleBacks = ruleBacks
  453. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  454. }
  455. }
  456. }
  457. //加载标签库
  458. func (e *ExtractTask) InitTag() {
  459. defer qu.Catch()
  460. e.Tag = map[string][]*Tag{}
  461. //字符串标签库
  462. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  463. for _, v := range *list {
  464. field := qu.ObjToString(v["s_field"])
  465. if tmp, ok := v["content"].([]interface{}); ok {
  466. fname := qu.ObjToString(v["s_name"])
  467. tab := ju.TagFile{Name: fname} //用于表格kv
  468. tab.Items = make([]*ju.Tag, len(tmp))
  469. for k, key := range tmp {
  470. tag := &Tag{Type: "string", Key: key.(string)}
  471. e.Tag[field] = append(e.Tag[field], tag)
  472. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  473. }
  474. sort.Sort(tab.Items)
  475. ju.TagdbTable[fname] = &tab
  476. }
  477. }
  478. //正则标签库
  479. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  480. for _, v := range *list {
  481. field := qu.ObjToString(v["s_field"])
  482. if tmp, ok := v["content"].([]interface{}); ok {
  483. fname := qu.ObjToString(v["s_name"])
  484. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  485. tab.Items = make([]*ju.Tag, len(tmp))
  486. for k, key := range tmp {
  487. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  488. e.Tag[field] = append(e.Tag[field], tag)
  489. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  490. }
  491. sort.Sort(tab.Items)
  492. ju.TagdbTable[fname+"_reg"] = &tab
  493. }
  494. }
  495. }
  496. //获取fields
  497. func getALLFields() map[string]string {
  498. fields := map[string]string{}
  499. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  500. for _, v := range *list {
  501. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  502. }
  503. return fields
  504. }
  505. //加载clear函数
  506. func (e *ExtractTask) InitClearFn() {
  507. defer qu.Catch()
  508. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  509. fn := map[string][]string{}
  510. for _, tmp := range *list {
  511. field := tmp["s_field"].(string)
  512. fns := tmp["clear"].([]interface{})
  513. if fn[field] == nil {
  514. fn[field] = []string{}
  515. }
  516. for _, v := range fns {
  517. fn[field] = append(fn[field], v.(string))
  518. }
  519. }
  520. e.ClearFn = fn
  521. }
  522. //加载省份
  523. func InitProvince(version string) map[string]interface{} {
  524. defer qu.Catch()
  525. fn := map[string]interface{}{}
  526. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  527. for _, v := range *list {
  528. name := qu.ObjToString(v["s_name"])
  529. content := v["content"]
  530. switch content.(type) {
  531. case string:
  532. fn[name] = []interface{}{content.(string)}
  533. case []interface{}:
  534. fn[name] = content
  535. }
  536. }
  537. return fn
  538. }
  539. //加载城市简称
  540. func InitCitySim(version string) map[string]map[string]interface{} {
  541. defer qu.Catch()
  542. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  543. fn := map[string]map[string]interface{}{}
  544. for _, v := range *list {
  545. name := qu.ObjToString(v["s_name"])
  546. tmp := v["content"].(map[string]interface{})
  547. fn[name] = tmp
  548. }
  549. return fn
  550. }
  551. //加载城市全称
  552. func InitCityAll(version string) map[string]map[string]interface{} {
  553. defer qu.Catch()
  554. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  555. fn := map[string]map[string]interface{}{}
  556. for _, v := range *list {
  557. name := qu.ObjToString(v["s_name"])
  558. tmp := v["content"].(map[string]interface{})
  559. fn[name] = tmp
  560. }
  561. return fn
  562. }
  563. //初始化城市省份敏感词
  564. func (e *ExtractTask) InitDFA() {
  565. defer qu.Catch()
  566. e.AreaGet = &ju.DFA{}
  567. e.AreaDistrict = &ju.DFA{}
  568. e.AreaProvinceGet = &ju.DFA{}
  569. e.AreaStreet = &ju.DFA{}
  570. //初始化map
  571. if e.ProvinceMap == nil {
  572. e.ProvinceMap = make(map[string]string)
  573. }
  574. if e.CityBrief == nil {
  575. e.CityBrief = make(map[string]*City)
  576. }
  577. if e.ProvinceBrief == nil {
  578. e.ProvinceBrief = make(map[string]*Province)
  579. }
  580. if e.AreaToCity == nil {
  581. e.AreaToCity = make(map[string][]*City)
  582. }
  583. if e.DistrictCityMap == nil {
  584. e.DistrictCityMap = make(map[string]*City)
  585. }
  586. if e.StreetDistrictMap == nil {
  587. e.StreetDistrictMap = make(map[string]*District)
  588. }
  589. //初始化省
  590. fn1 := InitProvince(e.TaskInfo.Version)
  591. for k, v := range fn1 {
  592. for _, p := range v.([]interface{}) {
  593. p1, _ := p.(string)
  594. e.AreaProvinceGet.AddWord(p1)
  595. e.ProvinceMap[p1] = k
  596. }
  597. }
  598. //初始化城市全称
  599. fn2 := InitCityAll(e.TaskInfo.Version)
  600. for k, v := range fn2 {
  601. e.AreaProvinceGet.AddWord(k) //省全称
  602. p := &Province{}
  603. p.Name = k
  604. p.Brief = v["brief"].(string)
  605. e.ProvinceMap[k] = p.Brief
  606. //
  607. e.ProvinceBrief[p.Brief] = p
  608. p.Cap = v["captial"].(string)
  609. city, _ := v["city"].(map[string]interface{})
  610. for k1, v1 := range city {
  611. v1m, _ := v1.(map[string]interface{})
  612. c := &City{}
  613. c.Name = k1
  614. // if v1m["brief"] == nil {
  615. // }
  616. c.Brief = v1m["brief"].(string)
  617. //
  618. e.CityBrief[c.Brief] = c
  619. c.P = p
  620. if c.Brief == p.Cap {
  621. p.Captial = c
  622. }
  623. //加入到城市map中
  624. //
  625. cs := e.AreaToCity[k1]
  626. e.AreaGet.AddWord(k1) //市全称
  627. if cs != nil {
  628. cs = append(cs, c)
  629. } else {
  630. cs = []*City{c}
  631. }
  632. e.AreaToCity[k1] = cs
  633. //区县
  634. districtmap := v1m["area"].(map[string]interface{}) //区或县
  635. for district, streetarr := range districtmap {
  636. d := &District{}
  637. d.Name = district
  638. d.C = c
  639. e.AreaDistrict.AddWord(district) //加入区或县敏感词
  640. ctmp := e.DistrictCityMap[district]
  641. if ctmp == nil {
  642. e.DistrictCityMap[district] = c
  643. }
  644. //街道
  645. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  646. e.AreaStreet.AddWord(s) //加入街道敏感词
  647. dtmp := e.StreetDistrictMap[s]
  648. if dtmp == nil {
  649. e.StreetDistrictMap[s] = d
  650. }
  651. }
  652. }
  653. }
  654. }
  655. //初始化城市简称
  656. fn3 := InitCitySim(e.TaskInfo.Version)
  657. e.AreaSimGet = &ju.DFA{}
  658. for k, v := range fn3 {
  659. pb := v["brief"].(string)
  660. p := e.ProvinceBrief[pb]
  661. //加载
  662. for _, ss := range []string{k, pb} {
  663. cs := e.AreaToCity[ss]
  664. if cs != nil {
  665. cs = append(cs, p.Captial)
  666. } else {
  667. cs = []*City{p.Captial}
  668. }
  669. e.AreaToCity[ss] = cs
  670. e.AreaSimGet.AddWord(ss) //省全称和省简称
  671. }
  672. city, _ := v["city"].(map[string]interface{})
  673. for k1, v1 := range city {
  674. v1m, _ := v1.(map[string]interface{})
  675. if v1m["brief"] == nil {
  676. }
  677. cb := v1m["brief"].(string)
  678. c := e.AreaToCity[k1][0]
  679. //加入到城市map中
  680. for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  681. e.AreaSimGet.AddWord(ss)
  682. cs := e.AreaToCity[ss]
  683. if cs != nil {
  684. cs = append(cs, c)
  685. } else {
  686. cs = []*City{c}
  687. }
  688. e.AreaToCity[ss] = cs
  689. }
  690. arr := v1m["area"].([]interface{})
  691. for _, k2 := range arr {
  692. s := k2.(string)
  693. for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  694. cs := e.AreaToCity[ss]
  695. e.AreaSimGet.AddWord(ss)
  696. if cs != nil {
  697. cs = append(cs, c)
  698. } else {
  699. cs = []*City{c}
  700. }
  701. e.AreaToCity[ss] = cs
  702. //只加入简称
  703. if n == 0 {
  704. d := &District{}
  705. d.Name = ss
  706. d.C = c
  707. e.AreaDistrict.AddWord(ss) //加入区或县简称敏感词
  708. ctmp := e.DistrictCityMap[ss]
  709. if ctmp == nil {
  710. e.DistrictCityMap[ss] = c
  711. }
  712. }
  713. }
  714. }
  715. }
  716. }
  717. }
  718. //保存抽取详情数据
  719. func (e *ExtractTask) ResultSave() {
  720. defer qu.Catch()
  721. e.ResultChanel = make(chan bool, 5)
  722. e.ResultArr = [][]map[string]interface{}{}
  723. for {
  724. if len(e.ResultArr) > 500 {
  725. e.ResultChanel <- true
  726. arr := e.ResultArr[:500]
  727. go func(tmp *[][]map[string]interface{}) {
  728. qu.Try(func() {
  729. db.Mgo.UpSertBulk("extract_result", *tmp...)
  730. <-e.ResultChanel
  731. }, func(err interface{}) {
  732. log.Println(err)
  733. <-e.ResultChanel
  734. })
  735. }(&arr)
  736. e.ResultArr = e.ResultArr[500:]
  737. } else {
  738. e.ResultChanel <- true
  739. arr := e.ResultArr
  740. go func(tmp *[][]map[string]interface{}) {
  741. qu.Try(func() {
  742. db.Mgo.UpSertBulk("extract_result", *tmp...)
  743. <-e.ResultChanel
  744. }, func(err interface{}) {
  745. log.Println(err)
  746. <-e.ResultChanel
  747. })
  748. }(&arr)
  749. e.ResultArr = [][]map[string]interface{}{}
  750. time.Sleep(10 * time.Second)
  751. }
  752. if !e.IsRun {
  753. break
  754. }
  755. }
  756. }
  757. //保存抽取数据
  758. func (e *ExtractTask) BidSave() {
  759. defer qu.Catch()
  760. e.BidChanel = make(chan bool, 5)
  761. e.BidArr = [][]map[string]interface{}{}
  762. for {
  763. if len(e.BidArr) > 500 {
  764. e.BidChanel <- true
  765. arr := e.BidArr[:500]
  766. go func(tmp *[][]map[string]interface{}) {
  767. qu.Try(func() {
  768. db.Mgo.UpSertBulk(e.TaskInfo.ToColl, *tmp...)
  769. <-e.BidChanel
  770. }, func(err interface{}) {
  771. log.Println(err)
  772. <-e.BidChanel
  773. })
  774. }(&arr)
  775. e.BidArr = e.BidArr[500:]
  776. } else {
  777. e.BidChanel <- true
  778. arr := e.BidArr
  779. go func(tmp *[][]map[string]interface{}) {
  780. qu.Try(func() {
  781. db.Mgo.UpSertBulk(e.TaskInfo.ToColl, *tmp...)
  782. <-e.BidChanel
  783. }, func(err interface{}) {
  784. log.Println(err)
  785. <-e.BidChanel
  786. })
  787. }(&arr)
  788. e.BidArr = [][]map[string]interface{}{}
  789. }
  790. if !e.IsRun {
  791. break
  792. }
  793. time.Sleep(10 * time.Second)
  794. }
  795. }
  796. func (e *ExtractTask) InitAuditRecogField() {
  797. defer qu.Catch()
  798. e.RecogFieldMap = make(map[string]map[string]interface{})
  799. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  800. for _, f := range *recogFieldList {
  801. field := qu.ObjToString(f["s_recogfield"])
  802. e.RecogFieldMap[field] = f
  803. }
  804. }
  805. func (e *ExtractTask) InitAuditClass() {
  806. defer qu.Catch()
  807. e.FidClassMap = make(map[string][]map[string]interface{})
  808. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  809. for _, c := range *class {
  810. classList := []map[string]interface{}{}
  811. fid := qu.ObjToString(c["s_fid"])
  812. if len(e.FidClassMap[fid]) > 0 { //追加
  813. classList = e.FidClassMap[fid]
  814. }
  815. classList = append(classList, c)
  816. e.FidClassMap[fid] = classList
  817. }
  818. }
  819. //加载规则
  820. func (e *ExtractTask) InitAuditRule() {
  821. defer qu.Catch()
  822. var rureg *regexp.Regexp
  823. var rs []rune
  824. var ru string
  825. var err error
  826. e.CidRuleMap = make(map[string][]map[string]interface{})
  827. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  828. for _, v := range *rule {
  829. i_rule := []interface{}{}
  830. ss, _ := (v["s_rule"].([]interface{}))
  831. for _, r := range qu.ObjArrToStringArr(ss) {
  832. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  833. rs = []rune(r)
  834. ru = string(rs[1 : len(rs)-1])
  835. rureg, err = regexp.Compile(ru)
  836. if err != nil {
  837. log.Println("error---rule:", r)
  838. continue
  839. }
  840. i_rule = append(i_rule, []interface{}{rureg}...)
  841. } else { //规则
  842. i_rule = append(i_rule, r)
  843. }
  844. }
  845. v["rule"] = i_rule
  846. ruleList := []map[string]interface{}{}
  847. classid := qu.ObjToString(v["s_classid"])
  848. if len(e.CidRuleMap[classid]) > 0 { //追加
  849. ruleList = e.CidRuleMap[classid]
  850. }
  851. ruleList = append(ruleList, v)
  852. e.CidRuleMap[classid] = ruleList
  853. }
  854. }
  855. //
  856. func (e *ExtractTask) InitAuditFields() {
  857. if len(e.AuditFields) == 0 {
  858. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  859. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  860. vid := qu.BsonIdToSId((*v)["_id"])
  861. query := map[string]interface{}{
  862. "isaudit": true,
  863. "delete": false,
  864. "vid": vid,
  865. }
  866. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  867. for _, d := range *data {
  868. field := qu.ObjToString(d["s_field"])
  869. e.AuditFields = append(e.AuditFields, field)
  870. }
  871. }
  872. }
  873. }
  874. //加载附件抽取
  875. func (e *ExtractTask) InitFile() {
  876. defer qu.Catch()
  877. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  878. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  879. //ve, _ := db.Mgo.FindOne("version", query)
  880. if ve == nil {
  881. return
  882. }
  883. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  884. e.IsFileField = true
  885. }
  886. efiled := make(map[string]int, 0)
  887. if (*ve)["s_filefileds"] != nil {
  888. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  889. efiled[vff.(string)] = 1
  890. }
  891. }
  892. e.FileFields = efiled
  893. }
  894. //加载清理任务信息
  895. func (c *ClearTask) InitClearTaskInfo() {
  896. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  897. if len(*cleartask) > 1 {
  898. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  899. c.ClearTaskInfo = &ClearTaskInfo{
  900. Name: (*cleartask)["s_taskname"].(string),
  901. Version: (*cleartask)["s_version"].(string),
  902. VersionId: qu.BsonIdToSId((*v)["_id"]),
  903. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  904. FromDB: (*cleartask)["s_mgodb"].(string),
  905. FromColl: (*cleartask)["s_mgocoll"].(string),
  906. IsCltLog: ju.Config["iscltlog"].(bool),
  907. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  908. }
  909. log.Println(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  910. } else {
  911. return
  912. }
  913. }
  914. //加载清理脚本
  915. func (c *ClearTask) InitClearLuas() {
  916. defer qu.Catch()
  917. c.ClearLuas = make(map[string][]*ClearLua)
  918. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  919. for _, l := range *list {
  920. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  921. continue
  922. }
  923. s_field := qu.ObjToString(l["s_field"])
  924. pid := qu.BsonIdToSId(l["_id"])
  925. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  926. for _, vv := range *luas {
  927. if b, _ := vv["isuse"].(bool); !b {
  928. continue
  929. }
  930. clearLua := &ClearLua{
  931. Field: s_field,
  932. Code: vv["s_code"].(string),
  933. Name: vv["s_name"].(string),
  934. LuaText: vv["s_luascript"].(string),
  935. LFields: getALLFields(),
  936. }
  937. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  938. }
  939. }
  940. }
  941. //加载分块规则
  942. func (e *ExtractTask) InitBlockRule() {
  943. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  944. "vid": e.TaskInfo.VersionId,
  945. "delete": false,
  946. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  947. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  948. for _, v := range *datas {
  949. block_reg, _ := v["block_reg"].(string)
  950. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  951. title_reg, _ := v["title_reg"].(string)
  952. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  953. if block_reg == "" || title_reg == "" {
  954. continue
  955. }
  956. b_reg, b_err := regexp.Compile(block_reg)
  957. t_reg, t_err := regexp.Compile(title_reg)
  958. log.Println(block_reg, title_reg, b_err, t_err)
  959. if b_err != nil || t_err != nil {
  960. continue
  961. }
  962. brs = append(brs, b_reg)
  963. trs = append(trs, t_reg)
  964. }
  965. e.RuleBlock = &ju.RuleBlock{
  966. BlockRegs: brs,
  967. TitleRegs: trs,
  968. }
  969. }