extractInit.go 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. log "github.com/donnie4w/go-logger/logger"
  14. )
  15. type RegLuaInfo struct { //正则或脚本信息
  16. Code, Name, Field string //
  17. RuleText string //
  18. IsLua bool //
  19. RegPreBac *ExtReg //
  20. RegCore *ExtReg //
  21. LFields map[string]string //lua抽取字段属性组
  22. }
  23. type ExtReg struct {
  24. Reg *regexp.Regexp
  25. Replace string
  26. Bextract bool
  27. ExtractPos map[string]int
  28. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  29. }
  30. type RuleCore struct {
  31. Field string //逻辑字段
  32. LuaLogic string //进入逻辑
  33. ExtFrom string //从哪个字段抽取
  34. RulePres []*RegLuaInfo //抽取前置规则
  35. RuleBacks []*RegLuaInfo //抽取后置规则
  36. RuleCores []*RegLuaInfo //抽取规则
  37. }
  38. type Tag struct {
  39. Type string //标签类型 string 字符串、regexp 正则
  40. Key string //
  41. Reg *regexp.Regexp //
  42. }
  43. type TaskInfo struct {
  44. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  45. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  46. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  47. TestColl, LastExtId string //测试结果表、上次抽取信息id
  48. FDB *db.Pool //数据库连接池
  49. TDB *db.Pool //数据库连接池
  50. IsEtxLog bool //是否开启抽取日志
  51. ProcessPool chan bool //任务进程池
  52. TestLua bool //检查测试用
  53. }
  54. type ExtractTask struct {
  55. Id string //任务id
  56. IsRun bool //是否启动
  57. Content string //信息内容
  58. TaskInfo *TaskInfo //任务信息
  59. RulePres []*RegLuaInfo //通用前置规则
  60. RuleBacks []*RegLuaInfo //通用后置规则
  61. RuleBlock *ju.RuleBlock
  62. //RuleCores []*RuleCore //抽取规则
  63. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  64. PkgRuleCores []*RuleCore //分包抽取规则
  65. Tag map[string][]*Tag //标签库
  66. ClearFn map[string][]string //清理函数
  67. IsExtractCity bool //是否开启城市抽取
  68. Fields map[string]int //抽取属性组
  69. IsFileField bool //是否开启附件抽取
  70. FileFields *sync.Map //抽取附件属性组
  71. ResultChanel chan bool //抽取结果详情
  72. ResultArr [][]map[string]interface{} //抽取结果详情
  73. BidChanel chan bool //抽取结果
  74. BidArr [][]map[string]interface{} //抽取结果
  75. BidTotal int //结果数量
  76. RecogFieldMap map[string]map[string]interface{} //识别字段
  77. FidClassMap map[string][]map[string]interface{} //分类
  78. CidRuleMap map[string][]map[string]interface{} //规则
  79. AuditFields []string //需要审核的字段名称
  80. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  81. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  82. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  83. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  84. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  85. DistrictCityMap map[string]*City //区或县对应的city
  86. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  87. StreetDistrictMap map[string]*District //街道对应的区或县
  88. ProvinceAllGet *ju.DFA //省全称
  89. ProvinceSimGet *ju.DFA //省简称
  90. CityAllGet *ju.DFA //市全称
  91. CitySimGet *ju.DFA //市简称
  92. DistrictAllGet *ju.DFA //区或县全称
  93. DistrictSimGet *ju.DFA //区或县简称
  94. StreetGet *ju.DFA //街道
  95. PostCodeMap map[string]*PostCode //邮编
  96. AreaCodeMap map[string]*AreaCode //区号
  97. InfoType []map[string]interface{}
  98. }
  99. type ClearTaskInfo struct {
  100. Name, Version, VersionId string //名称、版本、版本id
  101. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  102. FDB *db.Pool //数据库连接池
  103. TDB *db.Pool //数据库连接池
  104. IsCltLog bool //是否开启清理日志
  105. ProcessPool chan bool //任务进程池
  106. }
  107. type ClearLua struct {
  108. Field string //字段字段
  109. Code string //代码
  110. Name string //名称
  111. LuaText string
  112. //LuaLogic string //进入逻辑
  113. //ExtFrom string //从哪个字段抽取
  114. LFields map[string]string //lua抽取字段属性组
  115. }
  116. type ClearTask struct {
  117. Id string //任务id
  118. Content string //信息内容
  119. ClearTaskInfo *ClearTaskInfo //任务信息
  120. ClearLuas map[string][]*ClearLua //清理脚本
  121. UpdateResult [][]map[string]interface{} //清理后结果
  122. ClearChannel chan bool
  123. }
  124. func init() {
  125. TaskList = make(map[string]*ExtractTask)
  126. ClearTaskList = make(map[string]*ClearTask)
  127. go SaveExtLog()
  128. go SaveCltLog() //保存清理日志
  129. }
  130. //加载任务信息
  131. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  132. task, _ := db.Mgo.FindById("task", e.Id, nil)
  133. if len(*task) > 1 {
  134. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  135. e.TaskInfo = &TaskInfo{
  136. Name: (*task)["s_taskname"].(string),
  137. Version: (*task)["s_version"].(string),
  138. VersionId: qu.BsonIdToSId((*v)["_id"]),
  139. TrackColl: trackcoll,
  140. FromDbAddr: (*task)["s_mgoaddr"].(string),
  141. FromDB: (*task)["s_mgodb"].(string),
  142. FromColl: (*task)["s_mgocoll"].(string),
  143. TestColl: resultcoll,
  144. IsEtxLog: true,
  145. ProcessPool: make(chan bool, 1),
  146. }
  147. if (*v)["isextractcity"] != nil {
  148. e.IsExtractCity = (*v)["isextractcity"].(bool)
  149. }
  150. } else {
  151. return
  152. }
  153. }
  154. //加载任务信息
  155. func (e *ExtractTask) InitTaskInfo() {
  156. task, _ := db.Mgo.FindById("task", e.Id, nil)
  157. log.Debug("task", task)
  158. if len(*task) > 1 {
  159. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  160. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  161. log.Debug("s_mgosavecoll", strs)
  162. if len(strs) < 3 {
  163. return
  164. } else {
  165. e.TaskInfo = &TaskInfo{
  166. Name: (*task)["s_taskname"].(string),
  167. Version: (*task)["s_version"].(string),
  168. VersionId: qu.BsonIdToSId((*v)["_id"]),
  169. //TrackColl: (*task)["s_trackcoll"].(string),
  170. FromDbAddr: (*task)["s_mgoaddr"].(string),
  171. FromDB: (*task)["s_mgodb"].(string),
  172. FromColl: (*task)["s_mgocoll"].(string),
  173. ToDbAddr: strs[0],
  174. ToDB: strs[1],
  175. ToColl: strs[2],
  176. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  177. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  178. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  179. }
  180. if (*v)["isextractcity"] != nil {
  181. e.IsExtractCity = (*v)["isextractcity"].(bool)
  182. }
  183. }
  184. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  185. } else {
  186. return
  187. }
  188. }
  189. //加载通用前置规则
  190. func (e *ExtractTask) InitRulePres() {
  191. defer qu.Catch()
  192. e.RulePres = []*RegLuaInfo{}
  193. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  194. for _, v := range *list {
  195. rinfo := &RegLuaInfo{
  196. Code: v["s_code"].(string),
  197. Name: v["s_name"].(string),
  198. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  199. }
  200. if rinfo.IsLua {
  201. rinfo.RuleText = v["s_luascript"].(string)
  202. e.RulePres = append(e.RulePres, rinfo)
  203. } else {
  204. qu.Try(func() {
  205. rinfo.RuleText = v["s_rule"].(string)
  206. tmp := strings.Split(rinfo.RuleText, "__")
  207. var pattern string
  208. if strings.Contains(tmp[0], "\\u") {
  209. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  210. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  211. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  212. } else {
  213. pattern = tmp[0]
  214. }
  215. if len(tmp) == 2 {
  216. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  217. } else {
  218. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  219. }
  220. e.RulePres = append(e.RulePres, rinfo)
  221. }, func(err interface{}) {
  222. log.Debug(rinfo.Code, rinfo.Field, err)
  223. })
  224. }
  225. }
  226. }
  227. //加载通用后置规则
  228. func (e *ExtractTask) InitRuleBacks() {
  229. defer qu.Catch()
  230. e.RuleBacks = []*RegLuaInfo{}
  231. list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  232. for _, v := range *list {
  233. rinfo := &RegLuaInfo{
  234. Code: v["s_code"].(string),
  235. Name: v["s_name"].(string),
  236. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  237. }
  238. if rinfo.IsLua {
  239. rinfo.RuleText = v["s_luascript"].(string)
  240. e.RuleBacks = append(e.RuleBacks, rinfo)
  241. } else {
  242. qu.Try(func() {
  243. rinfo.RuleText = v["s_rule"].(string)
  244. tmp := strings.Split(rinfo.RuleText, "__")
  245. var pattern string
  246. if strings.Contains(tmp[0], "\\u") {
  247. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  248. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  249. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  250. } else {
  251. pattern = tmp[0]
  252. }
  253. if len(tmp) == 2 {
  254. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  255. } else {
  256. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  257. }
  258. e.RuleBacks = append(e.RuleBacks, rinfo)
  259. }, func(err interface{}) {
  260. log.Debug(rinfo.Code, rinfo.Field, err)
  261. })
  262. }
  263. }
  264. }
  265. func (e *ExtractTask) InfoTypeList() {
  266. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  267. infolist := *infolist1
  268. for _, v := range infolist {
  269. e.InfoType = append(e.InfoType, v)
  270. }
  271. }
  272. //加载抽取规则
  273. func (e *ExtractTask) InitRuleCore() {
  274. defer qu.Catch()
  275. e.Fields = map[string]int{}
  276. e.RuleCores = make(map[string]map[string][]*RuleCore)
  277. fieldrules := map[string][]*RuleCore{}
  278. vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  279. for _, vinfo := range *vinfos {
  280. if b, _ := vinfo["isuse"].(bool); !b {
  281. continue
  282. }
  283. s_field := qu.ObjToString(vinfo["s_field"])
  284. pid := qu.BsonIdToSId(vinfo["_id"])
  285. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  286. for _, vv := range *list {
  287. if b, _ := vv["isuse"].(bool); !b {
  288. continue
  289. }
  290. rcore := &RuleCore{}
  291. rcore.Field = s_field
  292. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  293. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  294. //前置规则
  295. rulePres := []*RegLuaInfo{}
  296. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  297. for _, v := range *plist {
  298. rinfo := &RegLuaInfo{
  299. Field: qu.ObjToString(v["s_field"]),
  300. Code: v["s_code"].(string),
  301. Name: v["s_name"].(string),
  302. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  303. }
  304. if rinfo.IsLua {
  305. rinfo.RuleText = v["s_luascript"].(string)
  306. rulePres = append(rulePres, rinfo)
  307. } else {
  308. qu.Try(func() {
  309. rinfo.RuleText = v["s_rule"].(string)
  310. tmp := strings.Split(rinfo.RuleText, "__")
  311. var pattern string
  312. if strings.Contains(tmp[0], "\\u") {
  313. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  314. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  315. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  316. } else {
  317. pattern = tmp[0]
  318. }
  319. if len(tmp) == 2 {
  320. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  321. } else {
  322. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  323. }
  324. rulePres = append(rulePres, rinfo)
  325. }, func(err interface{}) {
  326. log.Debug(rinfo.Code, rinfo.Field, err)
  327. })
  328. }
  329. }
  330. rcore.RulePres = rulePres
  331. //后置规则
  332. ruleBacks := []*RegLuaInfo{}
  333. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  334. for _, v := range *blist {
  335. rinfo := &RegLuaInfo{
  336. Field: qu.ObjToString(v["s_field"]),
  337. Code: v["s_code"].(string),
  338. Name: v["s_name"].(string),
  339. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  340. }
  341. if rinfo.IsLua {
  342. rinfo.RuleText = v["s_luascript"].(string)
  343. ruleBacks = append(ruleBacks, rinfo)
  344. } else {
  345. qu.Try(func() {
  346. rinfo.RuleText = v["s_rule"].(string)
  347. tmp := strings.Split(rinfo.RuleText, "__")
  348. var pattern string
  349. if strings.Contains(tmp[0], "\\u") {
  350. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  351. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  352. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  353. } else {
  354. pattern = tmp[0]
  355. }
  356. if len(tmp) == 2 {
  357. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  358. } else {
  359. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  360. }
  361. ruleBacks = append(ruleBacks, rinfo)
  362. }, func(err interface{}) {
  363. log.Debug(rinfo.Code, rinfo.Field, err)
  364. })
  365. }
  366. }
  367. rcore.RuleBacks = ruleBacks
  368. //抽取规则
  369. ruleCores := []*RegLuaInfo{}
  370. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  371. for _, v := range *clist {
  372. if b, _ := v["isuse"].(bool); !b {
  373. continue
  374. }
  375. field := qu.ObjToString(v["s_field"])
  376. e.Fields[field] = 1 //加入抽取属性组备用
  377. rinfo := &RegLuaInfo{
  378. Field: field,
  379. Code: v["s_code"].(string),
  380. Name: v["s_name"].(string),
  381. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  382. }
  383. if rinfo.IsLua {
  384. rinfo.RuleText = v["s_luascript"].(string)
  385. //提取全部属性
  386. rinfo.LFields = getALLFields()
  387. ruleCores = append(ruleCores, rinfo)
  388. } else {
  389. qu.Try(func() {
  390. rinfo.RuleText = v["s_rule"].(string)
  391. tmp := strings.Split(rinfo.RuleText, "__")
  392. var pattern string
  393. if strings.Contains(tmp[0], "\\u") {
  394. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  395. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  396. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  397. } else {
  398. pattern = tmp[0]
  399. }
  400. if len(tmp) == 2 {
  401. epos := strings.Split(tmp[1], ",")
  402. posm := map[string]int{}
  403. for _, v := range epos {
  404. ks := strings.Split(v, ":")
  405. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  406. posm[ks[1]] = qu.IntAll(ks[0])
  407. } else { //(.*)招标公告__2
  408. posm[rinfo.Field] = qu.IntAll(ks[0])
  409. }
  410. }
  411. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  412. } else {
  413. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  414. }
  415. ruleCores = append(ruleCores, rinfo)
  416. }, func(err interface{}) {
  417. log.Debug(rinfo.Code, rinfo.Field, err)
  418. })
  419. }
  420. }
  421. rcore.RuleCores = ruleCores
  422. //
  423. if fieldrules[s_field] == nil {
  424. fieldrules[s_field] = []*RuleCore{}
  425. }
  426. fieldrules[s_field] = append(fieldrules[s_field], rcore)
  427. }
  428. }
  429. //属性配置
  430. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  431. for _, v := range *infolist {
  432. topclass := qu.ObjToString(v["topclass"])
  433. if v["subclass"] == nil {
  434. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  435. for attr, _ := range v["fields"].(map[string]interface{}) {
  436. e.RuleCores[topclass][attr] = fieldrules[attr]
  437. }
  438. } else {
  439. for ca, fs := range v["subclass"].(map[string]interface{}) {
  440. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  441. for field, _ := range fs.(map[string]interface{}) {
  442. e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
  443. }
  444. }
  445. }
  446. }
  447. }
  448. //加载抽取规则
  449. func (e *ExtractTask) InitRuleCore2() {
  450. defer qu.Catch()
  451. e.Fields = map[string]int{}
  452. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  453. e.RuleCores = make(map[string]map[string][]*RuleCore)
  454. for _, v := range *infolist {
  455. topclass := qu.ObjToString(v["topclass"])
  456. if v["subclass"] == nil {
  457. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  458. for attr, _ := range v["fields"].(map[string]interface{}) {
  459. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
  460. e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
  461. }
  462. } else {
  463. for ca, fs := range v["subclass"].(map[string]interface{}) {
  464. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  465. for field, _ := range fs.(map[string]interface{}) {
  466. vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
  467. e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
  468. }
  469. }
  470. }
  471. }
  472. }
  473. func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
  474. maps := []*RuleCore{}
  475. if b, _ := vinfo["isuse"].(bool); !b {
  476. return nil
  477. }
  478. s_field := qu.ObjToString(vinfo["s_field"])
  479. pid := qu.BsonIdToSId(vinfo["_id"])
  480. list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  481. for _, vv := range *list {
  482. if b, _ := vv["isuse"].(bool); !b {
  483. continue
  484. }
  485. rcore := &RuleCore{}
  486. rcore.Field = s_field
  487. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  488. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  489. //前置规则
  490. rulePres := []*RegLuaInfo{}
  491. plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  492. for _, v := range *plist {
  493. rinfo := &RegLuaInfo{
  494. Field: qu.ObjToString(v["s_field"]),
  495. Code: v["s_code"].(string),
  496. Name: v["s_name"].(string),
  497. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  498. }
  499. if rinfo.IsLua {
  500. rinfo.RuleText = v["s_luascript"].(string)
  501. rulePres = append(rulePres, rinfo)
  502. } else {
  503. qu.Try(func() {
  504. rinfo.RuleText = v["s_rule"].(string)
  505. tmp := strings.Split(rinfo.RuleText, "__")
  506. var pattern string
  507. if strings.Contains(tmp[0], "\\u") {
  508. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  509. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  510. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  511. } else {
  512. pattern = tmp[0]
  513. }
  514. if len(tmp) == 2 {
  515. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  516. } else {
  517. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  518. }
  519. rulePres = append(rulePres, rinfo)
  520. }, func(err interface{}) {
  521. log.Debug(rinfo.Code, rinfo.Field, err)
  522. })
  523. }
  524. }
  525. rcore.RulePres = rulePres
  526. //后置规则
  527. ruleBacks := []*RegLuaInfo{}
  528. blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  529. for _, v := range *blist {
  530. rinfo := &RegLuaInfo{
  531. Field: qu.ObjToString(v["s_field"]),
  532. Code: v["s_code"].(string),
  533. Name: v["s_name"].(string),
  534. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  535. }
  536. if rinfo.IsLua {
  537. rinfo.RuleText = v["s_luascript"].(string)
  538. ruleBacks = append(ruleBacks, rinfo)
  539. } else {
  540. qu.Try(func() {
  541. rinfo.RuleText = v["s_rule"].(string)
  542. tmp := strings.Split(rinfo.RuleText, "__")
  543. var pattern string
  544. if strings.Contains(tmp[0], "\\u") {
  545. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  546. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  547. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  548. } else {
  549. pattern = tmp[0]
  550. }
  551. if len(tmp) == 2 {
  552. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  553. } else {
  554. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  555. }
  556. ruleBacks = append(ruleBacks, rinfo)
  557. }, func(err interface{}) {
  558. log.Debug(rinfo.Code, rinfo.Field, err)
  559. })
  560. }
  561. }
  562. rcore.RuleBacks = ruleBacks
  563. //抽取规则
  564. ruleCores := []*RegLuaInfo{}
  565. clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  566. for _, v := range *clist {
  567. if b, _ := v["isuse"].(bool); !b {
  568. continue
  569. }
  570. field := qu.ObjToString(v["s_field"])
  571. e.Fields[field] = 1 //加入抽取属性组备用
  572. rinfo := &RegLuaInfo{
  573. Field: field,
  574. Code: v["s_code"].(string),
  575. Name: v["s_name"].(string),
  576. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  577. }
  578. if rinfo.IsLua {
  579. rinfo.RuleText = v["s_luascript"].(string)
  580. //提取全部属性
  581. rinfo.LFields = getALLFields()
  582. ruleCores = append(ruleCores, rinfo)
  583. } else {
  584. qu.Try(func() {
  585. rinfo.RuleText = v["s_rule"].(string)
  586. ptmp := strings.Split(rinfo.RuleText, "#")
  587. sign := 0
  588. if len(ptmp) == 2 {
  589. if ptmp[1] == "正" {
  590. sign = 1
  591. } else if ptmp[1] == "负" {
  592. sign = -1
  593. }
  594. }
  595. tmp := strings.Split(ptmp[0], "__")
  596. var pattern string
  597. if strings.Contains(tmp[0], "\\u") {
  598. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  599. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  600. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  601. } else {
  602. pattern = tmp[0]
  603. }
  604. if len(tmp) == 2 {
  605. epos := strings.Split(tmp[1], ",")
  606. posm := map[string]int{}
  607. for _, v := range epos {
  608. ks := strings.Split(v, ":")
  609. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  610. posm[ks[1]] = qu.IntAll(ks[0])
  611. } else { //(.*)招标公告__2
  612. posm[rinfo.Field] = qu.IntAll(ks[0])
  613. }
  614. }
  615. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
  616. } else {
  617. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  618. }
  619. ruleCores = append(ruleCores, rinfo)
  620. }, func(err interface{}) {
  621. log.Debug(rinfo.Code, rinfo.Field, err)
  622. })
  623. }
  624. }
  625. rcore.RuleCores = ruleCores
  626. //
  627. maps = append(maps, rcore)
  628. }
  629. return maps
  630. }
  631. //加载分包抽取规则
  632. func (e *ExtractTask) InitPkgCore() {
  633. defer qu.Catch()
  634. e.PkgRuleCores = []*RuleCore{}
  635. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  636. for _, pkginfo := range *pkginfos {
  637. if b, _ := pkginfo["isuse"].(bool); !b {
  638. continue
  639. }
  640. s_field := qu.ObjToString(pkginfo["s_field"])
  641. pid := qu.BsonIdToSId(pkginfo["_id"])
  642. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  643. for _, vv := range *logicList {
  644. if b, _ := vv["isuse"].(bool); !b {
  645. continue
  646. }
  647. rcore := &RuleCore{}
  648. rcore.Field = s_field
  649. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  650. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  651. //后置规则
  652. ruleBacks := []*RegLuaInfo{}
  653. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  654. for _, v := range *blist {
  655. rinfo := &RegLuaInfo{
  656. Field: qu.ObjToString(v["s_field"]),
  657. Code: v["s_code"].(string),
  658. Name: v["s_name"].(string),
  659. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  660. }
  661. if rinfo.IsLua {
  662. rinfo.RuleText = v["s_luascript"].(string)
  663. ruleBacks = append(ruleBacks, rinfo)
  664. } else {
  665. qu.Try(func() {
  666. rinfo.RuleText = v["s_rule"].(string)
  667. tmp := strings.Split(rinfo.RuleText, "__")
  668. var pattern string
  669. if strings.Contains(tmp[0], "\\u") {
  670. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  671. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  672. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  673. } else {
  674. pattern = tmp[0]
  675. }
  676. if len(tmp) == 2 {
  677. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  678. } else {
  679. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  680. }
  681. ruleBacks = append(ruleBacks, rinfo)
  682. }, func(err interface{}) {
  683. log.Debug(rinfo.Code, rinfo.Field, err)
  684. })
  685. }
  686. }
  687. rcore.RuleBacks = ruleBacks
  688. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  689. }
  690. }
  691. }
  692. //加载标签库
  693. func (e *ExtractTask) InitTag() {
  694. defer qu.Catch()
  695. e.Tag = map[string][]*Tag{}
  696. //字符串标签库
  697. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  698. for _, v := range *list {
  699. field := qu.ObjToString(v["s_field"])
  700. if tmp, ok := v["content"].([]interface{}); ok {
  701. fname := qu.ObjToString(v["s_name"])
  702. tab := ju.TagFile{Name: fname} //用于表格kv
  703. tab.Items = make([]*ju.Tag, len(tmp))
  704. for k, key := range tmp {
  705. tag := &Tag{Type: "string", Key: key.(string)}
  706. e.Tag[field] = append(e.Tag[field], tag)
  707. tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
  708. }
  709. sort.Sort(tab.Items)
  710. ju.TagdbTable[fname] = &tab
  711. }
  712. }
  713. //正则标签库
  714. list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  715. for _, v := range *list {
  716. field := qu.ObjToString(v["s_field"])
  717. if tmp, ok := v["content"].([]interface{}); ok {
  718. fname := qu.ObjToString(v["s_name"])
  719. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  720. tab.Items = make([]*ju.Tag, len(tmp))
  721. for k, key := range tmp {
  722. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  723. e.Tag[field] = append(e.Tag[field], tag)
  724. tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
  725. }
  726. sort.Sort(tab.Items)
  727. ju.TagdbTable[fname+"_reg"] = &tab
  728. }
  729. }
  730. }
  731. //获取fields
  732. func getALLFields() map[string]string {
  733. fields := map[string]string{}
  734. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  735. for _, v := range *list {
  736. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  737. }
  738. return fields
  739. }
  740. //加载clear函数
  741. func (e *ExtractTask) InitClearFn() {
  742. defer qu.Catch()
  743. list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  744. fn := map[string][]string{}
  745. for _, tmp := range *list {
  746. field := tmp["s_field"].(string)
  747. fns := tmp["clear"].([]interface{})
  748. if fn[field] == nil {
  749. fn[field] = []string{}
  750. }
  751. for _, v := range fns {
  752. fn[field] = append(fn[field], v.(string))
  753. }
  754. }
  755. e.ClearFn = fn
  756. }
  757. //加载省份
  758. func InitProvince(version string) map[string]interface{} {
  759. defer qu.Catch()
  760. fn := map[string]interface{}{}
  761. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  762. for _, v := range *list {
  763. name := qu.ObjToString(v["s_name"])
  764. content := v["content"]
  765. switch content.(type) {
  766. case string:
  767. fn[name] = []interface{}{content.(string)}
  768. case []interface{}:
  769. fn[name] = content
  770. }
  771. }
  772. return fn
  773. }
  774. //加载城市简称
  775. func InitCitySim(version string) map[string]map[string]interface{} {
  776. defer qu.Catch()
  777. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  778. fn := map[string]map[string]interface{}{}
  779. for _, v := range *list {
  780. name := qu.ObjToString(v["s_name"])
  781. tmp := v["content"].(map[string]interface{})
  782. fn[name] = tmp
  783. }
  784. return fn
  785. }
  786. //加载城市全称
  787. func InitCityAll(version string) map[string]map[string]interface{} {
  788. defer qu.Catch()
  789. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  790. fn := map[string]map[string]interface{}{}
  791. for _, v := range *list {
  792. name := qu.ObjToString(v["s_name"])
  793. tmp := v["content"].(map[string]interface{})
  794. fn[name] = tmp
  795. }
  796. return fn
  797. }
  798. //初始化城市省份敏感词
  799. func (e *ExtractTask) InitCityDFA() {
  800. defer qu.Catch()
  801. e.CityAllGet = &ju.DFA{}
  802. e.CitySimGet = &ju.DFA{}
  803. e.DistrictAllGet = &ju.DFA{}
  804. e.DistrictSimGet = &ju.DFA{}
  805. e.ProvinceAllGet = &ju.DFA{}
  806. e.ProvinceSimGet = &ju.DFA{}
  807. e.StreetGet = &ju.DFA{}
  808. //初始化map
  809. if e.ProvinceMap == nil {
  810. e.ProvinceMap = make(map[string]string)
  811. }
  812. if e.CityMap == nil {
  813. e.CityMap = make(map[string]string)
  814. }
  815. if e.DistrictSimAndAll == nil {
  816. e.DistrictSimAndAll = make(map[string]string)
  817. }
  818. if e.CityBriefMap == nil {
  819. e.CityBriefMap = make(map[string]*City)
  820. }
  821. if e.CityFullMap == nil {
  822. e.CityFullMap = make(map[string]*City)
  823. }
  824. if e.ProvinceBriefMap == nil {
  825. e.ProvinceBriefMap = make(map[string]*Province)
  826. }
  827. if e.DistrictCityMap == nil {
  828. e.DistrictCityMap = make(map[string]*City)
  829. }
  830. if e.StreetDistrictMap == nil {
  831. e.StreetDistrictMap = make(map[string]*District)
  832. }
  833. //初始化省
  834. fn1 := InitProvince(e.TaskInfo.Version)
  835. for k, v := range fn1 {
  836. for _, p := range v.([]interface{}) {
  837. p1, _ := p.(string)
  838. e.ProvinceAllGet.AddWord(p1) //华中科技大学
  839. e.ProvinceMap[p1] = k //华中科技大学:湖北
  840. }
  841. }
  842. //初始化城市全称
  843. fn2 := InitCityAll(e.TaskInfo.Version)
  844. for k, v := range fn2 {
  845. //加载省信息
  846. e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
  847. p := &Province{}
  848. p.Name = k //省全称:浙江省
  849. p.Brief = v["brief"].(string) //省简称:浙江
  850. e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
  851. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  852. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  853. p.Cap = v["captial"].(string) //省会(杭州)
  854. //加载市信息
  855. city, _ := v["city"].(map[string]interface{})
  856. for k1, v1 := range city {
  857. e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
  858. v1m, _ := v1.(map[string]interface{})
  859. c := &City{}
  860. c.Name = k1 //市全称:杭州市
  861. c.Brief = v1m["brief"].(string) //市简称:杭州
  862. e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
  863. e.CityMap[k1] = c.Brief //杭州市:杭州
  864. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  865. e.CityFullMap[k1] = c //杭州市:市信息{}
  866. c.P = p
  867. if c.Name == p.Cap {
  868. p.Captial = c //加载province中的省会市信息{}
  869. }
  870. //区县
  871. districtmap := v1m["area"].(map[string]interface{}) //区或县
  872. for district, streetarr := range districtmap {
  873. d := &District{}
  874. d.Name = district
  875. d.C = c
  876. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  877. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
  878. e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
  879. ctmp := e.DistrictCityMap[district]
  880. if ctmp == nil {
  881. e.DistrictCityMap[district] = c
  882. }
  883. //街道
  884. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  885. e.StreetGet.AddWord(s) //加入街道敏感词
  886. dtmp := e.StreetDistrictMap[s]
  887. if dtmp == nil {
  888. e.StreetDistrictMap[s] = d
  889. }
  890. }
  891. }
  892. }
  893. }
  894. //初始化城市简称
  895. fn3 := InitCitySim(e.TaskInfo.Version)
  896. for _, v := range fn3 {
  897. city, _ := v["city"].(map[string]interface{})
  898. for _, v1 := range city {
  899. v1m, _ := v1.(map[string]interface{})
  900. cb := v1m["brief"].(string) //市简称
  901. arr := v1m["area"].(map[string]interface{}) //区或县简称
  902. for districtsim, districtall := range arr {
  903. e.DistrictSimAndAll[districtsim] = districtall.(string)
  904. d := &District{}
  905. d.Name = districtsim
  906. d.C = e.CityBriefMap[cb]
  907. e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
  908. ctmp := e.DistrictCityMap[districtsim]
  909. if ctmp == nil {
  910. e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
  911. }
  912. }
  913. }
  914. }
  915. }
  916. //初始化邮编库
  917. func (e *ExtractTask) InitPostCode() {
  918. defer qu.Catch()
  919. if e.PostCodeMap == nil {
  920. e.PostCodeMap = make(map[string]*PostCode)
  921. }
  922. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  923. for _, l := range *list {
  924. pc := &PostCode{}
  925. pc.Code = qu.ObjToString(l["code"])
  926. pc.P = qu.ObjToString(l["province"])
  927. pc.C = qu.ObjToString(l["city"])
  928. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  929. e.PostCodeMap[pc.Code] = pc
  930. }
  931. }
  932. //初始化区号库
  933. func (e *ExtractTask) InitAreaCode() {
  934. defer qu.Catch()
  935. if e.AreaCodeMap == nil {
  936. e.AreaCodeMap = make(map[string]*AreaCode)
  937. }
  938. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  939. for _, l := range *list {
  940. ac := &AreaCode{}
  941. ac.Code = qu.ObjToString(l["code"])
  942. ac.P = qu.ObjToString(l["province"])
  943. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  944. e.AreaCodeMap[ac.Code] = ac
  945. }
  946. }
  947. //初始化城市省份敏感词
  948. //func (e *ExtractTask) InitCityDFA() {
  949. // defer qu.Catch()
  950. // e.CityAllGet = &ju.DFA{}
  951. // e.DistrictGet = &ju.DFA{}
  952. // e.AreaProvinceGet = &ju.DFA{}
  953. // e.StreetGet = &ju.DFA{}
  954. // //初始化map
  955. // if e.ProvinceMap == nil {
  956. // e.ProvinceMap = make(map[string]string)
  957. // }
  958. // if e.CityBriefMap == nil {
  959. // e.CityBriefMap = make(map[string]*City)
  960. // }
  961. // if e.ProvinceBriefMap == nil {
  962. // e.ProvinceBriefMap = make(map[string]*Province)
  963. // }
  964. // if e.AreaToCityMap == nil {
  965. // e.AreaToCityMap = make(map[string][]*City)
  966. // }
  967. // if e.DistrictCityMap == nil {
  968. // e.DistrictCityMap = make(map[string]*City)
  969. // }
  970. // if e.StreetDistrictMap == nil {
  971. // e.StreetDistrictMap = make(map[string]*District)
  972. // }
  973. // //初始化省
  974. // fn1 := InitProvince(e.TaskInfo.Version)
  975. // for k, v := range fn1 {
  976. // for _, p := range v.([]interface{}) {
  977. // p1, _ := p.(string)
  978. // e.AreaProvinceGet.AddWord(p1) //华中科技大学
  979. // e.ProvinceMap[p1] = k //华中科技大学:湖北
  980. // }
  981. // }
  982. // //初始化城市全称
  983. // fn2 := InitCityAll(e.TaskInfo.Version)
  984. // for k, v := range fn2 {
  985. // e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
  986. // p := &Province{}
  987. // p.Name = k //省全称
  988. // p.Brief = v["brief"].(string) //省简称
  989. // e.ProvinceMap[k] = p.Brief //浙江省:浙江
  990. // e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
  991. // p.Cap = v["captial"].(string) //省会(杭州)
  992. // city, _ := v["city"].(map[string]interface{})
  993. // //
  994. // for k1, v1 := range city {
  995. // v1m, _ := v1.(map[string]interface{})
  996. // c := &City{}
  997. // c.Name = k1
  998. // c.Brief = v1m["brief"].(string)
  999. // e.CityBriefMap[c.Brief] = c
  1000. // c.P = p
  1001. // if c.Brief == p.Cap {
  1002. // p.Captial = c
  1003. // }
  1004. // //加入到城市map中
  1005. // //
  1006. // cs := e.AreaToCityMap[k1]
  1007. // e.CityAllGet.AddWord(k1) //市全称
  1008. // if cs != nil {
  1009. // cs = append(cs, c)
  1010. // } else {
  1011. // cs = []*City{c}
  1012. // }
  1013. // e.AreaToCityMap[k1] = cs
  1014. // //区县
  1015. // districtmap := v1m["area"].(map[string]interface{}) //区或县
  1016. // for district, streetarr := range districtmap {
  1017. // d := &District{}
  1018. // d.Name = district
  1019. // d.C = c
  1020. // e.DistrictGet.AddWord(district) //加入区或县敏感词
  1021. // ctmp := e.DistrictCityMap[district]
  1022. // if ctmp == nil {
  1023. // e.DistrictCityMap[district] = c
  1024. // }
  1025. // //街道
  1026. // for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  1027. // e.StreetGet.AddWord(s) //加入街道敏感词
  1028. // dtmp := e.StreetDistrictMap[s]
  1029. // if dtmp == nil {
  1030. // e.StreetDistrictMap[s] = d
  1031. // }
  1032. // }
  1033. // }
  1034. // }
  1035. // }
  1036. // //初始化城市简称
  1037. // fn3 := InitCitySim(e.TaskInfo.Version)
  1038. // e.CitySimGet = &ju.DFA{}
  1039. // for k, v := range fn3 {
  1040. // pb := v["brief"].(string)
  1041. // p := e.ProvinceBriefMap[pb]
  1042. // //加载
  1043. // for _, ss := range []string{k, pb} { //省全称和省简称
  1044. // cs := e.AreaToCityMap[ss]
  1045. // if cs != nil {
  1046. // cs = append(cs, p.Captial)
  1047. // } else {
  1048. // cs = []*City{p.Captial}
  1049. // }
  1050. // e.AreaToCityMap[ss] = cs
  1051. // e.CitySimGet.AddWord(ss)
  1052. // }
  1053. // city, _ := v["city"].(map[string]interface{})
  1054. // for k1, v1 := range city {
  1055. // v1m, _ := v1.(map[string]interface{})
  1056. // if v1m["brief"] == nil {
  1057. // }
  1058. // cb := v1m["brief"].(string)
  1059. // c := e.AreaToCityMap[k1][0]
  1060. // //加入到城市map中
  1061. // for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
  1062. // e.CitySimGet.AddWord(ss)
  1063. // cs := e.AreaToCityMap[ss]
  1064. // if cs != nil {
  1065. // cs = append(cs, c)
  1066. // } else {
  1067. // cs = []*City{c}
  1068. // }
  1069. // e.AreaToCityMap[ss] = cs
  1070. // }
  1071. // arr := v1m["area"].([]interface{})
  1072. // for _, k2 := range arr {
  1073. // s := k2.(string)
  1074. // for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
  1075. // cs := e.AreaToCityMap[ss]
  1076. // e.CitySimGet.AddWord(ss)
  1077. // if cs != nil {
  1078. // cs = append(cs, c)
  1079. // } else {
  1080. // cs = []*City{c}
  1081. // }
  1082. // e.AreaToCityMap[ss] = cs
  1083. // //只加入简称
  1084. // if n == 0 {
  1085. // d := &District{}
  1086. // d.Name = ss
  1087. // d.C = c
  1088. // e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
  1089. // ctmp := e.DistrictCityMap[ss]
  1090. // if ctmp == nil {
  1091. // e.DistrictCityMap[ss] = c
  1092. // }
  1093. // }
  1094. // }
  1095. // }
  1096. // }
  1097. // }
  1098. //}
  1099. //保存抽取详情数据
  1100. func (e *ExtractTask) ResultSave(init bool) {
  1101. defer qu.Catch()
  1102. if e.ResultArr == nil {
  1103. e.ResultArr = [][]map[string]interface{}{}
  1104. }
  1105. if init {
  1106. go func() {
  1107. for {
  1108. if len(e.ResultArr) > 500 {
  1109. arr := e.ResultArr[:500]
  1110. e.ResultArr = e.ResultArr[500:]
  1111. qu.Try(func() {
  1112. db.Mgo.UpSertBulk("extract_result", arr...)
  1113. }, func(err interface{}) {
  1114. log.Debug(err)
  1115. })
  1116. } else {
  1117. arr := e.ResultArr
  1118. e.ResultArr = [][]map[string]interface{}{}
  1119. qu.Try(func() {
  1120. db.Mgo.UpSertBulk("extract_result", arr...)
  1121. }, func(err interface{}) {
  1122. log.Debug(err)
  1123. })
  1124. }
  1125. time.Sleep(10 * time.Second)
  1126. }
  1127. }()
  1128. } else {
  1129. arr := e.ResultArr
  1130. e.ResultArr = [][]map[string]interface{}{}
  1131. qu.Try(func() {
  1132. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1133. }, func(err interface{}) {
  1134. log.Debug(err)
  1135. })
  1136. }
  1137. }
  1138. //保存抽取数据
  1139. func (e *ExtractTask) BidSave(init bool) {
  1140. defer qu.Catch()
  1141. if e.BidArr == nil {
  1142. e.BidArr = [][]map[string]interface{}{}
  1143. }
  1144. if init {
  1145. go func() {
  1146. for {
  1147. if len(e.BidArr) > 500 {
  1148. arr := e.BidArr[:500]
  1149. e.BidArr = e.BidArr[500:]
  1150. qu.Try(func() {
  1151. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1152. }, func(err interface{}) {
  1153. log.Debug(err)
  1154. })
  1155. } else {
  1156. arr := e.BidArr
  1157. e.BidArr = [][]map[string]interface{}{}
  1158. qu.Try(func() {
  1159. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1160. }, func(err interface{}) {
  1161. log.Debug(err)
  1162. })
  1163. }
  1164. time.Sleep(10 * time.Second)
  1165. }
  1166. }()
  1167. } else {
  1168. arr := e.BidArr
  1169. e.BidArr = [][]map[string]interface{}{}
  1170. qu.Try(func() {
  1171. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1172. }, func(err interface{}) {
  1173. log.Debug(err)
  1174. })
  1175. time.Sleep(1 * time.Second)
  1176. }
  1177. }
  1178. func (e *ExtractTask) InitAuditRecogField() {
  1179. defer qu.Catch()
  1180. e.RecogFieldMap = make(map[string]map[string]interface{})
  1181. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1182. for _, f := range *recogFieldList {
  1183. field := qu.ObjToString(f["s_recogfield"])
  1184. e.RecogFieldMap[field] = f
  1185. }
  1186. }
  1187. func (e *ExtractTask) InitAuditClass() {
  1188. defer qu.Catch()
  1189. e.FidClassMap = make(map[string][]map[string]interface{})
  1190. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1191. for _, c := range *class {
  1192. classList := []map[string]interface{}{}
  1193. fid := qu.ObjToString(c["s_fid"])
  1194. if len(e.FidClassMap[fid]) > 0 { //追加
  1195. classList = e.FidClassMap[fid]
  1196. }
  1197. classList = append(classList, c)
  1198. e.FidClassMap[fid] = classList
  1199. }
  1200. }
  1201. //加载规则
  1202. func (e *ExtractTask) InitAuditRule() {
  1203. defer qu.Catch()
  1204. var rureg *regexp.Regexp
  1205. var rs []rune
  1206. var ru string
  1207. var err error
  1208. e.CidRuleMap = make(map[string][]map[string]interface{})
  1209. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1210. for _, v := range *rule {
  1211. i_rule := []interface{}{}
  1212. ss, _ := (v["s_rule"].([]interface{}))
  1213. for _, r := range qu.ObjArrToStringArr(ss) {
  1214. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1215. rs = []rune(r)
  1216. ru = string(rs[1 : len(rs)-1])
  1217. rureg, err = regexp.Compile(ru)
  1218. if err != nil {
  1219. log.Debug("error---rule:", r)
  1220. continue
  1221. }
  1222. i_rule = append(i_rule, []interface{}{rureg}...)
  1223. } else { //规则
  1224. i_rule = append(i_rule, r)
  1225. }
  1226. }
  1227. v["rule"] = i_rule
  1228. ruleList := []map[string]interface{}{}
  1229. classid := qu.ObjToString(v["s_classid"])
  1230. if len(e.CidRuleMap[classid]) > 0 { //追加
  1231. ruleList = e.CidRuleMap[classid]
  1232. }
  1233. ruleList = append(ruleList, v)
  1234. e.CidRuleMap[classid] = ruleList
  1235. }
  1236. }
  1237. //
  1238. func (e *ExtractTask) InitAuditFields() {
  1239. if len(e.AuditFields) == 0 {
  1240. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1241. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1242. vid := qu.BsonIdToSId((*v)["_id"])
  1243. query := map[string]interface{}{
  1244. "isaudit": true,
  1245. "delete": false,
  1246. "vid": vid,
  1247. }
  1248. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1249. for _, d := range *data {
  1250. field := qu.ObjToString(d["s_field"])
  1251. e.AuditFields = append(e.AuditFields, field)
  1252. }
  1253. }
  1254. }
  1255. }
  1256. //加载附件抽取
  1257. func (e *ExtractTask) InitFile() {
  1258. defer qu.Catch()
  1259. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1260. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1261. //ve, _ := db.Mgo.FindOne("version", query)
  1262. if ve == nil {
  1263. return
  1264. }
  1265. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1266. e.IsFileField = true
  1267. }
  1268. syscefiled := new(sync.Map)
  1269. if (*ve)["s_filefileds"] != nil {
  1270. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1271. syscefiled.Store(vff.(string), 1)
  1272. }
  1273. }
  1274. e.FileFields = syscefiled
  1275. }
  1276. //加载清理任务信息
  1277. func (c *ClearTask) InitClearTaskInfo() {
  1278. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1279. if len(*cleartask) > 1 {
  1280. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1281. c.ClearTaskInfo = &ClearTaskInfo{
  1282. Name: (*cleartask)["s_taskname"].(string),
  1283. Version: (*cleartask)["s_version"].(string),
  1284. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1285. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1286. FromDB: (*cleartask)["s_mgodb"].(string),
  1287. FromColl: (*cleartask)["s_mgocoll"].(string),
  1288. IsCltLog: ju.Config["iscltlog"].(bool),
  1289. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1290. }
  1291. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1292. } else {
  1293. return
  1294. }
  1295. }
  1296. //加载清理脚本
  1297. func (c *ClearTask) InitClearLuas() {
  1298. defer qu.Catch()
  1299. c.ClearLuas = make(map[string][]*ClearLua)
  1300. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1301. for _, l := range *list {
  1302. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1303. continue
  1304. }
  1305. s_field := qu.ObjToString(l["s_field"])
  1306. pid := qu.BsonIdToSId(l["_id"])
  1307. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1308. for _, vv := range *luas {
  1309. if b, _ := vv["isuse"].(bool); !b {
  1310. continue
  1311. }
  1312. clearLua := &ClearLua{
  1313. Field: s_field,
  1314. Code: vv["s_code"].(string),
  1315. Name: vv["s_name"].(string),
  1316. LuaText: vv["s_luascript"].(string),
  1317. LFields: getALLFields(),
  1318. }
  1319. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1320. }
  1321. }
  1322. }
  1323. //加载分块规则
  1324. func (e *ExtractTask) InitBlockRule() {
  1325. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1326. "vid": e.TaskInfo.VersionId,
  1327. "delete": false,
  1328. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1329. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1330. for _, v := range *datas {
  1331. block_reg, _ := v["block_reg"].(string)
  1332. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1333. title_reg, _ := v["title_reg"].(string)
  1334. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1335. if block_reg == "" || title_reg == "" {
  1336. continue
  1337. }
  1338. b_reg, b_err := regexp.Compile(block_reg)
  1339. t_reg, t_err := regexp.Compile(title_reg)
  1340. if b_err != nil || t_err != nil {
  1341. continue
  1342. }
  1343. brs = append(brs, b_reg)
  1344. trs = append(trs, t_reg)
  1345. }
  1346. e.RuleBlock = &ju.RuleBlock{
  1347. BlockRegs: brs,
  1348. TitleRegs: trs,
  1349. Classify: e.InitBlockClassify(),
  1350. }
  1351. }
  1352. //加载分块规则
  1353. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1354. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1355. "vid": e.TaskInfo.VersionId,
  1356. "delete": false,
  1357. }, nil, `{"name":1}`, false, -1, -1)
  1358. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1359. "vid": e.TaskInfo.VersionId,
  1360. "delete": false,
  1361. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1362. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1363. "vid": e.TaskInfo.VersionId,
  1364. "delete": false,
  1365. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1366. tag_map := map[string]ju.Tags{}
  1367. for _, v := range *classify_tag {
  1368. pid := qu.ObjToString(v["pid"])
  1369. tag_map[pid] = append(tag_map[pid], &ju.Tag{Value: qu.ObjToString(v["name"])})
  1370. }
  1371. //
  1372. info_map := map[string][]*ju.NameCode{}
  1373. info_tag := map[string]*ju.TagFile{}
  1374. for _, v := range *classify_info {
  1375. pid := qu.ObjToString(v["pid"])
  1376. _id := qu.BsonIdToSId(v["_id"])
  1377. name := qu.ObjToString(v["name"])
  1378. info_tag[name] = &ju.TagFile{
  1379. Name: name,
  1380. Items: tag_map[_id],
  1381. }
  1382. info_map[pid] = append(info_map[pid], &ju.NameCode{
  1383. Name: name,
  1384. Code: qu.ObjToString(v["code"]),
  1385. })
  1386. }
  1387. classify_map := map[string][]*ju.NameCode{}
  1388. for _, v := range *classify {
  1389. _id := qu.BsonIdToSId(v["_id"])
  1390. if info_map[_id] == nil {
  1391. continue
  1392. }
  1393. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1394. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1395. }
  1396. }
  1397. return &ju.BlockClassify{
  1398. Type: classify_map,
  1399. Classify: info_tag,
  1400. }
  1401. }