extractInit.go 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686
  1. // extractInit
  2. package extract
  3. import (
  4. "gopkg.in/mgo.v2/bson"
  5. db "jy/mongodbutil"
  6. ju "jy/util"
  7. qu "qfw/util"
  8. "regexp"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. log "github.com/donnie4w/go-logger/logger"
  15. "github.com/go-ego/gse"
  16. )
  17. type RegLuaInfo struct {
  18. //正则或脚本信息
  19. Code, Name, Field string //
  20. RuleText string //
  21. IsLua bool //
  22. RegPreBac *ExtReg //
  23. RegCore *ExtReg //
  24. }
  25. type ExtReg struct {
  26. Reg *regexp.Regexp
  27. Replace string
  28. Bextract bool
  29. ExtractPos map[string]int
  30. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  31. }
  32. type RuleCore struct {
  33. Id string //id
  34. Field string //逻辑字段
  35. LuaLogic string //进入逻辑
  36. ExtFrom string //从哪个字段抽取
  37. RulePres []*RegLuaInfo //抽取前置规则
  38. RuleBacks []*RegLuaInfo //抽取后置规则
  39. RuleCores []*RegLuaInfo //抽取规则
  40. LFields map[string]string //所有字段属性组
  41. }
  42. type Tag struct {
  43. Type string //标签类型 string 字符串、regexp 正则
  44. Key string //
  45. Reg *regexp.Regexp //
  46. }
  47. type TaskInfo struct {
  48. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  49. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  50. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  51. TestColl, LastExtId string //测试结果表、上次抽取信息id
  52. FDB *db.Pool //数据库连接池
  53. TDB *db.Pool //数据库连接池
  54. IsEtxLog bool //是否开启抽取日志
  55. ProcessPool chan bool //任务进程池
  56. TestLua bool //检查测试用
  57. }
  58. type ExtractTask struct {
  59. Id string //任务id
  60. IsRun bool //是否启动
  61. Content string //信息内容
  62. TaskInfo *TaskInfo //任务信息
  63. RulePres []*RegLuaInfo //通用前置规则
  64. RuleBacks []*RegLuaInfo //通用后置规则
  65. SiteRuleBacks []*RegLuaInfo //站点通用后置规则
  66. RuleBlock *ju.RuleBlock
  67. //RuleCores []*RuleCore //抽取规则
  68. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  69. SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则
  70. PkgRuleCores []*RuleCore //分包抽取规则
  71. Tag map[string][]*Tag //标签库
  72. SiteTag map[string][]*Tag //站点标签库
  73. ClearFn map[string][]string //清理函数
  74. SiteClearFn map[string][]string //站点清理函数
  75. IsExtractCity bool //是否开启城市抽取
  76. Fields map[string]int //抽取属性组
  77. IsFileField bool //是否开启附件抽取
  78. FileFields *sync.Map //抽取附件属性组
  79. ResultChanel chan bool //抽取结果详情
  80. sync.RWMutex
  81. ResultArr [][]map[string]interface{} //抽取结果详情
  82. BidChanel chan bool //抽取结果
  83. BidArr [][]map[string]interface{} //抽取结果
  84. BidTotal int //结果数量
  85. RecogFieldMap map[string]map[string]interface{} //识别字段
  86. FidClassMap map[string][]map[string]interface{} //分类
  87. CidRuleMap map[string][]map[string]interface{} //规则
  88. AuditFields []string //需要审核的字段名称
  89. SiteCityMap map[string]*SiteCity //站点对应的省市区
  90. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  91. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  92. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  93. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  94. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  95. DistrictCityMap map[string]*City
  96. NewDistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
  97. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  98. NewDistrictSimAndAll map[string][]map[string]*City //区或县(key:简称 val: 相同简称的区全称:所在市)
  99. StreetDistrictMap map[string]*District //街道对应的区或县
  100. NewStreetDistrictMap map[string][]*District //街道全称对应的区或县
  101. CommunityDistrictMap map[string][]*District //村、居委会对应的区或县
  102. ProvinceAllGet *ju.DFA //省全称
  103. ProvinceSimGet *ju.DFA //省简称
  104. CityAllGet *ju.DFA //市全称
  105. CitySimGet *ju.DFA //市简称
  106. DistrictAllGet *ju.DFA //区或县全称
  107. DistrictSimGet *ju.DFA //区或县简称
  108. StreetGet *ju.DFA //街道
  109. PostCodeMap map[string]*PostCode //邮编
  110. AreaCodeMap map[string]*AreaCode //区号
  111. InfoType []map[string]interface{}
  112. Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区
  113. Trie_Full_City *ju.Trie //市全称 地级市
  114. Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
  115. Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道
  116. Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会
  117. Trie_Sim_Province *ju.Trie //省简称
  118. Trie_Sim_City *ju.Trie //市简称
  119. Trie_Sim_District *ju.Trie //县简称
  120. Trie_Fulls []*ju.Trie //所有全称
  121. Trie_Sims []*ju.Trie //所有简称
  122. Seg_PCD *gse.Segmenter //分词
  123. Seg_SV *gse.Segmenter //分词
  124. Luacodes *sync.Map //站点规则
  125. SiteMerge *sync.Map //抽取合并
  126. }
  127. type SiteCity struct {
  128. P string //省简称
  129. C string //市全称
  130. D string //区全称
  131. }
  132. type ClearTaskInfo struct {
  133. Name, Version, VersionId string //名称、版本、版本id
  134. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  135. FDB *db.Pool //数据库连接池
  136. TDB *db.Pool //数据库连接池
  137. IsCltLog bool //是否开启清理日志
  138. ProcessPool chan bool //任务进程池
  139. }
  140. type ClearLua struct {
  141. Field string //字段字段
  142. Code string //代码
  143. Name string //名称
  144. LuaText string
  145. //LuaLogic string //进入逻辑
  146. //ExtFrom string //从哪个字段抽取
  147. LFields map[string]string //lua抽取字段属性组
  148. }
  149. type ClearTask struct {
  150. sync.RWMutex
  151. Id string //任务id
  152. Content string //信息内容
  153. ClearTaskInfo *ClearTaskInfo //任务信息
  154. ClearLuas map[string][]*ClearLua //清理脚本
  155. UpdateResult [][]map[string]interface{} //清理后结果
  156. //ClearChannel chan bool
  157. }
  158. func init() {
  159. TaskList = make(map[string]*ExtractTask)
  160. ClearTaskList = make(map[string]*ClearTask)
  161. go SaveExtLog()
  162. go SaveCltLog() //保存清理日志
  163. }
  164. //加载任务信息
  165. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  166. task, _ := db.Mgo.FindById("task", e.Id, nil)
  167. if len(*task) > 1 {
  168. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  169. e.TaskInfo = &TaskInfo{
  170. Name: (*task)["s_taskname"].(string),
  171. Version: (*task)["s_version"].(string),
  172. VersionId: qu.BsonIdToSId((*v)["_id"]),
  173. TrackColl: trackcoll,
  174. FromDbAddr: (*task)["s_mgoaddr"].(string),
  175. FromDB: (*task)["s_mgodb"].(string),
  176. FromColl: (*task)["s_mgocoll"].(string),
  177. TestColl: resultcoll,
  178. IsEtxLog: true,
  179. ProcessPool: make(chan bool, 1),
  180. }
  181. if (*v)["isextractcity"] != nil {
  182. e.IsExtractCity = (*v)["isextractcity"].(bool)
  183. }
  184. } else {
  185. return
  186. }
  187. }
  188. //加载任务信息
  189. func (e *ExtractTask) InitTaskInfo() {
  190. task, _ := db.Mgo.FindById("task", e.Id, nil)
  191. log.Debug("task", task)
  192. if len(*task) > 1 {
  193. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  194. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  195. log.Debug("s_mgosavecoll", strs)
  196. if len(strs) < 3 {
  197. return
  198. } else {
  199. e.TaskInfo = &TaskInfo{
  200. Name: (*task)["s_taskname"].(string),
  201. Version: (*task)["s_version"].(string),
  202. VersionId: qu.BsonIdToSId((*v)["_id"]),
  203. //TrackColl: (*task)["s_trackcoll"].(string),
  204. FromDbAddr: (*task)["s_mgoaddr"].(string),
  205. FromDB: (*task)["s_mgodb"].(string),
  206. FromColl: (*task)["s_mgocoll"].(string),
  207. ToDbAddr: strs[0],
  208. ToDB: strs[1],
  209. ToColl: strs[2],
  210. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  211. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  212. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  213. }
  214. if (*v)["isextractcity"] != nil {
  215. e.IsExtractCity = (*v)["isextractcity"].(bool)
  216. }
  217. }
  218. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  219. } else {
  220. return
  221. }
  222. }
  223. func (e *ExtractTask) InitSite() {
  224. e.Luacodes = &sync.Map{}
  225. e.SiteMerge = &sync.Map{}
  226. sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1)
  227. for _, v := range *sites {
  228. if vv, ok := v["site_script"].([]interface{}); ok {
  229. for _, vvv := range vv {
  230. e.Luacodes.Store(vvv, map[string]interface{}{})
  231. e.SiteMerge.Store(vvv,v["ismerge"].(bool))
  232. }
  233. } else if vv, ok := v["site_script"].(interface{}); ok {
  234. e.Luacodes.Store(vv, map[string]interface{}{})
  235. e.SiteMerge.Store(vv,v["ismerge"].(bool))
  236. }
  237. }
  238. }
  239. //加载通用前置规则
  240. func (e *ExtractTask) InitRulePres() {
  241. defer qu.Catch()
  242. e.RulePres = []*RegLuaInfo{}
  243. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  244. for _, v := range *list {
  245. rinfo := &RegLuaInfo{
  246. Code: v["s_code"].(string),
  247. Name: v["s_name"].(string),
  248. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  249. }
  250. if rinfo.IsLua {
  251. rinfo.RuleText = v["s_luascript"].(string)
  252. e.RulePres = append(e.RulePres, rinfo)
  253. } else {
  254. qu.Try(func() {
  255. rinfo.RuleText = v["s_rule"].(string)
  256. tmp := strings.Split(rinfo.RuleText, "__")
  257. var pattern string
  258. if strings.Contains(tmp[0], "\\u") {
  259. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  260. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  261. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  262. } else {
  263. pattern = tmp[0]
  264. }
  265. if len(tmp) == 2 {
  266. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  267. } else {
  268. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  269. }
  270. e.RulePres = append(e.RulePres, rinfo)
  271. }, func(err interface{}) {
  272. log.Debug(rinfo.Code, rinfo.Field, err)
  273. })
  274. }
  275. }
  276. }
  277. //加载通用后置规则
  278. func (e *ExtractTask) InitRuleBacks(isSite bool) {
  279. defer qu.Catch()
  280. cDB := ""
  281. eSiteRuleBacks := []*RegLuaInfo{}
  282. if isSite {
  283. cDB = "site_rule_back"
  284. e.SiteRuleBacks = []*RegLuaInfo{}
  285. } else {
  286. cDB = "rule_back"
  287. e.RuleBacks = []*RegLuaInfo{}
  288. }
  289. list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  290. for _, v := range *list {
  291. rinfo := &RegLuaInfo{
  292. Code: v["s_code"].(string),
  293. Name: v["s_name"].(string),
  294. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  295. }
  296. if rinfo.IsLua {
  297. rinfo.RuleText = v["s_luascript"].(string)
  298. if isSite {
  299. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  300. //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo)
  301. } else {
  302. e.RuleBacks = append(e.RuleBacks, rinfo)
  303. }
  304. } else {
  305. qu.Try(func() {
  306. rinfo.RuleText = v["s_rule"].(string)
  307. tmp := strings.Split(rinfo.RuleText, "__")
  308. var pattern string
  309. if strings.Contains(tmp[0], "\\u") {
  310. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  311. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  312. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  313. } else {
  314. pattern = tmp[0]
  315. }
  316. if len(tmp) == 2 {
  317. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  318. } else {
  319. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  320. }
  321. if isSite {
  322. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  323. } else {
  324. e.RuleBacks = append(e.RuleBacks, rinfo)
  325. }
  326. }, func(err interface{}) {
  327. log.Debug(rinfo.Code, rinfo.Field, err)
  328. })
  329. }
  330. if isSite {
  331. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  332. if (*sm) == nil || len(*sm) <= 0 {
  333. eSiteRuleBacks = []*RegLuaInfo{}
  334. continue
  335. }
  336. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  337. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  338. if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil {
  339. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks
  340. } else {
  341. if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 {
  342. tmplist = append(tmplist, eSiteRuleBacks...)
  343. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist
  344. }
  345. //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...)
  346. }
  347. e.Luacodes.Store(v2, mdpvalue)
  348. }
  349. }
  350. eSiteRuleBacks = []*RegLuaInfo{}
  351. }
  352. }
  353. }
  354. func (e *ExtractTask) InfoTypeList() {
  355. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  356. infolist := *infolist1
  357. for _, v := range infolist {
  358. e.InfoType = append(e.InfoType, v)
  359. }
  360. }
  361. //加载抽取规则
  362. func (e *ExtractTask) InitRuleCore(isSite bool) {
  363. defer qu.Catch()
  364. allFields := getALLFields()
  365. e.Fields = map[string]int{}
  366. var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb string
  367. eSiteRuleCores := make(map[string]map[string][]*RuleCore)
  368. if isSite {
  369. versioninfodb = "site_versioninfo"
  370. rule_logicdb = "site_rule_logic"
  371. rule_logicpredb = "site_rule_logicpre"
  372. rule_logicbackdb = "site_rule_logicback"
  373. rule_logicoredb = "site_rule_logicore"
  374. e.SiteRuleCores = make(map[string]map[string][]*RuleCore)
  375. } else {
  376. versioninfodb = "versioninfo"
  377. rule_logicdb = "rule_logic"
  378. rule_logicpredb = "rule_logicpre"
  379. rule_logicbackdb = "rule_logicback"
  380. rule_logicoredb = "rule_logicore"
  381. e.RuleCores = make(map[string]map[string][]*RuleCore)
  382. }
  383. fieldrules := map[string][]*RuleCore{}
  384. vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  385. for _, vinfo := range *vinfos {
  386. if b, _ := vinfo["isuse"].(bool); !b {
  387. continue
  388. }
  389. s_field := qu.ObjToString(vinfo["s_field"])
  390. pid := qu.BsonIdToSId(vinfo["_id"])
  391. list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  392. for _, vv := range *list {
  393. if b, _ := vv["isuse"].(bool); !b {
  394. continue
  395. }
  396. rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
  397. rcore.Field = s_field
  398. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  399. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  400. rcore.LFields = allFields
  401. //前置规则
  402. rulePres := []*RegLuaInfo{}
  403. plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  404. for _, v := range *plist {
  405. rinfo := &RegLuaInfo{
  406. Field: qu.ObjToString(v["s_field"]),
  407. Code: v["s_code"].(string),
  408. Name: v["s_name"].(string),
  409. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  410. }
  411. if rinfo.IsLua {
  412. rinfo.RuleText = v["s_luascript"].(string)
  413. rulePres = append(rulePres, rinfo)
  414. } else {
  415. qu.Try(func() {
  416. rinfo.RuleText = v["s_rule"].(string)
  417. tmp := strings.Split(rinfo.RuleText, "__")
  418. var pattern string
  419. if strings.Contains(tmp[0], "\\u") {
  420. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  421. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  422. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  423. } else {
  424. pattern = tmp[0]
  425. }
  426. if len(tmp) == 2 {
  427. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  428. } else {
  429. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  430. }
  431. rulePres = append(rulePres, rinfo)
  432. }, func(err interface{}) {
  433. log.Debug(rinfo.Code, rinfo.Field, err)
  434. })
  435. }
  436. }
  437. rcore.RulePres = rulePres
  438. //后置规则
  439. ruleBacks := []*RegLuaInfo{}
  440. blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  441. for _, v := range *blist {
  442. rinfo := &RegLuaInfo{
  443. Field: qu.ObjToString(v["s_field"]),
  444. Code: v["s_code"].(string),
  445. Name: v["s_name"].(string),
  446. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  447. }
  448. if rinfo.IsLua {
  449. rinfo.RuleText = v["s_luascript"].(string)
  450. ruleBacks = append(ruleBacks, rinfo)
  451. } else {
  452. qu.Try(func() {
  453. rinfo.RuleText = v["s_rule"].(string)
  454. tmp := strings.Split(rinfo.RuleText, "__")
  455. var pattern string
  456. if strings.Contains(tmp[0], "\\u") {
  457. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  458. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  459. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  460. } else {
  461. pattern = tmp[0]
  462. }
  463. if len(tmp) == 2 {
  464. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  465. } else {
  466. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  467. }
  468. ruleBacks = append(ruleBacks, rinfo)
  469. }, func(err interface{}) {
  470. log.Debug(rinfo.Code, rinfo.Field, err)
  471. })
  472. }
  473. }
  474. rcore.RuleBacks = ruleBacks
  475. //抽取规则
  476. ruleCores := []*RegLuaInfo{}
  477. clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  478. for _, v := range *clist {
  479. if b, _ := v["isuse"].(bool); !b {
  480. continue
  481. }
  482. field := qu.ObjToString(v["s_field"])
  483. e.Fields[field] = 1 //加入抽取属性组备用
  484. rinfo := &RegLuaInfo{
  485. Field: field,
  486. Code: v["s_code"].(string),
  487. Name: v["s_name"].(string),
  488. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  489. }
  490. if rinfo.IsLua {
  491. rinfo.RuleText = v["s_luascript"].(string)
  492. //提取全部属性
  493. ruleCores = append(ruleCores, rinfo)
  494. } else {
  495. qu.Try(func() {
  496. rinfo.RuleText = v["s_rule"].(string)
  497. tmp := strings.Split(rinfo.RuleText, "__")
  498. var pattern string
  499. if strings.Contains(tmp[0], "\\u") {
  500. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  501. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  502. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  503. } else {
  504. pattern = tmp[0]
  505. }
  506. if len(tmp) == 2 {
  507. epos := strings.Split(tmp[1], ",")
  508. posm := map[string]int{}
  509. for _, v := range epos {
  510. ks := strings.Split(v, ":")
  511. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  512. posm[ks[1]] = qu.IntAll(ks[0])
  513. } else { //(.*)招标公告__2
  514. posm[rinfo.Field] = qu.IntAll(ks[0])
  515. }
  516. }
  517. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  518. } else {
  519. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  520. }
  521. ruleCores = append(ruleCores, rinfo)
  522. }, func(err interface{}) {
  523. log.Debug(rinfo.Code, rinfo.Field, err)
  524. })
  525. }
  526. }
  527. rcore.RuleCores = ruleCores
  528. //
  529. if fieldrules[s_field] == nil {
  530. fieldrules[s_field] = []*RuleCore{}
  531. }
  532. fieldrules[s_field] = append(fieldrules[s_field], rcore)
  533. }
  534. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  535. for _, v := range *infolist {
  536. topclass := qu.ObjToString(v["topclass"])
  537. if v["subclass"] == nil {
  538. eSiteRuleCores[topclass] = make(map[string][]*RuleCore)
  539. for attr, _ := range v["fields"].(map[string]interface{}) {
  540. if fieldrules[attr] != nil {
  541. eSiteRuleCores[topclass][attr] = fieldrules[attr]
  542. }
  543. }
  544. } else {
  545. for ca, fs := range v["subclass"].(map[string]interface{}) {
  546. eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  547. for field, _ := range fs.(map[string]interface{}) {
  548. if fieldrules[field] != nil {
  549. eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field]
  550. }
  551. }
  552. }
  553. }
  554. }
  555. if isSite {
  556. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1})
  557. if (*sm) == nil || len(*sm) <= 0 {
  558. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  559. fieldrules = map[string][]*RuleCore{}
  560. continue
  561. }
  562. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  563. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  564. //属性配置
  565. if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil {
  566. mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores
  567. } else {
  568. for k2, v2 := range eSiteRuleCores {
  569. tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2]
  570. for kkkk, vvv := range v2 {
  571. tmpv[kkkk] = vvv
  572. }
  573. mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv
  574. }
  575. }
  576. e.Luacodes.Store(v2, mdpvalue)
  577. }
  578. }
  579. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  580. fieldrules = map[string][]*RuleCore{}
  581. }
  582. }
  583. if !isSite {
  584. //属性配置
  585. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  586. for _, v := range *infolist {
  587. topclass := qu.ObjToString(v["topclass"])
  588. if v["subclass"] == nil {
  589. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  590. for attr, _ := range v["fields"].(map[string]interface{}) {
  591. if fieldrules[attr] != nil {
  592. e.RuleCores[topclass][attr] = fieldrules[attr]
  593. }
  594. }
  595. } else {
  596. for ca, fs := range v["subclass"].(map[string]interface{}) {
  597. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  598. for field, _ := range fs.(map[string]interface{}) {
  599. if fieldrules[field] != nil {
  600. e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
  601. }
  602. }
  603. }
  604. }
  605. }
  606. }
  607. }
  608. //加载分包抽取规则
  609. func (e *ExtractTask) InitPkgCore() {
  610. defer qu.Catch()
  611. e.PkgRuleCores = []*RuleCore{}
  612. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  613. for _, pkginfo := range *pkginfos {
  614. if b, _ := pkginfo["isuse"].(bool); !b {
  615. continue
  616. }
  617. s_field := qu.ObjToString(pkginfo["s_field"])
  618. pid := qu.BsonIdToSId(pkginfo["_id"])
  619. logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  620. for _, vv := range *logicList {
  621. if b, _ := vv["isuse"].(bool); !b {
  622. continue
  623. }
  624. rcore := &RuleCore{}
  625. rcore.Field = s_field
  626. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  627. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  628. //后置规则
  629. ruleBacks := []*RegLuaInfo{}
  630. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  631. for _, v := range *blist {
  632. rinfo := &RegLuaInfo{
  633. Field: qu.ObjToString(v["s_field"]),
  634. Code: v["s_code"].(string),
  635. Name: v["s_name"].(string),
  636. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  637. }
  638. if rinfo.IsLua {
  639. rinfo.RuleText = v["s_luascript"].(string)
  640. ruleBacks = append(ruleBacks, rinfo)
  641. } else {
  642. qu.Try(func() {
  643. rinfo.RuleText = v["s_rule"].(string)
  644. tmp := strings.Split(rinfo.RuleText, "__")
  645. var pattern string
  646. if strings.Contains(tmp[0], "\\u") {
  647. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  648. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  649. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  650. } else {
  651. pattern = tmp[0]
  652. }
  653. if len(tmp) == 2 {
  654. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  655. } else {
  656. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  657. }
  658. ruleBacks = append(ruleBacks, rinfo)
  659. }, func(err interface{}) {
  660. log.Debug(rinfo.Code, rinfo.Field, err)
  661. })
  662. }
  663. }
  664. rcore.RuleBacks = ruleBacks
  665. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  666. }
  667. }
  668. }
  669. //加载标签库
  670. func (e *ExtractTask) InitTag(isSite bool) {
  671. defer qu.Catch()
  672. var tagdetailinfodb string
  673. eSiteTag := map[string][]*Tag{}
  674. if isSite {
  675. tagdetailinfodb = "site_tagdetailinfo"
  676. e.SiteTag = map[string][]*Tag{}
  677. } else {
  678. tagdetailinfodb = "tagdetailinfo"
  679. e.Tag = map[string][]*Tag{}
  680. }
  681. //字符串标签库
  682. list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  683. var tmpMap sync.Map
  684. for _, v := range *list {
  685. field := qu.ObjToString(v["s_field"])
  686. if tmp, ok := v["content"].([]interface{}); ok {
  687. fname := qu.ObjToString(v["s_name"])
  688. tab := ju.TagFile{Name: fname} //用于表格kv
  689. tab.Items = make([]*ju.Tag, len(tmp))
  690. for k, key := range tmp {
  691. tag := &Tag{Type: "string", Key: key.(string)}
  692. if isSite {
  693. eSiteTag[field] = append(eSiteTag[field], tag)
  694. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  695. } else {
  696. e.Tag[field] = append(e.Tag[field], tag)
  697. }
  698. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false}
  699. }
  700. sort.Sort(tab.Items)
  701. //ju.TagdbTable[fname] = &tab
  702. if isSite {
  703. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  704. if (*sm) == nil || len(*sm) <= 0 {
  705. eSiteTag = map[string][]*Tag{}
  706. continue
  707. }
  708. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  709. if v2 == nil || v2 == "" {
  710. continue
  711. }
  712. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  713. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  714. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  715. } else {
  716. for k2, v2 := range eSiteTag {
  717. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  718. }
  719. }
  720. e.Luacodes.Store(v2, mdpvalue)
  721. }
  722. tmpMap.Store(fname, &tab)
  723. ju.SiteTagdbTable.Store(v2, tmpMap)
  724. }
  725. //ju.SiteTagdbTable.Store(fname, &tab)
  726. eSiteTag = map[string][]*Tag{}
  727. } else {
  728. ju.TagdbTable.Store(fname, &tab)
  729. }
  730. }
  731. //if isSite {
  732. // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  733. // for _, v2 := range (*sm)["site_script"].([]interface{}) {
  734. // if mdpvalue, ok := Luacodes.Load(v2); ok {
  735. // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{
  736. // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  737. // }else {
  738. // for k2,v2 := range eSiteTag{
  739. // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  740. // }
  741. // }
  742. // Luacodes.Store(v2, mdpvalue)
  743. // }
  744. // }
  745. // eSiteTag = map[string][]*Tag{}
  746. //}
  747. }
  748. //正则标签库
  749. list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  750. for _, v := range *list {
  751. field := qu.ObjToString(v["s_field"])
  752. if tmp, ok := v["content"].([]interface{}); ok {
  753. fname := qu.ObjToString(v["s_name"])
  754. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  755. tab.Items = make([]*ju.Tag, len(tmp))
  756. for k, key := range tmp {
  757. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  758. if isSite {
  759. eSiteTag[field] = append(eSiteTag[field], tag)
  760. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  761. } else {
  762. e.Tag[field] = append(e.Tag[field], tag)
  763. }
  764. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false}
  765. }
  766. sort.Sort(tab.Items)
  767. //ju.TagdbTable[fname+"_reg"] = &tab
  768. if isSite {
  769. ju.SiteTagdbTable.Store(fname+"_reg", &tab)
  770. } else {
  771. ju.TagdbTable.Store(fname+"_reg", &tab)
  772. }
  773. }
  774. if isSite {
  775. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  776. if (*sm) == nil || len(*sm) <= 0 {
  777. eSiteTag = map[string][]*Tag{}
  778. continue
  779. }
  780. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  781. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  782. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  783. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  784. } else {
  785. for k2, v2 := range eSiteTag {
  786. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  787. }
  788. }
  789. e.Luacodes.Store(v2, mdpvalue)
  790. }
  791. }
  792. eSiteTag = map[string][]*Tag{}
  793. }
  794. }
  795. }
  796. //获取fields
  797. func getALLFields() map[string]string {
  798. fields := map[string]string{}
  799. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  800. for _, v := range *list {
  801. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  802. }
  803. return fields
  804. }
  805. //加载clear函数
  806. func (e *ExtractTask) InitClearFn(isSite bool) {
  807. defer qu.Catch()
  808. var cleanupdb string
  809. if isSite {
  810. cleanupdb = "site_cleanup"
  811. e.SiteClearFn = map[string][]string{}
  812. } else {
  813. cleanupdb = "cleanup"
  814. }
  815. list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  816. fn := map[string][]string{}
  817. for _, tmp := range *list {
  818. field := tmp["s_field"].(string)
  819. fns := tmp["clear"].([]interface{})
  820. if fn[field] == nil {
  821. fn[field] = []string{}
  822. }
  823. for _, v := range fns {
  824. fn[field] = append(fn[field], v.(string))
  825. }
  826. if isSite {
  827. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1})
  828. if (*sm) == nil || len(*sm) <= 0 {
  829. fn = map[string][]string{}
  830. continue
  831. }
  832. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  833. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  834. if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil {
  835. mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn
  836. } else {
  837. for k2, v2 := range fn {
  838. mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2
  839. }
  840. }
  841. e.Luacodes.Store(v2, mdpvalue)
  842. }
  843. }
  844. fn = map[string][]string{}
  845. }
  846. }
  847. if !isSite {
  848. e.ClearFn = fn
  849. }
  850. }
  851. //加载省份
  852. func InitProvince(version string) map[string]interface{} {
  853. defer qu.Catch()
  854. fn := map[string]interface{}{}
  855. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  856. for _, v := range *list {
  857. name := qu.ObjToString(v["s_name"])
  858. content := v["content"]
  859. switch content.(type) {
  860. case string:
  861. fn[name] = []interface{}{content.(string)}
  862. case []interface{}:
  863. fn[name] = content
  864. }
  865. }
  866. return fn
  867. }
  868. //加载城市简称
  869. func InitCitySim(version string) map[string]map[string]interface{} {
  870. defer qu.Catch()
  871. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  872. fn := map[string]map[string]interface{}{}
  873. for _, v := range *list {
  874. name := qu.ObjToString(v["s_name"])
  875. tmp := v["content"].(map[string]interface{})
  876. fn[name] = tmp
  877. }
  878. return fn
  879. }
  880. //加载城市全称
  881. func InitCityAll(version string) map[string]map[string]interface{} {
  882. defer qu.Catch()
  883. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  884. fn := map[string]map[string]interface{}{}
  885. for _, v := range *list {
  886. name := qu.ObjToString(v["s_name"])
  887. tmp := v["content"].(map[string]interface{})
  888. fn[name] = tmp
  889. }
  890. return fn
  891. }
  892. //加载站点库site城市信息
  893. func InitSite() []map[string]interface{} {
  894. defer qu.Catch()
  895. query := map[string]interface{}{
  896. "depttype": map[string]interface{}{
  897. "$ne": "代理机构",
  898. },
  899. }
  900. list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
  901. return *list
  902. }
  903. func (e *ExtractTask) InitCityInfo() {
  904. defer qu.Catch()
  905. e.InitVar() //初始化变量
  906. //site站点信息
  907. for _, v := range InitSite() {
  908. site, _ := v["site"].(string)
  909. area, _ := v["area"].(string)
  910. city, _ := v["city"].(string)
  911. district, _ := v["district"].(string)
  912. if area != "" && area != "全国" && site != "" {
  913. s := &SiteCity{
  914. P: area,
  915. C: city,
  916. D: district,
  917. }
  918. e.SiteCityMap[site] = s
  919. }
  920. }
  921. //初始化省信息
  922. fn1 := InitProvince(e.TaskInfo.Version)
  923. for k, v := range fn1 {
  924. for _, p := range v.([]interface{}) {
  925. p1, _ := p.(string)
  926. e.Trie_Full_Province.AddWords(p1) //华中科技大学
  927. e.ProvinceMap[p1] = k //华中科技大学:湖北
  928. }
  929. }
  930. //初始化城市全称
  931. fn2 := InitCityAll(e.TaskInfo.Version)
  932. for k, v := range fn2 {
  933. //加载省信息
  934. e.Trie_Full_Province.AddWords(k) //加入省全称Trie(k:浙江省)
  935. p := &Province{}
  936. p.Name = k //省全称:浙江省
  937. p.Brief = v["brief"].(string) //省简称:浙江
  938. e.Trie_Sim_Province.AddWords(p.Brief) //加入省简称Trie(k:浙江)
  939. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  940. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  941. p.Cap = v["captial"].(string) //省会(杭州)
  942. //加载市信息
  943. city, _ := v["city"].(map[string]interface{})
  944. for k1, v1 := range city {
  945. e.Trie_Full_City.AddWords(k1) //加入市全称Trie(k:杭州市)
  946. v1m, _ := v1.(map[string]interface{})
  947. c := &City{}
  948. c.Name = k1 //市全称:杭州市
  949. c.Brief = v1m["brief"].(string) //市简称:杭州
  950. e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
  951. e.CityMap[k1] = c.Brief //杭州市:杭州
  952. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  953. e.CityFullMap[k1] = c //杭州市:市信息{}
  954. c.P = p
  955. if c.Name == p.Cap {
  956. p.Captial = c //加载province中的省会市信息{}
  957. }
  958. //区县
  959. districtmap, _ := v1m["area"].(map[string]interface{}) //区或县
  960. for district, streets := range districtmap {
  961. d := &District{}
  962. d.Name = district
  963. d.C = c
  964. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  965. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级
  966. e.Trie_Full_District.AddWords(district) //加入区或县全称Trie
  967. ctmp := e.NewDistrictCityMap[district]
  968. if len(ctmp) == 0 {
  969. tmpcarr := []*City{c}
  970. e.NewDistrictCityMap[district] = tmpcarr
  971. } else {
  972. e.NewDistrictCityMap[district] = append(e.NewDistrictCityMap[district], c)
  973. }
  974. //街道
  975. streetmap, _ := streets.(map[string]interface{})
  976. for street, communitys := range streetmap {
  977. s := &Street{}
  978. s.Name = street
  979. s.D = d
  980. e.Trie_Full_Street.AddWords(street) //加入街道全称Trie
  981. dtmp := e.NewStreetDistrictMap[street]
  982. if len(dtmp) == 0 {
  983. tmpdarr := []*District{d}
  984. e.NewStreetDistrictMap[street] = tmpdarr
  985. } else {
  986. e.NewStreetDistrictMap[street] = append(e.NewStreetDistrictMap[street], d)
  987. }
  988. //村、居委会
  989. for _, ct := range qu.ObjArrToStringArr(communitys.([]interface{})) {
  990. e.Trie_Full_Community.AddWords(ct) //加入居委会、村全称Trie
  991. cttmp := e.CommunityDistrictMap[ct]
  992. if len(cttmp) == 0 {
  993. tmpdarr := []*District{d}
  994. e.CommunityDistrictMap[ct] = tmpdarr
  995. } else {
  996. e.CommunityDistrictMap[ct] = append(e.CommunityDistrictMap[ct], d)
  997. }
  998. }
  999. }
  1000. }
  1001. }
  1002. }
  1003. //初始化城市简称
  1004. fn3 := InitCitySim(e.TaskInfo.Version)
  1005. for _, v := range fn3 {
  1006. city, _ := v["city"].(map[string]interface{})
  1007. for _, v1 := range city {
  1008. v1m, _ := v1.(map[string]interface{})
  1009. cb := v1m["brief"].(string) //市简称
  1010. arr := v1m["area"].(map[string]interface{}) //区或县简称
  1011. for districtsim, districtall := range arr {
  1012. dfullstr, _ := districtall.(string)
  1013. e.Trie_Sim_District.AddWords(districtsim) //加入区或县简称Trie
  1014. c := e.CityBriefMap[cb]
  1015. dfullarr := e.NewDistrictSimAndAll[districtsim]
  1016. dfullcity := map[string]*City{dfullstr: c}
  1017. if len(dfullarr) == 0 {
  1018. tmparr := []map[string]*City{dfullcity}
  1019. e.NewDistrictSimAndAll[districtsim] = tmparr
  1020. } else {
  1021. e.NewDistrictSimAndAll[districtsim] = append(e.NewDistrictSimAndAll[districtsim], dfullcity)
  1022. }
  1023. }
  1024. }
  1025. }
  1026. e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community}
  1027. e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District}
  1028. }
  1029. func (e *ExtractTask) InitVar() {
  1030. defer qu.Catch()
  1031. //初始化Trie
  1032. //全称
  1033. e.Trie_Full_Province = &ju.Trie{}
  1034. e.Trie_Full_City = &ju.Trie{}
  1035. e.Trie_Full_District = &ju.Trie{}
  1036. e.Trie_Full_Street = &ju.Trie{}
  1037. e.Trie_Full_Community = &ju.Trie{}
  1038. //简称
  1039. e.Trie_Sim_Province = &ju.Trie{}
  1040. e.Trie_Sim_City = &ju.Trie{}
  1041. e.Trie_Sim_District = &ju.Trie{}
  1042. //初始化分词
  1043. e.Seg_PCD = &gse.Segmenter{}
  1044. e.Seg_SV = &gse.Segmenter{}
  1045. e.Seg_PCD.LoadDict("./res/pcd.txt")
  1046. e.Seg_SV.LoadDict("./res/sv.txt")
  1047. //初始化map
  1048. if e.SiteCityMap == nil {
  1049. e.SiteCityMap = make(map[string]*SiteCity)
  1050. }
  1051. if e.ProvinceMap == nil {
  1052. e.ProvinceMap = make(map[string]string)
  1053. }
  1054. if e.CityMap == nil {
  1055. e.CityMap = make(map[string]string)
  1056. }
  1057. if e.DistrictSimAndAll == nil {
  1058. e.DistrictSimAndAll = make(map[string]string)
  1059. }
  1060. if e.NewDistrictSimAndAll == nil {
  1061. e.NewDistrictSimAndAll = make(map[string][]map[string]*City)
  1062. }
  1063. if e.CityBriefMap == nil {
  1064. e.CityBriefMap = make(map[string]*City)
  1065. }
  1066. if e.CityFullMap == nil {
  1067. e.CityFullMap = make(map[string]*City)
  1068. }
  1069. if e.ProvinceBriefMap == nil {
  1070. e.ProvinceBriefMap = make(map[string]*Province)
  1071. }
  1072. if e.NewDistrictCityMap == nil {
  1073. e.NewDistrictCityMap = make(map[string][]*City)
  1074. }
  1075. if e.NewStreetDistrictMap == nil {
  1076. e.NewStreetDistrictMap = make(map[string][]*District)
  1077. }
  1078. if e.CommunityDistrictMap == nil {
  1079. e.CommunityDistrictMap = make(map[string][]*District)
  1080. }
  1081. }
  1082. //初始化城市省份敏感词
  1083. func (e *ExtractTask) InitCityDFA() {
  1084. defer qu.Catch()
  1085. e.CityAllGet = &ju.DFA{}
  1086. e.CitySimGet = &ju.DFA{}
  1087. e.DistrictAllGet = &ju.DFA{}
  1088. e.DistrictSimGet = &ju.DFA{}
  1089. e.ProvinceAllGet = &ju.DFA{}
  1090. e.ProvinceSimGet = &ju.DFA{}
  1091. e.StreetGet = &ju.DFA{}
  1092. //初始化map
  1093. if e.ProvinceMap == nil {
  1094. e.ProvinceMap = make(map[string]string)
  1095. }
  1096. if e.CityMap == nil {
  1097. e.CityMap = make(map[string]string)
  1098. }
  1099. if e.DistrictSimAndAll == nil {
  1100. e.DistrictSimAndAll = make(map[string]string)
  1101. }
  1102. if e.CityBriefMap == nil {
  1103. e.CityBriefMap = make(map[string]*City)
  1104. }
  1105. if e.CityFullMap == nil {
  1106. e.CityFullMap = make(map[string]*City)
  1107. }
  1108. if e.ProvinceBriefMap == nil {
  1109. e.ProvinceBriefMap = make(map[string]*Province)
  1110. }
  1111. if e.DistrictCityMap == nil {
  1112. e.DistrictCityMap = make(map[string]*City)
  1113. }
  1114. if e.StreetDistrictMap == nil {
  1115. e.StreetDistrictMap = make(map[string]*District)
  1116. }
  1117. //初始化省
  1118. fn1 := InitProvince(e.TaskInfo.Version)
  1119. for k, v := range fn1 {
  1120. for _, p := range v.([]interface{}) {
  1121. p1, _ := p.(string)
  1122. e.ProvinceAllGet.AddWord(p1) //华中科技大学
  1123. e.ProvinceMap[p1] = k //华中科技大学:湖北
  1124. }
  1125. }
  1126. //初始化城市全称
  1127. fn2 := InitCityAll(e.TaskInfo.Version)
  1128. for k, v := range fn2 {
  1129. //加载省信息
  1130. e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
  1131. p := &Province{}
  1132. p.Name = k //省全称:浙江省
  1133. p.Brief = v["brief"].(string) //省简称:浙江
  1134. e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
  1135. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  1136. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  1137. p.Cap = v["captial"].(string) //省会(杭州)
  1138. //加载市信息
  1139. city, _ := v["city"].(map[string]interface{})
  1140. for k1, v1 := range city {
  1141. e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
  1142. v1m, _ := v1.(map[string]interface{})
  1143. c := &City{}
  1144. c.Name = k1 //市全称:杭州市
  1145. c.Brief = v1m["brief"].(string) //市简称:杭州
  1146. e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
  1147. e.CityMap[k1] = c.Brief //杭州市:杭州
  1148. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  1149. e.CityFullMap[k1] = c //杭州市:市信息{}
  1150. c.P = p
  1151. if c.Name == p.Cap {
  1152. p.Captial = c //加载province中的省会市信息{}
  1153. }
  1154. //区县
  1155. districtmap := v1m["area"].(map[string]interface{}) //区或县
  1156. for district, streetarr := range districtmap {
  1157. d := &District{}
  1158. d.Name = district
  1159. d.C = c
  1160. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  1161. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
  1162. e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
  1163. ctmp := e.DistrictCityMap[district]
  1164. if ctmp == nil {
  1165. e.DistrictCityMap[district] = c
  1166. }
  1167. //街道
  1168. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  1169. e.StreetGet.AddWord(s) //加入街道敏感词
  1170. dtmp := e.StreetDistrictMap[s]
  1171. if dtmp == nil {
  1172. e.StreetDistrictMap[s] = d
  1173. }
  1174. }
  1175. }
  1176. }
  1177. }
  1178. //初始化城市简称
  1179. fn3 := InitCitySim(e.TaskInfo.Version)
  1180. for _, v := range fn3 {
  1181. city, _ := v["city"].(map[string]interface{})
  1182. for _, v1 := range city {
  1183. v1m, _ := v1.(map[string]interface{})
  1184. cb := v1m["brief"].(string) //市简称
  1185. arr := v1m["area"].(map[string]interface{}) //区或县简称
  1186. for districtsim, districtall := range arr {
  1187. e.DistrictSimAndAll[districtsim] = districtall.(string)
  1188. d := &District{}
  1189. d.Name = districtsim
  1190. d.C = e.CityBriefMap[cb]
  1191. e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
  1192. ctmp := e.DistrictCityMap[districtsim]
  1193. if ctmp == nil {
  1194. e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
  1195. }
  1196. }
  1197. }
  1198. }
  1199. }
  1200. //初始化邮编库
  1201. func (e *ExtractTask) InitPostCode() {
  1202. defer qu.Catch()
  1203. if e.PostCodeMap == nil {
  1204. e.PostCodeMap = make(map[string]*PostCode)
  1205. }
  1206. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  1207. for _, l := range *list {
  1208. pc := &PostCode{}
  1209. pc.Code = qu.ObjToString(l["code"])
  1210. pc.P = qu.ObjToString(l["province"])
  1211. pc.C = qu.ObjToString(l["city"])
  1212. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  1213. e.PostCodeMap[pc.Code] = pc
  1214. }
  1215. }
  1216. //初始化区号库
  1217. func (e *ExtractTask) InitAreaCode() {
  1218. defer qu.Catch()
  1219. if e.AreaCodeMap == nil {
  1220. e.AreaCodeMap = make(map[string]*AreaCode)
  1221. }
  1222. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  1223. for _, l := range *list {
  1224. ac := &AreaCode{}
  1225. ac.Code = qu.ObjToString(l["code"])
  1226. ac.P = qu.ObjToString(l["province"])
  1227. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  1228. e.AreaCodeMap[ac.Code] = ac
  1229. }
  1230. }
  1231. //保存抽取详情数据
  1232. func (e *ExtractTask) ResultSave(init bool) {
  1233. defer qu.Catch()
  1234. e.RWMutex.Lock()
  1235. if e.ResultArr == nil {
  1236. e.ResultArr = [][]map[string]interface{}{}
  1237. }
  1238. e.RWMutex.Unlock()
  1239. if init {
  1240. go func() {
  1241. for {
  1242. e.RWMutex.Lock()
  1243. if len(e.ResultArr) > saveLimit {
  1244. arr := e.ResultArr[:saveLimit]
  1245. e.ResultArr = e.ResultArr[saveLimit:]
  1246. e.RWMutex.Unlock()
  1247. qu.Try(func() {
  1248. db.Mgo.UpSertBulk("extract_result", arr...)
  1249. }, func(err interface{}) {
  1250. log.Debug(err)
  1251. })
  1252. } else {
  1253. arr := e.ResultArr
  1254. e.ResultArr = [][]map[string]interface{}{}
  1255. e.RWMutex.Unlock()
  1256. qu.Try(func() {
  1257. db.Mgo.UpSertBulk("extract_result", arr...)
  1258. }, func(err interface{}) {
  1259. log.Debug(err)
  1260. })
  1261. }
  1262. time.Sleep(3 * time.Second)
  1263. }
  1264. }()
  1265. } else {
  1266. e.RWMutex.Lock()
  1267. arr := e.ResultArr
  1268. e.ResultArr = [][]map[string]interface{}{}
  1269. e.RWMutex.Unlock()
  1270. qu.Try(func() {
  1271. lenarr := len(arr)
  1272. for {
  1273. if lenarr > saveLimit {
  1274. arr2 := arr[:saveLimit]
  1275. arr = arr[saveLimit:]
  1276. lenarr = len(arr)
  1277. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1278. } else {
  1279. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1280. break
  1281. }
  1282. }
  1283. }, func(err interface{}) {
  1284. defer e.RWMutex.Unlock()
  1285. log.Debug(err)
  1286. })
  1287. }
  1288. }
  1289. //保存抽取数据
  1290. func (e *ExtractTask) BidSave(init bool) {
  1291. defer qu.Catch()
  1292. e.RWMutex.Lock()
  1293. if e.BidArr == nil {
  1294. e.BidArr = [][]map[string]interface{}{}
  1295. }
  1296. e.RWMutex.Unlock()
  1297. if init {
  1298. go func() {
  1299. for {
  1300. e.RWMutex.Lock()
  1301. if len(e.BidArr) > saveLimit {
  1302. arr := e.BidArr[:saveLimit]
  1303. e.BidArr = e.BidArr[saveLimit:]
  1304. e.RWMutex.Unlock()
  1305. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1306. qu.Try(func() {
  1307. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1308. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1309. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1310. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1311. }, func(err interface{}) {
  1312. log.Debug(err)
  1313. })
  1314. } else {
  1315. arr := e.BidArr
  1316. e.BidArr = [][]map[string]interface{}{}
  1317. e.RWMutex.Unlock()
  1318. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1319. qu.Try(func() {
  1320. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1321. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1322. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1323. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1324. }, func(err interface{}) {
  1325. log.Debug(err)
  1326. })
  1327. }
  1328. time.Sleep(2 * time.Second)
  1329. }
  1330. }()
  1331. } else {
  1332. e.RWMutex.Lock()
  1333. arr := e.BidArr
  1334. e.BidArr = [][]map[string]interface{}{}
  1335. e.RWMutex.Unlock()
  1336. qu.Try(func() {
  1337. lenarr := len(arr)
  1338. for {
  1339. if lenarr > saveLimit {
  1340. arr2 := arr[:saveLimit]
  1341. arr = arr[saveLimit:]
  1342. lenarr = len(arr)
  1343. arr2, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr2)
  1344. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1345. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1346. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1347. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1348. } else {
  1349. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1350. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1351. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1352. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1353. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1354. break
  1355. }
  1356. }
  1357. }, func(err interface{}) {
  1358. log.Debug(err)
  1359. })
  1360. time.Sleep(1 * time.Second)
  1361. }
  1362. }
  1363. func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) {
  1364. arr = [][]map[string]interface{}{}
  1365. blocks = []map[string]interface{}{}
  1366. fieldalls = []map[string]interface{}{}
  1367. fieldallsf = []map[string]interface{}{}
  1368. for _, v := range a {
  1369. _id, _ := v[0]["_id"]
  1370. if tmp, ok := v[1]["$set"].(map[string]interface{}); ok {
  1371. if tmp["blocks"] != nil {
  1372. block := map[string]interface{}{
  1373. "_id": _id,
  1374. "blocks": tmp["blocks"],
  1375. }
  1376. blocks = append(blocks, block)
  1377. }
  1378. delete(tmp, "blocks")
  1379. if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok {
  1380. fieldall := map[string]interface{}{
  1381. "_id": _id,
  1382. }
  1383. for k, v := range f {
  1384. fieldall[k] = v
  1385. }
  1386. fieldalls = append(fieldalls, fieldall)
  1387. }
  1388. delete(tmp, "fieldall")
  1389. if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok {
  1390. fieldallf := map[string]interface{}{
  1391. "_id": _id,
  1392. }
  1393. for k, v := range ff {
  1394. fieldallf[k] = v
  1395. }
  1396. fieldallsf = append(fieldalls, fieldallf)
  1397. }
  1398. delete(tmp, "fieldallf")
  1399. v[1] = tmp
  1400. }
  1401. arr = append(arr, v)
  1402. }
  1403. return arr, blocks, fieldalls, fieldallsf
  1404. }
  1405. func (e *ExtractTask) InitAuditRecogField() {
  1406. defer qu.Catch()
  1407. e.RecogFieldMap = make(map[string]map[string]interface{})
  1408. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1409. for _, f := range *recogFieldList {
  1410. field := qu.ObjToString(f["s_recogfield"])
  1411. e.RecogFieldMap[field] = f
  1412. }
  1413. }
  1414. func (e *ExtractTask) InitAuditClass() {
  1415. defer qu.Catch()
  1416. e.FidClassMap = make(map[string][]map[string]interface{})
  1417. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1418. for _, c := range *class {
  1419. classList := []map[string]interface{}{}
  1420. fid := qu.ObjToString(c["s_fid"])
  1421. if len(e.FidClassMap[fid]) > 0 { //追加
  1422. classList = e.FidClassMap[fid]
  1423. }
  1424. classList = append(classList, c)
  1425. e.FidClassMap[fid] = classList
  1426. }
  1427. }
  1428. //加载规则
  1429. func (e *ExtractTask) InitAuditRule() {
  1430. defer qu.Catch()
  1431. var rureg *regexp.Regexp
  1432. var rs []rune
  1433. var ru string
  1434. var err error
  1435. e.CidRuleMap = make(map[string][]map[string]interface{})
  1436. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1437. for _, v := range *rule {
  1438. i_rule := []interface{}{}
  1439. ss, _ := (v["s_rule"].([]interface{}))
  1440. for _, r := range qu.ObjArrToStringArr(ss) {
  1441. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1442. rs = []rune(r)
  1443. ru = string(rs[1 : len(rs)-1])
  1444. rureg, err = regexp.Compile(ru)
  1445. if err != nil {
  1446. log.Debug("error---rule:", r)
  1447. continue
  1448. }
  1449. i_rule = append(i_rule, []interface{}{rureg}...)
  1450. } else { //规则
  1451. i_rule = append(i_rule, r)
  1452. }
  1453. }
  1454. v["rule"] = i_rule
  1455. ruleList := []map[string]interface{}{}
  1456. classid := qu.ObjToString(v["s_classid"])
  1457. if len(e.CidRuleMap[classid]) > 0 { //追加
  1458. ruleList = e.CidRuleMap[classid]
  1459. }
  1460. ruleList = append(ruleList, v)
  1461. e.CidRuleMap[classid] = ruleList
  1462. }
  1463. }
  1464. //
  1465. func (e *ExtractTask) InitAuditFields() {
  1466. if len(e.AuditFields) == 0 {
  1467. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1468. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1469. vid := qu.BsonIdToSId((*v)["_id"])
  1470. query := map[string]interface{}{
  1471. "isaudit": true,
  1472. "delete": false,
  1473. "vid": vid,
  1474. }
  1475. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1476. for _, d := range *data {
  1477. field := qu.ObjToString(d["s_field"])
  1478. e.AuditFields = append(e.AuditFields, field)
  1479. }
  1480. }
  1481. }
  1482. }
  1483. //加载附件抽取
  1484. func (e *ExtractTask) InitFile() {
  1485. defer qu.Catch()
  1486. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1487. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1488. //ve, _ := db.Mgo.FindOne("version", query)
  1489. if ve == nil {
  1490. return
  1491. }
  1492. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1493. e.IsFileField = true
  1494. }
  1495. syscefiled := new(sync.Map)
  1496. if (*ve)["s_filefileds"] != nil {
  1497. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1498. syscefiled.Store(vff.(string), 1)
  1499. }
  1500. }
  1501. e.FileFields = syscefiled
  1502. }
  1503. //加载清理任务信息
  1504. func (c *ClearTask) InitClearTaskInfo() {
  1505. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1506. if len(*cleartask) > 1 {
  1507. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1508. c.ClearTaskInfo = &ClearTaskInfo{
  1509. Name: (*cleartask)["s_taskname"].(string),
  1510. Version: (*cleartask)["s_version"].(string),
  1511. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1512. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1513. FromDB: (*cleartask)["s_mgodb"].(string),
  1514. FromColl: (*cleartask)["s_mgocoll"].(string),
  1515. IsCltLog: ju.Config["iscltlog"].(bool),
  1516. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1517. }
  1518. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1519. } else {
  1520. return
  1521. }
  1522. }
  1523. //加载清理脚本
  1524. func (c *ClearTask) InitClearLuas() {
  1525. defer qu.Catch()
  1526. c.ClearLuas = make(map[string][]*ClearLua)
  1527. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1528. for _, l := range *list {
  1529. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1530. continue
  1531. }
  1532. s_field := qu.ObjToString(l["s_field"])
  1533. pid := qu.BsonIdToSId(l["_id"])
  1534. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1535. for _, vv := range *luas {
  1536. if b, _ := vv["isuse"].(bool); !b {
  1537. continue
  1538. }
  1539. clearLua := &ClearLua{
  1540. Field: s_field,
  1541. Code: vv["s_code"].(string),
  1542. Name: vv["s_name"].(string),
  1543. LuaText: vv["s_luascript"].(string),
  1544. LFields: getALLFields(),
  1545. }
  1546. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1547. }
  1548. }
  1549. }
  1550. //加载分块规则
  1551. func (e *ExtractTask) InitBlockRule() {
  1552. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1553. "vid": e.TaskInfo.VersionId,
  1554. "delete": false,
  1555. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1556. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1557. for _, v := range *datas {
  1558. block_reg, _ := v["block_reg"].(string)
  1559. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1560. title_reg, _ := v["title_reg"].(string)
  1561. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1562. if block_reg == "" || title_reg == "" {
  1563. continue
  1564. }
  1565. b_reg, b_err := regexp.Compile(block_reg)
  1566. t_reg, t_err := regexp.Compile(title_reg)
  1567. if b_err != nil || t_err != nil {
  1568. continue
  1569. }
  1570. brs = append(brs, b_reg)
  1571. trs = append(trs, t_reg)
  1572. }
  1573. e.RuleBlock = &ju.RuleBlock{
  1574. BlockRegs: brs,
  1575. TitleRegs: trs,
  1576. Classify: e.InitBlockClassify(),
  1577. }
  1578. }
  1579. //加载分块规则
  1580. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1581. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1582. "vid": e.TaskInfo.VersionId,
  1583. "delete": false,
  1584. }, nil, `{"name":1}`, false, -1, -1)
  1585. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1586. "vid": e.TaskInfo.VersionId,
  1587. "delete": false,
  1588. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1589. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1590. "vid": e.TaskInfo.VersionId,
  1591. "delete": false,
  1592. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1593. tag_map := map[string]ju.Tags{}
  1594. for _, v := range *classify_tag {
  1595. pid := qu.ObjToString(v["pid"])
  1596. name := qu.ObjToString(v["name"])
  1597. tag := &ju.Tag{Value: name}
  1598. if strings.HasPrefix(name, "reg__") {
  1599. tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
  1600. }
  1601. tag_map[pid] = append(tag_map[pid], tag)
  1602. }
  1603. //
  1604. info_map := map[string][]*ju.NameCode{}
  1605. info_tag := map[string]*ju.TagFile{}
  1606. for _, v := range *classify_info {
  1607. pid := qu.ObjToString(v["pid"])
  1608. _id := qu.BsonIdToSId(v["_id"])
  1609. name := qu.ObjToString(v["name"])
  1610. info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
  1611. info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
  1612. }
  1613. classify_map := map[string][]*ju.NameCode{}
  1614. for _, v := range *classify {
  1615. _id := qu.BsonIdToSId(v["_id"])
  1616. if info_map[_id] == nil {
  1617. continue
  1618. }
  1619. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1620. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1621. }
  1622. }
  1623. return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
  1624. }