extractInit.go 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734
  1. // extractInit
  2. package extract
  3. import (
  4. "github.com/sensitive"
  5. db "jy/mongodbutil"
  6. ju "jy/util"
  7. qu "qfw/util"
  8. "regexp"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. "gopkg.in/mgo.v2/bson"
  15. log "github.com/donnie4w/go-logger/logger"
  16. "github.com/go-ego/gse"
  17. )
  18. type RegLuaInfo struct {
  19. //正则或脚本信息
  20. Code, Name, Field string //
  21. Score float64
  22. RuleText string //
  23. IsLua bool //
  24. RegPreBac *ExtReg //
  25. RegCore *ExtReg //
  26. }
  27. type ExtReg struct {
  28. Reg *regexp.Regexp
  29. Replace string
  30. Bextract bool
  31. ExtractPos map[string]int
  32. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  33. }
  34. type RuleCore struct {
  35. Id string //id
  36. Field string //逻辑字段
  37. LuaLogic string //进入逻辑
  38. ExtFrom string //从哪个字段抽取
  39. RulePres []*RegLuaInfo //抽取前置规则
  40. RuleBacks []*RegLuaInfo //抽取后置规则
  41. RuleCores []*RegLuaInfo //抽取规则
  42. KVRuleCores []*RegLuaInfo //KV抽取清理规则
  43. LFields map[string]string //所有字段属性组
  44. }
  45. type Tag struct {
  46. Type string //标签类型 string 字符串、regexp 正则
  47. Key string //
  48. Reg *regexp.Regexp //
  49. }
  50. type TaskInfo struct {
  51. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  52. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  53. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  54. TestColl, LastExtId string //测试结果表、上次抽取信息id
  55. FDB *db.Pool //数据库连接池
  56. TDB *db.Pool //数据库连接池
  57. IsEtxLog bool //是否开启抽取日志
  58. ProcessPool chan bool //任务进程池
  59. TestLua bool //检查测试用
  60. }
  61. type ExtractTask struct {
  62. Id string //任务id
  63. IsRun bool //是否启动
  64. Content string //信息内容
  65. TaskInfo *TaskInfo //任务信息
  66. RulePres []*RegLuaInfo //通用前置规则
  67. RuleBacks []*RegLuaInfo //通用后置规则
  68. SiteRuleBacks []*RegLuaInfo //站点通用后置规则
  69. RuleBlock *ju.RuleBlock
  70. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  71. SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则
  72. PkgRuleCores []*RuleCore //分包抽取规则
  73. Tag map[string][]*Tag //标签库
  74. SiteTag map[string][]*Tag //站点标签库
  75. ClearFn map[string][]string //清理函数
  76. SiteClearFn map[string][]string //站点清理函数
  77. IsExtractCity bool //是否开启城市抽取
  78. Fields map[string]int //抽取属性组
  79. SiteFields map[string]int //抽取站点属性组
  80. IsFileField bool //是否开启附件抽取
  81. FileFields *sync.Map //抽取附件属性组
  82. ResultChanel chan bool //抽取结果详情
  83. sync.RWMutex
  84. ResultArr [][]map[string]interface{} //抽取结果详情
  85. BidChanel chan bool //抽取结果
  86. BidArr [][]map[string]interface{} //抽取结果
  87. BidTotal int //结果数量
  88. RecogFieldMap map[string]map[string]interface{} //识别字段
  89. FidClassMap map[string][]map[string]interface{} //分类
  90. CidRuleMap map[string][]map[string]interface{} //规则
  91. AuditFields []string //需要审核的字段名称
  92. SiteCityMap map[string]*SiteCity //站点对应的省市区
  93. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  94. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  95. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  96. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  97. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  98. DistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
  99. DistrictSimAndAll map[string][]map[string]*City //区或县简称对应的city(全国有相同名称的区或县,这里对应的city用slice)
  100. StreetDistrictMap map[string][]*District //街道全称对应的区或县
  101. ProvinceAllGet *ju.DFA //省全称
  102. ProvinceSimGet *ju.DFA //省简称
  103. CityAllGet *ju.DFA //市全称
  104. CitySimGet *ju.DFA //市简称
  105. DistrictAllGet *ju.DFA //区或县全称
  106. DistrictSimGet *ju.DFA //区或县简称
  107. StreetGet *ju.DFA //街道
  108. PostCodeMap map[string]*PostCode //邮编
  109. AreaCodeMap map[string]*AreaCode //区号
  110. XjbtCityArr []map[string]interface{} //新疆兵团相关数据
  111. SensitiveFullCity *sensitive.Filter
  112. SensitiveSimCity *sensitive.Filter
  113. InfoType []map[string]interface{}
  114. Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区
  115. Trie_Full_City *ju.Trie //市全称 地级市
  116. Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
  117. Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道
  118. Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会
  119. Trie_Sim_Province *ju.Trie //省简称
  120. Trie_Sim_City *ju.Trie //市简称
  121. Trie_Sim_District *ju.Trie //县简称
  122. Trie_Fulls []*ju.Trie //所有全称
  123. Trie_Sims []*ju.Trie //所有简称
  124. Seg_PCD *gse.Segmenter //分词
  125. Seg_SV *gse.Segmenter //分词
  126. Luacodes *sync.Map //站点规则
  127. SiteMerge *sync.Map //抽取合并
  128. }
  129. type SiteCity struct {
  130. P string //省简称
  131. C string //市全称
  132. D string //区全称
  133. }
  134. type ClearTaskInfo struct {
  135. Name, Version, VersionId string //名称、版本、版本id
  136. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  137. FDB *db.Pool //数据库连接池
  138. TDB *db.Pool //数据库连接池
  139. IsCltLog bool //是否开启清理日志
  140. ProcessPool chan bool //任务进程池
  141. }
  142. type ClearLua struct {
  143. Field string //字段字段
  144. Code string //代码
  145. Name string //名称
  146. LuaText string
  147. LFields map[string]string //lua抽取字段属性组
  148. //LuaLogic string //进入逻辑
  149. //ExtFrom string //从哪个字段抽取
  150. }
  151. type ClearTask struct {
  152. sync.RWMutex
  153. Id string //任务id
  154. Content string //信息内容
  155. ClearTaskInfo *ClearTaskInfo //任务信息
  156. ClearLuas map[string][]*ClearLua //清理脚本
  157. UpdateResult [][]map[string]interface{} //清理后结果
  158. //ClearChannel chan bool
  159. }
  160. func init() {
  161. TaskList = make(map[string]*ExtractTask)
  162. ClearTaskList = make(map[string]*ClearTask)
  163. go SaveExtLog()
  164. go SaveCltLog() //保存清理日志
  165. }
  166. //加载任务信息
  167. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  168. task, _ := db.Mgo.FindById("task", e.Id, nil)
  169. if len(*task) > 1 {
  170. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  171. e.TaskInfo = &TaskInfo{
  172. Name: (*task)["s_taskname"].(string),
  173. Version: (*task)["s_version"].(string),
  174. VersionId: qu.BsonIdToSId((*v)["_id"]),
  175. TrackColl: trackcoll,
  176. FromDbAddr: (*task)["s_mgoaddr"].(string),
  177. FromDB: (*task)["s_mgodb"].(string),
  178. FromColl: (*task)["s_mgocoll"].(string),
  179. TestColl: resultcoll,
  180. IsEtxLog: true,
  181. ProcessPool: make(chan bool, 1),
  182. }
  183. if (*v)["isextractcity"] != nil {
  184. e.IsExtractCity = (*v)["isextractcity"].(bool)
  185. }
  186. } else {
  187. return
  188. }
  189. }
  190. //加载任务信息
  191. func (e *ExtractTask) InitTaskInfo() {
  192. task, _ := db.Mgo.FindById("task", e.Id, nil)
  193. log.Debug("task", task, "~", e.Id)
  194. if len(*task) > 1 {
  195. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  196. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  197. log.Debug("s_mgosavecoll", strs)
  198. if len(strs) < 3 {
  199. return
  200. } else {
  201. e.TaskInfo = &TaskInfo{
  202. Name: (*task)["s_taskname"].(string),
  203. Version: (*task)["s_version"].(string),
  204. VersionId: qu.BsonIdToSId((*v)["_id"]),
  205. //TrackColl: (*task)["s_trackcoll"].(string),
  206. FromDbAddr: (*task)["s_mgoaddr"].(string),
  207. FromDB: (*task)["s_mgodb"].(string),
  208. FromColl: (*task)["s_mgocoll"].(string),
  209. ToDbAddr: strs[0],
  210. ToDB: strs[1],
  211. ToColl: strs[2],
  212. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  213. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  214. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  215. }
  216. if (*v)["isextractcity"] != nil {
  217. e.IsExtractCity = (*v)["isextractcity"].(bool)
  218. }
  219. }
  220. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  221. } else {
  222. return
  223. }
  224. }
  225. func (e *ExtractTask) InitSite() {
  226. e.Luacodes = &sync.Map{}
  227. e.SiteMerge = &sync.Map{}
  228. sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1)
  229. for _, v := range *sites {
  230. if vv, ok := v["site_script"].([]interface{}); ok {
  231. for _, vvv := range vv {
  232. e.Luacodes.Store(vvv, map[string]interface{}{})
  233. e.SiteMerge.Store(vvv, v["ismerge"].(bool))
  234. }
  235. } else if vv, ok := v["site_script"].(interface{}); ok {
  236. e.Luacodes.Store(vv, map[string]interface{}{})
  237. e.SiteMerge.Store(vv, v["ismerge"].(bool))
  238. }
  239. }
  240. }
  241. //加载通用前置规则
  242. func (e *ExtractTask) InitRulePres() {
  243. defer qu.Catch()
  244. e.RulePres = []*RegLuaInfo{}
  245. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  246. for _, v := range *list {
  247. rinfo := &RegLuaInfo{
  248. Code: v["s_code"].(string),
  249. Name: v["s_name"].(string),
  250. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  251. }
  252. if rinfo.IsLua {
  253. rinfo.RuleText = v["s_luascript"].(string)
  254. e.RulePres = append(e.RulePres, rinfo)
  255. } else {
  256. qu.Try(func() {
  257. rinfo.RuleText = v["s_rule"].(string)
  258. tmp := strings.Split(rinfo.RuleText, "__")
  259. var pattern string
  260. if strings.Contains(tmp[0], "\\u") {
  261. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  262. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  263. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  264. } else {
  265. pattern = tmp[0]
  266. }
  267. if len(tmp) == 2 {
  268. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  269. } else {
  270. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  271. }
  272. e.RulePres = append(e.RulePres, rinfo)
  273. }, func(err interface{}) {
  274. log.Debug(rinfo.Code, rinfo.Field, err)
  275. })
  276. }
  277. }
  278. }
  279. //加载通用后置规则
  280. func (e *ExtractTask) InitRuleBacks(isSite bool) {
  281. defer qu.Catch()
  282. cDB := ""
  283. eSiteRuleBacks := []*RegLuaInfo{}
  284. if isSite {
  285. cDB = "site_rule_back"
  286. e.SiteRuleBacks = []*RegLuaInfo{}
  287. } else {
  288. cDB = "rule_back"
  289. e.RuleBacks = []*RegLuaInfo{}
  290. }
  291. list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  292. for _, v := range *list {
  293. rinfo := &RegLuaInfo{
  294. Code: v["s_code"].(string),
  295. Name: v["s_name"].(string),
  296. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  297. }
  298. if rinfo.IsLua {
  299. rinfo.RuleText = v["s_luascript"].(string)
  300. if isSite {
  301. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  302. //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo)
  303. } else {
  304. e.RuleBacks = append(e.RuleBacks, rinfo)
  305. }
  306. } else {
  307. qu.Try(func() {
  308. rinfo.RuleText = v["s_rule"].(string)
  309. tmp := strings.Split(rinfo.RuleText, "__")
  310. var pattern string
  311. if strings.Contains(tmp[0], "\\u") {
  312. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  313. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  314. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  315. } else {
  316. pattern = tmp[0]
  317. }
  318. if len(tmp) == 2 {
  319. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  320. } else {
  321. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  322. }
  323. if isSite {
  324. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  325. } else {
  326. e.RuleBacks = append(e.RuleBacks, rinfo)
  327. }
  328. }, func(err interface{}) {
  329. log.Debug(rinfo.Code, rinfo.Field, err)
  330. })
  331. }
  332. if isSite {
  333. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  334. if (*sm) == nil || len(*sm) <= 0 {
  335. eSiteRuleBacks = []*RegLuaInfo{}
  336. continue
  337. }
  338. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  339. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  340. if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil {
  341. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks
  342. } else {
  343. if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 {
  344. tmplist = append(tmplist, eSiteRuleBacks...)
  345. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist
  346. }
  347. //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...)
  348. }
  349. e.Luacodes.Store(v2, mdpvalue)
  350. }
  351. }
  352. eSiteRuleBacks = []*RegLuaInfo{}
  353. }
  354. }
  355. }
  356. func (e *ExtractTask) InfoTypeList() {
  357. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  358. infolist := *infolist1
  359. for _, v := range infolist {
  360. e.InfoType = append(e.InfoType, v)
  361. }
  362. }
  363. //加载抽取规则
  364. func (e *ExtractTask) InitRuleCore(isSite bool) {
  365. defer qu.Catch()
  366. allFields := getALLFields()
  367. var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb, rule_logickvdb string
  368. eSiteRuleCores := make(map[string]map[string][]*RuleCore)
  369. if isSite {
  370. versioninfodb = "site_versioninfo"
  371. rule_logicdb = "site_rule_logic"
  372. rule_logicpredb = "site_rule_logicpre"
  373. rule_logicbackdb = "site_rule_logicback"
  374. rule_logicoredb = "site_rule_logicore"
  375. rule_logickvdb = "site_rule_logickv"
  376. e.SiteFields = map[string]int{}
  377. e.SiteRuleCores = make(map[string]map[string][]*RuleCore)
  378. } else {
  379. versioninfodb = "versioninfo"
  380. rule_logicdb = "rule_logic"
  381. rule_logicpredb = "rule_logicpre"
  382. rule_logicbackdb = "rule_logicback"
  383. rule_logicoredb = "rule_logicore"
  384. rule_logickvdb = "rule_logickv"
  385. e.Fields = map[string]int{}
  386. e.RuleCores = make(map[string]map[string][]*RuleCore)
  387. }
  388. fieldrules := map[string][]*RuleCore{}
  389. vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  390. for _, vinfo := range *vinfos {
  391. //fmt.Println("总计",len(*vinfos),"当前第N个",kkkk)
  392. if b, _ := vinfo["isuse"].(bool); !b {
  393. continue
  394. }
  395. s_field := qu.ObjToString(vinfo["s_field"])
  396. pid := qu.BsonIdToSId(vinfo["_id"])
  397. list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  398. for _, vv := range *list {
  399. if b, _ := vv["isuse"].(bool); !b {
  400. continue
  401. }
  402. rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
  403. rcore.Field = s_field
  404. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  405. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  406. rcore.LFields = allFields
  407. //前置规则
  408. rulePres := []*RegLuaInfo{}
  409. plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  410. for _, v := range *plist {
  411. rinfo := &RegLuaInfo{
  412. Field: qu.ObjToString(v["s_field"]),
  413. Code: v["s_code"].(string),
  414. Name: v["s_name"].(string),
  415. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  416. }
  417. if rinfo.IsLua {
  418. rinfo.RuleText = v["s_luascript"].(string)
  419. rulePres = append(rulePres, rinfo)
  420. } else {
  421. qu.Try(func() {
  422. rinfo.RuleText = v["s_rule"].(string)
  423. tmp := strings.Split(rinfo.RuleText, "__")
  424. var pattern string
  425. if strings.Contains(tmp[0], "\\u") {
  426. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  427. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  428. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  429. } else {
  430. pattern = tmp[0]
  431. }
  432. if len(tmp) == 2 {
  433. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  434. } else {
  435. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  436. }
  437. rulePres = append(rulePres, rinfo)
  438. }, func(err interface{}) {
  439. log.Debug(rinfo.Code, rinfo.Field, err)
  440. })
  441. }
  442. }
  443. rcore.RulePres = rulePres
  444. //后置规则
  445. ruleBacks := []*RegLuaInfo{}
  446. blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  447. for _, v := range *blist {
  448. rinfo := &RegLuaInfo{
  449. Field: qu.ObjToString(v["s_field"]),
  450. Code: v["s_code"].(string),
  451. Name: v["s_name"].(string),
  452. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  453. }
  454. if rinfo.IsLua {
  455. rinfo.RuleText = v["s_luascript"].(string)
  456. ruleBacks = append(ruleBacks, rinfo)
  457. } else {
  458. qu.Try(func() {
  459. rinfo.RuleText = v["s_rule"].(string)
  460. tmp := strings.Split(rinfo.RuleText, "__")
  461. var pattern string
  462. if strings.Contains(tmp[0], "\\u") {
  463. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  464. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  465. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  466. } else {
  467. pattern = tmp[0]
  468. }
  469. if len(tmp) == 2 {
  470. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  471. } else {
  472. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  473. }
  474. ruleBacks = append(ruleBacks, rinfo)
  475. }, func(err interface{}) {
  476. log.Debug(rinfo.Code, rinfo.Field, err)
  477. })
  478. }
  479. }
  480. rcore.RuleBacks = ruleBacks
  481. //抽取规则
  482. ruleCores := []*RegLuaInfo{}
  483. clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  484. for _, v := range *clist {
  485. if b, _ := v["isuse"].(bool); !b {
  486. continue
  487. }
  488. field := qu.ObjToString(v["s_field"])
  489. if isSite {
  490. e.SiteFields[field] = 1
  491. } else {
  492. e.Fields[field] = 1 //加入抽取属性组备用
  493. }
  494. rinfo := &RegLuaInfo{
  495. Field: field,
  496. Code: v["s_code"].(string),
  497. Name: v["s_name"].(string),
  498. Score: qu.Float64All(v["s_default_score"]),
  499. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  500. }
  501. if rinfo.IsLua {
  502. rinfo.RuleText = v["s_luascript"].(string)
  503. //提取全部属性
  504. ruleCores = append(ruleCores, rinfo)
  505. } else {
  506. qu.Try(func() {
  507. rinfo.RuleText = v["s_rule"].(string)
  508. tmp := strings.Split(rinfo.RuleText, "__")
  509. var pattern string
  510. if strings.Contains(tmp[0], "\\u") {
  511. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  512. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  513. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  514. } else {
  515. pattern = tmp[0]
  516. }
  517. if len(tmp) == 2 {
  518. epos := strings.Split(tmp[1], ",")
  519. posm := map[string]int{}
  520. for _, v := range epos {
  521. ks := strings.Split(v, ":")
  522. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  523. posm[ks[1]] = qu.IntAll(ks[0])
  524. } else { //(.*)招标公告__2
  525. posm[rinfo.Field] = qu.IntAll(ks[0])
  526. }
  527. }
  528. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  529. } else {
  530. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  531. }
  532. ruleCores = append(ruleCores, rinfo)
  533. }, func(err interface{}) {
  534. log.Debug(rinfo.Code, rinfo.Field, err)
  535. })
  536. }
  537. }
  538. rcore.RuleCores = ruleCores
  539. //kv规则
  540. kvRuleCores := []*RegLuaInfo{}
  541. kvlist, _ := db.Mgo.Find(rule_logickvdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  542. for _, v := range *kvlist {
  543. if b, _ := v["isuse"].(bool); !b {
  544. continue
  545. }
  546. field := qu.ObjToString(v["s_field"])
  547. if isSite {
  548. e.SiteFields[field] = 1
  549. } else {
  550. e.Fields[field] = 1 //加入抽取属性组备用
  551. }
  552. rinfo := &RegLuaInfo{
  553. Field: field,
  554. Code: v["s_code"].(string),
  555. Name: v["s_name"].(string),
  556. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  557. }
  558. qu.Try(func() {
  559. rinfo.RuleText = v["s_rule"].(string)
  560. tmp := strings.Split(rinfo.RuleText, "__")
  561. var pattern string
  562. if strings.Contains(tmp[0], "\\u") {
  563. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  564. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  565. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  566. } else {
  567. pattern = tmp[0]
  568. }
  569. if len(tmp) == 2 {
  570. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  571. } else {
  572. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  573. }
  574. kvRuleCores = append(kvRuleCores, rinfo)
  575. }, func(err interface{}) {
  576. log.Debug(rinfo.Code, rinfo.Field, err)
  577. })
  578. }
  579. rcore.KVRuleCores = kvRuleCores
  580. if fieldrules[s_field] == nil {
  581. fieldrules[s_field] = []*RuleCore{}
  582. }
  583. fieldrules[s_field] = append(fieldrules[s_field], rcore)
  584. }
  585. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  586. for _, v := range *infolist {
  587. topclass := qu.ObjToString(v["topclass"])
  588. if v["subclass"] == nil {
  589. eSiteRuleCores[topclass] = make(map[string][]*RuleCore)
  590. for attr, _ := range v["fields"].(map[string]interface{}) {
  591. if fieldrules[attr] != nil {
  592. eSiteRuleCores[topclass][attr] = fieldrules[attr]
  593. }
  594. }
  595. } else {
  596. for ca, fs := range v["subclass"].(map[string]interface{}) {
  597. eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  598. for field, _ := range fs.(map[string]interface{}) {
  599. if fieldrules[field] != nil {
  600. eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field]
  601. }
  602. }
  603. }
  604. }
  605. }
  606. if isSite {
  607. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1})
  608. if (*sm) == nil || len(*sm) <= 0 {
  609. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  610. fieldrules = map[string][]*RuleCore{}
  611. continue
  612. }
  613. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  614. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  615. //属性配置
  616. if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil {
  617. mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores
  618. } else {
  619. for k2, v2 := range eSiteRuleCores {
  620. tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2]
  621. for kkkk, vvv := range v2 {
  622. tmpv[kkkk] = vvv
  623. }
  624. mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv
  625. }
  626. }
  627. e.Luacodes.Store(v2, mdpvalue)
  628. }
  629. }
  630. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  631. fieldrules = map[string][]*RuleCore{}
  632. }
  633. }
  634. if !isSite {
  635. //属性配置
  636. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  637. for _, v := range *infolist {
  638. topclass := qu.ObjToString(v["topclass"])
  639. if v["subclass"] == nil {
  640. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  641. for attr, _ := range v["fields"].(map[string]interface{}) {
  642. if fieldrules[attr] != nil {
  643. e.RuleCores[topclass][attr] = fieldrules[attr]
  644. }
  645. }
  646. } else {
  647. for ca, fs := range v["subclass"].(map[string]interface{}) {
  648. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  649. for field, _ := range fs.(map[string]interface{}) {
  650. if fieldrules[field] != nil {
  651. e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
  652. }
  653. }
  654. }
  655. }
  656. }
  657. }
  658. }
  659. //加载分包抽取规则
  660. func (e *ExtractTask) InitPkgCore() {
  661. defer qu.Catch()
  662. e.PkgRuleCores = []*RuleCore{}
  663. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  664. for _, pkginfo := range *pkginfos {
  665. if b, _ := pkginfo["isuse"].(bool); !b {
  666. continue
  667. }
  668. s_field := qu.ObjToString(pkginfo["s_field"])
  669. sid := qu.BsonIdToSId(pkginfo["_id"])
  670. rcore := &RuleCore{}
  671. rcore.Field = s_field
  672. rcore.ExtFrom = "detail"
  673. //后置规则
  674. ruleBacks := []*RegLuaInfo{}
  675. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  676. for _, v := range *blist {
  677. rinfo := &RegLuaInfo{
  678. Field: qu.ObjToString(v["s_field"]),
  679. Code: v["s_code"].(string),
  680. Name: v["s_name"].(string),
  681. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  682. }
  683. if rinfo.IsLua {
  684. rinfo.RuleText = v["s_luascript"].(string)
  685. ruleBacks = append(ruleBacks, rinfo)
  686. } else {
  687. qu.Try(func() {
  688. rinfo.RuleText = v["s_rule"].(string)
  689. tmp := strings.Split(rinfo.RuleText, "__")
  690. var pattern string
  691. if strings.Contains(tmp[0], "\\u") {
  692. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  693. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  694. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  695. } else {
  696. pattern = tmp[0]
  697. }
  698. if len(tmp) == 2 {
  699. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  700. } else {
  701. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  702. }
  703. ruleBacks = append(ruleBacks, rinfo)
  704. }, func(err interface{}) {
  705. log.Debug(rinfo.Code, rinfo.Field, err)
  706. })
  707. }
  708. }
  709. rcore.RuleBacks = ruleBacks
  710. //抽取规则
  711. ruleCores := []*RegLuaInfo{}
  712. clist, _ := db.Mgo.Find("pkg_logicore", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  713. for _, v := range *clist {
  714. if b, _ := v["isuse"].(bool); !b {
  715. continue
  716. }
  717. field := qu.ObjToString(v["s_field"])
  718. e.Fields[field] = 1 //加入抽取属性组备用
  719. rinfo := &RegLuaInfo{
  720. Field: field,
  721. Code: v["s_code"].(string),
  722. Name: v["s_name"].(string),
  723. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  724. }
  725. if rinfo.IsLua {
  726. rinfo.RuleText = v["s_luascript"].(string)
  727. //提取全部属性
  728. ruleCores = append(ruleCores, rinfo)
  729. } else {
  730. qu.Try(func() {
  731. rinfo.RuleText = v["s_rule"].(string)
  732. tmp := strings.Split(rinfo.RuleText, "__")
  733. var pattern string
  734. if strings.Contains(tmp[0], "\\u") {
  735. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  736. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  737. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  738. } else {
  739. pattern = tmp[0]
  740. }
  741. if len(tmp) == 2 {
  742. epos := strings.Split(tmp[1], ",")
  743. posm := map[string]int{}
  744. for _, v := range epos {
  745. ks := strings.Split(v, ":")
  746. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  747. posm[ks[1]] = qu.IntAll(ks[0])
  748. } else { //(.*)招标公告__2
  749. posm[rinfo.Field] = qu.IntAll(ks[0])
  750. }
  751. }
  752. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  753. } else {
  754. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  755. }
  756. ruleCores = append(ruleCores, rinfo)
  757. }, func(err interface{}) {
  758. log.Debug(rinfo.Code, rinfo.Field, err)
  759. })
  760. }
  761. }
  762. rcore.RuleCores = ruleCores
  763. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  764. }
  765. }
  766. //加载标签库
  767. func (e *ExtractTask) InitTag(isSite bool) {
  768. defer qu.Catch()
  769. var tagdetailinfodb string
  770. eSiteTag := map[string][]*Tag{}
  771. if isSite {
  772. tagdetailinfodb = "site_tagdetailinfo"
  773. e.SiteTag = map[string][]*Tag{}
  774. } else {
  775. tagdetailinfodb = "tagdetailinfo"
  776. e.Tag = map[string][]*Tag{}
  777. }
  778. //字符串标签库
  779. list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  780. var tmpMap sync.Map
  781. for _, v := range *list {
  782. field := qu.ObjToString(v["s_field"])
  783. if tmp, ok := v["content"].([]interface{}); ok {
  784. fname := qu.ObjToString(v["s_name"])
  785. tab := ju.TagFile{Name: fname} //用于表格kv
  786. tab.Items = make([]*ju.Tag, len(tmp))
  787. for k, key := range tmp {
  788. tag := &Tag{Type: "string", Key: key.(string)}
  789. if isSite {
  790. eSiteTag[field] = append(eSiteTag[field], tag)
  791. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  792. } else {
  793. e.Tag[field] = append(e.Tag[field], tag)
  794. }
  795. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false}
  796. }
  797. sort.Sort(tab.Items)
  798. //ju.TagdbTable[fname] = &tab
  799. if isSite {
  800. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  801. if (*sm) == nil || len(*sm) <= 0 {
  802. eSiteTag = map[string][]*Tag{}
  803. continue
  804. }
  805. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  806. if v2 == nil || v2 == "" {
  807. continue
  808. }
  809. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  810. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  811. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  812. } else {
  813. for k2, v2 := range eSiteTag {
  814. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  815. }
  816. }
  817. e.Luacodes.Store(v2, mdpvalue)
  818. }
  819. tmpMap.Store(fname, &tab)
  820. ju.SiteTagdbTable.Store(v2, tmpMap)
  821. }
  822. //ju.SiteTagdbTable.Store(fname, &tab)
  823. eSiteTag = map[string][]*Tag{}
  824. } else {
  825. ju.TagdbTable.Store(fname, &tab)
  826. }
  827. }
  828. //if isSite {
  829. // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  830. // for _, v2 := range (*sm)["site_script"].([]interface{}) {
  831. // if mdpvalue, ok := Luacodes.Load(v2); ok {
  832. // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{
  833. // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  834. // }else {
  835. // for k2,v2 := range eSiteTag{
  836. // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  837. // }
  838. // }
  839. // Luacodes.Store(v2, mdpvalue)
  840. // }
  841. // }
  842. // eSiteTag = map[string][]*Tag{}
  843. //}
  844. }
  845. //正则标签库
  846. list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  847. for _, v := range *list {
  848. field := qu.ObjToString(v["s_field"])
  849. if tmp, ok := v["content"].([]interface{}); ok {
  850. fname := qu.ObjToString(v["s_name"])
  851. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  852. tab.Items = make([]*ju.Tag, len(tmp))
  853. for k, key := range tmp {
  854. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  855. if isSite {
  856. eSiteTag[field] = append(eSiteTag[field], tag)
  857. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  858. } else {
  859. e.Tag[field] = append(e.Tag[field], tag)
  860. }
  861. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false}
  862. }
  863. sort.Sort(tab.Items)
  864. //ju.TagdbTable[fname+"_reg"] = &tab
  865. if isSite {
  866. ju.SiteTagdbTable.Store(fname+"_reg", &tab)
  867. } else {
  868. ju.TagdbTable.Store(fname+"_reg", &tab)
  869. }
  870. }
  871. if isSite {
  872. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  873. if (*sm) == nil || len(*sm) <= 0 {
  874. eSiteTag = map[string][]*Tag{}
  875. continue
  876. }
  877. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  878. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  879. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  880. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  881. } else {
  882. for k2, v2 := range eSiteTag {
  883. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  884. }
  885. }
  886. e.Luacodes.Store(v2, mdpvalue)
  887. }
  888. }
  889. eSiteTag = map[string][]*Tag{}
  890. }
  891. }
  892. }
  893. //获取fields
  894. func getALLFields() map[string]string {
  895. fields := map[string]string{}
  896. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  897. for _, v := range *list {
  898. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  899. }
  900. return fields
  901. }
  902. //加载clear函数
  903. func (e *ExtractTask) InitClearFn(isSite bool) {
  904. defer qu.Catch()
  905. var cleanupdb string
  906. if isSite {
  907. cleanupdb = "site_cleanup"
  908. e.SiteClearFn = map[string][]string{}
  909. } else {
  910. cleanupdb = "cleanup"
  911. }
  912. list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  913. fn := map[string][]string{}
  914. for _, tmp := range *list {
  915. field := tmp["s_field"].(string)
  916. fns := tmp["clear"].([]interface{})
  917. if fn[field] == nil {
  918. fn[field] = []string{}
  919. }
  920. for _, v := range fns {
  921. fn[field] = append(fn[field], v.(string))
  922. }
  923. if isSite {
  924. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1})
  925. if (*sm) == nil || len(*sm) <= 0 {
  926. fn = map[string][]string{}
  927. continue
  928. }
  929. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  930. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  931. if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil {
  932. mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn
  933. } else {
  934. for k2, v2 := range fn {
  935. mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2
  936. }
  937. }
  938. e.Luacodes.Store(v2, mdpvalue)
  939. }
  940. }
  941. fn = map[string][]string{}
  942. }
  943. }
  944. if !isSite {
  945. e.ClearFn = fn
  946. }
  947. }
  948. //加载省份
  949. func InitProvince(version string) map[string]interface{} {
  950. defer qu.Catch()
  951. fn := map[string]interface{}{}
  952. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  953. for _, v := range *list {
  954. name := qu.ObjToString(v["s_name"])
  955. content := v["content"]
  956. switch content.(type) {
  957. case string:
  958. fn[name] = []interface{}{content.(string)}
  959. case []interface{}:
  960. fn[name] = content
  961. }
  962. }
  963. return fn
  964. }
  965. //加载所有
  966. func InitProvincesx() []map[string]interface{} {
  967. defer qu.Catch()
  968. provinces := make([]map[string]interface{}, 0)
  969. ju.AddrsSess.Find(map[string]interface{}{
  970. "Remarks": nil,
  971. }).All(&provinces)
  972. return provinces
  973. }
  974. //加载站点库site城市信息
  975. func InitSite() []map[string]interface{} {
  976. defer qu.Catch()
  977. query := map[string]interface{}{
  978. "site_type": map[string]interface{}{
  979. "$ne": "代理机构",
  980. },
  981. }
  982. list, _ := ju.Site_Mgo.Find("site", query, nil, map[string]interface{}{
  983. "site": 1,
  984. "area": 1,
  985. "city": 1,
  986. "district": 1,
  987. })
  988. return list
  989. //list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
  990. //return *list
  991. }
  992. //加载新疆兵团映射关系
  993. func (e *ExtractTask) InitXjbtCityInfo() {
  994. defer qu.Catch()
  995. //加载数据
  996. query := map[string]interface{}{}
  997. list, _ := db.Mgo.Find("area_xjbt", query, nil, nil, false, -1, -1)
  998. arr := []map[string]interface{}{}
  999. for _, v := range *list {
  1000. delete(v, "_id")
  1001. arr = append(arr, v)
  1002. }
  1003. e.XjbtCityArr = arr
  1004. }
  1005. //站点加载...
  1006. func (e *ExtractTask) InitUpdateSite() {
  1007. defer qu.Catch()
  1008. e.SiteCityMap = make(map[string]*SiteCity)
  1009. for _, v := range InitSite() {
  1010. site := qu.ObjToString(v["site"])
  1011. area := qu.ObjToString(v["area"])
  1012. city := qu.ObjToString(v["city"])
  1013. district := qu.ObjToString(v["district"])
  1014. if area != "" && area != "全国" && site != "" {
  1015. s := &SiteCity{
  1016. P: area,
  1017. C: city,
  1018. D: district,
  1019. }
  1020. e.SiteCityMap[site] = s
  1021. }
  1022. }
  1023. log.Debug("有效站点数量:", len(e.SiteCityMap))
  1024. }
  1025. func (e *ExtractTask) InitCityInfo() {
  1026. defer qu.Catch()
  1027. e.InitVar() //初始化变量
  1028. //新疆兵团数据
  1029. e.InitXjbtCityInfo()
  1030. //site站点信息
  1031. e.InitUpdateSite()
  1032. //初始化省信息
  1033. fn1 := InitProvince(e.TaskInfo.Version)
  1034. for k, v := range fn1 {
  1035. for _, p := range v.([]interface{}) {
  1036. p1, _ := p.(string)
  1037. e.Trie_Full_Province.AddWords(p1) //华中科技大学
  1038. e.ProvinceMap[p1] = k //华中科技大学:湖北
  1039. }
  1040. }
  1041. alldata := InitProvincesx()
  1042. fnx := make([]map[string]interface{}, 0)
  1043. citys_maps := make(map[string][]map[string]interface{}, 0)
  1044. districts_maps := make(map[string]map[string][]map[string]interface{}, 0)
  1045. towns_maps := make(map[string]map[string]map[string][]map[string]interface{}, 0)
  1046. jwhs_maps := make(map[string]map[string]map[string]map[string][]map[string]interface{}, 0)
  1047. for _, v := range alldata {
  1048. codenum := len(v["code"].(string))
  1049. province := qu.ObjToString(v["province"])
  1050. city := qu.ObjToString(v["city"])
  1051. district := qu.ObjToString(v["district"])
  1052. town := qu.ObjToString(v["town"])
  1053. if codenum == 2 {
  1054. fnx = append(fnx, v)
  1055. } else if codenum == 4 {
  1056. citys_maps[province] = append(citys_maps[province], v)
  1057. } else if codenum == 6 {
  1058. if districts_maps[province] == nil {
  1059. districts_maps[province] = make(map[string][]map[string]interface{}, 0)
  1060. }
  1061. districts_maps[province][city] = append(districts_maps[province][city], v)
  1062. } else if codenum == 9 {
  1063. if towns_maps[province] == nil {
  1064. towns_maps[province] = make(map[string]map[string][]map[string]interface{}, 0)
  1065. }
  1066. if towns_maps[province][city] == nil {
  1067. towns_maps[province][city] = make(map[string][]map[string]interface{}, 0)
  1068. }
  1069. towns_maps[province][city][district] = append(towns_maps[province][city][district], v)
  1070. } else if codenum == 12 {
  1071. if jwhs_maps[province] == nil {
  1072. jwhs_maps[province] = make(map[string]map[string]map[string][]map[string]interface{}, 0)
  1073. }
  1074. if jwhs_maps[province][city] == nil {
  1075. jwhs_maps[province][city] = make(map[string]map[string][]map[string]interface{}, 0)
  1076. }
  1077. if jwhs_maps[province][city][district] == nil {
  1078. jwhs_maps[province][city][district] = make(map[string][]map[string]interface{}, 0)
  1079. }
  1080. jwhs_maps[province][city][district][town] = append(jwhs_maps[province][city][district][town], v)
  1081. }
  1082. }
  1083. //初始化城市全称
  1084. for _, provinces := range fnx {
  1085. all_province := qu.ObjToString(provinces["all_province"]) //省全称
  1086. jc_province := qu.ObjToString(provinces["province"]) //省简称
  1087. //加载省信息
  1088. e.Trie_Full_Province.AddWords(all_province) //加入省全称Trie(k:浙江省)
  1089. p := &Province{}
  1090. p.Name = all_province //省全称:浙江省
  1091. p.Brief = jc_province //省简称:浙江
  1092. e.Trie_Sim_Province.AddWords(jc_province) //加入省简称Trie(k:浙江)
  1093. e.ProvinceMap[all_province] = jc_province //浙江省:浙江
  1094. e.ProvinceBriefMap[jc_province] = p //浙江:省信息{}
  1095. if province_alias, ok := provinces["province_alias"].([]interface{}); ok {
  1096. for _, vprovince_alias := range province_alias {
  1097. e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p
  1098. }
  1099. }
  1100. //加载市信息
  1101. citys := citys_maps[jc_province]
  1102. isok := make(map[string]bool)
  1103. for _, vcity := range citys {
  1104. qc_city := qu.ObjToString(vcity["city"])
  1105. jc_city := qu.ObjToString(vcity["brief_city"])
  1106. e.Trie_Full_City.AddWords(qc_city) //加入市全称Trie(k:杭州市)
  1107. e.SensitiveFullCity.AddWord(qc_city)
  1108. c := &City{}
  1109. c.Name = qc_city //市全称:杭州市
  1110. if jc_city != "" {
  1111. c.Brief = jc_city //市简称:杭州
  1112. e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
  1113. e.SensitiveSimCity.AddWord(c.Brief)
  1114. e.CityMap[qc_city] = c.Brief //杭州市:杭州
  1115. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  1116. e.CityFullMap[qc_city] = c //杭州市:市信息{}
  1117. }
  1118. c.P = p
  1119. if city_alias, ok := vcity["city_alias"].([]interface{}); ok {
  1120. for _, vcity_alias := range city_alias {
  1121. strvcity_alias := qu.ObjToString(vcity_alias)
  1122. if isok[jc_province+"_"+strvcity_alias] {
  1123. continue
  1124. }
  1125. e.CityBriefMap[strvcity_alias] = c
  1126. e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps)
  1127. isok[jc_province+"_"+strvcity_alias] = true
  1128. }
  1129. }
  1130. if isok[jc_province+"_"+qc_city] {
  1131. continue
  1132. }
  1133. e.initDistricts(jc_province, qc_city, c, jc_city, districts_maps, towns_maps, jwhs_maps)
  1134. }
  1135. }
  1136. e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community}
  1137. e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District}
  1138. }
  1139. //加载区县
  1140. func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
  1141. jc_city string, districts_maps map[string]map[string][]map[string]interface{},
  1142. towns_maps map[string]map[string]map[string][]map[string]interface{},
  1143. jwhs_maps map[string]map[string]map[string]map[string][]map[string]interface{}) {
  1144. districts := districts_maps[jc_province][qc_city]
  1145. for _, vdistricts := range districts {
  1146. qc_district := qu.ObjToString(vdistricts["district"])
  1147. jc_district := qu.ObjToString(vdistricts["brief_district"])
  1148. d := &District{}
  1149. d.Name = qc_district
  1150. d.C = c
  1151. e.Trie_Full_District.AddWords(qc_district) //加入区或县全称Trie
  1152. if jc_district != "" {
  1153. e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie
  1154. //初始化城市简称
  1155. c := e.CityBriefMap[jc_city]
  1156. dfullarr := e.DistrictSimAndAll[jc_district]
  1157. dfullcity := map[string]*City{qc_district: c}
  1158. if len(dfullarr) == 0 {
  1159. tmparr := []map[string]*City{dfullcity}
  1160. e.DistrictSimAndAll[jc_district] = tmparr
  1161. } else {
  1162. e.DistrictSimAndAll[jc_district] = append(e.DistrictSimAndAll[jc_district], dfullcity)
  1163. }
  1164. }
  1165. ctmp := e.DistrictCityMap[qc_district]
  1166. if len(ctmp) == 0 {
  1167. tmpcarr := []*City{c}
  1168. e.DistrictCityMap[qc_district] = tmpcarr
  1169. } else {
  1170. e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c)
  1171. }
  1172. if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
  1173. for _, vdistrict_alias := range district_alias {
  1174. strvdistrict_alias := qu.ObjToString(vdistrict_alias)
  1175. e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
  1176. c_tmp := e.DistrictCityMap[strvdistrict_alias]
  1177. if len(c_tmp) == 0 {
  1178. tmpcarr := []*City{c}
  1179. e.DistrictCityMap[strvdistrict_alias] = tmpcarr
  1180. } else {
  1181. e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
  1182. }
  1183. }
  1184. }
  1185. //街道
  1186. towns := towns_maps[jc_province][qc_city][qc_district]
  1187. for _, vtown := range towns {
  1188. strvtown := qu.ObjToString(vtown["town"])
  1189. s := &Street{}
  1190. s.Name = strvtown
  1191. s.D = d
  1192. e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie
  1193. dtmp := e.StreetDistrictMap[strvtown]
  1194. if len(dtmp) == 0 {
  1195. tmpdarr := []*District{d}
  1196. e.StreetDistrictMap[strvtown] = tmpdarr
  1197. } else {
  1198. e.StreetDistrictMap[strvtown] = append(e.StreetDistrictMap[strvtown], d)
  1199. }
  1200. //村、居委会
  1201. //jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown]
  1202. //for _, vjwh := range jwhs {
  1203. // strvillage := qu.ObjToString(vjwh["village"])
  1204. // e.Trie_Full_Community.AddWords(strvillage) //加入居委会、村全称Trie
  1205. // cttmp := e.CommunityDistrictMap[strvillage]
  1206. // if len(cttmp) == 0 {
  1207. // tmpdarr := []*District{d}
  1208. // e.CommunityDistrictMap[strvillage] = tmpdarr
  1209. // } else {
  1210. // e.CommunityDistrictMap[strvillage] = append(e.CommunityDistrictMap[strvillage], d)
  1211. // }
  1212. //}
  1213. }
  1214. }
  1215. }
  1216. func (e *ExtractTask) InitVar() {
  1217. defer qu.Catch()
  1218. //初始化Trie
  1219. //全称
  1220. e.Trie_Full_Province = &ju.Trie{}
  1221. e.Trie_Full_City = &ju.Trie{}
  1222. e.Trie_Full_District = &ju.Trie{}
  1223. e.Trie_Full_Street = &ju.Trie{}
  1224. e.Trie_Full_Community = &ju.Trie{}
  1225. //简称
  1226. e.Trie_Sim_Province = &ju.Trie{}
  1227. e.Trie_Sim_City = &ju.Trie{}
  1228. e.Trie_Sim_District = &ju.Trie{}
  1229. //初始化分词
  1230. e.Seg_PCD = &gse.Segmenter{}
  1231. e.Seg_SV = &gse.Segmenter{}
  1232. e.Seg_PCD.LoadDict("./res/pcd.txt")
  1233. e.Seg_SV.LoadDict("./res/sv.txt")
  1234. //初始化城市相关
  1235. e.SiteCityMap = make(map[string]*SiteCity)
  1236. e.ProvinceMap = make(map[string]string)
  1237. e.CityMap = make(map[string]string)
  1238. e.DistrictSimAndAll = make(map[string][]map[string]*City)
  1239. e.CityBriefMap = make(map[string]*City)
  1240. e.CityFullMap = make(map[string]*City)
  1241. e.ProvinceBriefMap = make(map[string]*Province)
  1242. e.DistrictCityMap = make(map[string][]*City)
  1243. e.StreetDistrictMap = make(map[string][]*District)
  1244. //新疆兵团-数组
  1245. e.XjbtCityArr = make([]map[string]interface{}, 0)
  1246. //敏感词-筛选
  1247. e.SensitiveFullCity = sensitive.New()
  1248. e.SensitiveSimCity = sensitive.New()
  1249. }
  1250. //初始化邮编库
  1251. func (e *ExtractTask) InitPostCode() {
  1252. defer qu.Catch()
  1253. e.PostCodeMap = make(map[string]*PostCode)
  1254. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  1255. for _, l := range *list {
  1256. pc := &PostCode{}
  1257. pc.Code = qu.ObjToString(l["code"])
  1258. pc.P = qu.ObjToString(l["province"])
  1259. pc.C = qu.ObjToString(l["city"])
  1260. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  1261. e.PostCodeMap[pc.Code] = pc
  1262. }
  1263. }
  1264. //初始化区号库
  1265. func (e *ExtractTask) InitAreaCode() {
  1266. defer qu.Catch()
  1267. e.AreaCodeMap = make(map[string]*AreaCode)
  1268. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  1269. for _, l := range *list {
  1270. ac := &AreaCode{}
  1271. ac.Code = qu.ObjToString(l["code"])
  1272. ac.P = qu.ObjToString(l["province"])
  1273. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  1274. e.AreaCodeMap[ac.Code] = ac
  1275. }
  1276. }
  1277. //保存抽取详情数据
  1278. func (e *ExtractTask) ResultSave(init bool) {
  1279. defer qu.Catch()
  1280. e.RWMutex.Lock()
  1281. if e.ResultArr == nil {
  1282. e.ResultArr = [][]map[string]interface{}{}
  1283. }
  1284. e.RWMutex.Unlock()
  1285. if init {
  1286. go func() {
  1287. for {
  1288. e.RWMutex.Lock()
  1289. if len(e.ResultArr) > saveLimit {
  1290. arr := e.ResultArr[:saveLimit]
  1291. e.ResultArr = e.ResultArr[saveLimit:]
  1292. e.RWMutex.Unlock()
  1293. qu.Try(func() {
  1294. db.Mgo.UpSertBulk("extract_result", arr...)
  1295. }, func(err interface{}) {
  1296. log.Debug(err)
  1297. })
  1298. } else {
  1299. arr := e.ResultArr
  1300. e.ResultArr = [][]map[string]interface{}{}
  1301. e.RWMutex.Unlock()
  1302. qu.Try(func() {
  1303. db.Mgo.UpSertBulk("extract_result", arr...)
  1304. }, func(err interface{}) {
  1305. log.Debug(err)
  1306. })
  1307. }
  1308. time.Sleep(2 * time.Second)
  1309. }
  1310. }()
  1311. } else {
  1312. e.RWMutex.Lock()
  1313. arr := e.ResultArr
  1314. e.ResultArr = [][]map[string]interface{}{}
  1315. e.RWMutex.Unlock()
  1316. qu.Try(func() {
  1317. lenarr := len(arr)
  1318. for {
  1319. if lenarr > saveLimit {
  1320. arr2 := arr[:saveLimit]
  1321. arr = arr[saveLimit:]
  1322. lenarr = len(arr)
  1323. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1324. } else {
  1325. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1326. break
  1327. }
  1328. }
  1329. }, func(err interface{}) {
  1330. defer e.RWMutex.Unlock()
  1331. log.Debug(err)
  1332. })
  1333. }
  1334. }
  1335. //保存抽取数据
  1336. func (e *ExtractTask) BidSave(init bool) {
  1337. defer qu.Catch()
  1338. e.RWMutex.Lock()
  1339. if e.BidArr == nil {
  1340. e.BidArr = [][]map[string]interface{}{}
  1341. }
  1342. e.RWMutex.Unlock()
  1343. if init {
  1344. go func() {
  1345. for {
  1346. e.RWMutex.Lock()
  1347. if len(e.BidArr) > saveLimit {
  1348. arr := e.BidArr[:saveLimit]
  1349. e.BidArr = e.BidArr[saveLimit:]
  1350. e.RWMutex.Unlock()
  1351. //arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1352. arr, _, _, _ = getFieldAllAndBlocks(arr)
  1353. qu.Try(func() {
  1354. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1355. //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1356. //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1357. //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1358. }, func(err interface{}) {
  1359. log.Debug(err)
  1360. })
  1361. } else {
  1362. arr := e.BidArr
  1363. e.BidArr = [][]map[string]interface{}{}
  1364. e.RWMutex.Unlock()
  1365. arr, _, _, _ = getFieldAllAndBlocks(arr)
  1366. qu.Try(func() {
  1367. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1368. //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1369. //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1370. //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1371. }, func(err interface{}) {
  1372. log.Debug(err)
  1373. })
  1374. }
  1375. time.Sleep(2 * time.Second)
  1376. }
  1377. }()
  1378. } else {
  1379. e.RWMutex.Lock()
  1380. arr := e.BidArr
  1381. e.BidArr = [][]map[string]interface{}{}
  1382. e.RWMutex.Unlock()
  1383. qu.Try(func() {
  1384. lenarr := len(arr)
  1385. for {
  1386. if lenarr > saveLimit {
  1387. arr2 := arr[:saveLimit]
  1388. arr = arr[saveLimit:]
  1389. lenarr = len(arr)
  1390. arr2, _, _, _ = getFieldAllAndBlocks(arr2)
  1391. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1392. //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1393. //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1394. //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1395. } else {
  1396. arr, _, _, _ := getFieldAllAndBlocks(arr)
  1397. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1398. //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1399. //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1400. //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1401. break
  1402. }
  1403. }
  1404. }, func(err interface{}) {
  1405. log.Debug(err)
  1406. })
  1407. time.Sleep(1 * time.Second)
  1408. }
  1409. }
  1410. func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) {
  1411. arr = [][]map[string]interface{}{}
  1412. blocks = []map[string]interface{}{}
  1413. fieldalls = []map[string]interface{}{}
  1414. fieldallsf = []map[string]interface{}{}
  1415. for _, v := range a {
  1416. _id, _ := v[0]["_id"]
  1417. if tmp, ok := v[1]["$set"].(map[string]interface{}); ok {
  1418. if ju.SaveBlock {
  1419. if tmp["blocks"] != nil {
  1420. block := map[string]interface{}{
  1421. "_id": _id,
  1422. "blocks": tmp["blocks"],
  1423. }
  1424. blocks = append(blocks, block)
  1425. }
  1426. }
  1427. delete(tmp, "blocks")
  1428. if ju.FieldsFind {
  1429. if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok {
  1430. fieldall := map[string]interface{}{
  1431. "_id": _id,
  1432. }
  1433. for k, v := range f {
  1434. fieldall[k] = v
  1435. }
  1436. fieldalls = append(fieldalls, fieldall)
  1437. }
  1438. if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok {
  1439. fieldallf := map[string]interface{}{
  1440. "_id": _id,
  1441. }
  1442. for k, v := range ff {
  1443. fieldallf[k] = v
  1444. }
  1445. fieldallsf = append(fieldalls, fieldallf)
  1446. }
  1447. }
  1448. delete(tmp, "fieldall")
  1449. delete(tmp, "fieldallf")
  1450. v[1] = tmp //全部更新
  1451. //v[1]["$set"] = tmp //指定更新~针对指定projectname
  1452. }
  1453. arr = append(arr, v)
  1454. }
  1455. return arr, blocks, fieldalls, fieldallsf
  1456. }
  1457. func (e *ExtractTask) InitAuditRecogField() {
  1458. defer qu.Catch()
  1459. e.RecogFieldMap = make(map[string]map[string]interface{})
  1460. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1461. for _, f := range *recogFieldList {
  1462. field := qu.ObjToString(f["s_recogfield"])
  1463. e.RecogFieldMap[field] = f
  1464. }
  1465. }
  1466. func (e *ExtractTask) InitAuditClass() {
  1467. defer qu.Catch()
  1468. e.FidClassMap = make(map[string][]map[string]interface{})
  1469. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1470. for _, c := range *class {
  1471. classList := []map[string]interface{}{}
  1472. fid := qu.ObjToString(c["s_fid"])
  1473. if len(e.FidClassMap[fid]) > 0 { //追加
  1474. classList = e.FidClassMap[fid]
  1475. }
  1476. classList = append(classList, c)
  1477. e.FidClassMap[fid] = classList
  1478. }
  1479. }
  1480. //加载规则
  1481. func (e *ExtractTask) InitAuditRule() {
  1482. defer qu.Catch()
  1483. var rureg *regexp.Regexp
  1484. var rs []rune
  1485. var ru string
  1486. var err error
  1487. e.CidRuleMap = make(map[string][]map[string]interface{})
  1488. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1489. for _, v := range *rule {
  1490. i_rule := []interface{}{}
  1491. ss, _ := (v["s_rule"].([]interface{}))
  1492. for _, r := range qu.ObjArrToStringArr(ss) {
  1493. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1494. rs = []rune(r)
  1495. ru = string(rs[1 : len(rs)-1])
  1496. rureg, err = regexp.Compile(ru)
  1497. if err != nil {
  1498. log.Debug("error---rule:", r)
  1499. continue
  1500. }
  1501. i_rule = append(i_rule, []interface{}{rureg}...)
  1502. } else { //规则
  1503. i_rule = append(i_rule, r)
  1504. }
  1505. }
  1506. v["rule"] = i_rule
  1507. ruleList := []map[string]interface{}{}
  1508. classid := qu.ObjToString(v["s_classid"])
  1509. if len(e.CidRuleMap[classid]) > 0 { //追加
  1510. ruleList = e.CidRuleMap[classid]
  1511. }
  1512. ruleList = append(ruleList, v)
  1513. e.CidRuleMap[classid] = ruleList
  1514. }
  1515. }
  1516. //
  1517. func (e *ExtractTask) InitAuditFields() {
  1518. if len(e.AuditFields) == 0 {
  1519. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1520. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1521. vid := qu.BsonIdToSId((*v)["_id"])
  1522. query := map[string]interface{}{
  1523. "isaudit": true,
  1524. "delete": false,
  1525. "vid": vid,
  1526. }
  1527. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1528. for _, d := range *data {
  1529. field := qu.ObjToString(d["s_field"])
  1530. e.AuditFields = append(e.AuditFields, field)
  1531. }
  1532. }
  1533. }
  1534. }
  1535. //加载附件抽取
  1536. func (e *ExtractTask) InitFile() {
  1537. defer qu.Catch()
  1538. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1539. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1540. //ve, _ := db.Mgo.FindOne("version", query)
  1541. if ve == nil {
  1542. return
  1543. }
  1544. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1545. e.IsFileField = true
  1546. }
  1547. syscefiled := new(sync.Map)
  1548. if (*ve)["s_filefileds"] != nil {
  1549. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1550. syscefiled.Store(vff.(string), 1)
  1551. }
  1552. }
  1553. e.FileFields = syscefiled
  1554. ju.InitOss(ju.Config["istest"].(bool))
  1555. }
  1556. //加载清理任务信息
  1557. func (c *ClearTask) InitClearTaskInfo() {
  1558. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1559. if len(*cleartask) > 1 {
  1560. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1561. c.ClearTaskInfo = &ClearTaskInfo{
  1562. Name: (*cleartask)["s_taskname"].(string),
  1563. Version: (*cleartask)["s_version"].(string),
  1564. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1565. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1566. FromDB: (*cleartask)["s_mgodb"].(string),
  1567. FromColl: (*cleartask)["s_mgocoll"].(string),
  1568. IsCltLog: ju.Config["iscltlog"].(bool),
  1569. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1570. }
  1571. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1572. } else {
  1573. return
  1574. }
  1575. }
  1576. //加载清理脚本
  1577. func (c *ClearTask) InitClearLuas() {
  1578. defer qu.Catch()
  1579. c.ClearLuas = make(map[string][]*ClearLua)
  1580. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1581. for _, l := range *list {
  1582. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1583. continue
  1584. }
  1585. s_field := qu.ObjToString(l["s_field"])
  1586. pid := qu.BsonIdToSId(l["_id"])
  1587. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1588. for _, vv := range *luas {
  1589. if b, _ := vv["isuse"].(bool); !b {
  1590. continue
  1591. }
  1592. clearLua := &ClearLua{
  1593. Field: s_field,
  1594. Code: vv["s_code"].(string),
  1595. Name: vv["s_name"].(string),
  1596. LuaText: vv["s_luascript"].(string),
  1597. LFields: getALLFields(),
  1598. }
  1599. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1600. }
  1601. }
  1602. }
  1603. //加载分块规则
  1604. func (e *ExtractTask) InitBlockRule() {
  1605. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1606. "vid": e.TaskInfo.VersionId,
  1607. "delete": false,
  1608. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1609. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1610. for _, v := range *datas {
  1611. block_reg, _ := v["block_reg"].(string)
  1612. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1613. title_reg, _ := v["title_reg"].(string)
  1614. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1615. if block_reg == "" || title_reg == "" {
  1616. continue
  1617. }
  1618. b_reg, b_err := regexp.Compile(block_reg)
  1619. t_reg, t_err := regexp.Compile(title_reg)
  1620. if b_err != nil || t_err != nil {
  1621. continue
  1622. }
  1623. brs = append(brs, b_reg)
  1624. trs = append(trs, t_reg)
  1625. }
  1626. e.RuleBlock = &ju.RuleBlock{
  1627. BlockRegs: brs,
  1628. TitleRegs: trs,
  1629. Classify: e.InitBlockClassify(),
  1630. }
  1631. }
  1632. //加载分块规则
  1633. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1634. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1635. "vid": e.TaskInfo.VersionId,
  1636. "delete": false,
  1637. }, nil, `{"name":1}`, false, -1, -1)
  1638. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1639. "vid": e.TaskInfo.VersionId,
  1640. "delete": false,
  1641. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1642. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1643. "vid": e.TaskInfo.VersionId,
  1644. "delete": false,
  1645. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1646. tag_map := map[string]ju.Tags{}
  1647. for _, v := range *classify_tag {
  1648. pid := qu.ObjToString(v["pid"])
  1649. name := qu.ObjToString(v["name"])
  1650. tag := &ju.Tag{Value: name}
  1651. if strings.HasPrefix(name, "reg__") {
  1652. tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
  1653. }
  1654. tag_map[pid] = append(tag_map[pid], tag)
  1655. }
  1656. //
  1657. info_map := map[string][]*ju.NameCode{}
  1658. info_tag := map[string]*ju.TagFile{}
  1659. for _, v := range *classify_info {
  1660. pid := qu.ObjToString(v["pid"])
  1661. _id := qu.BsonIdToSId(v["_id"])
  1662. name := qu.ObjToString(v["name"])
  1663. info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
  1664. info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
  1665. }
  1666. classify_map := map[string][]*ju.NameCode{}
  1667. for _, v := range *classify {
  1668. _id := qu.BsonIdToSId(v["_id"])
  1669. if info_map[_id] == nil {
  1670. continue
  1671. }
  1672. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1673. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1674. }
  1675. }
  1676. return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
  1677. }