extractInit.go 58 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. "gopkg.in/mgo.v2/bson"
  14. log "github.com/donnie4w/go-logger/logger"
  15. "github.com/go-ego/gse"
  16. )
  17. type RegLuaInfo struct {
  18. //正则或脚本信息
  19. Code, Name, Field string //
  20. RuleText string //
  21. IsLua bool //
  22. RegPreBac *ExtReg //
  23. RegCore *ExtReg //
  24. }
  25. type ExtReg struct {
  26. Reg *regexp.Regexp
  27. Replace string
  28. Bextract bool
  29. ExtractPos map[string]int
  30. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  31. }
  32. type RuleCore struct {
  33. Id string //id
  34. Field string //逻辑字段
  35. LuaLogic string //进入逻辑
  36. ExtFrom string //从哪个字段抽取
  37. RulePres []*RegLuaInfo //抽取前置规则
  38. RuleBacks []*RegLuaInfo //抽取后置规则
  39. RuleCores []*RegLuaInfo //抽取规则
  40. KVRuleCores []*RegLuaInfo //KV抽取清理规则
  41. LFields map[string]string //所有字段属性组
  42. }
  43. type Tag struct {
  44. Type string //标签类型 string 字符串、regexp 正则
  45. Key string //
  46. Reg *regexp.Regexp //
  47. }
  48. type TaskInfo struct {
  49. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  50. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  51. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  52. TestColl, LastExtId string //测试结果表、上次抽取信息id
  53. FDB *db.Pool //数据库连接池
  54. TDB *db.Pool //数据库连接池
  55. IsEtxLog bool //是否开启抽取日志
  56. ProcessPool chan bool //任务进程池
  57. TestLua bool //检查测试用
  58. }
  59. type ExtractTask struct {
  60. Id string //任务id
  61. IsRun bool //是否启动
  62. Content string //信息内容
  63. TaskInfo *TaskInfo //任务信息
  64. RulePres []*RegLuaInfo //通用前置规则
  65. RuleBacks []*RegLuaInfo //通用后置规则
  66. SiteRuleBacks []*RegLuaInfo //站点通用后置规则
  67. RuleBlock *ju.RuleBlock
  68. //RuleCores []*RuleCore //抽取规则
  69. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  70. SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则
  71. PkgRuleCores []*RuleCore //分包抽取规则
  72. Tag map[string][]*Tag //标签库
  73. SiteTag map[string][]*Tag //站点标签库
  74. ClearFn map[string][]string //清理函数
  75. SiteClearFn map[string][]string //站点清理函数
  76. IsExtractCity bool //是否开启城市抽取
  77. Fields map[string]int //抽取属性组
  78. SiteFields map[string]int //抽取站点属性组
  79. IsFileField bool //是否开启附件抽取
  80. FileFields *sync.Map //抽取附件属性组
  81. ResultChanel chan bool //抽取结果详情
  82. sync.RWMutex
  83. ResultArr [][]map[string]interface {
  84. } //抽取结果详情
  85. BidChanel chan bool //抽取结果
  86. BidArr [][]map[string]interface {
  87. } //抽取结果
  88. BidTotal int //结果数量
  89. RecogFieldMap map[string]map[string]interface {
  90. } //识别字段
  91. FidClassMap map[string][]map[string]interface {
  92. } //分类
  93. CidRuleMap map[string][]map[string]interface {
  94. } //规则
  95. AuditFields []string //需要审核的字段名称
  96. SiteCityMap map[string]*SiteCity //站点对应的省市区
  97. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  98. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  99. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  100. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  101. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  102. DistrictCityMap map[string]*City
  103. NewDistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
  104. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  105. NewDistrictSimAndAll map[string][]map[string]*City //区或县(key:简称 val: 相同简称的区全称:所在市)
  106. StreetDistrictMap map[string]*District //街道对应的区或县
  107. NewStreetDistrictMap map[string][]*District //街道全称对应的区或县
  108. CommunityDistrictMap map[string][]*District //村、居委会对应的区或县
  109. ProvinceAllGet *ju.DFA //省全称
  110. ProvinceSimGet *ju.DFA //省简称
  111. CityAllGet *ju.DFA //市全称
  112. CitySimGet *ju.DFA //市简称
  113. DistrictAllGet *ju.DFA //区或县全称
  114. DistrictSimGet *ju.DFA //区或县简称
  115. StreetGet *ju.DFA //街道
  116. PostCodeMap map[string]*PostCode //邮编
  117. AreaCodeMap map[string]*AreaCode //区号
  118. InfoType []map[string]interface {
  119. }
  120. Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区
  121. Trie_Full_City *ju.Trie //市全称 地级市
  122. Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
  123. Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道
  124. Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会
  125. Trie_Sim_Province *ju.Trie //省简称
  126. Trie_Sim_City *ju.Trie //市简称
  127. Trie_Sim_District *ju.Trie //县简称
  128. Trie_Fulls []*ju.Trie //所有全称
  129. Trie_Sims []*ju.Trie //所有简称
  130. Seg_PCD *gse.Segmenter //分词
  131. Seg_SV *gse.Segmenter //分词
  132. Luacodes *sync.Map //站点规则
  133. SiteMerge *sync.Map //抽取合并
  134. }
  135. type SiteCity struct {
  136. P string //省简称
  137. C string //市全称
  138. D string //区全称
  139. }
  140. type ClearTaskInfo struct {
  141. Name, Version, VersionId string //名称、版本、版本id
  142. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  143. FDB *db.Pool //数据库连接池
  144. TDB *db.Pool //数据库连接池
  145. IsCltLog bool //是否开启清理日志
  146. ProcessPool chan bool //任务进程池
  147. }
  148. type ClearLua struct {
  149. Field string //字段字段
  150. Code string //代码
  151. Name string //名称
  152. LuaText string
  153. //LuaLogic string //进入逻辑
  154. //ExtFrom string //从哪个字段抽取
  155. LFields map[string]string //lua抽取字段属性组
  156. }
  157. type ClearTask struct {
  158. sync.RWMutex
  159. Id string //任务id
  160. Content string //信息内容
  161. ClearTaskInfo *ClearTaskInfo //任务信息
  162. ClearLuas map[string][]*ClearLua //清理脚本
  163. UpdateResult [][]map[string]interface{} //清理后结果
  164. //ClearChannel chan bool
  165. }
  166. func init() {
  167. TaskList = make(map[string]*ExtractTask)
  168. ClearTaskList = make(map[string]*ClearTask)
  169. go SaveExtLog()
  170. go SaveCltLog() //保存清理日志
  171. }
  172. //加载任务信息
  173. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  174. task, _ := db.Mgo.FindById("task", e.Id, nil)
  175. if len(*task) > 1 {
  176. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  177. e.TaskInfo = &TaskInfo{
  178. Name: (*task)["s_taskname"].(string),
  179. Version: (*task)["s_version"].(string),
  180. VersionId: qu.BsonIdToSId((*v)["_id"]),
  181. TrackColl: trackcoll,
  182. FromDbAddr: (*task)["s_mgoaddr"].(string),
  183. FromDB: (*task)["s_mgodb"].(string),
  184. FromColl: (*task)["s_mgocoll"].(string),
  185. TestColl: resultcoll,
  186. IsEtxLog: true,
  187. ProcessPool: make(chan bool, 1),
  188. }
  189. if (*v)["isextractcity"] != nil {
  190. e.IsExtractCity = (*v)["isextractcity"].(bool)
  191. }
  192. } else {
  193. return
  194. }
  195. }
  196. //加载任务信息
  197. func (e *ExtractTask) InitTaskInfo() {
  198. task, _ := db.Mgo.FindById("task", e.Id, nil)
  199. log.Debug("task", task)
  200. if len(*task) > 1 {
  201. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  202. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  203. log.Debug("s_mgosavecoll", strs)
  204. if len(strs) < 3 {
  205. return
  206. } else {
  207. e.TaskInfo = &TaskInfo{
  208. Name: (*task)["s_taskname"].(string),
  209. Version: (*task)["s_version"].(string),
  210. VersionId: qu.BsonIdToSId((*v)["_id"]),
  211. //TrackColl: (*task)["s_trackcoll"].(string),
  212. FromDbAddr: (*task)["s_mgoaddr"].(string),
  213. FromDB: (*task)["s_mgodb"].(string),
  214. FromColl: (*task)["s_mgocoll"].(string),
  215. ToDbAddr: strs[0],
  216. ToDB: strs[1],
  217. ToColl: strs[2],
  218. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  219. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  220. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  221. }
  222. if (*v)["isextractcity"] != nil {
  223. e.IsExtractCity = (*v)["isextractcity"].(bool)
  224. }
  225. }
  226. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  227. } else {
  228. return
  229. }
  230. }
  231. func (e *ExtractTask) InitSite() {
  232. e.Luacodes = &sync.Map{}
  233. e.SiteMerge = &sync.Map{}
  234. sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1)
  235. for _, v := range *sites {
  236. if vv, ok := v["site_script"].([]interface{}); ok {
  237. for _, vvv := range vv {
  238. e.Luacodes.Store(vvv, map[string]interface{}{})
  239. e.SiteMerge.Store(vvv, v["ismerge"].(bool))
  240. }
  241. } else if vv, ok := v["site_script"].(interface{}); ok {
  242. e.Luacodes.Store(vv, map[string]interface{}{})
  243. e.SiteMerge.Store(vv, v["ismerge"].(bool))
  244. }
  245. }
  246. }
  247. //加载通用前置规则
  248. func (e *ExtractTask) InitRulePres() {
  249. defer qu.Catch()
  250. e.RulePres = []*RegLuaInfo{}
  251. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  252. for _, v := range *list {
  253. rinfo := &RegLuaInfo{
  254. Code: v["s_code"].(string),
  255. Name: v["s_name"].(string),
  256. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  257. }
  258. if rinfo.IsLua {
  259. rinfo.RuleText = v["s_luascript"].(string)
  260. e.RulePres = append(e.RulePres, rinfo)
  261. } else {
  262. qu.Try(func() {
  263. rinfo.RuleText = v["s_rule"].(string)
  264. tmp := strings.Split(rinfo.RuleText, "__")
  265. var pattern string
  266. if strings.Contains(tmp[0], "\\u") {
  267. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  268. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  269. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  270. } else {
  271. pattern = tmp[0]
  272. }
  273. if len(tmp) == 2 {
  274. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  275. } else {
  276. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  277. }
  278. e.RulePres = append(e.RulePres, rinfo)
  279. }, func(err interface{}) {
  280. log.Debug(rinfo.Code, rinfo.Field, err)
  281. })
  282. }
  283. }
  284. }
  285. //加载通用后置规则
  286. func (e *ExtractTask) InitRuleBacks(isSite bool) {
  287. defer qu.Catch()
  288. cDB := ""
  289. eSiteRuleBacks := []*RegLuaInfo{}
  290. if isSite {
  291. cDB = "site_rule_back"
  292. e.SiteRuleBacks = []*RegLuaInfo{}
  293. } else {
  294. cDB = "rule_back"
  295. e.RuleBacks = []*RegLuaInfo{}
  296. }
  297. list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  298. for _, v := range *list {
  299. rinfo := &RegLuaInfo{
  300. Code: v["s_code"].(string),
  301. Name: v["s_name"].(string),
  302. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  303. }
  304. if rinfo.IsLua {
  305. rinfo.RuleText = v["s_luascript"].(string)
  306. if isSite {
  307. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  308. //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo)
  309. } else {
  310. e.RuleBacks = append(e.RuleBacks, rinfo)
  311. }
  312. } else {
  313. qu.Try(func() {
  314. rinfo.RuleText = v["s_rule"].(string)
  315. tmp := strings.Split(rinfo.RuleText, "__")
  316. var pattern string
  317. if strings.Contains(tmp[0], "\\u") {
  318. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  319. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  320. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  321. } else {
  322. pattern = tmp[0]
  323. }
  324. if len(tmp) == 2 {
  325. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  326. } else {
  327. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  328. }
  329. if isSite {
  330. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  331. } else {
  332. e.RuleBacks = append(e.RuleBacks, rinfo)
  333. }
  334. }, func(err interface{}) {
  335. log.Debug(rinfo.Code, rinfo.Field, err)
  336. })
  337. }
  338. if isSite {
  339. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  340. if (*sm) == nil || len(*sm) <= 0 {
  341. eSiteRuleBacks = []*RegLuaInfo{}
  342. continue
  343. }
  344. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  345. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  346. if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil {
  347. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks
  348. } else {
  349. if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 {
  350. tmplist = append(tmplist, eSiteRuleBacks...)
  351. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist
  352. }
  353. //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...)
  354. }
  355. e.Luacodes.Store(v2, mdpvalue)
  356. }
  357. }
  358. eSiteRuleBacks = []*RegLuaInfo{}
  359. }
  360. }
  361. }
  362. func (e *ExtractTask) InfoTypeList() {
  363. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  364. infolist := *infolist1
  365. for _, v := range infolist {
  366. e.InfoType = append(e.InfoType, v)
  367. }
  368. }
  369. //加载抽取规则
  370. func (e *ExtractTask) InitRuleCore(isSite bool) {
  371. defer qu.Catch()
  372. allFields := getALLFields()
  373. var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb, rule_logickvdb string
  374. eSiteRuleCores := make(map[string]map[string][]*RuleCore)
  375. if isSite {
  376. versioninfodb = "site_versioninfo"
  377. rule_logicdb = "site_rule_logic"
  378. rule_logicpredb = "site_rule_logicpre"
  379. rule_logicbackdb = "site_rule_logicback"
  380. rule_logicoredb = "site_rule_logicore"
  381. rule_logickvdb = "site_rule_logickv"
  382. e.SiteFields = map[string]int{}
  383. e.SiteRuleCores = make(map[string]map[string][]*RuleCore)
  384. } else {
  385. versioninfodb = "versioninfo"
  386. rule_logicdb = "rule_logic"
  387. rule_logicpredb = "rule_logicpre"
  388. rule_logicbackdb = "rule_logicback"
  389. rule_logicoredb = "rule_logicore"
  390. rule_logickvdb = "rule_logickv"
  391. e.Fields = map[string]int{}
  392. e.RuleCores = make(map[string]map[string][]*RuleCore)
  393. }
  394. fieldrules := map[string][]*RuleCore{}
  395. vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  396. for _, vinfo := range *vinfos {
  397. if b, _ := vinfo["isuse"].(bool); !b {
  398. continue
  399. }
  400. s_field := qu.ObjToString(vinfo["s_field"])
  401. pid := qu.BsonIdToSId(vinfo["_id"])
  402. list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  403. for _, vv := range *list {
  404. if b, _ := vv["isuse"].(bool); !b {
  405. continue
  406. }
  407. rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
  408. rcore.Field = s_field
  409. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  410. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  411. rcore.LFields = allFields
  412. //前置规则
  413. rulePres := []*RegLuaInfo{}
  414. plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  415. for _, v := range *plist {
  416. rinfo := &RegLuaInfo{
  417. Field: qu.ObjToString(v["s_field"]),
  418. Code: v["s_code"].(string),
  419. Name: v["s_name"].(string),
  420. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  421. }
  422. if rinfo.IsLua {
  423. rinfo.RuleText = v["s_luascript"].(string)
  424. rulePres = append(rulePres, rinfo)
  425. } else {
  426. qu.Try(func() {
  427. rinfo.RuleText = v["s_rule"].(string)
  428. tmp := strings.Split(rinfo.RuleText, "__")
  429. var pattern string
  430. if strings.Contains(tmp[0], "\\u") {
  431. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  432. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  433. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  434. } else {
  435. pattern = tmp[0]
  436. }
  437. if len(tmp) == 2 {
  438. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  439. } else {
  440. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  441. }
  442. rulePres = append(rulePres, rinfo)
  443. }, func(err interface{}) {
  444. log.Debug(rinfo.Code, rinfo.Field, err)
  445. })
  446. }
  447. }
  448. rcore.RulePres = rulePres
  449. //后置规则
  450. ruleBacks := []*RegLuaInfo{}
  451. blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  452. for _, v := range *blist {
  453. rinfo := &RegLuaInfo{
  454. Field: qu.ObjToString(v["s_field"]),
  455. Code: v["s_code"].(string),
  456. Name: v["s_name"].(string),
  457. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  458. }
  459. if rinfo.IsLua {
  460. rinfo.RuleText = v["s_luascript"].(string)
  461. ruleBacks = append(ruleBacks, rinfo)
  462. } else {
  463. qu.Try(func() {
  464. rinfo.RuleText = v["s_rule"].(string)
  465. tmp := strings.Split(rinfo.RuleText, "__")
  466. var pattern string
  467. if strings.Contains(tmp[0], "\\u") {
  468. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  469. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  470. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  471. } else {
  472. pattern = tmp[0]
  473. }
  474. if len(tmp) == 2 {
  475. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  476. } else {
  477. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  478. }
  479. ruleBacks = append(ruleBacks, rinfo)
  480. }, func(err interface{}) {
  481. log.Debug(rinfo.Code, rinfo.Field, err)
  482. })
  483. }
  484. }
  485. rcore.RuleBacks = ruleBacks
  486. //抽取规则
  487. ruleCores := []*RegLuaInfo{}
  488. clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  489. for _, v := range *clist {
  490. if b, _ := v["isuse"].(bool); !b {
  491. continue
  492. }
  493. field := qu.ObjToString(v["s_field"])
  494. if isSite {
  495. e.SiteFields[field] = 1
  496. } else {
  497. e.Fields[field] = 1 //加入抽取属性组备用
  498. }
  499. rinfo := &RegLuaInfo{
  500. Field: field,
  501. Code: v["s_code"].(string),
  502. Name: v["s_name"].(string),
  503. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  504. }
  505. if rinfo.IsLua {
  506. rinfo.RuleText = v["s_luascript"].(string)
  507. //提取全部属性
  508. ruleCores = append(ruleCores, rinfo)
  509. } else {
  510. qu.Try(func() {
  511. rinfo.RuleText = v["s_rule"].(string)
  512. tmp := strings.Split(rinfo.RuleText, "__")
  513. var pattern string
  514. if strings.Contains(tmp[0], "\\u") {
  515. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  516. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  517. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  518. } else {
  519. pattern = tmp[0]
  520. }
  521. if len(tmp) == 2 {
  522. epos := strings.Split(tmp[1], ",")
  523. posm := map[string]int{}
  524. for _, v := range epos {
  525. ks := strings.Split(v, ":")
  526. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  527. posm[ks[1]] = qu.IntAll(ks[0])
  528. } else { //(.*)招标公告__2
  529. posm[rinfo.Field] = qu.IntAll(ks[0])
  530. }
  531. }
  532. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  533. } else {
  534. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  535. }
  536. ruleCores = append(ruleCores, rinfo)
  537. }, func(err interface{}) {
  538. log.Debug(rinfo.Code, rinfo.Field, err)
  539. })
  540. }
  541. }
  542. rcore.RuleCores = ruleCores
  543. //kv规则
  544. kvRuleCores := []*RegLuaInfo{}
  545. kvlist, _ := db.Mgo.Find(rule_logickvdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  546. for _, v := range *kvlist {
  547. if b, _ := v["isuse"].(bool); !b {
  548. continue
  549. }
  550. field := qu.ObjToString(v["s_field"])
  551. if isSite {
  552. e.SiteFields[field] = 1
  553. } else {
  554. e.Fields[field] = 1 //加入抽取属性组备用
  555. }
  556. rinfo := &RegLuaInfo{
  557. Field: field,
  558. Code: v["s_code"].(string),
  559. Name: v["s_name"].(string),
  560. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  561. }
  562. qu.Try(func() {
  563. rinfo.RuleText = v["s_rule"].(string)
  564. tmp := strings.Split(rinfo.RuleText, "__")
  565. var pattern string
  566. if strings.Contains(tmp[0], "\\u") {
  567. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  568. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  569. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  570. } else {
  571. pattern = tmp[0]
  572. }
  573. if len(tmp) == 2 {
  574. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  575. } else {
  576. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  577. }
  578. kvRuleCores = append(kvRuleCores, rinfo)
  579. }, func(err interface{}) {
  580. log.Debug(rinfo.Code, rinfo.Field, err)
  581. })
  582. }
  583. rcore.KVRuleCores = kvRuleCores
  584. if fieldrules[s_field] == nil {
  585. fieldrules[s_field] = []*RuleCore{}
  586. }
  587. fieldrules[s_field] = append(fieldrules[s_field], rcore)
  588. }
  589. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  590. for _, v := range *infolist {
  591. topclass := qu.ObjToString(v["topclass"])
  592. if v["subclass"] == nil {
  593. eSiteRuleCores[topclass] = make(map[string][]*RuleCore)
  594. for attr, _ := range v["fields"].(map[string]interface{}) {
  595. if fieldrules[attr] != nil {
  596. eSiteRuleCores[topclass][attr] = fieldrules[attr]
  597. }
  598. }
  599. } else {
  600. for ca, fs := range v["subclass"].(map[string]interface{}) {
  601. eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  602. for field, _ := range fs.(map[string]interface{}) {
  603. if fieldrules[field] != nil {
  604. eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field]
  605. }
  606. }
  607. }
  608. }
  609. }
  610. if isSite {
  611. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1})
  612. if (*sm) == nil || len(*sm) <= 0 {
  613. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  614. fieldrules = map[string][]*RuleCore{}
  615. continue
  616. }
  617. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  618. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  619. //属性配置
  620. if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil {
  621. mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores
  622. } else {
  623. for k2, v2 := range eSiteRuleCores {
  624. tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2]
  625. for kkkk, vvv := range v2 {
  626. tmpv[kkkk] = vvv
  627. }
  628. mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv
  629. }
  630. }
  631. e.Luacodes.Store(v2, mdpvalue)
  632. }
  633. }
  634. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  635. fieldrules = map[string][]*RuleCore{}
  636. }
  637. }
  638. if !isSite {
  639. //属性配置
  640. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  641. for _, v := range *infolist {
  642. topclass := qu.ObjToString(v["topclass"])
  643. if v["subclass"] == nil {
  644. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  645. for attr, _ := range v["fields"].(map[string]interface{}) {
  646. if fieldrules[attr] != nil {
  647. e.RuleCores[topclass][attr] = fieldrules[attr]
  648. }
  649. }
  650. } else {
  651. for ca, fs := range v["subclass"].(map[string]interface{}) {
  652. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  653. for field, _ := range fs.(map[string]interface{}) {
  654. if fieldrules[field] != nil {
  655. e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
  656. }
  657. }
  658. }
  659. }
  660. }
  661. }
  662. }
  663. //加载分包抽取规则
  664. func (e *ExtractTask) InitPkgCore() {
  665. defer qu.Catch()
  666. e.PkgRuleCores = []*RuleCore{}
  667. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  668. for _, pkginfo := range *pkginfos {
  669. if b, _ := pkginfo["isuse"].(bool); !b {
  670. continue
  671. }
  672. s_field := qu.ObjToString(pkginfo["s_field"])
  673. sid := qu.BsonIdToSId(pkginfo["_id"])
  674. rcore := &RuleCore{}
  675. rcore.Field = s_field
  676. rcore.ExtFrom = "detail"
  677. //后置规则
  678. ruleBacks := []*RegLuaInfo{}
  679. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  680. for _, v := range *blist {
  681. rinfo := &RegLuaInfo{
  682. Field: qu.ObjToString(v["s_field"]),
  683. Code: v["s_code"].(string),
  684. Name: v["s_name"].(string),
  685. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  686. }
  687. if rinfo.IsLua {
  688. rinfo.RuleText = v["s_luascript"].(string)
  689. ruleBacks = append(ruleBacks, rinfo)
  690. } else {
  691. qu.Try(func() {
  692. rinfo.RuleText = v["s_rule"].(string)
  693. tmp := strings.Split(rinfo.RuleText, "__")
  694. var pattern string
  695. if strings.Contains(tmp[0], "\\u") {
  696. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  697. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  698. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  699. } else {
  700. pattern = tmp[0]
  701. }
  702. if len(tmp) == 2 {
  703. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  704. } else {
  705. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  706. }
  707. ruleBacks = append(ruleBacks, rinfo)
  708. }, func(err interface{}) {
  709. log.Debug(rinfo.Code, rinfo.Field, err)
  710. })
  711. }
  712. }
  713. rcore.RuleBacks = ruleBacks
  714. //抽取规则
  715. ruleCores := []*RegLuaInfo{}
  716. clist, _ := db.Mgo.Find("pkg_logicore", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  717. for _, v := range *clist {
  718. if b, _ := v["isuse"].(bool); !b {
  719. continue
  720. }
  721. field := qu.ObjToString(v["s_field"])
  722. e.Fields[field] = 1 //加入抽取属性组备用
  723. rinfo := &RegLuaInfo{
  724. Field: field,
  725. Code: v["s_code"].(string),
  726. Name: v["s_name"].(string),
  727. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  728. }
  729. if rinfo.IsLua {
  730. rinfo.RuleText = v["s_luascript"].(string)
  731. //提取全部属性
  732. ruleCores = append(ruleCores, rinfo)
  733. } else {
  734. qu.Try(func() {
  735. rinfo.RuleText = v["s_rule"].(string)
  736. tmp := strings.Split(rinfo.RuleText, "__")
  737. var pattern string
  738. if strings.Contains(tmp[0], "\\u") {
  739. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  740. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  741. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  742. } else {
  743. pattern = tmp[0]
  744. }
  745. if len(tmp) == 2 {
  746. epos := strings.Split(tmp[1], ",")
  747. posm := map[string]int{}
  748. for _, v := range epos {
  749. ks := strings.Split(v, ":")
  750. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  751. posm[ks[1]] = qu.IntAll(ks[0])
  752. } else { //(.*)招标公告__2
  753. posm[rinfo.Field] = qu.IntAll(ks[0])
  754. }
  755. }
  756. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  757. } else {
  758. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  759. }
  760. ruleCores = append(ruleCores, rinfo)
  761. }, func(err interface{}) {
  762. log.Debug(rinfo.Code, rinfo.Field, err)
  763. })
  764. }
  765. }
  766. rcore.RuleCores = ruleCores
  767. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  768. }
  769. }
  770. //加载标签库
  771. func (e *ExtractTask) InitTag(isSite bool) {
  772. defer qu.Catch()
  773. var tagdetailinfodb string
  774. eSiteTag := map[string][]*Tag{}
  775. if isSite {
  776. tagdetailinfodb = "site_tagdetailinfo"
  777. e.SiteTag = map[string][]*Tag{}
  778. } else {
  779. tagdetailinfodb = "tagdetailinfo"
  780. e.Tag = map[string][]*Tag{}
  781. }
  782. //字符串标签库
  783. list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  784. var tmpMap sync.Map
  785. for _, v := range *list {
  786. field := qu.ObjToString(v["s_field"])
  787. if tmp, ok := v["content"].([]interface{}); ok {
  788. fname := qu.ObjToString(v["s_name"])
  789. tab := ju.TagFile{Name: fname} //用于表格kv
  790. tab.Items = make([]*ju.Tag, len(tmp))
  791. for k, key := range tmp {
  792. tag := &Tag{Type: "string", Key: key.(string)}
  793. if isSite {
  794. eSiteTag[field] = append(eSiteTag[field], tag)
  795. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  796. } else {
  797. e.Tag[field] = append(e.Tag[field], tag)
  798. }
  799. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false}
  800. }
  801. sort.Sort(tab.Items)
  802. //ju.TagdbTable[fname] = &tab
  803. if isSite {
  804. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  805. if (*sm) == nil || len(*sm) <= 0 {
  806. eSiteTag = map[string][]*Tag{}
  807. continue
  808. }
  809. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  810. if v2 == nil || v2 == "" {
  811. continue
  812. }
  813. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  814. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  815. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  816. } else {
  817. for k2, v2 := range eSiteTag {
  818. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  819. }
  820. }
  821. e.Luacodes.Store(v2, mdpvalue)
  822. }
  823. tmpMap.Store(fname, &tab)
  824. ju.SiteTagdbTable.Store(v2, tmpMap)
  825. }
  826. //ju.SiteTagdbTable.Store(fname, &tab)
  827. eSiteTag = map[string][]*Tag{}
  828. } else {
  829. ju.TagdbTable.Store(fname, &tab)
  830. }
  831. }
  832. //if isSite {
  833. // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  834. // for _, v2 := range (*sm)["site_script"].([]interface{}) {
  835. // if mdpvalue, ok := Luacodes.Load(v2); ok {
  836. // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{
  837. // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  838. // }else {
  839. // for k2,v2 := range eSiteTag{
  840. // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  841. // }
  842. // }
  843. // Luacodes.Store(v2, mdpvalue)
  844. // }
  845. // }
  846. // eSiteTag = map[string][]*Tag{}
  847. //}
  848. }
  849. //正则标签库
  850. list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  851. for _, v := range *list {
  852. field := qu.ObjToString(v["s_field"])
  853. if tmp, ok := v["content"].([]interface{}); ok {
  854. fname := qu.ObjToString(v["s_name"])
  855. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  856. tab.Items = make([]*ju.Tag, len(tmp))
  857. for k, key := range tmp {
  858. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  859. if isSite {
  860. eSiteTag[field] = append(eSiteTag[field], tag)
  861. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  862. } else {
  863. e.Tag[field] = append(e.Tag[field], tag)
  864. }
  865. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false}
  866. }
  867. sort.Sort(tab.Items)
  868. //ju.TagdbTable[fname+"_reg"] = &tab
  869. if isSite {
  870. ju.SiteTagdbTable.Store(fname+"_reg", &tab)
  871. } else {
  872. ju.TagdbTable.Store(fname+"_reg", &tab)
  873. }
  874. }
  875. if isSite {
  876. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  877. if (*sm) == nil || len(*sm) <= 0 {
  878. eSiteTag = map[string][]*Tag{}
  879. continue
  880. }
  881. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  882. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  883. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  884. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  885. } else {
  886. for k2, v2 := range eSiteTag {
  887. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  888. }
  889. }
  890. e.Luacodes.Store(v2, mdpvalue)
  891. }
  892. }
  893. eSiteTag = map[string][]*Tag{}
  894. }
  895. }
  896. }
  897. //获取fields
  898. func getALLFields() map[string]string {
  899. fields := map[string]string{}
  900. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  901. for _, v := range *list {
  902. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  903. }
  904. return fields
  905. }
  906. //加载clear函数
  907. func (e *ExtractTask) InitClearFn(isSite bool) {
  908. defer qu.Catch()
  909. var cleanupdb string
  910. if isSite {
  911. cleanupdb = "site_cleanup"
  912. e.SiteClearFn = map[string][]string{}
  913. } else {
  914. cleanupdb = "cleanup"
  915. }
  916. list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  917. fn := map[string][]string{}
  918. for _, tmp := range *list {
  919. field := tmp["s_field"].(string)
  920. fns := tmp["clear"].([]interface{})
  921. if fn[field] == nil {
  922. fn[field] = []string{}
  923. }
  924. for _, v := range fns {
  925. fn[field] = append(fn[field], v.(string))
  926. }
  927. if isSite {
  928. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1})
  929. if (*sm) == nil || len(*sm) <= 0 {
  930. fn = map[string][]string{}
  931. continue
  932. }
  933. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  934. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  935. if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil {
  936. mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn
  937. } else {
  938. for k2, v2 := range fn {
  939. mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2
  940. }
  941. }
  942. e.Luacodes.Store(v2, mdpvalue)
  943. }
  944. }
  945. fn = map[string][]string{}
  946. }
  947. }
  948. if !isSite {
  949. e.ClearFn = fn
  950. }
  951. }
  952. //加载省份
  953. func InitProvince(version string) map[string]interface{} {
  954. defer qu.Catch()
  955. fn := map[string]interface{}{}
  956. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  957. for _, v := range *list {
  958. name := qu.ObjToString(v["s_name"])
  959. content := v["content"]
  960. switch content.(type) {
  961. case string:
  962. fn[name] = []interface{}{content.(string)}
  963. case []interface{}:
  964. fn[name] = content
  965. }
  966. }
  967. return fn
  968. }
  969. //加载城市简称
  970. func InitCitySim(version string) map[string]map[string]interface{} {
  971. defer qu.Catch()
  972. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  973. fn := map[string]map[string]interface{}{}
  974. for _, v := range *list {
  975. name := qu.ObjToString(v["s_name"])
  976. tmp := v["content"].(map[string]interface{})
  977. fn[name] = tmp
  978. }
  979. return fn
  980. }
  981. //加载城市全称
  982. func InitCityAll(version string) map[string]map[string]interface{} {
  983. defer qu.Catch()
  984. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  985. fn := map[string]map[string]interface{}{}
  986. for _, v := range *list {
  987. name := qu.ObjToString(v["s_name"])
  988. tmp := v["content"].(map[string]interface{})
  989. fn[name] = tmp
  990. }
  991. return fn
  992. }
  993. //加载站点库site城市信息
  994. func InitSite() []map[string]interface{} {
  995. defer qu.Catch()
  996. query := map[string]interface{}{
  997. "depttype": map[string]interface{}{
  998. "$ne": "代理机构",
  999. },
  1000. }
  1001. list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
  1002. return *list
  1003. }
  1004. func (e *ExtractTask) InitCityInfo() {
  1005. defer qu.Catch()
  1006. e.InitVar() //初始化变量
  1007. //site站点信息
  1008. for _, v := range InitSite() {
  1009. site, _ := v["site"].(string)
  1010. area, _ := v["area"].(string)
  1011. city, _ := v["city"].(string)
  1012. district, _ := v["district"].(string)
  1013. if area != "" && area != "全国" && site != "" {
  1014. s := &SiteCity{
  1015. P: area,
  1016. C: city,
  1017. D: district,
  1018. }
  1019. e.SiteCityMap[site] = s
  1020. }
  1021. }
  1022. //初始化省信息
  1023. fn1 := InitProvince(e.TaskInfo.Version)
  1024. for k, v := range fn1 {
  1025. for _, p := range v.([]interface{}) {
  1026. p1, _ := p.(string)
  1027. e.Trie_Full_Province.AddWords(p1) //华中科技大学
  1028. e.ProvinceMap[p1] = k //华中科技大学:湖北
  1029. }
  1030. }
  1031. //初始化城市全称
  1032. fn2 := InitCityAll(e.TaskInfo.Version)
  1033. for k, v := range fn2 {
  1034. //加载省信息
  1035. e.Trie_Full_Province.AddWords(k) //加入省全称Trie(k:浙江省)
  1036. p := &Province{}
  1037. p.Name = k //省全称:浙江省
  1038. p.Brief = v["brief"].(string) //省简称:浙江
  1039. e.Trie_Sim_Province.AddWords(p.Brief) //加入省简称Trie(k:浙江)
  1040. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  1041. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  1042. p.Cap = v["captial"].(string) //省会(杭州)
  1043. //加载市信息
  1044. city, _ := v["city"].(map[string]interface{})
  1045. for k1, v1 := range city {
  1046. e.Trie_Full_City.AddWords(k1) //加入市全称Trie(k:杭州市)
  1047. v1m, _ := v1.(map[string]interface{})
  1048. c := &City{}
  1049. c.Name = k1 //市全称:杭州市
  1050. c.Brief = v1m["brief"].(string) //市简称:杭州
  1051. e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
  1052. e.CityMap[k1] = c.Brief //杭州市:杭州
  1053. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  1054. e.CityFullMap[k1] = c //杭州市:市信息{}
  1055. c.P = p
  1056. if c.Name == p.Cap {
  1057. p.Captial = c //加载province中的省会市信息{}
  1058. }
  1059. //区县
  1060. districtmap, _ := v1m["area"].(map[string]interface{}) //区或县
  1061. for district, streets := range districtmap {
  1062. d := &District{}
  1063. d.Name = district
  1064. d.C = c
  1065. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  1066. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级
  1067. e.Trie_Full_District.AddWords(district) //加入区或县全称Trie
  1068. ctmp := e.NewDistrictCityMap[district]
  1069. if len(ctmp) == 0 {
  1070. tmpcarr := []*City{c}
  1071. e.NewDistrictCityMap[district] = tmpcarr
  1072. } else {
  1073. e.NewDistrictCityMap[district] = append(e.NewDistrictCityMap[district], c)
  1074. }
  1075. //街道
  1076. streetmap, _ := streets.(map[string]interface{})
  1077. for street, communitys := range streetmap {
  1078. s := &Street{}
  1079. s.Name = street
  1080. s.D = d
  1081. e.Trie_Full_Street.AddWords(street) //加入街道全称Trie
  1082. dtmp := e.NewStreetDistrictMap[street]
  1083. if len(dtmp) == 0 {
  1084. tmpdarr := []*District{d}
  1085. e.NewStreetDistrictMap[street] = tmpdarr
  1086. } else {
  1087. e.NewStreetDistrictMap[street] = append(e.NewStreetDistrictMap[street], d)
  1088. }
  1089. //村、居委会
  1090. for _, ct := range qu.ObjArrToStringArr(communitys.([]interface{})) {
  1091. e.Trie_Full_Community.AddWords(ct) //加入居委会、村全称Trie
  1092. cttmp := e.CommunityDistrictMap[ct]
  1093. if len(cttmp) == 0 {
  1094. tmpdarr := []*District{d}
  1095. e.CommunityDistrictMap[ct] = tmpdarr
  1096. } else {
  1097. e.CommunityDistrictMap[ct] = append(e.CommunityDistrictMap[ct], d)
  1098. }
  1099. }
  1100. }
  1101. }
  1102. }
  1103. }
  1104. //初始化城市简称
  1105. fn3 := InitCitySim(e.TaskInfo.Version)
  1106. for _, v := range fn3 {
  1107. city, _ := v["city"].(map[string]interface{})
  1108. for _, v1 := range city {
  1109. v1m, _ := v1.(map[string]interface{})
  1110. cb := v1m["brief"].(string) //市简称
  1111. arr := v1m["area"].(map[string]interface{}) //区或县简称
  1112. for districtsim, districtall := range arr {
  1113. dfullstr, _ := districtall.(string)
  1114. e.Trie_Sim_District.AddWords(districtsim) //加入区或县简称Trie
  1115. c := e.CityBriefMap[cb]
  1116. dfullarr := e.NewDistrictSimAndAll[districtsim]
  1117. dfullcity := map[string]*City{dfullstr: c}
  1118. if len(dfullarr) == 0 {
  1119. tmparr := []map[string]*City{dfullcity}
  1120. e.NewDistrictSimAndAll[districtsim] = tmparr
  1121. } else {
  1122. e.NewDistrictSimAndAll[districtsim] = append(e.NewDistrictSimAndAll[districtsim], dfullcity)
  1123. }
  1124. }
  1125. }
  1126. }
  1127. e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community}
  1128. e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District}
  1129. }
  1130. func (e *ExtractTask) InitVar() {
  1131. defer qu.Catch()
  1132. //初始化Trie
  1133. //全称
  1134. e.Trie_Full_Province = &ju.Trie{}
  1135. e.Trie_Full_City = &ju.Trie{}
  1136. e.Trie_Full_District = &ju.Trie{}
  1137. e.Trie_Full_Street = &ju.Trie{}
  1138. e.Trie_Full_Community = &ju.Trie{}
  1139. //简称
  1140. e.Trie_Sim_Province = &ju.Trie{}
  1141. e.Trie_Sim_City = &ju.Trie{}
  1142. e.Trie_Sim_District = &ju.Trie{}
  1143. //初始化分词
  1144. e.Seg_PCD = &gse.Segmenter{}
  1145. e.Seg_SV = &gse.Segmenter{}
  1146. e.Seg_PCD.LoadDict("./res/pcd.txt")
  1147. e.Seg_SV.LoadDict("./res/sv.txt")
  1148. //初始化map
  1149. if e.SiteCityMap == nil {
  1150. e.SiteCityMap = make(map[string]*SiteCity)
  1151. }
  1152. if e.ProvinceMap == nil {
  1153. e.ProvinceMap = make(map[string]string)
  1154. }
  1155. if e.CityMap == nil {
  1156. e.CityMap = make(map[string]string)
  1157. }
  1158. if e.DistrictSimAndAll == nil {
  1159. e.DistrictSimAndAll = make(map[string]string)
  1160. }
  1161. if e.NewDistrictSimAndAll == nil {
  1162. e.NewDistrictSimAndAll = make(map[string][]map[string]*City)
  1163. }
  1164. if e.CityBriefMap == nil {
  1165. e.CityBriefMap = make(map[string]*City)
  1166. }
  1167. if e.CityFullMap == nil {
  1168. e.CityFullMap = make(map[string]*City)
  1169. }
  1170. if e.ProvinceBriefMap == nil {
  1171. e.ProvinceBriefMap = make(map[string]*Province)
  1172. }
  1173. if e.NewDistrictCityMap == nil {
  1174. e.NewDistrictCityMap = make(map[string][]*City)
  1175. }
  1176. if e.NewStreetDistrictMap == nil {
  1177. e.NewStreetDistrictMap = make(map[string][]*District)
  1178. }
  1179. if e.CommunityDistrictMap == nil {
  1180. e.CommunityDistrictMap = make(map[string][]*District)
  1181. }
  1182. }
  1183. //初始化城市省份敏感词
  1184. func (e *ExtractTask) InitCityDFA() {
  1185. defer qu.Catch()
  1186. e.CityAllGet = &ju.DFA{}
  1187. e.CitySimGet = &ju.DFA{}
  1188. e.DistrictAllGet = &ju.DFA{}
  1189. e.DistrictSimGet = &ju.DFA{}
  1190. e.ProvinceAllGet = &ju.DFA{}
  1191. e.ProvinceSimGet = &ju.DFA{}
  1192. e.StreetGet = &ju.DFA{}
  1193. //初始化map
  1194. if e.ProvinceMap == nil {
  1195. e.ProvinceMap = make(map[string]string)
  1196. }
  1197. if e.CityMap == nil {
  1198. e.CityMap = make(map[string]string)
  1199. }
  1200. if e.DistrictSimAndAll == nil {
  1201. e.DistrictSimAndAll = make(map[string]string)
  1202. }
  1203. if e.CityBriefMap == nil {
  1204. e.CityBriefMap = make(map[string]*City)
  1205. }
  1206. if e.CityFullMap == nil {
  1207. e.CityFullMap = make(map[string]*City)
  1208. }
  1209. if e.ProvinceBriefMap == nil {
  1210. e.ProvinceBriefMap = make(map[string]*Province)
  1211. }
  1212. if e.DistrictCityMap == nil {
  1213. e.DistrictCityMap = make(map[string]*City)
  1214. }
  1215. if e.StreetDistrictMap == nil {
  1216. e.StreetDistrictMap = make(map[string]*District)
  1217. }
  1218. //初始化省
  1219. fn1 := InitProvince(e.TaskInfo.Version)
  1220. for k, v := range fn1 {
  1221. for _, p := range v.([]interface{}) {
  1222. p1, _ := p.(string)
  1223. e.ProvinceAllGet.AddWord(p1) //华中科技大学
  1224. e.ProvinceMap[p1] = k //华中科技大学:湖北
  1225. }
  1226. }
  1227. //初始化城市全称
  1228. fn2 := InitCityAll(e.TaskInfo.Version)
  1229. for k, v := range fn2 {
  1230. //加载省信息
  1231. e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
  1232. p := &Province{}
  1233. p.Name = k //省全称:浙江省
  1234. p.Brief = v["brief"].(string) //省简称:浙江
  1235. e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
  1236. e.ProvinceMap[k] = p.Brief //浙江省:浙江
  1237. e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
  1238. p.Cap = v["captial"].(string) //省会(杭州)
  1239. //加载市信息
  1240. city, _ := v["city"].(map[string]interface{})
  1241. for k1, v1 := range city {
  1242. e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
  1243. v1m, _ := v1.(map[string]interface{})
  1244. c := &City{}
  1245. c.Name = k1 //市全称:杭州市
  1246. c.Brief = v1m["brief"].(string) //市简称:杭州
  1247. e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
  1248. e.CityMap[k1] = c.Brief //杭州市:杭州
  1249. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  1250. e.CityFullMap[k1] = c //杭州市:市信息{}
  1251. c.P = p
  1252. if c.Name == p.Cap {
  1253. p.Captial = c //加载province中的省会市信息{}
  1254. }
  1255. //区县
  1256. districtmap := v1m["area"].(map[string]interface{}) //区或县
  1257. for district, streetarr := range districtmap {
  1258. d := &District{}
  1259. d.Name = district
  1260. d.C = c
  1261. //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
  1262. //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
  1263. e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
  1264. ctmp := e.DistrictCityMap[district]
  1265. if ctmp == nil {
  1266. e.DistrictCityMap[district] = c
  1267. }
  1268. //街道
  1269. for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
  1270. e.StreetGet.AddWord(s) //加入街道敏感词
  1271. dtmp := e.StreetDistrictMap[s]
  1272. if dtmp == nil {
  1273. e.StreetDistrictMap[s] = d
  1274. }
  1275. }
  1276. }
  1277. }
  1278. }
  1279. //初始化城市简称
  1280. fn3 := InitCitySim(e.TaskInfo.Version)
  1281. for _, v := range fn3 {
  1282. city, _ := v["city"].(map[string]interface{})
  1283. for _, v1 := range city {
  1284. v1m, _ := v1.(map[string]interface{})
  1285. cb := v1m["brief"].(string) //市简称
  1286. arr := v1m["area"].(map[string]interface{}) //区或县简称
  1287. for districtsim, districtall := range arr {
  1288. e.DistrictSimAndAll[districtsim] = districtall.(string)
  1289. d := &District{}
  1290. d.Name = districtsim
  1291. d.C = e.CityBriefMap[cb]
  1292. e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
  1293. ctmp := e.DistrictCityMap[districtsim]
  1294. if ctmp == nil {
  1295. e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
  1296. }
  1297. }
  1298. }
  1299. }
  1300. }
  1301. //初始化邮编库
  1302. func (e *ExtractTask) InitPostCode() {
  1303. defer qu.Catch()
  1304. if e.PostCodeMap == nil {
  1305. e.PostCodeMap = make(map[string]*PostCode)
  1306. }
  1307. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  1308. for _, l := range *list {
  1309. pc := &PostCode{}
  1310. pc.Code = qu.ObjToString(l["code"])
  1311. pc.P = qu.ObjToString(l["province"])
  1312. pc.C = qu.ObjToString(l["city"])
  1313. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  1314. e.PostCodeMap[pc.Code] = pc
  1315. }
  1316. }
  1317. //初始化区号库
  1318. func (e *ExtractTask) InitAreaCode() {
  1319. defer qu.Catch()
  1320. if e.AreaCodeMap == nil {
  1321. e.AreaCodeMap = make(map[string]*AreaCode)
  1322. }
  1323. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  1324. for _, l := range *list {
  1325. ac := &AreaCode{}
  1326. ac.Code = qu.ObjToString(l["code"])
  1327. ac.P = qu.ObjToString(l["province"])
  1328. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  1329. e.AreaCodeMap[ac.Code] = ac
  1330. }
  1331. }
  1332. //保存抽取详情数据
  1333. func (e *ExtractTask) ResultSave(init bool) {
  1334. defer qu.Catch()
  1335. e.RWMutex.Lock()
  1336. if e.ResultArr == nil {
  1337. e.ResultArr = [][]map[string]interface{}{}
  1338. }
  1339. e.RWMutex.Unlock()
  1340. if init {
  1341. go func() {
  1342. for {
  1343. e.RWMutex.Lock()
  1344. if len(e.ResultArr) > saveLimit {
  1345. arr := e.ResultArr[:saveLimit]
  1346. e.ResultArr = e.ResultArr[saveLimit:]
  1347. e.RWMutex.Unlock()
  1348. qu.Try(func() {
  1349. db.Mgo.UpSertBulk("extract_result", arr...)
  1350. }, func(err interface{}) {
  1351. log.Debug(err)
  1352. })
  1353. } else {
  1354. arr := e.ResultArr
  1355. e.ResultArr = [][]map[string]interface{}{}
  1356. e.RWMutex.Unlock()
  1357. qu.Try(func() {
  1358. db.Mgo.UpSertBulk("extract_result", arr...)
  1359. }, func(err interface{}) {
  1360. log.Debug(err)
  1361. })
  1362. }
  1363. time.Sleep(3 * time.Second)
  1364. }
  1365. }()
  1366. } else {
  1367. e.RWMutex.Lock()
  1368. arr := e.ResultArr
  1369. e.ResultArr = [][]map[string]interface{}{}
  1370. e.RWMutex.Unlock()
  1371. qu.Try(func() {
  1372. lenarr := len(arr)
  1373. for {
  1374. if lenarr > saveLimit {
  1375. arr2 := arr[:saveLimit]
  1376. arr = arr[saveLimit:]
  1377. lenarr = len(arr)
  1378. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1379. } else {
  1380. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1381. break
  1382. }
  1383. }
  1384. }, func(err interface{}) {
  1385. defer e.RWMutex.Unlock()
  1386. log.Debug(err)
  1387. })
  1388. }
  1389. }
  1390. //保存抽取数据
  1391. func (e *ExtractTask) BidSave(init bool) {
  1392. defer qu.Catch()
  1393. e.RWMutex.Lock()
  1394. if e.BidArr == nil {
  1395. e.BidArr = [][]map[string]interface{}{}
  1396. }
  1397. e.RWMutex.Unlock()
  1398. if init {
  1399. go func() {
  1400. for {
  1401. e.RWMutex.Lock()
  1402. if len(e.BidArr) > saveLimit {
  1403. arr := e.BidArr[:saveLimit]
  1404. e.BidArr = e.BidArr[saveLimit:]
  1405. e.RWMutex.Unlock()
  1406. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1407. qu.Try(func() {
  1408. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1409. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1410. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1411. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1412. }, func(err interface{}) {
  1413. log.Debug(err)
  1414. })
  1415. } else {
  1416. arr := e.BidArr
  1417. e.BidArr = [][]map[string]interface{}{}
  1418. e.RWMutex.Unlock()
  1419. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1420. qu.Try(func() {
  1421. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1422. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1423. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1424. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1425. }, func(err interface{}) {
  1426. log.Debug(err)
  1427. })
  1428. }
  1429. time.Sleep(2 * time.Second)
  1430. }
  1431. }()
  1432. } else {
  1433. e.RWMutex.Lock()
  1434. arr := e.BidArr
  1435. e.BidArr = [][]map[string]interface{}{}
  1436. e.RWMutex.Unlock()
  1437. qu.Try(func() {
  1438. lenarr := len(arr)
  1439. for {
  1440. if lenarr > saveLimit {
  1441. arr2 := arr[:saveLimit]
  1442. arr = arr[saveLimit:]
  1443. lenarr = len(arr)
  1444. arr2, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr2)
  1445. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1446. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1447. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1448. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1449. } else {
  1450. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1451. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1452. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1453. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1454. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1455. break
  1456. }
  1457. }
  1458. }, func(err interface{}) {
  1459. log.Debug(err)
  1460. })
  1461. time.Sleep(1 * time.Second)
  1462. }
  1463. }
  1464. func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) {
  1465. arr = [][]map[string]interface{}{}
  1466. blocks = []map[string]interface{}{}
  1467. fieldalls = []map[string]interface{}{}
  1468. fieldallsf = []map[string]interface{}{}
  1469. for _, v := range a {
  1470. _id, _ := v[0]["_id"]
  1471. if tmp, ok := v[1]["$set"].(map[string]interface{}); ok {
  1472. if ju.SaveBlock {
  1473. if tmp["blocks"] != nil {
  1474. block := map[string]interface{}{
  1475. "_id": _id,
  1476. "blocks": tmp["blocks"],
  1477. }
  1478. blocks = append(blocks, block)
  1479. }
  1480. }
  1481. delete(tmp, "blocks")
  1482. if ju.FieldsFind {
  1483. if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok {
  1484. fieldall := map[string]interface{}{
  1485. "_id": _id,
  1486. }
  1487. for k, v := range f {
  1488. fieldall[k] = v
  1489. }
  1490. fieldalls = append(fieldalls, fieldall)
  1491. }
  1492. if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok {
  1493. fieldallf := map[string]interface{}{
  1494. "_id": _id,
  1495. }
  1496. for k, v := range ff {
  1497. fieldallf[k] = v
  1498. }
  1499. fieldallsf = append(fieldalls, fieldallf)
  1500. }
  1501. }
  1502. delete(tmp, "fieldall")
  1503. delete(tmp, "fieldallf")
  1504. v[1] = tmp
  1505. }
  1506. arr = append(arr, v)
  1507. }
  1508. return arr, blocks, fieldalls, fieldallsf
  1509. }
  1510. func (e *ExtractTask) InitAuditRecogField() {
  1511. defer qu.Catch()
  1512. e.RecogFieldMap = make(map[string]map[string]interface{})
  1513. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1514. for _, f := range *recogFieldList {
  1515. field := qu.ObjToString(f["s_recogfield"])
  1516. e.RecogFieldMap[field] = f
  1517. }
  1518. }
  1519. func (e *ExtractTask) InitAuditClass() {
  1520. defer qu.Catch()
  1521. e.FidClassMap = make(map[string][]map[string]interface{})
  1522. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1523. for _, c := range *class {
  1524. classList := []map[string]interface{}{}
  1525. fid := qu.ObjToString(c["s_fid"])
  1526. if len(e.FidClassMap[fid]) > 0 { //追加
  1527. classList = e.FidClassMap[fid]
  1528. }
  1529. classList = append(classList, c)
  1530. e.FidClassMap[fid] = classList
  1531. }
  1532. }
  1533. //加载规则
  1534. func (e *ExtractTask) InitAuditRule() {
  1535. defer qu.Catch()
  1536. var rureg *regexp.Regexp
  1537. var rs []rune
  1538. var ru string
  1539. var err error
  1540. e.CidRuleMap = make(map[string][]map[string]interface{})
  1541. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1542. for _, v := range *rule {
  1543. i_rule := []interface{}{}
  1544. ss, _ := (v["s_rule"].([]interface{}))
  1545. for _, r := range qu.ObjArrToStringArr(ss) {
  1546. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1547. rs = []rune(r)
  1548. ru = string(rs[1 : len(rs)-1])
  1549. rureg, err = regexp.Compile(ru)
  1550. if err != nil {
  1551. log.Debug("error---rule:", r)
  1552. continue
  1553. }
  1554. i_rule = append(i_rule, []interface{}{rureg}...)
  1555. } else { //规则
  1556. i_rule = append(i_rule, r)
  1557. }
  1558. }
  1559. v["rule"] = i_rule
  1560. ruleList := []map[string]interface{}{}
  1561. classid := qu.ObjToString(v["s_classid"])
  1562. if len(e.CidRuleMap[classid]) > 0 { //追加
  1563. ruleList = e.CidRuleMap[classid]
  1564. }
  1565. ruleList = append(ruleList, v)
  1566. e.CidRuleMap[classid] = ruleList
  1567. }
  1568. }
  1569. //
  1570. func (e *ExtractTask) InitAuditFields() {
  1571. if len(e.AuditFields) == 0 {
  1572. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1573. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1574. vid := qu.BsonIdToSId((*v)["_id"])
  1575. query := map[string]interface{}{
  1576. "isaudit": true,
  1577. "delete": false,
  1578. "vid": vid,
  1579. }
  1580. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1581. for _, d := range *data {
  1582. field := qu.ObjToString(d["s_field"])
  1583. e.AuditFields = append(e.AuditFields, field)
  1584. }
  1585. }
  1586. }
  1587. }
  1588. //加载附件抽取
  1589. func (e *ExtractTask) InitFile() {
  1590. defer qu.Catch()
  1591. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1592. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1593. //ve, _ := db.Mgo.FindOne("version", query)
  1594. if ve == nil {
  1595. return
  1596. }
  1597. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1598. e.IsFileField = true
  1599. }
  1600. syscefiled := new(sync.Map)
  1601. if (*ve)["s_filefileds"] != nil {
  1602. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1603. syscefiled.Store(vff.(string), 1)
  1604. }
  1605. }
  1606. e.FileFields = syscefiled
  1607. }
  1608. //加载清理任务信息
  1609. func (c *ClearTask) InitClearTaskInfo() {
  1610. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1611. if len(*cleartask) > 1 {
  1612. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1613. c.ClearTaskInfo = &ClearTaskInfo{
  1614. Name: (*cleartask)["s_taskname"].(string),
  1615. Version: (*cleartask)["s_version"].(string),
  1616. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1617. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1618. FromDB: (*cleartask)["s_mgodb"].(string),
  1619. FromColl: (*cleartask)["s_mgocoll"].(string),
  1620. IsCltLog: ju.Config["iscltlog"].(bool),
  1621. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1622. }
  1623. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1624. } else {
  1625. return
  1626. }
  1627. }
  1628. //加载清理脚本
  1629. func (c *ClearTask) InitClearLuas() {
  1630. defer qu.Catch()
  1631. c.ClearLuas = make(map[string][]*ClearLua)
  1632. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1633. for _, l := range *list {
  1634. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1635. continue
  1636. }
  1637. s_field := qu.ObjToString(l["s_field"])
  1638. pid := qu.BsonIdToSId(l["_id"])
  1639. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1640. for _, vv := range *luas {
  1641. if b, _ := vv["isuse"].(bool); !b {
  1642. continue
  1643. }
  1644. clearLua := &ClearLua{
  1645. Field: s_field,
  1646. Code: vv["s_code"].(string),
  1647. Name: vv["s_name"].(string),
  1648. LuaText: vv["s_luascript"].(string),
  1649. LFields: getALLFields(),
  1650. }
  1651. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1652. }
  1653. }
  1654. }
  1655. //加载分块规则
  1656. func (e *ExtractTask) InitBlockRule() {
  1657. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1658. "vid": e.TaskInfo.VersionId,
  1659. "delete": false,
  1660. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1661. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1662. for _, v := range *datas {
  1663. block_reg, _ := v["block_reg"].(string)
  1664. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1665. title_reg, _ := v["title_reg"].(string)
  1666. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1667. if block_reg == "" || title_reg == "" {
  1668. continue
  1669. }
  1670. b_reg, b_err := regexp.Compile(block_reg)
  1671. t_reg, t_err := regexp.Compile(title_reg)
  1672. if b_err != nil || t_err != nil {
  1673. continue
  1674. }
  1675. brs = append(brs, b_reg)
  1676. trs = append(trs, t_reg)
  1677. }
  1678. e.RuleBlock = &ju.RuleBlock{
  1679. BlockRegs: brs,
  1680. TitleRegs: trs,
  1681. Classify: e.InitBlockClassify(),
  1682. }
  1683. }
  1684. //加载分块规则
  1685. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1686. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1687. "vid": e.TaskInfo.VersionId,
  1688. "delete": false,
  1689. }, nil, `{"name":1}`, false, -1, -1)
  1690. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1691. "vid": e.TaskInfo.VersionId,
  1692. "delete": false,
  1693. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1694. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1695. "vid": e.TaskInfo.VersionId,
  1696. "delete": false,
  1697. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1698. tag_map := map[string]ju.Tags{}
  1699. for _, v := range *classify_tag {
  1700. pid := qu.ObjToString(v["pid"])
  1701. name := qu.ObjToString(v["name"])
  1702. tag := &ju.Tag{Value: name}
  1703. if strings.HasPrefix(name, "reg__") {
  1704. tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
  1705. }
  1706. tag_map[pid] = append(tag_map[pid], tag)
  1707. }
  1708. //
  1709. info_map := map[string][]*ju.NameCode{}
  1710. info_tag := map[string]*ju.TagFile{}
  1711. for _, v := range *classify_info {
  1712. pid := qu.ObjToString(v["pid"])
  1713. _id := qu.BsonIdToSId(v["_id"])
  1714. name := qu.ObjToString(v["name"])
  1715. info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
  1716. info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
  1717. }
  1718. classify_map := map[string][]*ju.NameCode{}
  1719. for _, v := range *classify {
  1720. _id := qu.BsonIdToSId(v["_id"])
  1721. if info_map[_id] == nil {
  1722. continue
  1723. }
  1724. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1725. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1726. }
  1727. }
  1728. return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
  1729. }