extractInit.go 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734
  1. // extractInit
  2. package extract
  3. import (
  4. db "jy/mongodbutil"
  5. ju "jy/util"
  6. qu "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. "gopkg.in/mgo.v2/bson"
  14. log "github.com/donnie4w/go-logger/logger"
  15. "github.com/go-ego/gse"
  16. )
  17. type RegLuaInfo struct {
  18. //正则或脚本信息
  19. Code, Name, Field string //
  20. Score float64
  21. RuleText string //
  22. IsLua bool //
  23. RegPreBac *ExtReg //
  24. RegCore *ExtReg //
  25. }
  26. type ExtReg struct {
  27. Reg *regexp.Regexp
  28. Replace string
  29. Bextract bool
  30. ExtractPos map[string]int
  31. NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
  32. }
  33. type RuleCore struct {
  34. Id string //id
  35. Field string //逻辑字段
  36. LuaLogic string //进入逻辑
  37. ExtFrom string //从哪个字段抽取
  38. RulePres []*RegLuaInfo //抽取前置规则
  39. RuleBacks []*RegLuaInfo //抽取后置规则
  40. RuleCores []*RegLuaInfo //抽取规则
  41. KVRuleCores []*RegLuaInfo //KV抽取清理规则
  42. LFields map[string]string //所有字段属性组
  43. }
  44. type Tag struct {
  45. Type string //标签类型 string 字符串、regexp 正则
  46. Key string //
  47. Reg *regexp.Regexp //
  48. }
  49. type TaskInfo struct {
  50. Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
  51. FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
  52. ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
  53. TestColl, LastExtId string //测试结果表、上次抽取信息id
  54. FDB *db.Pool //数据库连接池
  55. TDB *db.Pool //数据库连接池
  56. IsEtxLog bool //是否开启抽取日志
  57. ProcessPool chan bool //任务进程池
  58. TestLua bool //检查测试用
  59. }
  60. type ExtractTask struct {
  61. Id string //任务id
  62. IsRun bool //是否启动
  63. Content string //信息内容
  64. TaskInfo *TaskInfo //任务信息
  65. RulePres []*RegLuaInfo //通用前置规则
  66. RuleBacks []*RegLuaInfo //通用后置规则
  67. SiteRuleBacks []*RegLuaInfo //站点通用后置规则
  68. RuleBlock *ju.RuleBlock
  69. //RuleCores []*RuleCore //抽取规则
  70. RuleCores map[string]map[string][]*RuleCore //分类抽取规则
  71. SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则
  72. PkgRuleCores []*RuleCore //分包抽取规则
  73. Tag map[string][]*Tag //标签库
  74. SiteTag map[string][]*Tag //站点标签库
  75. ClearFn map[string][]string //清理函数
  76. SiteClearFn map[string][]string //站点清理函数
  77. IsExtractCity bool //是否开启城市抽取
  78. Fields map[string]int //抽取属性组
  79. SiteFields map[string]int //抽取站点属性组
  80. IsFileField bool //是否开启附件抽取
  81. FileFields *sync.Map //抽取附件属性组
  82. ResultChanel chan bool //抽取结果详情
  83. sync.RWMutex
  84. ResultArr [][]map[string]interface {
  85. } //抽取结果详情
  86. BidChanel chan bool //抽取结果
  87. BidArr [][]map[string]interface {
  88. } //抽取结果
  89. BidTotal int //结果数量
  90. RecogFieldMap map[string]map[string]interface {
  91. } //识别字段
  92. FidClassMap map[string][]map[string]interface {
  93. } //分类
  94. CidRuleMap map[string][]map[string]interface {
  95. } //规则
  96. AuditFields []string //需要审核的字段名称
  97. SiteCityMap map[string]*SiteCity //站点对应的省市区
  98. ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
  99. ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
  100. CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
  101. CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
  102. CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
  103. DistrictCityMap map[string]*City
  104. NewDistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
  105. DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
  106. NewDistrictSimAndAll map[string][]map[string]*City //区或县(key:简称 val: 相同简称的区全称:所在市)
  107. StreetDistrictMap map[string]*District //街道对应的区或县
  108. NewStreetDistrictMap map[string][]*District //街道全称对应的区或县
  109. CommunityDistrictMap map[string][]*District //村、居委会对应的区或县
  110. ProvinceAllGet *ju.DFA //省全称
  111. ProvinceSimGet *ju.DFA //省简称
  112. CityAllGet *ju.DFA //市全称
  113. CitySimGet *ju.DFA //市简称
  114. DistrictAllGet *ju.DFA //区或县全称
  115. DistrictSimGet *ju.DFA //区或县简称
  116. StreetGet *ju.DFA //街道
  117. PostCodeMap map[string]*PostCode //邮编
  118. AreaCodeMap map[string]*AreaCode //区号
  119. InfoType []map[string]interface {
  120. }
  121. Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区
  122. Trie_Full_City *ju.Trie //市全称 地级市
  123. Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
  124. Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道
  125. Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会
  126. Trie_Sim_Province *ju.Trie //省简称
  127. Trie_Sim_City *ju.Trie //市简称
  128. Trie_Sim_District *ju.Trie //县简称
  129. Trie_Fulls []*ju.Trie //所有全称
  130. Trie_Sims []*ju.Trie //所有简称
  131. Seg_PCD *gse.Segmenter //分词
  132. Seg_SV *gse.Segmenter //分词
  133. Luacodes *sync.Map //站点规则
  134. SiteMerge *sync.Map //抽取合并
  135. }
  136. type SiteCity struct {
  137. P string //省简称
  138. C string //市全称
  139. D string //区全称
  140. }
  141. type ClearTaskInfo struct {
  142. Name, Version, VersionId string //名称、版本、版本id
  143. FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
  144. FDB *db.Pool //数据库连接池
  145. TDB *db.Pool //数据库连接池
  146. IsCltLog bool //是否开启清理日志
  147. ProcessPool chan bool //任务进程池
  148. }
  149. type ClearLua struct {
  150. Field string //字段字段
  151. Code string //代码
  152. Name string //名称
  153. LuaText string
  154. //LuaLogic string //进入逻辑
  155. //ExtFrom string //从哪个字段抽取
  156. LFields map[string]string //lua抽取字段属性组
  157. }
  158. type ClearTask struct {
  159. sync.RWMutex
  160. Id string //任务id
  161. Content string //信息内容
  162. ClearTaskInfo *ClearTaskInfo //任务信息
  163. ClearLuas map[string][]*ClearLua //清理脚本
  164. UpdateResult [][]map[string]interface{} //清理后结果
  165. //ClearChannel chan bool
  166. }
  167. func init() {
  168. TaskList = make(map[string]*ExtractTask)
  169. ClearTaskList = make(map[string]*ClearTask)
  170. go SaveExtLog()
  171. go SaveCltLog() //保存清理日志
  172. }
  173. //加载任务信息
  174. func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
  175. task, _ := db.Mgo.FindById("task", e.Id, nil)
  176. if len(*task) > 1 {
  177. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  178. e.TaskInfo = &TaskInfo{
  179. Name: (*task)["s_taskname"].(string),
  180. Version: (*task)["s_version"].(string),
  181. VersionId: qu.BsonIdToSId((*v)["_id"]),
  182. TrackColl: trackcoll,
  183. FromDbAddr: (*task)["s_mgoaddr"].(string),
  184. FromDB: (*task)["s_mgodb"].(string),
  185. FromColl: (*task)["s_mgocoll"].(string),
  186. TestColl: resultcoll,
  187. IsEtxLog: true,
  188. ProcessPool: make(chan bool, 1),
  189. }
  190. if (*v)["isextractcity"] != nil {
  191. e.IsExtractCity = (*v)["isextractcity"].(bool)
  192. }
  193. } else {
  194. return
  195. }
  196. }
  197. //加载任务信息
  198. func (e *ExtractTask) InitTaskInfo() {
  199. task, _ := db.Mgo.FindById("task", e.Id, nil)
  200. log.Debug("task", task)
  201. if len(*task) > 1 {
  202. v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
  203. strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
  204. log.Debug("s_mgosavecoll", strs)
  205. if len(strs) < 3 {
  206. return
  207. } else {
  208. e.TaskInfo = &TaskInfo{
  209. Name: (*task)["s_taskname"].(string),
  210. Version: (*task)["s_version"].(string),
  211. VersionId: qu.BsonIdToSId((*v)["_id"]),
  212. //TrackColl: (*task)["s_trackcoll"].(string),
  213. FromDbAddr: (*task)["s_mgoaddr"].(string),
  214. FromDB: (*task)["s_mgodb"].(string),
  215. FromColl: (*task)["s_mgocoll"].(string),
  216. ToDbAddr: strs[0],
  217. ToDB: strs[1],
  218. ToColl: strs[2],
  219. IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
  220. LastExtId: qu.ObjToString((*task)["s_extlastid"]),
  221. ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
  222. }
  223. if (*v)["isextractcity"] != nil {
  224. e.IsExtractCity = (*v)["isextractcity"].(bool)
  225. }
  226. }
  227. log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
  228. } else {
  229. return
  230. }
  231. }
  232. func (e *ExtractTask) InitSite() {
  233. e.Luacodes = &sync.Map{}
  234. e.SiteMerge = &sync.Map{}
  235. sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1)
  236. for _, v := range *sites {
  237. if vv, ok := v["site_script"].([]interface{}); ok {
  238. for _, vvv := range vv {
  239. e.Luacodes.Store(vvv, map[string]interface{}{})
  240. e.SiteMerge.Store(vvv, v["ismerge"].(bool))
  241. }
  242. } else if vv, ok := v["site_script"].(interface{}); ok {
  243. e.Luacodes.Store(vv, map[string]interface{}{})
  244. e.SiteMerge.Store(vv, v["ismerge"].(bool))
  245. }
  246. }
  247. }
  248. //加载通用前置规则
  249. func (e *ExtractTask) InitRulePres() {
  250. defer qu.Catch()
  251. e.RulePres = []*RegLuaInfo{}
  252. list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  253. for _, v := range *list {
  254. rinfo := &RegLuaInfo{
  255. Code: v["s_code"].(string),
  256. Name: v["s_name"].(string),
  257. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  258. }
  259. if rinfo.IsLua {
  260. rinfo.RuleText = v["s_luascript"].(string)
  261. e.RulePres = append(e.RulePres, rinfo)
  262. } else {
  263. qu.Try(func() {
  264. rinfo.RuleText = v["s_rule"].(string)
  265. tmp := strings.Split(rinfo.RuleText, "__")
  266. var pattern string
  267. if strings.Contains(tmp[0], "\\u") {
  268. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  269. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  270. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  271. } else {
  272. pattern = tmp[0]
  273. }
  274. if len(tmp) == 2 {
  275. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  276. } else {
  277. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  278. }
  279. e.RulePres = append(e.RulePres, rinfo)
  280. }, func(err interface{}) {
  281. log.Debug(rinfo.Code, rinfo.Field, err)
  282. })
  283. }
  284. }
  285. }
  286. //加载通用后置规则
  287. func (e *ExtractTask) InitRuleBacks(isSite bool) {
  288. defer qu.Catch()
  289. cDB := ""
  290. eSiteRuleBacks := []*RegLuaInfo{}
  291. if isSite {
  292. cDB = "site_rule_back"
  293. e.SiteRuleBacks = []*RegLuaInfo{}
  294. } else {
  295. cDB = "rule_back"
  296. e.RuleBacks = []*RegLuaInfo{}
  297. }
  298. list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  299. for _, v := range *list {
  300. rinfo := &RegLuaInfo{
  301. Code: v["s_code"].(string),
  302. Name: v["s_name"].(string),
  303. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  304. }
  305. if rinfo.IsLua {
  306. rinfo.RuleText = v["s_luascript"].(string)
  307. if isSite {
  308. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  309. //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo)
  310. } else {
  311. e.RuleBacks = append(e.RuleBacks, rinfo)
  312. }
  313. } else {
  314. qu.Try(func() {
  315. rinfo.RuleText = v["s_rule"].(string)
  316. tmp := strings.Split(rinfo.RuleText, "__")
  317. var pattern string
  318. if strings.Contains(tmp[0], "\\u") {
  319. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  320. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  321. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  322. } else {
  323. pattern = tmp[0]
  324. }
  325. if len(tmp) == 2 {
  326. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  327. } else {
  328. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  329. }
  330. if isSite {
  331. eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
  332. } else {
  333. e.RuleBacks = append(e.RuleBacks, rinfo)
  334. }
  335. }, func(err interface{}) {
  336. log.Debug(rinfo.Code, rinfo.Field, err)
  337. })
  338. }
  339. if isSite {
  340. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  341. if (*sm) == nil || len(*sm) <= 0 {
  342. eSiteRuleBacks = []*RegLuaInfo{}
  343. continue
  344. }
  345. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  346. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  347. if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil {
  348. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks
  349. } else {
  350. if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 {
  351. tmplist = append(tmplist, eSiteRuleBacks...)
  352. mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist
  353. }
  354. //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...)
  355. }
  356. e.Luacodes.Store(v2, mdpvalue)
  357. }
  358. }
  359. eSiteRuleBacks = []*RegLuaInfo{}
  360. }
  361. }
  362. }
  363. func (e *ExtractTask) InfoTypeList() {
  364. infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  365. infolist := *infolist1
  366. for _, v := range infolist {
  367. e.InfoType = append(e.InfoType, v)
  368. }
  369. }
  370. //加载抽取规则
  371. func (e *ExtractTask) InitRuleCore(isSite bool) {
  372. defer qu.Catch()
  373. allFields := getALLFields()
  374. var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb, rule_logickvdb string
  375. eSiteRuleCores := make(map[string]map[string][]*RuleCore)
  376. if isSite {
  377. versioninfodb = "site_versioninfo"
  378. rule_logicdb = "site_rule_logic"
  379. rule_logicpredb = "site_rule_logicpre"
  380. rule_logicbackdb = "site_rule_logicback"
  381. rule_logicoredb = "site_rule_logicore"
  382. rule_logickvdb = "site_rule_logickv"
  383. e.SiteFields = map[string]int{}
  384. e.SiteRuleCores = make(map[string]map[string][]*RuleCore)
  385. } else {
  386. versioninfodb = "versioninfo"
  387. rule_logicdb = "rule_logic"
  388. rule_logicpredb = "rule_logicpre"
  389. rule_logicbackdb = "rule_logicback"
  390. rule_logicoredb = "rule_logicore"
  391. rule_logickvdb = "rule_logickv"
  392. e.Fields = map[string]int{}
  393. e.RuleCores = make(map[string]map[string][]*RuleCore)
  394. }
  395. fieldrules := map[string][]*RuleCore{}
  396. vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  397. for _, vinfo := range *vinfos {
  398. if b, _ := vinfo["isuse"].(bool); !b {
  399. continue
  400. }
  401. s_field := qu.ObjToString(vinfo["s_field"])
  402. pid := qu.BsonIdToSId(vinfo["_id"])
  403. list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  404. for _, vv := range *list {
  405. if b, _ := vv["isuse"].(bool); !b {
  406. continue
  407. }
  408. rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
  409. rcore.Field = s_field
  410. rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
  411. rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
  412. rcore.LFields = allFields
  413. //前置规则
  414. rulePres := []*RegLuaInfo{}
  415. plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  416. for _, v := range *plist {
  417. rinfo := &RegLuaInfo{
  418. Field: qu.ObjToString(v["s_field"]),
  419. Code: v["s_code"].(string),
  420. Name: v["s_name"].(string),
  421. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  422. }
  423. if rinfo.IsLua {
  424. rinfo.RuleText = v["s_luascript"].(string)
  425. rulePres = append(rulePres, rinfo)
  426. } else {
  427. qu.Try(func() {
  428. rinfo.RuleText = v["s_rule"].(string)
  429. tmp := strings.Split(rinfo.RuleText, "__")
  430. var pattern string
  431. if strings.Contains(tmp[0], "\\u") {
  432. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  433. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  434. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  435. } else {
  436. pattern = tmp[0]
  437. }
  438. if len(tmp) == 2 {
  439. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  440. } else {
  441. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  442. }
  443. rulePres = append(rulePres, rinfo)
  444. }, func(err interface{}) {
  445. log.Debug(rinfo.Code, rinfo.Field, err)
  446. })
  447. }
  448. }
  449. rcore.RulePres = rulePres
  450. //后置规则
  451. ruleBacks := []*RegLuaInfo{}
  452. blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  453. for _, v := range *blist {
  454. rinfo := &RegLuaInfo{
  455. Field: qu.ObjToString(v["s_field"]),
  456. Code: v["s_code"].(string),
  457. Name: v["s_name"].(string),
  458. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  459. }
  460. if rinfo.IsLua {
  461. rinfo.RuleText = v["s_luascript"].(string)
  462. ruleBacks = append(ruleBacks, rinfo)
  463. } else {
  464. qu.Try(func() {
  465. rinfo.RuleText = v["s_rule"].(string)
  466. tmp := strings.Split(rinfo.RuleText, "__")
  467. var pattern string
  468. if strings.Contains(tmp[0], "\\u") {
  469. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  470. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  471. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  472. } else {
  473. pattern = tmp[0]
  474. }
  475. if len(tmp) == 2 {
  476. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  477. } else {
  478. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  479. }
  480. ruleBacks = append(ruleBacks, rinfo)
  481. }, func(err interface{}) {
  482. log.Debug(rinfo.Code, rinfo.Field, err)
  483. })
  484. }
  485. }
  486. rcore.RuleBacks = ruleBacks
  487. //抽取规则
  488. ruleCores := []*RegLuaInfo{}
  489. clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  490. for _, v := range *clist {
  491. if b, _ := v["isuse"].(bool); !b {
  492. continue
  493. }
  494. field := qu.ObjToString(v["s_field"])
  495. if isSite {
  496. e.SiteFields[field] = 1
  497. } else {
  498. e.Fields[field] = 1 //加入抽取属性组备用
  499. }
  500. rinfo := &RegLuaInfo{
  501. Field: field,
  502. Code: v["s_code"].(string),
  503. Name: v["s_name"].(string),
  504. Score: qu.Float64All(v["s_default_score"]),
  505. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  506. }
  507. if rinfo.IsLua {
  508. rinfo.RuleText = v["s_luascript"].(string)
  509. //提取全部属性
  510. ruleCores = append(ruleCores, rinfo)
  511. } else {
  512. qu.Try(func() {
  513. rinfo.RuleText = v["s_rule"].(string)
  514. tmp := strings.Split(rinfo.RuleText, "__")
  515. var pattern string
  516. if strings.Contains(tmp[0], "\\u") {
  517. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  518. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  519. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  520. } else {
  521. pattern = tmp[0]
  522. }
  523. if len(tmp) == 2 {
  524. epos := strings.Split(tmp[1], ",")
  525. posm := map[string]int{}
  526. for _, v := range epos {
  527. ks := strings.Split(v, ":")
  528. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  529. posm[ks[1]] = qu.IntAll(ks[0])
  530. } else { //(.*)招标公告__2
  531. posm[rinfo.Field] = qu.IntAll(ks[0])
  532. }
  533. }
  534. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  535. } else {
  536. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  537. }
  538. ruleCores = append(ruleCores, rinfo)
  539. }, func(err interface{}) {
  540. log.Debug(rinfo.Code, rinfo.Field, err)
  541. })
  542. }
  543. }
  544. rcore.RuleCores = ruleCores
  545. //kv规则
  546. kvRuleCores := []*RegLuaInfo{}
  547. kvlist, _ := db.Mgo.Find(rule_logickvdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
  548. for _, v := range *kvlist {
  549. if b, _ := v["isuse"].(bool); !b {
  550. continue
  551. }
  552. field := qu.ObjToString(v["s_field"])
  553. if isSite {
  554. e.SiteFields[field] = 1
  555. } else {
  556. e.Fields[field] = 1 //加入抽取属性组备用
  557. }
  558. rinfo := &RegLuaInfo{
  559. Field: field,
  560. Code: v["s_code"].(string),
  561. Name: v["s_name"].(string),
  562. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  563. }
  564. qu.Try(func() {
  565. rinfo.RuleText = v["s_rule"].(string)
  566. tmp := strings.Split(rinfo.RuleText, "__")
  567. var pattern string
  568. if strings.Contains(tmp[0], "\\u") {
  569. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  570. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  571. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  572. } else {
  573. pattern = tmp[0]
  574. }
  575. if len(tmp) == 2 {
  576. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  577. } else {
  578. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  579. }
  580. kvRuleCores = append(kvRuleCores, rinfo)
  581. }, func(err interface{}) {
  582. log.Debug(rinfo.Code, rinfo.Field, err)
  583. })
  584. }
  585. rcore.KVRuleCores = kvRuleCores
  586. if fieldrules[s_field] == nil {
  587. fieldrules[s_field] = []*RuleCore{}
  588. }
  589. fieldrules[s_field] = append(fieldrules[s_field], rcore)
  590. }
  591. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  592. for _, v := range *infolist {
  593. topclass := qu.ObjToString(v["topclass"])
  594. if v["subclass"] == nil {
  595. eSiteRuleCores[topclass] = make(map[string][]*RuleCore)
  596. for attr, _ := range v["fields"].(map[string]interface{}) {
  597. if fieldrules[attr] != nil {
  598. eSiteRuleCores[topclass][attr] = fieldrules[attr]
  599. }
  600. }
  601. } else {
  602. for ca, fs := range v["subclass"].(map[string]interface{}) {
  603. eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  604. for field, _ := range fs.(map[string]interface{}) {
  605. if fieldrules[field] != nil {
  606. eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field]
  607. }
  608. }
  609. }
  610. }
  611. }
  612. if isSite {
  613. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1})
  614. if (*sm) == nil || len(*sm) <= 0 {
  615. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  616. fieldrules = map[string][]*RuleCore{}
  617. continue
  618. }
  619. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  620. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  621. //属性配置
  622. if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil {
  623. mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores
  624. } else {
  625. for k2, v2 := range eSiteRuleCores {
  626. tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2]
  627. for kkkk, vvv := range v2 {
  628. tmpv[kkkk] = vvv
  629. }
  630. mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv
  631. }
  632. }
  633. e.Luacodes.Store(v2, mdpvalue)
  634. }
  635. }
  636. eSiteRuleCores = make(map[string]map[string][]*RuleCore)
  637. fieldrules = map[string][]*RuleCore{}
  638. }
  639. }
  640. if !isSite {
  641. //属性配置
  642. infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
  643. for _, v := range *infolist {
  644. topclass := qu.ObjToString(v["topclass"])
  645. if v["subclass"] == nil {
  646. e.RuleCores[topclass] = make(map[string][]*RuleCore)
  647. for attr, _ := range v["fields"].(map[string]interface{}) {
  648. if fieldrules[attr] != nil {
  649. e.RuleCores[topclass][attr] = fieldrules[attr]
  650. }
  651. }
  652. } else {
  653. for ca, fs := range v["subclass"].(map[string]interface{}) {
  654. e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
  655. for field, _ := range fs.(map[string]interface{}) {
  656. if fieldrules[field] != nil {
  657. e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
  658. }
  659. }
  660. }
  661. }
  662. }
  663. }
  664. }
  665. //加载分包抽取规则
  666. func (e *ExtractTask) InitPkgCore() {
  667. defer qu.Catch()
  668. e.PkgRuleCores = []*RuleCore{}
  669. pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  670. for _, pkginfo := range *pkginfos {
  671. if b, _ := pkginfo["isuse"].(bool); !b {
  672. continue
  673. }
  674. s_field := qu.ObjToString(pkginfo["s_field"])
  675. sid := qu.BsonIdToSId(pkginfo["_id"])
  676. rcore := &RuleCore{}
  677. rcore.Field = s_field
  678. rcore.ExtFrom = "detail"
  679. //后置规则
  680. ruleBacks := []*RegLuaInfo{}
  681. blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  682. for _, v := range *blist {
  683. rinfo := &RegLuaInfo{
  684. Field: qu.ObjToString(v["s_field"]),
  685. Code: v["s_code"].(string),
  686. Name: v["s_name"].(string),
  687. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  688. }
  689. if rinfo.IsLua {
  690. rinfo.RuleText = v["s_luascript"].(string)
  691. ruleBacks = append(ruleBacks, rinfo)
  692. } else {
  693. qu.Try(func() {
  694. rinfo.RuleText = v["s_rule"].(string)
  695. tmp := strings.Split(rinfo.RuleText, "__")
  696. var pattern string
  697. if strings.Contains(tmp[0], "\\u") {
  698. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  699. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  700. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  701. } else {
  702. pattern = tmp[0]
  703. }
  704. if len(tmp) == 2 {
  705. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
  706. } else {
  707. rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
  708. }
  709. ruleBacks = append(ruleBacks, rinfo)
  710. }, func(err interface{}) {
  711. log.Debug(rinfo.Code, rinfo.Field, err)
  712. })
  713. }
  714. }
  715. rcore.RuleBacks = ruleBacks
  716. //抽取规则
  717. ruleCores := []*RegLuaInfo{}
  718. clist, _ := db.Mgo.Find("pkg_logicore", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
  719. for _, v := range *clist {
  720. if b, _ := v["isuse"].(bool); !b {
  721. continue
  722. }
  723. field := qu.ObjToString(v["s_field"])
  724. e.Fields[field] = 1 //加入抽取属性组备用
  725. rinfo := &RegLuaInfo{
  726. Field: field,
  727. Code: v["s_code"].(string),
  728. Name: v["s_name"].(string),
  729. IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
  730. }
  731. if rinfo.IsLua {
  732. rinfo.RuleText = v["s_luascript"].(string)
  733. //提取全部属性
  734. ruleCores = append(ruleCores, rinfo)
  735. } else {
  736. qu.Try(func() {
  737. rinfo.RuleText = v["s_rule"].(string)
  738. tmp := strings.Split(rinfo.RuleText, "__")
  739. var pattern string
  740. if strings.Contains(tmp[0], "\\u") {
  741. tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
  742. tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
  743. pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
  744. } else {
  745. pattern = tmp[0]
  746. }
  747. if len(tmp) == 2 {
  748. epos := strings.Split(tmp[1], ",")
  749. posm := map[string]int{}
  750. for _, v := range epos {
  751. ks := strings.Split(v, ":")
  752. if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
  753. posm[ks[1]] = qu.IntAll(ks[0])
  754. } else { //(.*)招标公告__2
  755. posm[rinfo.Field] = qu.IntAll(ks[0])
  756. }
  757. }
  758. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
  759. } else {
  760. rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
  761. }
  762. ruleCores = append(ruleCores, rinfo)
  763. }, func(err interface{}) {
  764. log.Debug(rinfo.Code, rinfo.Field, err)
  765. })
  766. }
  767. }
  768. rcore.RuleCores = ruleCores
  769. e.PkgRuleCores = append(e.PkgRuleCores, rcore)
  770. }
  771. }
  772. //加载标签库
  773. func (e *ExtractTask) InitTag(isSite bool) {
  774. defer qu.Catch()
  775. var tagdetailinfodb string
  776. eSiteTag := map[string][]*Tag{}
  777. if isSite {
  778. tagdetailinfodb = "site_tagdetailinfo"
  779. e.SiteTag = map[string][]*Tag{}
  780. } else {
  781. tagdetailinfodb = "tagdetailinfo"
  782. e.Tag = map[string][]*Tag{}
  783. }
  784. //字符串标签库
  785. list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  786. var tmpMap sync.Map
  787. for _, v := range *list {
  788. field := qu.ObjToString(v["s_field"])
  789. if tmp, ok := v["content"].([]interface{}); ok {
  790. fname := qu.ObjToString(v["s_name"])
  791. tab := ju.TagFile{Name: fname} //用于表格kv
  792. tab.Items = make([]*ju.Tag, len(tmp))
  793. for k, key := range tmp {
  794. tag := &Tag{Type: "string", Key: key.(string)}
  795. if isSite {
  796. eSiteTag[field] = append(eSiteTag[field], tag)
  797. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  798. } else {
  799. e.Tag[field] = append(e.Tag[field], tag)
  800. }
  801. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false}
  802. }
  803. sort.Sort(tab.Items)
  804. //ju.TagdbTable[fname] = &tab
  805. if isSite {
  806. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  807. if (*sm) == nil || len(*sm) <= 0 {
  808. eSiteTag = map[string][]*Tag{}
  809. continue
  810. }
  811. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  812. if v2 == nil || v2 == "" {
  813. continue
  814. }
  815. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  816. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  817. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  818. } else {
  819. for k2, v2 := range eSiteTag {
  820. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  821. }
  822. }
  823. e.Luacodes.Store(v2, mdpvalue)
  824. }
  825. tmpMap.Store(fname, &tab)
  826. ju.SiteTagdbTable.Store(v2, tmpMap)
  827. }
  828. //ju.SiteTagdbTable.Store(fname, &tab)
  829. eSiteTag = map[string][]*Tag{}
  830. } else {
  831. ju.TagdbTable.Store(fname, &tab)
  832. }
  833. }
  834. //if isSite {
  835. // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  836. // for _, v2 := range (*sm)["site_script"].([]interface{}) {
  837. // if mdpvalue, ok := Luacodes.Load(v2); ok {
  838. // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{
  839. // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  840. // }else {
  841. // for k2,v2 := range eSiteTag{
  842. // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  843. // }
  844. // }
  845. // Luacodes.Store(v2, mdpvalue)
  846. // }
  847. // }
  848. // eSiteTag = map[string][]*Tag{}
  849. //}
  850. }
  851. //正则标签库
  852. list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  853. for _, v := range *list {
  854. field := qu.ObjToString(v["s_field"])
  855. if tmp, ok := v["content"].([]interface{}); ok {
  856. fname := qu.ObjToString(v["s_name"])
  857. tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
  858. tab.Items = make([]*ju.Tag, len(tmp))
  859. for k, key := range tmp {
  860. tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
  861. if isSite {
  862. eSiteTag[field] = append(eSiteTag[field], tag)
  863. //e.SiteTag[field] = append(e.SiteTag[field], tag)
  864. } else {
  865. e.Tag[field] = append(e.Tag[field], tag)
  866. }
  867. tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false}
  868. }
  869. sort.Sort(tab.Items)
  870. //ju.TagdbTable[fname+"_reg"] = &tab
  871. if isSite {
  872. ju.SiteTagdbTable.Store(fname+"_reg", &tab)
  873. } else {
  874. ju.TagdbTable.Store(fname+"_reg", &tab)
  875. }
  876. }
  877. if isSite {
  878. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
  879. if (*sm) == nil || len(*sm) <= 0 {
  880. eSiteTag = map[string][]*Tag{}
  881. continue
  882. }
  883. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  884. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  885. if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
  886. mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
  887. } else {
  888. for k2, v2 := range eSiteTag {
  889. mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
  890. }
  891. }
  892. e.Luacodes.Store(v2, mdpvalue)
  893. }
  894. }
  895. eSiteTag = map[string][]*Tag{}
  896. }
  897. }
  898. }
  899. //获取fields
  900. func getALLFields() map[string]string {
  901. fields := map[string]string{}
  902. list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
  903. for _, v := range *list {
  904. fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
  905. }
  906. return fields
  907. }
  908. //加载clear函数
  909. func (e *ExtractTask) InitClearFn(isSite bool) {
  910. defer qu.Catch()
  911. var cleanupdb string
  912. if isSite {
  913. cleanupdb = "site_cleanup"
  914. e.SiteClearFn = map[string][]string{}
  915. } else {
  916. cleanupdb = "cleanup"
  917. }
  918. list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
  919. fn := map[string][]string{}
  920. for _, tmp := range *list {
  921. field := tmp["s_field"].(string)
  922. fns := tmp["clear"].([]interface{})
  923. if fn[field] == nil {
  924. fn[field] = []string{}
  925. }
  926. for _, v := range fns {
  927. fn[field] = append(fn[field], v.(string))
  928. }
  929. if isSite {
  930. sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1})
  931. if (*sm) == nil || len(*sm) <= 0 {
  932. fn = map[string][]string{}
  933. continue
  934. }
  935. for _, v2 := range (*sm)["site_script"].([]interface{}) {
  936. if mdpvalue, ok := e.Luacodes.Load(v2); ok {
  937. if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil {
  938. mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn
  939. } else {
  940. for k2, v2 := range fn {
  941. mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2
  942. }
  943. }
  944. e.Luacodes.Store(v2, mdpvalue)
  945. }
  946. }
  947. fn = map[string][]string{}
  948. }
  949. }
  950. if !isSite {
  951. e.ClearFn = fn
  952. }
  953. }
  954. //加载省份
  955. func InitProvince(version string) map[string]interface{} {
  956. defer qu.Catch()
  957. fn := map[string]interface{}{}
  958. list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
  959. for _, v := range *list {
  960. name := qu.ObjToString(v["s_name"])
  961. content := v["content"]
  962. switch content.(type) {
  963. case string:
  964. fn[name] = []interface{}{content.(string)}
  965. case []interface{}:
  966. fn[name] = content
  967. }
  968. }
  969. return fn
  970. }
  971. //加载所有
  972. func InitProvincesx() []map[string]interface{} {
  973. defer qu.Catch()
  974. provinces := make([]map[string]interface{}, 0)
  975. ju.AddrsSess.Find(map[string]interface{}{
  976. "Remarks": nil,
  977. }).All(&provinces)
  978. return provinces
  979. }
  980. //加载站点库site城市信息
  981. func InitSite() []map[string]interface{} {
  982. defer qu.Catch()
  983. query := map[string]interface{}{
  984. "depttype": map[string]interface{}{
  985. "$ne": "代理机构",
  986. },
  987. }
  988. list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
  989. return *list
  990. }
  991. func (e *ExtractTask) InitCityInfo() {
  992. defer qu.Catch()
  993. e.InitVar() //初始化变量
  994. //site站点信息
  995. for _, v := range InitSite() {
  996. site, _ := v["site"].(string)
  997. area, _ := v["area"].(string)
  998. city, _ := v["city"].(string)
  999. district, _ := v["district"].(string)
  1000. if area != "" && area != "全国" && site != "" {
  1001. s := &SiteCity{
  1002. P: area,
  1003. C: city,
  1004. D: district,
  1005. }
  1006. e.SiteCityMap[site] = s
  1007. }
  1008. }
  1009. //初始化省信息
  1010. fn1 := InitProvince(e.TaskInfo.Version)
  1011. for k, v := range fn1 {
  1012. for _, p := range v.([]interface{}) {
  1013. p1, _ := p.(string)
  1014. e.Trie_Full_Province.AddWords(p1) //华中科技大学
  1015. e.ProvinceMap[p1] = k //华中科技大学:湖北
  1016. }
  1017. }
  1018. alldata := InitProvincesx()
  1019. fnx := make([]map[string]interface{}, 0)
  1020. citys_maps := make(map[string][]map[string]interface{}, 0)
  1021. districts_maps := make(map[string]map[string][]map[string]interface{}, 0)
  1022. towns_maps := make(map[string]map[string]map[string][]map[string]interface{}, 0)
  1023. jwhs_maps := make(map[string]map[string]map[string]map[string][]map[string]interface{}, 0)
  1024. for _, v := range alldata {
  1025. codenum := len(v["code"].(string))
  1026. province := qu.ObjToString(v["province"])
  1027. city := qu.ObjToString(v["city"])
  1028. district := qu.ObjToString(v["district"])
  1029. town := qu.ObjToString(v["town"])
  1030. if codenum == 2 {
  1031. fnx = append(fnx, v)
  1032. } else if codenum == 4 {
  1033. citys_maps[province] = append(citys_maps[province], v)
  1034. } else if codenum == 6 {
  1035. if districts_maps[province] == nil {
  1036. districts_maps[province] = make(map[string][]map[string]interface{}, 0)
  1037. }
  1038. districts_maps[province][city] = append(districts_maps[province][city], v)
  1039. } else if codenum == 9 {
  1040. if towns_maps[province] == nil {
  1041. towns_maps[province] = make(map[string]map[string][]map[string]interface{}, 0)
  1042. }
  1043. if towns_maps[province][city] == nil {
  1044. towns_maps[province][city] = make(map[string][]map[string]interface{}, 0)
  1045. }
  1046. towns_maps[province][city][district] = append(towns_maps[province][city][district], v)
  1047. } else if codenum == 12 {
  1048. if jwhs_maps[province] == nil {
  1049. jwhs_maps[province] = make(map[string]map[string]map[string][]map[string]interface{}, 0)
  1050. }
  1051. if jwhs_maps[province][city] == nil {
  1052. jwhs_maps[province][city] = make(map[string]map[string][]map[string]interface{}, 0)
  1053. }
  1054. if jwhs_maps[province][city][district] == nil {
  1055. jwhs_maps[province][city][district] = make(map[string][]map[string]interface{}, 0)
  1056. }
  1057. jwhs_maps[province][city][district][town] = append(jwhs_maps[province][city][district][town], v)
  1058. }
  1059. }
  1060. //初始化城市全称
  1061. for _, provinces := range fnx {
  1062. all_province := qu.ObjToString(provinces["all_province"]) //省全称
  1063. jc_province := qu.ObjToString(provinces["province"]) //省简称
  1064. //加载省信息
  1065. e.Trie_Full_Province.AddWords(all_province) //加入省全称Trie(k:浙江省)
  1066. p := &Province{}
  1067. p.Name = all_province //省全称:浙江省
  1068. p.Brief = jc_province //省简称:浙江
  1069. e.Trie_Sim_Province.AddWords(jc_province) //加入省简称Trie(k:浙江)
  1070. e.ProvinceMap[all_province] = jc_province //浙江省:浙江
  1071. e.ProvinceBriefMap[jc_province] = p //浙江:省信息{}
  1072. if province_alias, ok := provinces["province_alias"].([]interface{}); ok {
  1073. for _, vprovince_alias := range province_alias {
  1074. e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p
  1075. }
  1076. }
  1077. //加载市信息
  1078. citys := citys_maps[jc_province]
  1079. isok := make(map[string]bool)
  1080. for _, vcity := range citys {
  1081. qc_city := qu.ObjToString(vcity["city"])
  1082. jc_city := qu.ObjToString(vcity["brief_city"])
  1083. e.Trie_Full_City.AddWords(qc_city) //加入市全称Trie(k:杭州市)
  1084. c := &City{}
  1085. c.Name = qc_city //市全称:杭州市
  1086. if jc_city != "" {
  1087. c.Brief = jc_city //市简称:杭州
  1088. e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
  1089. e.CityMap[qc_city] = c.Brief //杭州市:杭州
  1090. e.CityBriefMap[c.Brief] = c //杭州:市信息{}
  1091. e.CityFullMap[qc_city] = c //杭州市:市信息{}
  1092. }
  1093. c.P = p
  1094. if city_alias, ok := vcity["city_alias"].([]interface{}); ok {
  1095. for _, vcity_alias := range city_alias {
  1096. strvcity_alias := qu.ObjToString(vcity_alias)
  1097. if isok[jc_province+"_"+strvcity_alias] {
  1098. continue
  1099. }
  1100. e.CityBriefMap[strvcity_alias] = c
  1101. e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps)
  1102. isok[jc_province+"_"+strvcity_alias] = true
  1103. }
  1104. }
  1105. if isok[jc_province+"_"+qc_city] {
  1106. continue
  1107. }
  1108. e.initDistricts(jc_province, qc_city, c, jc_city, districts_maps, towns_maps, jwhs_maps)
  1109. }
  1110. }
  1111. e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community}
  1112. e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District}
  1113. }
  1114. //加载区县
  1115. func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
  1116. jc_city string, districts_maps map[string]map[string][]map[string]interface{},
  1117. towns_maps map[string]map[string]map[string][]map[string]interface{},
  1118. jwhs_maps map[string]map[string]map[string]map[string][]map[string]interface{}) {
  1119. districts := districts_maps[jc_province][qc_city]
  1120. for _, vdistricts := range districts {
  1121. qc_district := qu.ObjToString(vdistricts["district"])
  1122. jc_district := qu.ObjToString(vdistricts["brief_district"])
  1123. d := &District{}
  1124. d.Name = qc_district
  1125. d.C = c
  1126. e.Trie_Full_District.AddWords(qc_district) //加入区或县全称Trie
  1127. if jc_district != "" {
  1128. e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie
  1129. //初始化城市简称
  1130. c := e.CityBriefMap[jc_city]
  1131. dfullarr := e.NewDistrictSimAndAll[jc_district]
  1132. dfullcity := map[string]*City{qc_district: c}
  1133. if len(dfullarr) == 0 {
  1134. tmparr := []map[string]*City{dfullcity}
  1135. e.NewDistrictSimAndAll[jc_district] = tmparr
  1136. } else {
  1137. e.NewDistrictSimAndAll[jc_district] = append(e.NewDistrictSimAndAll[jc_district], dfullcity)
  1138. }
  1139. }
  1140. ctmp := e.NewDistrictCityMap[qc_district]
  1141. if len(ctmp) == 0 {
  1142. tmpcarr := []*City{c}
  1143. e.NewDistrictCityMap[qc_district] = tmpcarr
  1144. } else {
  1145. e.NewDistrictCityMap[qc_district] = append(e.NewDistrictCityMap[qc_district], c)
  1146. }
  1147. if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
  1148. for _, vdistrict_alias := range district_alias {
  1149. strvdistrict_alias := qu.ObjToString(vdistrict_alias)
  1150. e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
  1151. ctmp := e.NewDistrictCityMap[strvdistrict_alias]
  1152. if len(ctmp) == 0 {
  1153. tmpcarr := []*City{c}
  1154. e.NewDistrictCityMap[strvdistrict_alias] = tmpcarr
  1155. } else {
  1156. e.NewDistrictCityMap[strvdistrict_alias] = append(e.NewDistrictCityMap[strvdistrict_alias], c)
  1157. }
  1158. }
  1159. }
  1160. //街道
  1161. towns := towns_maps[jc_province][qc_city][qc_district]
  1162. for _, vtown := range towns {
  1163. strvtown := qu.ObjToString(vtown["town"])
  1164. s := &Street{}
  1165. s.Name = strvtown
  1166. s.D = d
  1167. e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie
  1168. dtmp := e.NewStreetDistrictMap[strvtown]
  1169. if len(dtmp) == 0 {
  1170. tmpdarr := []*District{d}
  1171. e.NewStreetDistrictMap[strvtown] = tmpdarr
  1172. } else {
  1173. e.NewStreetDistrictMap[strvtown] = append(e.NewStreetDistrictMap[strvtown], d)
  1174. }
  1175. //村、居委会
  1176. //jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown]
  1177. //for _, vjwh := range jwhs {
  1178. // strvillage := qu.ObjToString(vjwh["village"])
  1179. // e.Trie_Full_Community.AddWords(strvillage) //加入居委会、村全称Trie
  1180. // cttmp := e.CommunityDistrictMap[strvillage]
  1181. // if len(cttmp) == 0 {
  1182. // tmpdarr := []*District{d}
  1183. // e.CommunityDistrictMap[strvillage] = tmpdarr
  1184. // } else {
  1185. // e.CommunityDistrictMap[strvillage] = append(e.CommunityDistrictMap[strvillage], d)
  1186. // }
  1187. //}
  1188. }
  1189. }
  1190. }
  1191. func (e *ExtractTask) InitVar() {
  1192. defer qu.Catch()
  1193. //初始化Trie
  1194. //全称
  1195. e.Trie_Full_Province = &ju.Trie{}
  1196. e.Trie_Full_City = &ju.Trie{}
  1197. e.Trie_Full_District = &ju.Trie{}
  1198. e.Trie_Full_Street = &ju.Trie{}
  1199. e.Trie_Full_Community = &ju.Trie{}
  1200. //简称
  1201. e.Trie_Sim_Province = &ju.Trie{}
  1202. e.Trie_Sim_City = &ju.Trie{}
  1203. e.Trie_Sim_District = &ju.Trie{}
  1204. //初始化分词
  1205. e.Seg_PCD = &gse.Segmenter{}
  1206. e.Seg_SV = &gse.Segmenter{}
  1207. e.Seg_PCD.LoadDict("./res/pcd.txt")
  1208. e.Seg_SV.LoadDict("./res/sv.txt")
  1209. //初始化map
  1210. if e.SiteCityMap == nil {
  1211. e.SiteCityMap = make(map[string]*SiteCity)
  1212. }
  1213. if e.ProvinceMap == nil {
  1214. e.ProvinceMap = make(map[string]string)
  1215. }
  1216. if e.CityMap == nil {
  1217. e.CityMap = make(map[string]string)
  1218. }
  1219. if e.DistrictSimAndAll == nil {
  1220. e.DistrictSimAndAll = make(map[string]string)
  1221. }
  1222. if e.NewDistrictSimAndAll == nil {
  1223. e.NewDistrictSimAndAll = make(map[string][]map[string]*City)
  1224. }
  1225. if e.CityBriefMap == nil {
  1226. e.CityBriefMap = make(map[string]*City)
  1227. }
  1228. if e.CityFullMap == nil {
  1229. e.CityFullMap = make(map[string]*City)
  1230. }
  1231. if e.ProvinceBriefMap == nil {
  1232. e.ProvinceBriefMap = make(map[string]*Province)
  1233. }
  1234. if e.NewDistrictCityMap == nil {
  1235. e.NewDistrictCityMap = make(map[string][]*City)
  1236. }
  1237. if e.NewStreetDistrictMap == nil {
  1238. e.NewStreetDistrictMap = make(map[string][]*District)
  1239. }
  1240. if e.CommunityDistrictMap == nil {
  1241. e.CommunityDistrictMap = make(map[string][]*District)
  1242. }
  1243. }
  1244. //初始化邮编库
  1245. func (e *ExtractTask) InitPostCode() {
  1246. defer qu.Catch()
  1247. if e.PostCodeMap == nil {
  1248. e.PostCodeMap = make(map[string]*PostCode)
  1249. }
  1250. list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
  1251. for _, l := range *list {
  1252. pc := &PostCode{}
  1253. pc.Code = qu.ObjToString(l["code"])
  1254. pc.P = qu.ObjToString(l["province"])
  1255. pc.C = qu.ObjToString(l["city"])
  1256. pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
  1257. e.PostCodeMap[pc.Code] = pc
  1258. }
  1259. }
  1260. //初始化区号库
  1261. func (e *ExtractTask) InitAreaCode() {
  1262. defer qu.Catch()
  1263. if e.AreaCodeMap == nil {
  1264. e.AreaCodeMap = make(map[string]*AreaCode)
  1265. }
  1266. list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
  1267. for _, l := range *list {
  1268. ac := &AreaCode{}
  1269. ac.Code = qu.ObjToString(l["code"])
  1270. ac.P = qu.ObjToString(l["province"])
  1271. ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
  1272. e.AreaCodeMap[ac.Code] = ac
  1273. }
  1274. }
  1275. //保存抽取详情数据
  1276. func (e *ExtractTask) ResultSave(init bool) {
  1277. defer qu.Catch()
  1278. e.RWMutex.Lock()
  1279. if e.ResultArr == nil {
  1280. e.ResultArr = [][]map[string]interface{}{}
  1281. }
  1282. e.RWMutex.Unlock()
  1283. if init {
  1284. go func() {
  1285. for {
  1286. e.RWMutex.Lock()
  1287. if len(e.ResultArr) > saveLimit {
  1288. arr := e.ResultArr[:saveLimit]
  1289. e.ResultArr = e.ResultArr[saveLimit:]
  1290. e.RWMutex.Unlock()
  1291. qu.Try(func() {
  1292. db.Mgo.UpSertBulk("extract_result", arr...)
  1293. }, func(err interface{}) {
  1294. log.Debug(err)
  1295. })
  1296. } else {
  1297. arr := e.ResultArr
  1298. e.ResultArr = [][]map[string]interface{}{}
  1299. e.RWMutex.Unlock()
  1300. qu.Try(func() {
  1301. db.Mgo.UpSertBulk("extract_result", arr...)
  1302. }, func(err interface{}) {
  1303. log.Debug(err)
  1304. })
  1305. }
  1306. time.Sleep(2 * time.Second)
  1307. }
  1308. }()
  1309. } else {
  1310. e.RWMutex.Lock()
  1311. arr := e.ResultArr
  1312. e.ResultArr = [][]map[string]interface{}{}
  1313. e.RWMutex.Unlock()
  1314. qu.Try(func() {
  1315. lenarr := len(arr)
  1316. for {
  1317. if lenarr > saveLimit {
  1318. arr2 := arr[:saveLimit]
  1319. arr = arr[saveLimit:]
  1320. lenarr = len(arr)
  1321. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1322. } else {
  1323. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1324. break
  1325. }
  1326. }
  1327. }, func(err interface{}) {
  1328. defer e.RWMutex.Unlock()
  1329. log.Debug(err)
  1330. })
  1331. }
  1332. }
  1333. //保存抽取数据
  1334. func (e *ExtractTask) BidSave(init bool) {
  1335. defer qu.Catch()
  1336. e.RWMutex.Lock()
  1337. if e.BidArr == nil {
  1338. e.BidArr = [][]map[string]interface{}{}
  1339. }
  1340. e.RWMutex.Unlock()
  1341. if init {
  1342. go func() {
  1343. for {
  1344. e.RWMutex.Lock()
  1345. if len(e.BidArr) > saveLimit {
  1346. arr := e.BidArr[:saveLimit]
  1347. e.BidArr = e.BidArr[saveLimit:]
  1348. e.RWMutex.Unlock()
  1349. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1350. qu.Try(func() {
  1351. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1352. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1353. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1354. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1355. }, func(err interface{}) {
  1356. log.Debug(err)
  1357. })
  1358. } else {
  1359. arr := e.BidArr
  1360. e.BidArr = [][]map[string]interface{}{}
  1361. e.RWMutex.Unlock()
  1362. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1363. qu.Try(func() {
  1364. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1365. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1366. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1367. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1368. }, func(err interface{}) {
  1369. log.Debug(err)
  1370. })
  1371. }
  1372. time.Sleep(2 * time.Second)
  1373. }
  1374. }()
  1375. } else {
  1376. e.RWMutex.Lock()
  1377. arr := e.BidArr
  1378. e.BidArr = [][]map[string]interface{}{}
  1379. e.RWMutex.Unlock()
  1380. qu.Try(func() {
  1381. lenarr := len(arr)
  1382. for {
  1383. if lenarr > saveLimit {
  1384. arr2 := arr[:saveLimit]
  1385. arr = arr[saveLimit:]
  1386. lenarr = len(arr)
  1387. arr2, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr2)
  1388. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
  1389. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1390. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1391. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1392. } else {
  1393. arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
  1394. e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
  1395. e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
  1396. e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
  1397. e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
  1398. break
  1399. }
  1400. }
  1401. }, func(err interface{}) {
  1402. log.Debug(err)
  1403. })
  1404. time.Sleep(1 * time.Second)
  1405. }
  1406. }
  1407. func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) {
  1408. arr = [][]map[string]interface{}{}
  1409. blocks = []map[string]interface{}{}
  1410. fieldalls = []map[string]interface{}{}
  1411. fieldallsf = []map[string]interface{}{}
  1412. for _, v := range a {
  1413. _id, _ := v[0]["_id"]
  1414. if tmp, ok := v[1]["$set"].(map[string]interface{}); ok {
  1415. if ju.SaveBlock {
  1416. if tmp["blocks"] != nil {
  1417. block := map[string]interface{}{
  1418. "_id": _id,
  1419. "blocks": tmp["blocks"],
  1420. }
  1421. blocks = append(blocks, block)
  1422. }
  1423. }
  1424. delete(tmp, "blocks")
  1425. if ju.FieldsFind {
  1426. if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok {
  1427. fieldall := map[string]interface{}{
  1428. "_id": _id,
  1429. }
  1430. for k, v := range f {
  1431. fieldall[k] = v
  1432. }
  1433. fieldalls = append(fieldalls, fieldall)
  1434. }
  1435. if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok {
  1436. fieldallf := map[string]interface{}{
  1437. "_id": _id,
  1438. }
  1439. for k, v := range ff {
  1440. fieldallf[k] = v
  1441. }
  1442. fieldallsf = append(fieldalls, fieldallf)
  1443. }
  1444. }
  1445. delete(tmp, "fieldall")
  1446. delete(tmp, "fieldallf")
  1447. v[1] = tmp
  1448. }
  1449. arr = append(arr, v)
  1450. }
  1451. return arr, blocks, fieldalls, fieldallsf
  1452. }
  1453. func (e *ExtractTask) InitAuditRecogField() {
  1454. defer qu.Catch()
  1455. e.RecogFieldMap = make(map[string]map[string]interface{})
  1456. recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
  1457. for _, f := range *recogFieldList {
  1458. field := qu.ObjToString(f["s_recogfield"])
  1459. e.RecogFieldMap[field] = f
  1460. }
  1461. }
  1462. func (e *ExtractTask) InitAuditClass() {
  1463. defer qu.Catch()
  1464. e.FidClassMap = make(map[string][]map[string]interface{})
  1465. class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1466. for _, c := range *class {
  1467. classList := []map[string]interface{}{}
  1468. fid := qu.ObjToString(c["s_fid"])
  1469. if len(e.FidClassMap[fid]) > 0 { //追加
  1470. classList = e.FidClassMap[fid]
  1471. }
  1472. classList = append(classList, c)
  1473. e.FidClassMap[fid] = classList
  1474. }
  1475. }
  1476. //加载规则
  1477. func (e *ExtractTask) InitAuditRule() {
  1478. defer qu.Catch()
  1479. var rureg *regexp.Regexp
  1480. var rs []rune
  1481. var ru string
  1482. var err error
  1483. e.CidRuleMap = make(map[string][]map[string]interface{})
  1484. rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
  1485. for _, v := range *rule {
  1486. i_rule := []interface{}{}
  1487. ss, _ := (v["s_rule"].([]interface{}))
  1488. for _, r := range qu.ObjArrToStringArr(ss) {
  1489. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  1490. rs = []rune(r)
  1491. ru = string(rs[1 : len(rs)-1])
  1492. rureg, err = regexp.Compile(ru)
  1493. if err != nil {
  1494. log.Debug("error---rule:", r)
  1495. continue
  1496. }
  1497. i_rule = append(i_rule, []interface{}{rureg}...)
  1498. } else { //规则
  1499. i_rule = append(i_rule, r)
  1500. }
  1501. }
  1502. v["rule"] = i_rule
  1503. ruleList := []map[string]interface{}{}
  1504. classid := qu.ObjToString(v["s_classid"])
  1505. if len(e.CidRuleMap[classid]) > 0 { //追加
  1506. ruleList = e.CidRuleMap[classid]
  1507. }
  1508. ruleList = append(ruleList, v)
  1509. e.CidRuleMap[classid] = ruleList
  1510. }
  1511. }
  1512. //
  1513. func (e *ExtractTask) InitAuditFields() {
  1514. if len(e.AuditFields) == 0 {
  1515. v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
  1516. if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
  1517. vid := qu.BsonIdToSId((*v)["_id"])
  1518. query := map[string]interface{}{
  1519. "isaudit": true,
  1520. "delete": false,
  1521. "vid": vid,
  1522. }
  1523. data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
  1524. for _, d := range *data {
  1525. field := qu.ObjToString(d["s_field"])
  1526. e.AuditFields = append(e.AuditFields, field)
  1527. }
  1528. }
  1529. }
  1530. }
  1531. //加载附件抽取
  1532. func (e *ExtractTask) InitFile() {
  1533. defer qu.Catch()
  1534. //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
  1535. ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
  1536. //ve, _ := db.Mgo.FindOne("version", query)
  1537. if ve == nil {
  1538. return
  1539. }
  1540. if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
  1541. e.IsFileField = true
  1542. }
  1543. syscefiled := new(sync.Map)
  1544. if (*ve)["s_filefileds"] != nil {
  1545. for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
  1546. syscefiled.Store(vff.(string), 1)
  1547. }
  1548. }
  1549. e.FileFields = syscefiled
  1550. ju.InitOss(ju.Config["istest"].(bool))
  1551. }
  1552. //加载清理任务信息
  1553. func (c *ClearTask) InitClearTaskInfo() {
  1554. cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
  1555. if len(*cleartask) > 1 {
  1556. v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
  1557. c.ClearTaskInfo = &ClearTaskInfo{
  1558. Name: (*cleartask)["s_taskname"].(string),
  1559. Version: (*cleartask)["s_version"].(string),
  1560. VersionId: qu.BsonIdToSId((*v)["_id"]),
  1561. FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
  1562. FromDB: (*cleartask)["s_mgodb"].(string),
  1563. FromColl: (*cleartask)["s_mgocoll"].(string),
  1564. IsCltLog: ju.Config["iscltlog"].(bool),
  1565. ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
  1566. }
  1567. log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
  1568. } else {
  1569. return
  1570. }
  1571. }
  1572. //加载清理脚本
  1573. func (c *ClearTask) InitClearLuas() {
  1574. defer qu.Catch()
  1575. c.ClearLuas = make(map[string][]*ClearLua)
  1576. list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
  1577. for _, l := range *list {
  1578. if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
  1579. continue
  1580. }
  1581. s_field := qu.ObjToString(l["s_field"])
  1582. pid := qu.BsonIdToSId(l["_id"])
  1583. luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
  1584. for _, vv := range *luas {
  1585. if b, _ := vv["isuse"].(bool); !b {
  1586. continue
  1587. }
  1588. clearLua := &ClearLua{
  1589. Field: s_field,
  1590. Code: vv["s_code"].(string),
  1591. Name: vv["s_name"].(string),
  1592. LuaText: vv["s_luascript"].(string),
  1593. LFields: getALLFields(),
  1594. }
  1595. c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
  1596. }
  1597. }
  1598. }
  1599. //加载分块规则
  1600. func (e *ExtractTask) InitBlockRule() {
  1601. datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
  1602. "vid": e.TaskInfo.VersionId,
  1603. "delete": false,
  1604. }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
  1605. brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
  1606. for _, v := range *datas {
  1607. block_reg, _ := v["block_reg"].(string)
  1608. block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
  1609. title_reg, _ := v["title_reg"].(string)
  1610. title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
  1611. if block_reg == "" || title_reg == "" {
  1612. continue
  1613. }
  1614. b_reg, b_err := regexp.Compile(block_reg)
  1615. t_reg, t_err := regexp.Compile(title_reg)
  1616. if b_err != nil || t_err != nil {
  1617. continue
  1618. }
  1619. brs = append(brs, b_reg)
  1620. trs = append(trs, t_reg)
  1621. }
  1622. e.RuleBlock = &ju.RuleBlock{
  1623. BlockRegs: brs,
  1624. TitleRegs: trs,
  1625. Classify: e.InitBlockClassify(),
  1626. }
  1627. }
  1628. //加载分块规则
  1629. func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
  1630. classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
  1631. "vid": e.TaskInfo.VersionId,
  1632. "delete": false,
  1633. }, nil, `{"name":1}`, false, -1, -1)
  1634. classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
  1635. "vid": e.TaskInfo.VersionId,
  1636. "delete": false,
  1637. }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
  1638. classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
  1639. "vid": e.TaskInfo.VersionId,
  1640. "delete": false,
  1641. }, nil, `{"name":1,"pid":1}`, false, -1, -1)
  1642. tag_map := map[string]ju.Tags{}
  1643. for _, v := range *classify_tag {
  1644. pid := qu.ObjToString(v["pid"])
  1645. name := qu.ObjToString(v["name"])
  1646. tag := &ju.Tag{Value: name}
  1647. if strings.HasPrefix(name, "reg__") {
  1648. tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
  1649. }
  1650. tag_map[pid] = append(tag_map[pid], tag)
  1651. }
  1652. //
  1653. info_map := map[string][]*ju.NameCode{}
  1654. info_tag := map[string]*ju.TagFile{}
  1655. for _, v := range *classify_info {
  1656. pid := qu.ObjToString(v["pid"])
  1657. _id := qu.BsonIdToSId(v["_id"])
  1658. name := qu.ObjToString(v["name"])
  1659. info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
  1660. info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
  1661. }
  1662. classify_map := map[string][]*ju.NameCode{}
  1663. for _, v := range *classify {
  1664. _id := qu.BsonIdToSId(v["_id"])
  1665. if info_map[_id] == nil {
  1666. continue
  1667. }
  1668. for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
  1669. classify_map[vv] = append(classify_map[vv], info_map[_id]...)
  1670. }
  1671. }
  1672. return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
  1673. }