|
@@ -1,1361 +0,0 @@
|
|
|
-package main
|
|
|
-
|
|
|
-import (
|
|
|
- "encoding/json"
|
|
|
- "errors"
|
|
|
- "log"
|
|
|
- mongo "qfw/mongodb"
|
|
|
- qu "qfw/util"
|
|
|
- "qfw/util/elastic"
|
|
|
- "regexp"
|
|
|
- "strings"
|
|
|
- "sync"
|
|
|
- "sync/atomic"
|
|
|
- "time"
|
|
|
-
|
|
|
- //"gopkg.in/mgo.v2/bson"
|
|
|
- es "gopkg.in/olivere/elastic.v1"
|
|
|
-)
|
|
|
-
|
|
|
-//匹配方式map
|
|
|
-var task_export_matchtype = map[string]interface{}{
|
|
|
- "1": "title",
|
|
|
- "2": "detail",
|
|
|
- "3": "purchasing",
|
|
|
- "4": "filetext",
|
|
|
- "5": "projectname",
|
|
|
- "6": "buyer",
|
|
|
- "7": "s_winner",
|
|
|
-}
|
|
|
-var LetterCase = regexp.MustCompile("[A-Za-z]")
|
|
|
-var FilteReg = regexp.MustCompile("[()(){}]*")
|
|
|
-var TaskList = make(map[string]*Task) //存储启动任务
|
|
|
-var TaskListLock = &sync.Mutex{}
|
|
|
-var FieldsMap = map[string]interface{}{"title": 1, "detail": 1, "tagname": 1, "_id": 1}
|
|
|
-var FieldsArr = []string{"title", "detail", "tagname", "_id"}
|
|
|
-var EOS = errors.New("EOS")
|
|
|
-
|
|
|
-//任务模型
|
|
|
-type Task struct {
|
|
|
- //任务信息
|
|
|
- Id string //任务id
|
|
|
- StartId string //起始id
|
|
|
- From string //数据出处(es mongodb)
|
|
|
- //To string //数据更新去处(es mongodb)
|
|
|
- Index string //es index
|
|
|
- Itype string //es type
|
|
|
- MgoDb string //mgo db
|
|
|
- MgoColl string //mgo coll
|
|
|
- PRules []*PRule //任务相关规则(对数据打标签)
|
|
|
- IsRun bool //是否运行
|
|
|
- IsIndex bool //是否同步es
|
|
|
- IsClear bool //是否清理原有标签
|
|
|
- AllTagField map[string]bool //所有标签
|
|
|
- AllPreField map[string]bool //所有父标签
|
|
|
- //存储相关
|
|
|
- Wg *sync.WaitGroup
|
|
|
- Lock *sync.Mutex
|
|
|
- Mgo *mongo.MongodbSim //mgo
|
|
|
- Es *elastic.Elastic //es
|
|
|
- DataChan chan bool //
|
|
|
- EsUpdateCache chan []map[string]interface{} //es更新集合
|
|
|
- EsOver chan bool //es结束标志
|
|
|
- SP chan bool //批量更新时的线程控制
|
|
|
- //MgoUpdataCache chan []map[string]interface{} //mgo更新集合
|
|
|
-
|
|
|
-}
|
|
|
-
|
|
|
-type PRule struct {
|
|
|
- GNW *NotWord //全局排除词
|
|
|
- GAW *GlobalWord //全局附加词
|
|
|
- Rule []*Rule //规则
|
|
|
- TagField string //标签属性
|
|
|
- TagName string //标签名称
|
|
|
- PreField string //父标签属性
|
|
|
- PreFieldName string //父标签名称(可为空)
|
|
|
- Id string
|
|
|
-}
|
|
|
-
|
|
|
-//规则
|
|
|
-type Rule struct {
|
|
|
- NW *NotWord //排除词
|
|
|
- KW *KeyWord //关键词
|
|
|
- AW *AddWord //附加词
|
|
|
-}
|
|
|
-
|
|
|
-//关键词类型
|
|
|
-type KeyWord struct {
|
|
|
- KeyReg []*regexp.Regexp
|
|
|
- MatchType []string //关键词的匹配方式
|
|
|
- KeyWordMap map[int]bool //记录KeyReg中字母规则
|
|
|
-}
|
|
|
-
|
|
|
-//附加词类型
|
|
|
-type AddWord struct {
|
|
|
- KeyReg []*regexp.Regexp
|
|
|
- MatchType []string //附加词的匹配方式
|
|
|
- AddWordMap map[int]bool //记录KeyReg中字母规则
|
|
|
-}
|
|
|
-
|
|
|
-//排除词
|
|
|
-type NotWord struct {
|
|
|
- KeyReg [][]*regexp.Regexp //例如:排除1&&排除2,排除3,KeyReg[0]存 排除3;KeyReg[1]存 排除1 排除2
|
|
|
- MatchType []string //排除词的匹配方式
|
|
|
- NotWordMap []map[int]bool //记录KeyReg中字母规则
|
|
|
-}
|
|
|
-
|
|
|
-//全局附加词
|
|
|
-type GlobalWord struct {
|
|
|
- KeyReg [][]*regexp.Regexp //例如:附加1&&附加2,附加3,KeyReg[0]存 附加3;KeyReg[1]存 附加1 附加2
|
|
|
- MatchType []string //匹配方式
|
|
|
- MatchWordMap []map[int]bool //记录KeyReg中字母规则
|
|
|
-}
|
|
|
-
|
|
|
-//结束任务
|
|
|
-func EndTask(taskid string) {
|
|
|
- defer qu.Catch()
|
|
|
- TaskListLock.Lock()
|
|
|
- delete(TaskList, taskid)
|
|
|
- log.Println("Delete TaskList:", taskid)
|
|
|
- TaskListLock.Unlock()
|
|
|
-}
|
|
|
-
|
|
|
-//开始任务
|
|
|
-func StartTask(taskid string) {
|
|
|
- defer qu.Catch()
|
|
|
- t := &Task{}
|
|
|
- t.InitTask(taskid)
|
|
|
- if len(t.PRules) == 0 { //判断有没有Rules
|
|
|
- log.Println(t.Id, "无启用规则")
|
|
|
- return
|
|
|
- }
|
|
|
- //t.IsRun = true //更新任务状态
|
|
|
- //qu.Debug(t.Id, t.From, t.Index, t.Itype, t.IsRun, t.IsIndex)
|
|
|
- TaskListLock.Lock()
|
|
|
- TaskList[taskid] = t //加入map
|
|
|
- TaskListLock.Unlock()
|
|
|
- if t.From == "mongodb" {
|
|
|
- t.RunMgo() //增量
|
|
|
- //go t.UpdateMgo() //开启mgo保存
|
|
|
- } else {
|
|
|
- go t.UpdateEs() //开启es保存
|
|
|
- t.RunEs()
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//mgo增量
|
|
|
-func (t *Task) RunMgo() {
|
|
|
- defer qu.Catch()
|
|
|
- log.Println("Task Id:", t.Id, "Start...")
|
|
|
- TaskListLock.Lock()
|
|
|
- tmpTask := TaskList[t.Id]
|
|
|
- TaskListLock.Unlock()
|
|
|
- if tmpTask == nil {
|
|
|
- log.Println("Task Id:", t.Id, "Over...")
|
|
|
- t.Mgo.Destory() //销毁连接池
|
|
|
- return
|
|
|
- }
|
|
|
- //oldId := t.StartId //记录起始id
|
|
|
- queryOne := map[string]interface{}{
|
|
|
- "_id": map[string]interface{}{
|
|
|
- "$gte": qu.StringTOBsonId(t.StartId),
|
|
|
- },
|
|
|
- }
|
|
|
- data, _ := t.Mgo.Find(t.MgoColl, queryOne, `{"_id":-1}`, nil, false, 0, 1) //找最后一条数据
|
|
|
- endId := qu.BsonIdToSId((*data)[0]["_id"])
|
|
|
- if endId <= t.StartId { //判断id
|
|
|
- log.Println("Id区间有误:", t.StartId, endId)
|
|
|
- time.AfterFunc(time.Minute*5, t.RunMgo)
|
|
|
- return
|
|
|
- }
|
|
|
- sid := t.StartId
|
|
|
- query := map[string]interface{}{
|
|
|
- "_id": map[string]interface{}{
|
|
|
- "$gt": qu.StringTOBsonId(sid),
|
|
|
- "$lte": qu.StringTOBsonId(endId),
|
|
|
- },
|
|
|
- }
|
|
|
- sess := t.Mgo.GetMgoConn()
|
|
|
- defer t.Mgo.DestoryMongoConn(sess)
|
|
|
- count, _ := sess.DB(t.MgoDb).C(t.MgoColl).Find(&query).Count()
|
|
|
- log.Println("查询语句:", query, "查询总数:", count)
|
|
|
- it := sess.DB(t.MgoDb).C(t.MgoColl).Find(&query).Select(FieldsMap).Sort("_id").Iter()
|
|
|
-
|
|
|
- t.Wg = &sync.WaitGroup{}
|
|
|
- t.Lock = &sync.Mutex{}
|
|
|
- update := [][]map[string]interface{}{}
|
|
|
- //遍历
|
|
|
- index := 0
|
|
|
- n := int64(0)
|
|
|
- for tmp := map[string]interface{}{}; it.Next(&tmp); index++ {
|
|
|
- if index%500 == 0 {
|
|
|
- log.Println("current:", index)
|
|
|
- }
|
|
|
- tid := qu.BsonIdToSId(tmp["_id"])
|
|
|
- t.Wg.Add(1)
|
|
|
- t.DataChan <- true
|
|
|
- go func(tmp map[string]interface{}) {
|
|
|
- defer func() {
|
|
|
- <-t.DataChan
|
|
|
- t.Wg.Done()
|
|
|
- }()
|
|
|
- tmpTagNameMap := map[string][]string{} //记录标签
|
|
|
- tmpPreTagMap := map[string][]string{} //记录父标签
|
|
|
- GetTags(t.PRules, tmp, tmpTagNameMap, tmpPreTagMap)
|
|
|
- for _, ru := range t.PRules {
|
|
|
- //全局排除词
|
|
|
- IsMatchGNotKey := RegMatch(tmp, ru.GNW.MatchType, ru.GNW.KeyReg, ru.GNW.NotWordMap)
|
|
|
- if IsMatchGNotKey { //全局排除词匹配成功
|
|
|
- continue
|
|
|
- } else {
|
|
|
- //全局附加词
|
|
|
- IsMatchGAddKey := RegMatch(tmp, ru.GAW.MatchType, ru.GAW.KeyReg, ru.GAW.MatchWordMap)
|
|
|
- if !IsMatchGAddKey && len(ru.GAW.MatchType) != 0 { //全局附加词没有匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
- L:
|
|
|
- for _, r := range ru.Rule {
|
|
|
- //排除词
|
|
|
- IsMatchNotKey := RegMatch(tmp, r.NW.MatchType, r.NW.KeyReg, r.NW.NotWordMap)
|
|
|
- if IsMatchNotKey { //排除词匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- //L:
|
|
|
- //关键词匹配
|
|
|
- for _, kwm := range r.KW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[kwm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for i, kw_reg := range r.KW.KeyReg {
|
|
|
- IsContinue := false
|
|
|
- if kw_indexArr := kw_reg.FindAllStringIndex(text, -1); len(kw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.KW.KeyWordMap[i] && CheckLetter(text, kw_reg, kw_indexArr) { //kw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- IsContinue = true
|
|
|
- } else if !r.KW.KeyWordMap[i] {
|
|
|
- IsContinue = true
|
|
|
- }
|
|
|
- }
|
|
|
- if IsContinue { //关键词匹配成功,匹配附加词
|
|
|
- if len(r.AW.KeyReg) == 0 { //无附加词
|
|
|
- //tmpTagNameMap[r.TagName] = true
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else {
|
|
|
- for _, awm := range r.AW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[awm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for j, aw_reg := range r.AW.KeyReg {
|
|
|
- if aw_indexArr := aw_reg.FindAllStringIndex(text, -1); len(aw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.AW.AddWordMap[j] && CheckLetter(text, aw_reg, aw_indexArr) { //aw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else if !r.AW.AddWordMap[j] {
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- }
|
|
|
- }
|
|
|
- // if aw_reg.MatchString(text) { //附加词匹配成功
|
|
|
- // tmpTagName[r.TagName] = true
|
|
|
- // break L
|
|
|
- // }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- clearMap := map[string]interface{}{}
|
|
|
- if t.IsClear { //清理
|
|
|
- ClearData(clearMap, t.AllTagField, tmpTagNameMap)
|
|
|
- ClearData(clearMap, t.AllPreField, tmpPreTagMap)
|
|
|
- } else {
|
|
|
- for tf, tn := range tmpTagNameMap { //将原有标签汇总
|
|
|
- tmpTagName := qu.ObjToString(tmp[tf])
|
|
|
- tn = append(tn, strings.Split(tmpTagName, ",")...)
|
|
|
- tmpTagNameMap[tf] = tn
|
|
|
- }
|
|
|
- for pt, pn := range tmpPreTagMap { //将原有父标签汇总
|
|
|
- preField := qu.ObjToString(tmp[pt])
|
|
|
- pn = append(pn, strings.Split(preField, ",")...)
|
|
|
- tmpPreTagMap[pt] = pn
|
|
|
- }
|
|
|
- }
|
|
|
- addMap := map[string]interface{}{}
|
|
|
- if len(tmpTagNameMap) > 0 { //有新标签或者历史标签
|
|
|
- atomic.AddInt64(&n, +1) //n++ 计数
|
|
|
- AddData(addMap, tmpPreTagMap)
|
|
|
- AddData(addMap, tmpTagNameMap)
|
|
|
- }
|
|
|
- t.Lock.Lock()
|
|
|
- idAndSet := []map[string]interface{}{}
|
|
|
- _id := map[string]interface{}{
|
|
|
- "_id": tmp["_id"],
|
|
|
- }
|
|
|
- set := map[string]interface{}{}
|
|
|
- if len(addMap) > 0 {
|
|
|
- set["$set"] = addMap
|
|
|
- }
|
|
|
- if len(clearMap) > 0 {
|
|
|
- set["$unset"] = clearMap
|
|
|
- }
|
|
|
- if len(set) > 0 {
|
|
|
- idAndSet = append(idAndSet, _id) //第一个为查询条件
|
|
|
- idAndSet = append(idAndSet, set) //第二个为更新内容
|
|
|
- update = append(update, idAndSet)
|
|
|
- }
|
|
|
- if len(update) > 500 {
|
|
|
- t.Mgo.UpdateBulk(t.MgoColl, update...)
|
|
|
- update = [][]map[string]interface{}{} //更新后把数据置空
|
|
|
- }
|
|
|
- t.Lock.Unlock()
|
|
|
- }(tmp)
|
|
|
- if tid > t.StartId {
|
|
|
- t.StartId = tid
|
|
|
- }
|
|
|
- tmp = map[string]interface{}{}
|
|
|
- }
|
|
|
- t.Wg.Wait()
|
|
|
- t.Lock.Lock()
|
|
|
- if len(update) > 0 {
|
|
|
- t.Mgo.UpdateBulk(t.MgoColl, update...)
|
|
|
- update = [][]map[string]interface{}{} //更新后把数据置空
|
|
|
- }
|
|
|
- t.Lock.Unlock()
|
|
|
- log.Println("Update Count:", n)
|
|
|
- //更新起始id
|
|
|
- setid := map[string]interface{}{
|
|
|
- "$set": map[string]interface{}{
|
|
|
- "s_startid": t.StartId,
|
|
|
- },
|
|
|
- }
|
|
|
- Mgo.Update("taskinfo", `{"_id":"`+t.Id+`"}`, setid, false, false)
|
|
|
- //是否同步es
|
|
|
- if t.IsIndex {
|
|
|
- go processEs(sid, endId)
|
|
|
- }
|
|
|
- time.AfterFunc(time.Minute*5, t.RunMgo)
|
|
|
-}
|
|
|
-
|
|
|
-func (t *Task) RunEs() {
|
|
|
- defer qu.Catch()
|
|
|
- log.Println("Task Id:", t.Id, "Start...")
|
|
|
- TaskListLock.Lock()
|
|
|
- tmpTask := TaskList[t.Id]
|
|
|
- TaskListLock.Unlock()
|
|
|
- if tmpTask == nil {
|
|
|
- log.Println("Task Id:", t.Id, "Over...")
|
|
|
- t.EsOver <- true //停止保存线程
|
|
|
- t.Es.Close() //关闭连接池
|
|
|
- return
|
|
|
- }
|
|
|
-
|
|
|
- client := t.Es.GetEsConn()
|
|
|
- defer t.Es.DestoryEsConn(client)
|
|
|
- //查询条件
|
|
|
- source, _ := json.Marshal(FieldsArr)
|
|
|
- esquery := `{"query": {"bool": {"must": [{"range": { "id": {"gt": "` + t.StartId + `"}}}]}},"from":0,"size":1,"sort": [{"comeintime": "desc"}],"_source":` + string(source) + `}`
|
|
|
- list := t.Es.Get(t.Index, t.Itype, esquery) //comeintime倒叙,找最后一条数据
|
|
|
- if len(*list) == 0 {
|
|
|
- log.Println("大于Id:", t.StartId, "暂时无数据")
|
|
|
- time.AfterFunc(time.Minute*5, t.RunEs)
|
|
|
- return
|
|
|
- }
|
|
|
- endId := qu.ObjToString((*list)[0]["_id"]) //结束id
|
|
|
- // if endId <= t.StartId { //判断id
|
|
|
- // log.Println("Id区间有误:", t.StartId, endId)
|
|
|
- // time.AfterFunc(time.Second*10, t.RunEs)
|
|
|
- // return
|
|
|
- // }
|
|
|
- escount := t.Es.Count(t.Index, t.Itype, esquery)
|
|
|
- log.Println("查询语句:", esquery, "查询总数:", escount, "结束Id:", endId)
|
|
|
- //查询条件类型转换
|
|
|
- var q es.Query
|
|
|
- tmpQuery := es.BoolQuery{
|
|
|
- QueryStrings: esquery,
|
|
|
- }
|
|
|
- q = tmpQuery
|
|
|
-
|
|
|
- //游标查询,index不支持别名,只能写索引库的名称
|
|
|
- res, err := client.Scroll(t.Index).Query(q).Size(1000).Do() //查询一条获取游标
|
|
|
- if err == nil {
|
|
|
- numDocs := 0
|
|
|
- numTags := int64(0)
|
|
|
- scrollId := res.ScrollId
|
|
|
- t.Wg = &sync.WaitGroup{}
|
|
|
- for {
|
|
|
- if scrollId == "" {
|
|
|
- log.Println("ScrollId Is Error")
|
|
|
- break
|
|
|
- }
|
|
|
- searchResult, err := client.Scroll(t.Index).Size(1000).ScrollId(scrollId).Do() //查询
|
|
|
- if err != nil {
|
|
|
- if err.Error() == "EOS" { //迭代完毕
|
|
|
- log.Println("Es Search Data Over:", err)
|
|
|
- } else {
|
|
|
- log.Println("Es Search Data Error:", err)
|
|
|
- }
|
|
|
- break
|
|
|
- }
|
|
|
-
|
|
|
- for _, hit := range searchResult.Hits.Hits {
|
|
|
- //开始处理数据
|
|
|
- t.Wg.Add(1)
|
|
|
- t.DataChan <- true
|
|
|
- go func(tmpHit *es.SearchHit) {
|
|
|
- defer func() {
|
|
|
- <-t.DataChan
|
|
|
- t.Wg.Done()
|
|
|
- }()
|
|
|
- tmp := make(map[string]interface{})
|
|
|
- if json.Unmarshal(*tmpHit.Source, &tmp) == nil {
|
|
|
- tmpTagNameMap := map[string][]string{} //记录标签
|
|
|
- tmpPreTagMap := map[string][]string{} //记录父标签
|
|
|
- GetTags(t.PRules, tmp, tmpTagNameMap, tmpPreTagMap)
|
|
|
- // for _, ru := range t.PRules {
|
|
|
- // //全局排除词
|
|
|
- // IsMatchGNotKey := RegMatch(tmp, ru.GNW.MatchType, ru.GNW.KeyReg, ru.GNW.NotWordMap)
|
|
|
- // if IsMatchGNotKey { //全局排除词匹配成功
|
|
|
- // continue
|
|
|
- // } else {
|
|
|
- // //全局附加词
|
|
|
- // IsMatchGAddKey := RegMatch(tmp, ru.GAW.MatchType, ru.GAW.KeyReg, ru.GAW.MatchWordMap)
|
|
|
- // if !IsMatchGAddKey && len(ru.GAW.MatchType) != 0 { //全局附加词没有匹配成功
|
|
|
- // continue
|
|
|
- // }
|
|
|
- // }
|
|
|
- // L:
|
|
|
- // for _, r := range ru.Rule {
|
|
|
- // //排除词
|
|
|
- // IsMatchNotKey := RegMatch(tmp, r.NW.MatchType, r.NW.KeyReg, r.NW.NotWordMap)
|
|
|
- // if IsMatchNotKey { //排除词匹配成功,过滤当前rule
|
|
|
- // continue
|
|
|
- // }
|
|
|
- // // L:
|
|
|
- // //关键词匹配
|
|
|
- // for _, kwm := range r.KW.MatchType {
|
|
|
- // if text := qu.ObjToString(tmp[kwm]); text != "" {
|
|
|
- // text = ProcessData(text)
|
|
|
- // for i, kw_reg := range r.KW.KeyReg {
|
|
|
- // IsContinue := false
|
|
|
- // if kw_indexArr := kw_reg.FindAllStringIndex(text, -1); len(kw_indexArr) > 0 { //关键词匹配成功
|
|
|
- // if r.KW.KeyWordMap[i] && CheckLetter(text, kw_reg, kw_indexArr) { //kw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- // IsContinue = true
|
|
|
- // } else if !r.KW.KeyWordMap[i] {
|
|
|
- // IsContinue = true
|
|
|
- // }
|
|
|
- // }
|
|
|
- // if IsContinue { //关键词匹配成功,匹配附加词
|
|
|
- // if len(r.AW.KeyReg) == 0 { //无附加词
|
|
|
- // //tmpTagNameMap[r.TagName] = true
|
|
|
- // RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- // break L
|
|
|
- // } else {
|
|
|
- // for _, awm := range r.AW.MatchType {
|
|
|
- // if text := qu.ObjToString(tmp[awm]); text != "" {
|
|
|
- // text = ProcessData(text)
|
|
|
- // for j, aw_reg := range r.AW.KeyReg {
|
|
|
- // if aw_indexArr := aw_reg.FindAllStringIndex(text, -1); len(aw_indexArr) > 0 { //关键词匹配成功
|
|
|
- // if r.AW.AddWordMap[j] && CheckLetter(text, aw_reg, aw_indexArr) { //aw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- // //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- // RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- // break L
|
|
|
- // } else if !r.AW.AddWordMap[j] {
|
|
|
- // //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- // RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- // break L
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- for _, ru := range t.PRules {
|
|
|
- //全局排除词
|
|
|
- IsMatchGNotKey := RegMatch(tmp, ru.GNW.MatchType, ru.GNW.KeyReg, ru.GNW.NotWordMap)
|
|
|
- if IsMatchGNotKey { //全局排除词匹配成功
|
|
|
- continue
|
|
|
- } else {
|
|
|
- //全局附加词
|
|
|
- IsMatchGAddKey := RegMatch(tmp, ru.GAW.MatchType, ru.GAW.KeyReg, ru.GAW.MatchWordMap)
|
|
|
- if !IsMatchGAddKey && len(ru.GAW.MatchType) != 0 { //全局附加词没有匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
- L:
|
|
|
- for _, r := range ru.Rule {
|
|
|
- // IsMatch := false
|
|
|
- // L1:
|
|
|
- // //排除词匹配
|
|
|
- // for _, nwm := range r.NW.MatchType {
|
|
|
- // if text := qu.ObjToString(tmp[nwm]); text != "" {
|
|
|
- // text = ProcessData(text)
|
|
|
- // //i=0时,nw_regArr任意一个匹配表示有排除词
|
|
|
- // //i=1时,nw_regArr所有匹配表示有排除词
|
|
|
- // for i, nw_regArr := range r.NW.KeyReg {
|
|
|
- // andMatchNum := 0
|
|
|
- // for j, nw_reg := range nw_regArr {
|
|
|
- // if nw_indexArr := nw_reg.FindAllStringIndex(text, -1); len(nw_indexArr) > 0 { //排除词匹配成功
|
|
|
- // if r.NW.NotWordMap[i][j] && CheckLetter(text, nw_reg, nw_indexArr) { //nw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- // andMatchNum++
|
|
|
- // if i == 0 {
|
|
|
- // IsMatch = true
|
|
|
- // break L1
|
|
|
- // } else if i == 1 && len(nw_regArr) == andMatchNum {
|
|
|
- // IsMatch = true
|
|
|
- // break L1
|
|
|
- // }
|
|
|
- // } else if !r.NW.NotWordMap[i][j] {
|
|
|
- // andMatchNum++
|
|
|
- // if i == 0 {
|
|
|
- // IsMatch = true
|
|
|
- // break L1
|
|
|
- // } else if i == 1 && len(nw_regArr) == andMatchNum {
|
|
|
- // IsMatch = true
|
|
|
- // break L1
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- //排除词
|
|
|
- IsMatchNotKey := RegMatch(tmp, r.NW.MatchType, r.NW.KeyReg, r.NW.NotWordMap)
|
|
|
- if IsMatchNotKey { //排除词匹配成功,过滤当前rule
|
|
|
- continue
|
|
|
- }
|
|
|
- // L:
|
|
|
- //关键词匹配
|
|
|
- for _, kwm := range r.KW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[kwm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for i, kw_reg := range r.KW.KeyReg {
|
|
|
- IsContinue := false
|
|
|
- if kw_indexArr := kw_reg.FindAllStringIndex(text, -1); len(kw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.KW.KeyWordMap[i] && CheckLetter(text, kw_reg, kw_indexArr) { //kw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- IsContinue = true
|
|
|
- } else if !r.KW.KeyWordMap[i] {
|
|
|
- IsContinue = true
|
|
|
- }
|
|
|
- }
|
|
|
- if IsContinue { //关键词匹配成功,匹配附加词
|
|
|
- if len(r.AW.KeyReg) == 0 { //无附加词
|
|
|
- //tmpTagNameMap[r.TagName] = true
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else {
|
|
|
- for _, awm := range r.AW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[awm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for j, aw_reg := range r.AW.KeyReg {
|
|
|
- if aw_indexArr := aw_reg.FindAllStringIndex(text, -1); len(aw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.AW.AddWordMap[j] && CheckLetter(text, aw_reg, aw_indexArr) { //aw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else if !r.AW.AddWordMap[j] {
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- clearMap := map[string]interface{}{}
|
|
|
- if t.IsClear { //清理
|
|
|
- ClearData(clearMap, t.AllTagField, tmpTagNameMap)
|
|
|
- ClearData(clearMap, t.AllPreField, tmpPreTagMap)
|
|
|
- } else {
|
|
|
- for tf, tn := range tmpTagNameMap { //将原有标签汇总
|
|
|
- tmpTagName := qu.ObjToString(tmp[tf])
|
|
|
- tn = append(tn, strings.Split(tmpTagName, ",")...)
|
|
|
- tmpTagNameMap[tf] = tn
|
|
|
- }
|
|
|
- for pt, pn := range tmpPreTagMap { //将原有父标签汇总
|
|
|
- preField := qu.ObjToString(tmp[pt])
|
|
|
- pn = append(pn, strings.Split(preField, ",")...)
|
|
|
- tmpPreTagMap[pt] = pn
|
|
|
- }
|
|
|
- }
|
|
|
- addMap := map[string]interface{}{}
|
|
|
- if len(tmpTagNameMap) > 0 { //有新标签或者历史标签
|
|
|
- atomic.AddInt64(&numTags, 1) //n++ 计数
|
|
|
- AddData(addMap, tmpPreTagMap)
|
|
|
- AddData(addMap, tmpTagNameMap)
|
|
|
- }
|
|
|
- update := []string{}
|
|
|
- if len(addMap) > 0 { //新增
|
|
|
- for tn, tv := range addMap {
|
|
|
- update = append(update, `ctx._source.`+tn+`="`+qu.ObjToString(tv)+`"`)
|
|
|
- }
|
|
|
- }
|
|
|
- if len(clearMap) > 0 { //删除
|
|
|
- for cn, _ := range clearMap {
|
|
|
- update = append(update, `ctx._source.remove("`+cn+`")`)
|
|
|
- }
|
|
|
- }
|
|
|
- if len(update) > 0 {
|
|
|
- tmpMap := []map[string]interface{}{}
|
|
|
- tmpMap = append(tmpMap, map[string]interface{}{"id": qu.ObjToString(tmp["_id"])})
|
|
|
- tmpMap = append(tmpMap, map[string]interface{}{"update": update})
|
|
|
- t.EsUpdateCache <- tmpMap
|
|
|
- }
|
|
|
- }
|
|
|
- }(hit)
|
|
|
- numDocs += 1
|
|
|
- if numDocs%500 == 0 {
|
|
|
- log.Println("Current:", numDocs)
|
|
|
- }
|
|
|
- }
|
|
|
- scrollId = searchResult.ScrollId
|
|
|
- }
|
|
|
- t.Wg.Wait()
|
|
|
- client.ClearScroll().ScrollId(scrollId).Do() //清理游标
|
|
|
- time.Sleep(5 * time.Second)
|
|
|
- log.Println("Result Data Count:", numDocs, " Tags Data Count:", numTags)
|
|
|
- t.StartId = endId //替换id
|
|
|
- setid := map[string]interface{}{
|
|
|
- "$set": map[string]interface{}{
|
|
|
- "s_startid": t.StartId,
|
|
|
- },
|
|
|
- }
|
|
|
- Mgo.Update("taskinfo", `{"_id":"`+t.Id+`"}`, setid, false, false)
|
|
|
- time.AfterFunc(time.Minute*5, t.RunEs)
|
|
|
- } else {
|
|
|
- log.Println("Es Scroll Find Error:", err)
|
|
|
- return
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//初始化任务信息
|
|
|
-func (t *Task) InitTask(taskid string) {
|
|
|
- defer qu.Catch()
|
|
|
- data, _ := Mgo.FindById("taskinfo", taskid, nil)
|
|
|
- t.IsClear = (*data)["b_isclearoldtag"].(bool)
|
|
|
- t.Id = taskid
|
|
|
- t.StartId = qu.ObjToString((*data)["s_startid"])
|
|
|
- from := qu.ObjToString((*data)["s_fromtype"])
|
|
|
- t.From = from
|
|
|
- url := qu.ObjToString((*data)["s_fromdburl"])
|
|
|
- dbname := qu.ObjToString((*data)["s_fromdbname"])
|
|
|
- coll := qu.ObjToString((*data)["s_fromdbcoll"])
|
|
|
- if from == "mongodb" { //初始化mgo
|
|
|
- t.Mgo = &mongo.MongodbSim{
|
|
|
- MongodbAddr: url,
|
|
|
- Size: 20,
|
|
|
- DbName: dbname,
|
|
|
- }
|
|
|
- t.Mgo.InitPool()
|
|
|
- t.MgoDb = dbname
|
|
|
- t.MgoColl = coll
|
|
|
- if s_synces := qu.ObjToString((*data)["s_synces"]); s_synces == "1" {
|
|
|
- t.IsIndex = true //mgo打标签是否同步es
|
|
|
- }
|
|
|
- } else { //初始化es
|
|
|
- t.Es = &elastic.Elastic{
|
|
|
- S_esurl: url,
|
|
|
- I_size: 20,
|
|
|
- }
|
|
|
- t.Es.InitElasticSize()
|
|
|
- t.Index = dbname
|
|
|
- t.Itype = coll
|
|
|
- t.EsOver = make(chan bool, 1)
|
|
|
- t.EsUpdateCache = make(chan []map[string]interface{}, 500)
|
|
|
- }
|
|
|
- //t.MgoUpdataCache = make(chan []map[string]interface{}, 500)
|
|
|
- t.SP = make(chan bool, 5)
|
|
|
- t.DataChan = make(chan bool, 10)
|
|
|
- t.PRules = InitRules(qu.ObjToString((*data)["s_tasktype"])) //rules
|
|
|
- t.AllTagField = AllTagField
|
|
|
- t.AllPreField = AllPreField
|
|
|
- // for j, ru := range t.PRules {
|
|
|
- // qu.Debug(j, ru.TagField, ru.TagName, ru.PreField, ru.PreFieldName, "---", ru.GNW.KeyReg, len(ru.GNW.KeyReg), ru.GNW.MatchType, len(ru.GNW.MatchType), ru.GNW.NotWordMap, len(ru.GNW.NotWordMap), "---", ru.GAW.KeyReg, len(ru.GAW.KeyReg), ru.GAW.MatchType, len(ru.GAW.MatchType), ru.GAW.MatchWordMap, len(ru.GAW.MatchWordMap))
|
|
|
- // // for k, v := range ru.GNW.KeyReg {
|
|
|
- // // qu.Debug(k)
|
|
|
- // // for k1, v1 := range v {
|
|
|
- // // qu.Debug(k1, v1)
|
|
|
- // // }
|
|
|
- // // }
|
|
|
- // qu.Debug("---------------------------------------------------------------------------------------------------------------")
|
|
|
- // for i, r := range ru.Rule {
|
|
|
- // qu.Debug(i, r.KW.KeyReg, len(r.KW.KeyReg), r.KW.MatchType, len(r.KW.MatchType), r.KW.KeyWordMap, "---", r.AW.KeyReg, len(r.AW.KeyReg), r.AW.MatchType, len(r.AW.MatchType), r.AW.AddWordMap, "----", r.NW.KeyReg, len(r.NW.KeyReg), r.NW.MatchType, len(r.NW.MatchType), r.NW.NotWordMap, len(r.NW.NotWordMap))
|
|
|
- // }
|
|
|
- // }
|
|
|
-}
|
|
|
-
|
|
|
-//初始化Rules
|
|
|
-func InitRules(tasktype string) (rules []*PRule) {
|
|
|
- defer qu.Catch()
|
|
|
- query := map[string]interface{}{
|
|
|
- "i_isuse": 1, //启用状态
|
|
|
- "s_tasktype": tasktype,
|
|
|
- "b_delete": false,
|
|
|
- }
|
|
|
- list, _ := Mgo.Find("tagrule", query, nil, nil, false, -1, -1)
|
|
|
- if len(*list) == 0 {
|
|
|
- return
|
|
|
- }
|
|
|
- for _, l := range *list {
|
|
|
- tagname := qu.ObjToString(l["s_tagname"])
|
|
|
- tagField := qu.ObjToString(l["s_tagfield"]) //标签属性值
|
|
|
- preTagField := qu.ObjToString(l["s_pretagfield"]) //父标签属性值
|
|
|
- preTagName := qu.ObjToString(l["s_pretagname"]) //父标签名称
|
|
|
- pr := &PRule{}
|
|
|
- pr.Id = qu.BsonIdToSId(l["_id"])
|
|
|
- pr.TagField = tagField
|
|
|
- pr.TagName = tagname
|
|
|
- pr.PreField = preTagField
|
|
|
- pr.PreFieldName = preTagName
|
|
|
- o_list := l["o_list"].([]interface{})
|
|
|
- //全局排除词匹配方式
|
|
|
- gnkm := qu.ObjToString(l["s_globalnotkeymatch"])
|
|
|
- gnkmArr := []string{}
|
|
|
- for _, gnv := range strings.Split(gnkm, ",") {
|
|
|
- if field := qu.ObjToString(task_export_matchtype[gnv]); field != "" {
|
|
|
- gnkmArr = append(gnkmArr, field)
|
|
|
- }
|
|
|
- }
|
|
|
- //全局排除词
|
|
|
- gnotword := qu.ObjToString(l["s_globalnotkey"])
|
|
|
- gnw_commaArr := strings.Split(gnotword, ",")
|
|
|
- gnw := &NotWord{}
|
|
|
- gnw.NotWordMap = []map[int]bool{}
|
|
|
- gnw.MatchType = gnkmArr
|
|
|
- gnw_keyReg1 := []*regexp.Regexp{}
|
|
|
- gnw_keyReg2 := []*regexp.Regexp{}
|
|
|
- gn1, gn2 := 0, 0
|
|
|
- gnotWordMap1 := map[int]bool{}
|
|
|
- gnotWordMap2 := map[int]bool{}
|
|
|
- for _, comma := range gnw_commaArr {
|
|
|
- gnw_notArr := strings.Split(comma, "&&")
|
|
|
- if len(gnw_notArr) == 1 { //,
|
|
|
- tmp_gnw := gnw_notArr[0]
|
|
|
- if tmp_gnw != "" {
|
|
|
- if LetterCase.MatchString(tmp_gnw) { //判断排除词中是否有英文
|
|
|
- tmp_gnw = strings.ToUpper(tmp_gnw) //排除词中有英文全部转为大写
|
|
|
- gnotWordMap1[gn1] = true
|
|
|
- }
|
|
|
- gnw_keyReg1 = append(gnw_keyReg1, regexp.MustCompile(tmp_gnw))
|
|
|
- gn1++
|
|
|
- }
|
|
|
- } else { //&&
|
|
|
- for _, and := range gnw_notArr {
|
|
|
- if and != "" {
|
|
|
- if LetterCase.MatchString(and) { //判断排除词中是否有英文
|
|
|
- and = strings.ToUpper(and) //排除词中有英文全部转为大写
|
|
|
- gnotWordMap2[gn2] = true
|
|
|
- }
|
|
|
- gnw_keyReg2 = append(gnw_keyReg2, regexp.MustCompile(and))
|
|
|
- gn2++
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- gnw.NotWordMap = append(gnw.NotWordMap, gnotWordMap1)
|
|
|
- gnw.NotWordMap = append(gnw.NotWordMap, gnotWordMap2)
|
|
|
- gnw.KeyReg = append(gnw.KeyReg, gnw_keyReg1)
|
|
|
- gnw.KeyReg = append(gnw.KeyReg, gnw_keyReg2)
|
|
|
- pr.GNW = gnw
|
|
|
- //全局附加词匹配方式
|
|
|
- gawm := qu.ObjToString(l["s_globaladdkeymatch"])
|
|
|
- gawmArr := []string{}
|
|
|
- for _, gav := range strings.Split(gawm, ",") {
|
|
|
- if field := qu.ObjToString(task_export_matchtype[gav]); field != "" {
|
|
|
- gawmArr = append(gawmArr, field)
|
|
|
- }
|
|
|
- }
|
|
|
- //全局附加词
|
|
|
- gaddword := qu.ObjToString(l["s_globaladdkey"])
|
|
|
- gaw_commaArr := strings.Split(gaddword, ",")
|
|
|
- gaw := &GlobalWord{}
|
|
|
- gaw.MatchWordMap = []map[int]bool{}
|
|
|
- gaw.MatchType = gawmArr
|
|
|
- gaw_keyReg1 := []*regexp.Regexp{}
|
|
|
- gaw_keyReg2 := []*regexp.Regexp{}
|
|
|
- an1, an2 := 0, 0
|
|
|
- gaddWordMap1 := map[int]bool{}
|
|
|
- gaddWordMap2 := map[int]bool{}
|
|
|
- for _, comma := range gaw_commaArr {
|
|
|
- gaddWordArr := strings.Split(comma, "&&")
|
|
|
- if len(gaddWordArr) == 1 { //,
|
|
|
- tmp_gaw := gaddWordArr[0]
|
|
|
- if tmp_gaw != "" {
|
|
|
- if LetterCase.MatchString(tmp_gaw) { //判断排除词中是否有英文
|
|
|
- tmp_gaw = strings.ToUpper(tmp_gaw) //排除词中有英文全部转为大写
|
|
|
- gaddWordMap1[an1] = true
|
|
|
- }
|
|
|
- gaw_keyReg1 = append(gaw_keyReg1, regexp.MustCompile(tmp_gaw))
|
|
|
- an1++
|
|
|
- }
|
|
|
- } else { //&&
|
|
|
- for _, and := range gaddWordArr {
|
|
|
- if and != "" {
|
|
|
- if LetterCase.MatchString(and) { //判断排除词中是否有英文
|
|
|
- and = strings.ToUpper(and) //排除词中有英文全部转为大写
|
|
|
- gaddWordMap2[an2] = true
|
|
|
- }
|
|
|
- gaw_keyReg2 = append(gaw_keyReg2, regexp.MustCompile(and))
|
|
|
- an2++
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- gaw.MatchWordMap = append(gaw.MatchWordMap, gaddWordMap1)
|
|
|
- gaw.MatchWordMap = append(gaw.MatchWordMap, gaddWordMap2)
|
|
|
- gaw.KeyReg = append(gaw.KeyReg, gaw_keyReg1)
|
|
|
- gaw.KeyReg = append(gaw.KeyReg, gaw_keyReg2)
|
|
|
- pr.GAW = gaw
|
|
|
- for _, o := range o_list {
|
|
|
- o_map := o.(map[string]interface{})
|
|
|
- //排除词匹配方式
|
|
|
- nkm := qu.ObjToString(o_map["s_notkeymatch"])
|
|
|
- nkmArr := []string{}
|
|
|
- for _, nv := range strings.Split(nkm, ",") {
|
|
|
- if field := qu.ObjToString(task_export_matchtype[nv]); field != "" {
|
|
|
- nkmArr = append(nkmArr, field)
|
|
|
- }
|
|
|
- }
|
|
|
- //排除词
|
|
|
- notword := qu.ObjToString(o_map["s_notkey"])
|
|
|
- nw_commaArr := strings.Split(notword, ",")
|
|
|
- nw := &NotWord{}
|
|
|
- nw.NotWordMap = []map[int]bool{}
|
|
|
- nw.MatchType = nkmArr
|
|
|
- nw_keyReg1 := []*regexp.Regexp{}
|
|
|
- nw_keyReg2 := []*regexp.Regexp{}
|
|
|
- n1, n2 := 0, 0
|
|
|
- notWordMap1 := map[int]bool{}
|
|
|
- notWordMap2 := map[int]bool{}
|
|
|
- for _, comma := range nw_commaArr {
|
|
|
- nw_notArr := strings.Split(comma, "&&")
|
|
|
- if len(nw_notArr) == 1 { //,
|
|
|
- tmp_nw := nw_notArr[0]
|
|
|
- if tmp_nw != "" {
|
|
|
- if LetterCase.MatchString(tmp_nw) { //判断排除词中是否有英文
|
|
|
- tmp_nw = strings.ToUpper(tmp_nw) //排除词中有英文全部转为大写
|
|
|
- notWordMap1[n1] = true
|
|
|
- }
|
|
|
- nw_keyReg1 = append(nw_keyReg1, regexp.MustCompile(tmp_nw))
|
|
|
- n1++
|
|
|
- }
|
|
|
- } else { //&&
|
|
|
- for _, and := range nw_notArr {
|
|
|
- if and != "" {
|
|
|
- if LetterCase.MatchString(and) { //判断排除词中是否有英文
|
|
|
- and = strings.ToUpper(and) //排除词中有英文全部转为大写
|
|
|
- notWordMap2[n2] = true
|
|
|
- }
|
|
|
- nw_keyReg2 = append(nw_keyReg2, regexp.MustCompile(and))
|
|
|
- n2++
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- nw.NotWordMap = append(nw.NotWordMap, notWordMap1)
|
|
|
- nw.NotWordMap = append(nw.NotWordMap, notWordMap2)
|
|
|
- nw.KeyReg = append(nw.KeyReg, nw_keyReg1)
|
|
|
- nw.KeyReg = append(nw.KeyReg, nw_keyReg2)
|
|
|
- // tmp_nw := []*NotWord{}
|
|
|
- // notword := qu.ObjToString(o_map["s_notkey"])
|
|
|
- // nw_commaArr := strings.Split(notword, ",")
|
|
|
- // for _, comma := range nw_commaArr {
|
|
|
- // nw := &NotWord{}
|
|
|
- // nw.NotWordMap = make(map[int]bool)
|
|
|
- // nw.MatchType = nkmArr
|
|
|
- // nw_notArr := strings.Split(comma, "&&")
|
|
|
- // if len(nw_notArr) == 1 { //,
|
|
|
- // tmp_nw := nw_notArr[0]
|
|
|
- // if tmp_nw != "" {
|
|
|
- // if LetterCase.MatchString(tmp_nw) { //判断排除词中是否有英文
|
|
|
- // tmp_nw = strings.ToUpper(tmp_nw) //排除词中有英文全部转为大写
|
|
|
- // nw.NotWordMap[len(nw.KeyReg)] = true
|
|
|
- // }
|
|
|
- // nw.KeyReg = append(nw.KeyReg, regexp.MustCompile(tmp_nw))
|
|
|
- // }
|
|
|
- // } else { //&&
|
|
|
- // for _, and := range nw_notArr {
|
|
|
- // if and != "" {
|
|
|
- // if LetterCase.MatchString(and) { //判断排除词中是否有英文
|
|
|
- // and = strings.ToUpper(and) //排除词中有英文全部转为大写
|
|
|
- // nw.NotWordMap[len(nw.KeyReg)] = true
|
|
|
- // }
|
|
|
- // nw.KeyReg = append(nw.KeyReg, regexp.MustCompile(and))
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
- // tmp_nw = append(tmp_nw, nw)
|
|
|
- // }
|
|
|
- //附加词匹配方式
|
|
|
- awm := qu.ObjToString(o_map["s_addkeymatch"])
|
|
|
- awmArr := []string{}
|
|
|
- for _, av := range strings.Split(awm, ",") {
|
|
|
- if field := qu.ObjToString(task_export_matchtype[av]); field != "" {
|
|
|
- awmArr = append(awmArr, field)
|
|
|
- }
|
|
|
- }
|
|
|
- //附加词
|
|
|
- tmp_aw := []*AddWord{}
|
|
|
- addword := qu.ObjToString(o_map["s_addkey"])
|
|
|
- aw_commaArr := strings.Split(addword, ",")
|
|
|
- for _, comma := range aw_commaArr {
|
|
|
- aw := &AddWord{}
|
|
|
- aw.AddWordMap = make(map[int]bool)
|
|
|
- aw.MatchType = awmArr
|
|
|
- aw_addArr := strings.Split(comma, "&&")
|
|
|
- if len(aw_addArr) == 1 { //,
|
|
|
- tmp_aw := aw_addArr[0]
|
|
|
- if tmp_aw != "" {
|
|
|
- if LetterCase.MatchString(tmp_aw) { //判断附加词中是否有英文
|
|
|
- tmp_aw = strings.ToUpper(tmp_aw) //附加词中有英文全部转为大写
|
|
|
- aw.AddWordMap[len(aw.KeyReg)] = true
|
|
|
- }
|
|
|
- aw.KeyReg = append(aw.KeyReg, regexp.MustCompile(tmp_aw))
|
|
|
- }
|
|
|
- } else { //&&
|
|
|
- for _, and := range aw_addArr {
|
|
|
- if and != "" {
|
|
|
- if LetterCase.MatchString(and) { //判断附加词中是否有英文
|
|
|
- and = strings.ToUpper(and) //附加词中有英文全部转为大写
|
|
|
- aw.AddWordMap[len(aw.KeyReg)] = true
|
|
|
- }
|
|
|
- aw.KeyReg = append(aw.KeyReg, regexp.MustCompile(and))
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- tmp_aw = append(tmp_aw, aw)
|
|
|
- }
|
|
|
- //关键词匹配方式
|
|
|
- kwm := qu.ObjToString(o_map["s_keymatch"])
|
|
|
- kwmArr := []string{}
|
|
|
- for _, kv := range strings.Split(kwm, ",") {
|
|
|
- if field := qu.ObjToString(task_export_matchtype[kv]); field != "" {
|
|
|
- kwmArr = append(kwmArr, field)
|
|
|
- }
|
|
|
- }
|
|
|
- //关键词
|
|
|
- tmp_kw := []*KeyWord{}
|
|
|
- keyword := qu.ObjToString(o_map["s_matchkey"])
|
|
|
- kw_commaArr := strings.Split(keyword, ",")
|
|
|
- for _, comma := range kw_commaArr {
|
|
|
- kw := &KeyWord{}
|
|
|
- kw.KeyWordMap = make(map[int]bool)
|
|
|
- kw.MatchType = kwmArr
|
|
|
- kw_addArr := strings.Split(comma, "&&")
|
|
|
- if len(kw_addArr) == 1 { //,
|
|
|
- tmp_kw := kw_addArr[0]
|
|
|
- if tmp_kw != "" {
|
|
|
- if LetterCase.MatchString(tmp_kw) {
|
|
|
- tmp_kw = strings.ToUpper(tmp_kw)
|
|
|
- kw.KeyWordMap[len(kw.KeyReg)] = true
|
|
|
- }
|
|
|
- kw.KeyReg = append(kw.KeyReg, regexp.MustCompile(tmp_kw))
|
|
|
- }
|
|
|
- } else { //&&
|
|
|
- for _, and := range kw_addArr {
|
|
|
- if and != "" {
|
|
|
- if LetterCase.MatchString(and) {
|
|
|
- and = strings.ToUpper(and)
|
|
|
- kw.KeyWordMap[len(kw.KeyReg)] = true
|
|
|
- }
|
|
|
- kw.KeyReg = append(kw.KeyReg, regexp.MustCompile(and))
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- tmp_kw = append(tmp_kw, kw)
|
|
|
- }
|
|
|
-
|
|
|
- //组合
|
|
|
- for _, tk := range tmp_kw {
|
|
|
- for _, aw := range tmp_aw {
|
|
|
- rule := &Rule{}
|
|
|
- rule.KW = tk
|
|
|
- rule.AW = aw
|
|
|
- rule.NW = nw
|
|
|
- pr.Rule = append(pr.Rule, rule)
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
- rules = append(rules, pr)
|
|
|
- }
|
|
|
- return
|
|
|
-}
|
|
|
-
|
|
|
-func GetTags(prules []*PRule, tmp map[string]interface{}, tmpTagNameMap, tmpPreTagMap map[string][]string) {
|
|
|
- for _, ru := range prules {
|
|
|
- //全局排除词
|
|
|
- IsMatchGNotKey := RegMatch(tmp, ru.GNW.MatchType, ru.GNW.KeyReg, ru.GNW.NotWordMap)
|
|
|
- if IsMatchGNotKey { //全局排除词匹配成功
|
|
|
- continue
|
|
|
- } else {
|
|
|
- //全局附加词
|
|
|
- IsMatchGAddKey := RegMatch(tmp, ru.GAW.MatchType, ru.GAW.KeyReg, ru.GAW.MatchWordMap)
|
|
|
- if !IsMatchGAddKey && len(ru.GAW.MatchType) != 0 { //全局附加词没有匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
- L:
|
|
|
- for _, r := range ru.Rule {
|
|
|
- //排除词
|
|
|
- IsMatchNotKey := RegMatch(tmp, r.NW.MatchType, r.NW.KeyReg, r.NW.NotWordMap)
|
|
|
- if IsMatchNotKey { //排除词匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- //L:
|
|
|
- //关键词匹配
|
|
|
- for _, kwm := range r.KW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[kwm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for i, kw_reg := range r.KW.KeyReg {
|
|
|
- IsContinue := false
|
|
|
- if kw_indexArr := kw_reg.FindAllStringIndex(text, -1); len(kw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.KW.KeyWordMap[i] && CheckLetter(text, kw_reg, kw_indexArr) { //kw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- IsContinue = true
|
|
|
- } else if !r.KW.KeyWordMap[i] {
|
|
|
- IsContinue = true
|
|
|
- }
|
|
|
- }
|
|
|
- if IsContinue { //关键词匹配成功,匹配附加词
|
|
|
- if len(r.AW.KeyReg) == 0 { //无附加词
|
|
|
- //tmpTagNameMap[r.TagName] = true
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else {
|
|
|
- for _, awm := range r.AW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[awm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for j, aw_reg := range r.AW.KeyReg {
|
|
|
- if aw_indexArr := aw_reg.FindAllStringIndex(text, -1); len(aw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.AW.AddWordMap[j] && CheckLetter(text, aw_reg, aw_indexArr) { //aw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- } else if !r.AW.AddWordMap[j] {
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break L
|
|
|
- }
|
|
|
- }
|
|
|
- // if aw_reg.MatchString(text) { //附加词匹配成功
|
|
|
- // tmpTagName[r.TagName] = true
|
|
|
- // break L
|
|
|
- // }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//mgo打标签+日志log
|
|
|
-func GetTagsAndLog(prules []*PRule, tmp map[string]interface{}, tmpTagNameMap, tmpPreTagMap map[string][]string, logMap map[string]interface{}) {
|
|
|
- logMap["dataid"] = qu.BsonIdToSId(tmp["_id"])
|
|
|
- for _, ru := range prules {
|
|
|
- logMap["tagid"] = ru.Id
|
|
|
- //全局排除词
|
|
|
- IsMatchGNotKey := RegMatch(tmp, ru.GNW.MatchType, ru.GNW.KeyReg, ru.GNW.NotWordMap)
|
|
|
- if IsMatchGNotKey { //全局排除词匹配成功
|
|
|
- continue
|
|
|
- } else {
|
|
|
- //全局附加词
|
|
|
- IsMatchGAddKey := RegMatch(tmp, ru.GAW.MatchType, ru.GAW.KeyReg, ru.GAW.MatchWordMap)
|
|
|
- if !IsMatchGAddKey && len(ru.GAW.MatchType) != 0 { //全局附加词没有匹配成功
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- mapArr := []map[string]interface{}{}
|
|
|
- for _, r := range ru.Rule {
|
|
|
- keyRule := map[string]interface{}{}
|
|
|
- //排除词
|
|
|
- IsMatchNotKey := RegMatch(tmp, r.NW.MatchType, r.NW.KeyReg, r.NW.NotWordMap)
|
|
|
- if IsMatchNotKey { //排除词匹配成功
|
|
|
- keyRule["s_keyword"] = r.NW.KeyReg
|
|
|
- keyRule["s_keywordmatch"] = strings.Join(r.NW.MatchType, ",")
|
|
|
- continue
|
|
|
- }
|
|
|
- //关键词匹配
|
|
|
- for _, kwm := range r.KW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[kwm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for i, kw_reg := range r.KW.KeyReg {
|
|
|
- IsContinue := false
|
|
|
- if kw_indexArr := kw_reg.FindAllStringIndex(text, -1); len(kw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.KW.KeyWordMap[i] && CheckLetter(text, kw_reg, kw_indexArr) { //kw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- IsContinue = true
|
|
|
- keyRule["s_keyword"] = kw_reg.String()
|
|
|
- keyRule["s_keywordmatch"] = kwm
|
|
|
- } else if !r.KW.KeyWordMap[i] {
|
|
|
- IsContinue = true
|
|
|
- keyRule["s_keyword"] = kw_reg.String()
|
|
|
- keyRule["s_keywordmatch"] = kwm
|
|
|
- }
|
|
|
- }
|
|
|
- if IsContinue { //关键词匹配成功,匹配附加词
|
|
|
- if len(r.AW.KeyReg) == 0 { //无附加词
|
|
|
- //tmpTagNameMap[r.TagName] = true
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- break
|
|
|
- } else {
|
|
|
- for _, awm := range r.AW.MatchType {
|
|
|
- if text := qu.ObjToString(tmp[awm]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- for j, aw_reg := range r.AW.KeyReg {
|
|
|
- if aw_indexArr := aw_reg.FindAllStringIndex(text, -1); len(aw_indexArr) > 0 { //关键词匹配成功
|
|
|
- if r.AW.AddWordMap[j] && CheckLetter(text, aw_reg, aw_indexArr) { //aw_reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- keyRule["s_addkey"] = aw_reg.String()
|
|
|
- keyRule["s_addkeymatch"] = awm
|
|
|
- } else if !r.AW.AddWordMap[j] {
|
|
|
- //tmpTagNameMap[r.TagName] = true //附加词匹配成功
|
|
|
- RecordData(ru, tmpTagNameMap, tmpPreTagMap)
|
|
|
- keyRule["s_addkey"] = aw_reg.String()
|
|
|
- keyRule["s_addkeymatch"] = awm
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if len(keyRule) > 0 {
|
|
|
- mapArr = append(mapArr, keyRule)
|
|
|
- }
|
|
|
- }
|
|
|
- logMap["o_list"] = mapArr
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//匹配
|
|
|
-func RegMatch(tmp map[string]interface{}, matchType []string, matchReg [][]*regexp.Regexp, matchMap []map[int]bool) bool {
|
|
|
- defer qu.Catch()
|
|
|
- for _, mt := range matchType {
|
|
|
- if text := qu.ObjToString(tmp[mt]); text != "" {
|
|
|
- text = ProcessData(text)
|
|
|
- //i=0时,regArr任意一个匹配表示匹配成功
|
|
|
- //i=1时,regArr所有匹配表示匹配成功
|
|
|
- for i, regArr := range matchReg {
|
|
|
- andMatchNum := 0
|
|
|
- for j, reg := range regArr {
|
|
|
- if indexArr := reg.FindAllStringIndex(text, -1); len(indexArr) > 0 { //匹配成功
|
|
|
- if matchMap[i][j] && CheckLetter(text, reg, indexArr) { //reg有字母,判断是否是包含关系(AAAIBBB or AI)
|
|
|
- andMatchNum++
|
|
|
- if i == 0 {
|
|
|
- return true
|
|
|
- //IsMatchGNotKey = true
|
|
|
- //break L0
|
|
|
- } else if i == 1 && len(regArr) == andMatchNum {
|
|
|
- return true
|
|
|
- //IsMatchGNotKey = true
|
|
|
- //break L0
|
|
|
- }
|
|
|
- } else if !matchMap[i][j] { //reg中没有字母
|
|
|
- andMatchNum++
|
|
|
- if i == 0 { //reg是逗号分割的reg
|
|
|
- return true
|
|
|
- //IsMatchGNotKey = true
|
|
|
- //break L0
|
|
|
- } else if i == 1 && len(regArr) == andMatchNum { //&&分割的所有reg都匹配
|
|
|
- return true
|
|
|
- //IsMatchGNotKey = true
|
|
|
- //break L0
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-//处理文本
|
|
|
-func ProcessData(text string) string {
|
|
|
- text = strings.ToUpper(text) //文本中的英文全转为大写
|
|
|
- text = FilteReg.ReplaceAllString(text, "") //去除一些特殊符号
|
|
|
- return text
|
|
|
-}
|
|
|
-
|
|
|
-//校验字母
|
|
|
-func CheckLetter(text string, reg *regexp.Regexp, indexArr [][]int) (flag bool) {
|
|
|
- for _, tmpArr := range indexArr {
|
|
|
- sIndex := tmpArr[0]
|
|
|
- eIndex := tmpArr[1]
|
|
|
- sbyte := ""
|
|
|
- ebyte := ""
|
|
|
- //log.Println("---", sIndex, eIndex)
|
|
|
- if sIndex != 0 {
|
|
|
- sbyte = text[sIndex-1 : sIndex]
|
|
|
- if eIndex != len(text) { //BAIB
|
|
|
- ebyte = text[eIndex : eIndex+1]
|
|
|
- } /*else { //BAI
|
|
|
-
|
|
|
- }*/
|
|
|
- } else {
|
|
|
- if eIndex != len(text) { //AIB
|
|
|
- ebyte = text[eIndex : eIndex+1]
|
|
|
- } /*else { //AI
|
|
|
-
|
|
|
- }*/
|
|
|
- }
|
|
|
- //log.Println("sssss", "s:", sbyte, "e:", ebyte, LetterCase.Match([]byte(sbyte)), LetterCase.Match([]byte(ebyte)))
|
|
|
- if !LetterCase.Match([]byte(sbyte)) && !LetterCase.Match([]byte(ebyte)) {
|
|
|
- flag = true
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- return
|
|
|
-}
|
|
|
-
|
|
|
-//记录匹配数据
|
|
|
-func RecordData(r *PRule, tmpTagNameMap, tmpPreTagMap map[string][]string) {
|
|
|
- //tmpTagNameMap
|
|
|
- tagNameArr := tmpTagNameMap[r.TagField]
|
|
|
- if len(tagNameArr) == 0 {
|
|
|
- tmpTagNameMap[r.TagField] = []string{r.TagName}
|
|
|
- } else {
|
|
|
- tagNameArr = append(tagNameArr, r.TagName)
|
|
|
- tmpTagNameMap[r.TagField] = tagNameArr
|
|
|
- }
|
|
|
- //tmpPreTagMap
|
|
|
- preTagArr := tmpPreTagMap[r.PreField]
|
|
|
- if len(preTagArr) == 0 {
|
|
|
- tmpPreTagMap[r.PreField] = []string{r.PreFieldName}
|
|
|
- } else {
|
|
|
- preTagArr = append(preTagArr, r.PreFieldName)
|
|
|
- tmpPreTagMap[r.PreField] = preTagArr
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//清理历史标签
|
|
|
-func ClearData(clearMap map[string]interface{}, allField map[string]bool, tmp map[string][]string) {
|
|
|
- for f, _ := range allField {
|
|
|
- if len(tmp[f]) == 0 { //清理tag
|
|
|
- clearMap[f] = ""
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//添加标签
|
|
|
-func AddData(addMap map[string]interface{}, tmp map[string][]string) {
|
|
|
- for tf, tn := range tmp {
|
|
|
- tMap := map[string]bool{}
|
|
|
- tArr := []string{}
|
|
|
- for _, tv := range tn {
|
|
|
- if tv != "" && !tMap[tv] {
|
|
|
- tArr = append(tArr, tv)
|
|
|
- tMap[tv] = true
|
|
|
- }
|
|
|
- }
|
|
|
- if len(tArr) > 0 { //如果父标签名称没有值,不添加
|
|
|
- addMap[tf] = strings.Join(tArr, ",")
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//更新es
|
|
|
-func (t *Task) UpdateEs() {
|
|
|
- log.Println("Es Save...")
|
|
|
- arru := make([][]map[string]interface{}, 200)
|
|
|
- indexu := 0
|
|
|
-Loop:
|
|
|
- for {
|
|
|
- select {
|
|
|
- case v := <-t.EsUpdateCache:
|
|
|
- arru[indexu] = v
|
|
|
- indexu++
|
|
|
- if indexu == 200 {
|
|
|
- t.SP <- true
|
|
|
- go func(arru [][]map[string]interface{}) {
|
|
|
- defer func() {
|
|
|
- <-t.SP
|
|
|
- }()
|
|
|
- t.Es.BulkUpdateMultipleFields(t.Index, t.Itype, arru)
|
|
|
- }(arru)
|
|
|
- arru = make([][]map[string]interface{}, 200)
|
|
|
- indexu = 0
|
|
|
- }
|
|
|
- case <-time.After(1000 * time.Millisecond):
|
|
|
- if indexu > 0 {
|
|
|
- t.SP <- true
|
|
|
- go func(arru [][]map[string]interface{}) {
|
|
|
- defer func() {
|
|
|
- <-t.SP
|
|
|
- }()
|
|
|
- t.Es.BulkUpdateMultipleFields(t.Index, t.Itype, arru)
|
|
|
- }(arru[:indexu])
|
|
|
- arru = make([][]map[string]interface{}, 200)
|
|
|
- indexu = 0
|
|
|
- }
|
|
|
- case <-t.EsOver: //结束es任务时结束保存
|
|
|
- break Loop
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//更新mongo
|
|
|
-// func (t *Task) UpdateMgo() {
|
|
|
-// log.Println("Mgo Save...")
|
|
|
-// arru := make([][]map[string]interface{}, 200)
|
|
|
-// indexu := 0
|
|
|
-// for {
|
|
|
-// select {
|
|
|
-// case v := <-t.MgoUpdataCache:
|
|
|
-// arru[indexu] = v
|
|
|
-// indexu++
|
|
|
-// if indexu == 200 {
|
|
|
-// t.SP <- true
|
|
|
-// go func(arru [][]map[string]interface{}) {
|
|
|
-// defer func() {
|
|
|
-// <-t.SP
|
|
|
-// }()
|
|
|
-// t.Mgo.UpdateBulk(t.MgoColl, arru...)
|
|
|
-// }(arru)
|
|
|
-// arru = make([][]map[string]interface{}, 200)
|
|
|
-// indexu = 0
|
|
|
-// }
|
|
|
-// case <-time.After(1000 * time.Millisecond):
|
|
|
-// if indexu > 0 {
|
|
|
-// t.SP <- true
|
|
|
-// go func(arru [][]map[string]interface{}) {
|
|
|
-// defer func() {
|
|
|
-// <-t.SP
|
|
|
-// }()
|
|
|
-// t.Mgo.UpdateBulk(t.MgoColl, arru...)
|
|
|
-// }(arru[:indexu])
|
|
|
-// arru = make([][]map[string]interface{}, 200)
|
|
|
-// indexu = 0
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|