|
@@ -261,6 +261,7 @@ func NewClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap
|
|
|
// if tt.I_multiclass == 0 { //单分类临时记录保存字段
|
|
|
// tmpSavefield[savefield] = ""
|
|
|
// }
|
|
|
+ //同一个字段,例如toptype有值了就跳过
|
|
|
if savefield != "" && tt.I_multiclass == 0 && SMap.Map[savefield] != nil {
|
|
|
//同一个字段,只要一个类识别...
|
|
|
continue
|
|
@@ -281,15 +282,18 @@ func NewClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap
|
|
|
val = PreFilter(val, tt.Task_PreRule) //任务的前置过滤
|
|
|
rulval[f] = val //整个任务仅过滤一次,将其存储
|
|
|
}
|
|
|
+
|
|
|
if val != "" {
|
|
|
- val = PreFilter(val, c.Class_PreRule) //分类的前置过滤
|
|
|
+ //1.分类的前置过滤,不符合直接跳过当前循环
|
|
|
+ val = PreFilter(val, c.Class_PreRule)
|
|
|
if val == "" {
|
|
|
continue
|
|
|
}
|
|
|
+ //2.循环当前类规则
|
|
|
for _, r := range rule {
|
|
|
-
|
|
|
- if s_pid != "" { //只要此类的父类不为空
|
|
|
- ru_s_pid := r.S_pid //当前规则的父规则id
|
|
|
+ //只要此类的父类不为空
|
|
|
+ if s_pid != "" {
|
|
|
+ ru_s_pid := r.S_pid //当前规则的父规则id,57a02d4fd368081fac39eaf2,57a18dd5d368081d70e185d0,5a4d99b7e138236b380378be
|
|
|
class_s_pid := strings.Split(s_pid, ",") //当前规则所属类的父类集合
|
|
|
bcontinue := true
|
|
|
L1:
|
|
@@ -316,12 +320,13 @@ func NewClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap
|
|
|
continue
|
|
|
}
|
|
|
//开始识别
|
|
|
- i_rule := r.Reg
|
|
|
+ i_rule := r.Reg //toptype=结果 的所有规则
|
|
|
//util.Debug("text---", ruval, "rule---", i_rule)
|
|
|
b, rulearr := DFAAnalyRules(ruval, i_rule)
|
|
|
//util.Debug("===============", b, f, rulearr, r.S_name)
|
|
|
if b {
|
|
|
- if r.S_name == "中选" && len(r.DetailReg) > 0 { //title、channel二级分类中标处理
|
|
|
+ //title、channel二级分类中标处理
|
|
|
+ if r.S_name == "中选" && len(r.DetailReg) > 0 {
|
|
|
detail := util.ObjToString(tmp["detail"])
|
|
|
if len(r.NotReg) > 0 { //排除规则
|
|
|
dnrb, _ := DFAAnalyRules(detail, r.NotReg)
|
|
@@ -371,8 +376,10 @@ func NewClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap
|
|
|
} else {
|
|
|
fflag[cid] = []string{r_id} //第一个规则中的_id
|
|
|
}
|
|
|
+ //57a02bc0d368081fac39eaef =[57a02cdcd368081fac39eaf1]
|
|
|
s_name := r.S_name
|
|
|
- if tt.I_wordcount == 1 && len(rulearr) > 0 { //词频统计
|
|
|
+ //词频统计
|
|
|
+ if tt.I_wordcount == 1 && len(rulearr) > 0 {
|
|
|
tt.WcLock.Lock()
|
|
|
map1 := tt.WordCount[s_name]
|
|
|
if map1 == nil {
|
|
@@ -461,6 +468,244 @@ func NewClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap
|
|
|
return SMap
|
|
|
}
|
|
|
|
|
|
+func ReSub(tt *TTask, tmp map[string]interface{}, top string) *tools.SortMap {
|
|
|
+ SMap := tools.NewSortMap()
|
|
|
+ //tmpSavefield := map[string]interface{}{}
|
|
|
+ //fflag := map[string][]string{} //标志父类从属关系
|
|
|
+ rulval := map[string]string{} //存储过滤记录
|
|
|
+ class := tt.Class //获取任务中的多个分类
|
|
|
+
|
|
|
+ var top_class_ids []string //一级分类的规则ID
|
|
|
+ if len(class) > 0 {
|
|
|
+ for _, cla := range class {
|
|
|
+ rules := cla.Rule
|
|
|
+ for _, rule := range rules {
|
|
|
+ if rule.S_name == top {
|
|
|
+ top_class_ids = append(top_class_ids, rule.Rid)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if class != nil && len(class) > 0 {
|
|
|
+ for _, c := range class {
|
|
|
+ //预处理lua
|
|
|
+ s_fields := c.S_fields //识别字段 o_jy.a_key.key detail,title
|
|
|
+ for _, ftmp := range strings.Split(s_fields, ",") { //先将所有o_jy.a_key.key类型字段的值处理到tmp中
|
|
|
+ fieldarr := strings.Split(ftmp, ".")
|
|
|
+ if len(fieldarr) > 1 {
|
|
|
+ key := fieldarr[0] //字段名称
|
|
|
+ field := fieldarr[len(fieldarr)-1]
|
|
|
+ if tmp[key] == nil { //判断是否有该字段的值
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ rMap := map[string]interface{}{}
|
|
|
+ if tMap, ok := tmp[key].(map[string]interface{}); ok {
|
|
|
+ rMap = tMap
|
|
|
+ } else if sMap, ok := tmp[key].(string); ok {
|
|
|
+ json.Unmarshal([]byte(sMap), &rMap)
|
|
|
+ logger.Warn("Projectinfo Type Is Wrong:", tmp["_id"])
|
|
|
+ }
|
|
|
+ if len(rMap) > 0 {
|
|
|
+ if s != nil && s[ftmp] != nil {
|
|
|
+ newscript := NewLuaScript(s[ftmp].Name, s[ftmp].File) //newlua
|
|
|
+ datamap := Dealdata(newscript, rMap, field) //处理数据
|
|
|
+ dataarr := maptoarr(datamap)
|
|
|
+ if len(dataarr) == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ tmp[ftmp] = dataarr
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //cid := c.Cid //类id
|
|
|
+ s_pid := c.S_pid //父类id
|
|
|
+
|
|
|
+ if s_pid == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ savefield := c.S_savefield //保存字段,toptype
|
|
|
+ // if tt.I_multiclass == 0 { //单分类临时记录保存字段
|
|
|
+ // tmpSavefield[savefield] = ""
|
|
|
+ // }
|
|
|
+ //同一个字段,例如toptype有值了就跳过
|
|
|
+ if savefield != "" && tt.I_multiclass == 0 && SMap.Map[savefield] != nil {
|
|
|
+ //同一个字段,只要一个类识别...
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ rule := c.Rule //取规则
|
|
|
+ fields := strings.Split(s_fields, ",")
|
|
|
+ L:
|
|
|
+ for _, f := range fields { //f:每一个识别字段
|
|
|
+ val := rulval[f]
|
|
|
+ if val == "" { //无过滤记录
|
|
|
+ strArr, ok := tmp[f].([]string)
|
|
|
+ if ok {
|
|
|
+ //strArr := tools.ObjArrToStringArr(strArr)
|
|
|
+ val = strings.Join(strArr, "")
|
|
|
+ } else {
|
|
|
+ val = util.ObjToString(tmp[f]) //取识别内容
|
|
|
+ }
|
|
|
+ val = PreFilter(val, tt.Task_PreRule) //任务的前置过滤
|
|
|
+ rulval[f] = val //整个任务仅过滤一次,将其存储
|
|
|
+ }
|
|
|
+
|
|
|
+ if val != "" {
|
|
|
+ //1.分类的前置过滤,不符合直接跳过当前循环
|
|
|
+ val = PreFilter(val, c.Class_PreRule)
|
|
|
+ if val == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //2.循环当前类规则
|
|
|
+ for _, r := range rule {
|
|
|
+ //只要此类的父类不为空
|
|
|
+ if s_pid != "" {
|
|
|
+ ru_s_pid := r.S_pid //当前规则的父规则id,57a02d4fd368081fac39eaf2,57a18dd5d368081d70e185d0,5a4d99b7e138236b380378be
|
|
|
+ //class_s_pid := strings.Split(s_pid, ",") //当前规则所属类的父类集合
|
|
|
+ bcontinue := true
|
|
|
+ L1:
|
|
|
+ for _, ids1 := range top_class_ids {
|
|
|
+ if strings.Index(ru_s_pid, ids1) > -1 {
|
|
|
+ bcontinue = false
|
|
|
+ break L1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bcontinue { //没有找到父类的识别,跳过此规则
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ruval := val //避免不同class的rule串行过滤
|
|
|
+ if len(r.Rule_PreRule) > 0 {
|
|
|
+ ruval = PreFilter(ruval, r.Rule_PreRule) //rule的前置过滤
|
|
|
+ }
|
|
|
+ if ruval == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //开始识别
|
|
|
+ i_rule := r.Reg //toptype=结果 的所有规则
|
|
|
+ //util.Debug("text---", ruval, "rule---", i_rule)
|
|
|
+ b, rulearr := DFAAnalyRules(ruval, i_rule)
|
|
|
+ //util.Debug("===============", b, f, rulearr, r.S_name)
|
|
|
+ if b {
|
|
|
+ detail := util.ObjToString(tmp["detail"])
|
|
|
+ if len(r.NotReg) > 0 { //排除规则
|
|
|
+ dnrb, _ := DFAAnalyRules(detail, r.NotReg)
|
|
|
+ if dnrb { //排除规则匹配成功,匹配下一条
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if b {
|
|
|
+ //title、channel二级分类中标处理
|
|
|
+ if r.S_name == "中选" && len(r.DetailReg) > 0 {
|
|
|
+ detail := util.ObjToString(tmp["detail"])
|
|
|
+ if len(r.NotReg) > 0 { //排除规则
|
|
|
+ dnrb, _ := DFAAnalyRules(detail, r.NotReg)
|
|
|
+ if dnrb { //排除规则匹配成功,匹配下一条
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //util.Debug("detail---", detail)
|
|
|
+ drb, _ := DFAAnalyRules(detail, r.DetailReg)
|
|
|
+ //util.Debug("-----", drb)
|
|
|
+ if drb {
|
|
|
+ SMap.AddKey(savefield, "中标")
|
|
|
+ } else {
|
|
|
+ SMap.AddKey(savefield, "成交")
|
|
|
+ }
|
|
|
+ break L
|
|
|
+ }
|
|
|
+
|
|
|
+ //57a02bc0d368081fac39eaef =[57a02cdcd368081fac39eaf1]
|
|
|
+ s_name := r.S_name
|
|
|
+ //词频统计
|
|
|
+ if tt.I_wordcount == 1 && len(rulearr) > 0 {
|
|
|
+ tt.WcLock.Lock()
|
|
|
+ map1 := tt.WordCount[s_name]
|
|
|
+ if map1 == nil {
|
|
|
+ map1 = map[string]int{}
|
|
|
+ }
|
|
|
+ for _, rulekey := range rulearr {
|
|
|
+ map1[rulekey]++
|
|
|
+ }
|
|
|
+ tt.WordCount[s_name] = map1
|
|
|
+ tt.WcLock.Unlock()
|
|
|
+ }
|
|
|
+ if savefield != "" {
|
|
|
+ s_code := r.S_code
|
|
|
+ if tt.I_multiclass == 0 { //单分类
|
|
|
+ if tt.I_savetype == 1 { //存储属性
|
|
|
+ SMap.AddKey(savefield, s_name)
|
|
|
+ } else {
|
|
|
+ SMap.AddKey(savefield, s_code)
|
|
|
+ }
|
|
|
+ break L //停止其他字段识别
|
|
|
+ } else { //多分类
|
|
|
+ sf := []string{}
|
|
|
+ b := false
|
|
|
+ if SMap.Map[savefield] != nil {
|
|
|
+ b = true
|
|
|
+ sf = SMap.Map[savefield].([]string)
|
|
|
+ }
|
|
|
+ if tt.I_savetype == 1 { //
|
|
|
+ sf = append(sf, s_name)
|
|
|
+ } else {
|
|
|
+ sf = append(sf, s_code)
|
|
|
+ }
|
|
|
+ if b {
|
|
|
+ SMap.Map[savefield] = sf
|
|
|
+ } else {
|
|
|
+ SMap.AddKey(savefield, sf)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //设置默认值
|
|
|
+ //log.Println("smp", SMap.Map, SMap.Keys)
|
|
|
+ if savefield != "" && SMap.Map[savefield] == nil && c.S_default != "" {
|
|
|
+ if tt.I_multiclass == 0 { //单分类
|
|
|
+ //if savefield == "buyerclass" && util.ObjToString(tmp["buyer"]) == "" { //buyer不存在时,无buyerclass字段
|
|
|
+ // return SMap
|
|
|
+ //}
|
|
|
+ SMap.AddKey(savefield, c.S_default)
|
|
|
+ } else { //多分类
|
|
|
+ SMap.AddKey(savefield, []string{c.S_default})
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //多分类去重
|
|
|
+ if tt.I_multiclass != 0 {
|
|
|
+ for _, k := range SMap.Keys {
|
|
|
+ repeatMap := make(map[string]string)
|
|
|
+ tmpArr := make([]string, 0)
|
|
|
+ if arr, ok := SMap.Map[k].([]string); ok {
|
|
|
+ for _, v := range arr {
|
|
|
+ if repeatMap[v] != "has" {
|
|
|
+ tmpArr = append(tmpArr, v)
|
|
|
+ repeatMap[v] = "has"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ SMap.Map[k] = tmpArr
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } /* else { //单分类
|
|
|
+ for k, v := range tmpSavefield {
|
|
|
+ if k != "" && SMap.Map[k] == nil {
|
|
|
+ SMap.Map[k] = v
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }*/
|
|
|
+ return SMap
|
|
|
+}
|
|
|
+
|
|
|
//标签识别过程
|
|
|
func TagClassificationRun(tt *TTask, tmp map[string]interface{}) *tools.SortMap {
|
|
|
SMap := tools.NewSortMap()
|
|
@@ -778,9 +1023,22 @@ func LoadUpdateTask(_id, s_mgourl, s_mgodb, s_coll, i_poolsize, s_esurl, s_esdb,
|
|
|
if util.IntAll(tmp["infoformat"]) == 2 { //此处增加特例
|
|
|
SMap.AddKey("toptype", "拟建")
|
|
|
SMap.AddKey("subtype", "拟建")
|
|
|
+ } else if util.IntAll(tmp["infoformat"]) == 3 {
|
|
|
+ SMap.AddKey("toptype", "产权")
|
|
|
+ SMap.AddKey("subtype", "产权")
|
|
|
} else {
|
|
|
//SMap = ClassificationRun(tt, tmp)
|
|
|
SMap = NewClassificationRun(tt, tmp)
|
|
|
+ //一级分类时,符合结果中成交规则时
|
|
|
+ if SMap.Map["toptype"] == "招标" && SMap.Map["subtype"] != "单一" {
|
|
|
+ if u.ChargeDetailResult(tmp["detail"].(string)) {
|
|
|
+ SMap.Map["toptype"] = "结果"
|
|
|
+ resa := ReSub(tt, tmp, "结果")
|
|
|
+ subtype := resa.Map["subtype"]
|
|
|
+ delete(SMap.Map, "subtype")
|
|
|
+ SMap.Map["subtype"] = subtype
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
bupdate := false
|
|
|
//对比,是否保存
|