|
@@ -25,21 +25,9 @@ var (
|
|
|
TaskList map[string]*ExtractTask //任务列表
|
|
|
saveLimit = 200 //抽取日志批量保存
|
|
|
|
|
|
- AreaGet DFA //敏感词
|
|
|
- AreaProvinceGet DFA //敏感词
|
|
|
- AreaSimGet DFA //敏感词
|
|
|
-
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
|
|
|
)
|
|
|
|
|
|
-var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
|
|
|
-var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
|
|
|
-var ProviceConfig map[string]interface{} = make(map[string]interface{}) //省份
|
|
|
-var ProvinceMap map[string]string = make(map[string]string)
|
|
|
-var CityBrief map[string]*City = make(map[string]*City) //只加载一次即可
|
|
|
-var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
|
|
|
-var AreaToCity map[string][]*City = make(map[string][]*City) //两个文件共用
|
|
|
-
|
|
|
//启动测试抽取
|
|
|
func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
|
|
|
defer qu.Catch()
|
|
@@ -53,6 +41,12 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
|
ext.InitRuleCore()
|
|
|
ext.InitTag()
|
|
|
ext.InitClearFn()
|
|
|
+
|
|
|
+ ext.InitProvince()
|
|
|
+ ext.InitCityAll()
|
|
|
+ ext.InitCitySim()
|
|
|
+ InitDFA()
|
|
|
+
|
|
|
return RunExtractTestTask(ext, startId, num)
|
|
|
}
|
|
|
|
|
@@ -186,7 +180,7 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
City: qu.ObjToString(doc["city"]),
|
|
|
Province: qu.ObjToString(doc["area"]),
|
|
|
Result: map[string][]*ju.ExtField{},
|
|
|
- //BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
}
|
|
|
pretreated.AnalyStart(j)
|
|
|
return j
|
|
@@ -242,12 +236,6 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
|
|
|
}
|
|
|
//bs, _ := json.Marshal(j.Result)
|
|
|
//log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
-
|
|
|
- //抽取省份城市县
|
|
|
-
|
|
|
- //fmt.Println("-----------", j.Province, j.City, j.BuyerAddr, j.Title) //j.Address
|
|
|
- //ExtractPC(j.Result, j.Province, j.City, j.Title, j.BuyerAddr, j.SourceMid) //j.Address
|
|
|
- ExtractPC2(j.Result, "Province", "City", "Title", "Addr", j.SourceMid)
|
|
|
//分析抽取结果并保存 todo
|
|
|
AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
|
|
|
|
|
@@ -736,7 +724,8 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
|
|
|
values[key] = ju.ExtSort(objects)
|
|
|
}
|
|
|
//从排序结果中取值
|
|
|
- tmp := map[string]interface{}{}
|
|
|
+ tmp := map[string]interface{}{} //抽取值
|
|
|
+ resulttmp := tmp //保存结果
|
|
|
for key, val := range values {
|
|
|
for _, v := range val { //取第一个
|
|
|
if v.Key != "" {
|
|
@@ -745,202 +734,39 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ resulttmp["result"] = result
|
|
|
+ for k, v := range *doc {
|
|
|
+ if resulttmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
|
|
|
+ resulttmp[k] = v
|
|
|
+ }
|
|
|
+ }
|
|
|
+ b, p, c, d := TransmitData(resulttmp, _id) //抽取省份城市
|
|
|
+ //log.Println("抽取省份,城市,县结果=====", b, p, c, d)
|
|
|
+ resulttmp["district"] = d
|
|
|
+ if b {
|
|
|
+ resulttmp["city"] = c
|
|
|
+ resulttmp["area"] = p
|
|
|
+ }
|
|
|
if task.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
|
task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
|
}
|
|
|
//保存抽取详情
|
|
|
- tmp["result"] = result
|
|
|
- for k, v := range *doc {
|
|
|
- if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
|
|
|
- tmp[k] = v
|
|
|
- }
|
|
|
- }
|
|
|
- db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
|
+ // tmp["result"] = result
|
|
|
+ // for k, v := range *doc {
|
|
|
+ // if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
|
|
|
+ // tmp[k] = v
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
|
|
|
} else { //测试结果
|
|
|
//保存抽取详情
|
|
|
- tmp["result"] = result
|
|
|
- for k, v := range *doc {
|
|
|
- if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
|
|
|
- tmp[k] = v
|
|
|
- }
|
|
|
- }
|
|
|
- db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-//抽取城市、省份
|
|
|
-func ExtractPC2(result map[string][]*ju.ExtField, province, city, title, addr, sourcemid string) (bres bool, c, p string) {
|
|
|
- var pjnarr, buyerarr []string
|
|
|
- var pb []interface{}
|
|
|
- for n, val := range result["projectname"] {
|
|
|
- pjnarr[n] = fmt.Sprint(val.Value)
|
|
|
- }
|
|
|
- for n, val := range result["buyer"] {
|
|
|
- buyerarr[n] = fmt.Sprint(val.Value)
|
|
|
- }
|
|
|
- pl := len(pjnarr)
|
|
|
- bl := len(buyerarr)
|
|
|
- max := 0
|
|
|
- if pl > bl {
|
|
|
- max = pl
|
|
|
- } else {
|
|
|
- max = bl
|
|
|
- }
|
|
|
- //city, buyer, addr, projectname, title
|
|
|
- if max == 0 { //没有projectname和buyer结果集
|
|
|
- tmp1 := []string{city, "", addr, "", title}
|
|
|
- pb = append(pb, tmp1)
|
|
|
- } else { //至少有一个结果集
|
|
|
- if max == pl {
|
|
|
- for i := 0; i < max; i++ {
|
|
|
- p := pjnarr[i]
|
|
|
- b := ""
|
|
|
- if i < bl {
|
|
|
- b = buyerarr[i]
|
|
|
- }
|
|
|
- tmp2 := []string{city, b, addr, p, title}
|
|
|
- pb = append(pb, tmp2)
|
|
|
- }
|
|
|
- } else {
|
|
|
- for i := 0; i < max; i++ {
|
|
|
- b := buyerarr[i]
|
|
|
- p := ""
|
|
|
- if i < pl {
|
|
|
- p = pjnarr[i]
|
|
|
- }
|
|
|
- tmp3 := []string{city, b, addr, p, title}
|
|
|
- pb = append(pb, tmp3)
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- }
|
|
|
- log.Println(pb)
|
|
|
- return
|
|
|
-}
|
|
|
-func ExtractPC(buyer, projectname, title, city, province, addr string, id interface{}) (bres bool, c, p string) {
|
|
|
- defer qu.Catch()
|
|
|
- bc := true //是否继续抽取
|
|
|
- if city != "" {
|
|
|
- if CityBrief[city] == nil { //简称不存在
|
|
|
- //log.Println("city err:", city, id)
|
|
|
- } else { //简称存在
|
|
|
- if province != CityBrief[city].P.Brief { //省份不对
|
|
|
- log.Println("province err:", city, province, id)
|
|
|
- } else {
|
|
|
- bc = false
|
|
|
- //原值正确,不用抽取
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //有省份
|
|
|
- bp := false
|
|
|
- if ProvinceBrief[province] != nil {
|
|
|
- bp = true
|
|
|
- } else { //没有省份,先识别省份
|
|
|
- for _, str := range []string{city, buyer, addr, projectname, title} {
|
|
|
- word := AreaProvinceGet.CheckSensitiveWord(str) //省全称
|
|
|
- if word != "" {
|
|
|
- province = ProvinceMap[word] //省简称
|
|
|
- bp = true
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //匹配城市
|
|
|
- if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不对,继续抽取
|
|
|
- //目前是全匹配模式,如果再加上精简匹配,加一层循环
|
|
|
- for pos, GET := range []DFA{AreaGet, AreaSimGet} {
|
|
|
- ws := make([]string, 5)
|
|
|
- for n, str := range []string{city, buyer, addr, projectname, title} {
|
|
|
- if str != "" {
|
|
|
- word := GET.CheckSensitiveWord(str)
|
|
|
- if pos == 1 { //用简称 后辍为路、集团替换
|
|
|
- str1 := strings.Replace(str, word+"路", "", 1)
|
|
|
- if str1 != str {
|
|
|
- word = GET.CheckSensitiveWord(str1)
|
|
|
- }
|
|
|
- }
|
|
|
- ws[n] = word
|
|
|
- if word != "" {
|
|
|
- res := AreaToCity[word]
|
|
|
- if len(res) == 1 {
|
|
|
- //判断省份
|
|
|
- if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回
|
|
|
- bres = true
|
|
|
- c = res[0].Brief
|
|
|
- p = res[0].P.Brief
|
|
|
- break
|
|
|
- } else { //不一致时。。暂时不处理
|
|
|
- }
|
|
|
- } else { //多个时
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if !bres {
|
|
|
- mc := map[string]int{}
|
|
|
- for _, w := range ws {
|
|
|
- res := AreaToCity[w]
|
|
|
- for _, ct := range res {
|
|
|
- if ct == nil {
|
|
|
- continue
|
|
|
- }
|
|
|
- if bp { //有省份
|
|
|
- if ct.P != nil && ct.P.Brief == province {
|
|
|
- mc[ct.Brief]++
|
|
|
- }
|
|
|
- } else { //没有省份
|
|
|
- mc[ct.Brief]++
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //计算mc中最大值且大于1
|
|
|
- max := 1
|
|
|
- v := ""
|
|
|
- for mk, mv := range mc {
|
|
|
- if mv > max {
|
|
|
- v = mk
|
|
|
- }
|
|
|
- }
|
|
|
- if v != "" {
|
|
|
- bres = true
|
|
|
- c = CityBrief[v].Brief
|
|
|
- p = CityBrief[v].P.Brief
|
|
|
- } else if len(mc) > 0 {
|
|
|
- //取级别更大的
|
|
|
- v := ""
|
|
|
- for mk, _ := range mc {
|
|
|
- if CityBrief[mk].P.Cap == mk {
|
|
|
- bres = true
|
|
|
- c = CityBrief[mk].Brief
|
|
|
- p = CityBrief[mk].P.Brief
|
|
|
- break
|
|
|
- } else {
|
|
|
- v = mk
|
|
|
- }
|
|
|
- }
|
|
|
- if !bres {
|
|
|
- bres = true
|
|
|
- c = CityBrief[v].Brief
|
|
|
- p = CityBrief[v].P.Brief
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if bres {
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- } else {
|
|
|
- return
|
|
|
- }
|
|
|
- if !bres {
|
|
|
- //取默认省会
|
|
|
- if ProvinceBrief[province] != nil {
|
|
|
- bres = true
|
|
|
- c = ProvinceBrief[province].Cap
|
|
|
- p = province
|
|
|
- }
|
|
|
+ // tmp["result"] = result
|
|
|
+ // for k, v := range *doc {
|
|
|
+ // if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
|
|
|
+ // tmp[k] = v
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
|
|
|
}
|
|
|
- return
|
|
|
}
|