123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- package main
- import (
- "bufio"
- "fmt"
- "io"
- cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
- su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil"
- "os"
- "regexp"
- "strconv"
- "strings"
- )
- var (
- CmmonDFA *DFA //常用字
- NotCommonDFA *DFA //不常用字
- TimesLimit int //常用字界限
- UpdateLimit float64 //更新界限
- OssSite map[string]float64 //解析附件站点集合
- HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
- SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
- SpecialReg = regexp.MustCompile("图片(\\d)+") //
- SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
- )
- func DealFile(tmp map[string]interface{}) {
- site := cu.ObjToString(tmp["site"]) //解析附件站点
- if limitRatio := OssSite[site]; limitRatio > 0 { //配置站点解析附件,根据准确率情况替换正文
- replace, filetext := AnalysisFile(true, limitRatio, tmp)
- if replace { //替换正文
- tmp["detail"] = filetext
- }
- } else { //其它网站附件信息,detail无效,只有一个附件且不是ocr识别的,替换正文
- //判断detail是否有效
- detail := cu.ObjToString(tmp["detail"])
- detail = su.FilterDetail(detail) //只保留文本内容
- if len([]rune(detail)) <= 5 || (len([]rune(detail)) <= 50 && SpecialTextReg.MatchString(detail)) {
- replace, filetext := AnalysisFile(false, 0, tmp)
- if replace { //替换正文
- tmp["detail"] = filetext
- }
- }
- }
- }
- func InitFileInfo() {
- OssSite = map[string]float64{}
- TimesLimit = cu.IntAll(Config["timeslimit"])
- UpdateLimit = cu.Float64All(Config["updatelimit"])
- for site, b := range Config["osssite"].(map[string]interface{}) {
- OssSite[site] = cu.Float64All(b)
- }
- fmt.Println(TimesLimit, UpdateLimit, OssSite)
- CmmonDFA = &DFA{}
- NotCommonDFA = &DFA{}
- LoadDict("common.txt") //初始化常用字典
- }
- // DFA
- type DFA struct {
- Link map[string]interface{}
- }
- func (d *DFA) AddWord(keys ...string) {
- d.AddWordAll(true, keys...)
- }
- func (d *DFA) AddWordAll(haskey bool, keys ...string) {
- if d.Link == nil {
- d.Link = make(map[string]interface{})
- }
- for _, key := range keys {
- nowMap := &d.Link
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["YN"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["YN"] = "1"
- if haskey {
- (*nowMap)["K"] = key
- }
- }
- }
- }
- }
- func (d *DFA) CheckSensitiveWord(src string) []string {
- res := make([]string, 0)
- for j := 0; j < len(src); j++ {
- nowMap := &d.Link
- for i := j; i < len(src); i++ {
- word := src[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if "1" == cu.ObjToString((*nowMap)["YN"]) {
- s := cu.ObjToString((*nowMap)["K"])
- res = append(res, s)
- }
- } else {
- break
- }
- }
- }
- return res
- }
- // 加载统计的常用词
- func LoadDict(path string) {
- dictFile, err := os.Open(path)
- if err != nil {
- fmt.Println("Load Common.txt Error")
- os.Exit(-1)
- }
- defer dictFile.Close()
- reader := bufio.NewReader(dictFile)
- var (
- text string
- frequency int
- )
- // 逐行读入分词
- line := 0
- for {
- line++
- size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
- if fsErr == io.EOF { //读取到结尾
- break
- }
- if size == 2 { //正确数据
- if frequency >= TimesLimit { //常用字
- CmmonDFA.AddWord(text)
- } else { //非常用字
- //NotCommonDFA.AddWord(text)
- }
- } else {
- fmt.Println("Read Line Error:", line)
- }
- }
- }
- func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
- defer cu.Catch()
- filetext, byOcr := GetFileText(tmp) //解析附件
- if filetext == "" {
- return false, filetext
- }
- if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
- return !byOcr, filetext
- } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
- return true, filetext
- }
- //下面是指定站点附件识别后,按准确率判断是否替换detail
- //特殊情况:图片0 图片1
- filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
- if filetextTmp == "" { //附件为空
- return false, filetext
- }
- //中文匹配
- HanArr := HanReg.FindAllString(filetextTmp, -1)
- hanText := strings.Join(HanArr, "")
- hanLen := len([]rune(hanText))
- //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
- //filetextLen := len([]rune(filetextTmp))
- //长度过滤
- if hanLen <= 100 {
- return false, filetext
- }
- commonArr := CmmonDFA.CheckSensitiveWord(hanText)
- commonLen := len(commonArr)
- //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
- commonRatio := float64(commonLen) / float64(hanLen)
- commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
- if commonRatio >= limitRatio {
- return true, filetext
- }
- return false, filetext
- }
- // 解析附件
- func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
- defer cu.Catch()
- filetext, _ := GetFileText(tmp) //解析附件
- //过滤空格
- filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
- if filetextTmp == "" { //附件为空
- return false, false, filetext
- }
- //特殊情况:图片0 图片1
- filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
- if filetextTmp == "" { //附件为空
- return false, false, filetext
- }
- //中文匹配
- HanArr := HanReg.FindAllString(filetextTmp, -1)
- hanText := strings.Join(HanArr, "")
- hanTextLen := len([]rune(hanText))
- //长度过滤
- if hanTextLen <= 20 {
- return false, false, filetext
- } else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 {
- return false, false, filetext
- }
- //fmt.Println(hanTextLen, hanText)
- commonArr := CmmonDFA.CheckSensitiveWord(hanText)
- commonLen := len(commonArr)
- //fmt.Println(commonLen, commonArr)
- //commonText := strings.Join(commonArr, "")
- //notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
- //notCommonLen := len(notCommonArr)
- //fmt.Println(notCommonLen, notCommonArr)
- //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
- commonRatio := float64(commonLen) / float64(hanTextLen)
- commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
- if commonRatio < 0.5 { //常用字占比低于x<50%
- return false, false, filetext
- } else if replaceSite {
- if commonRatio < UpdateLimit { //50%<x<UpdateLimit
- return false, true, filetext
- } else { //x>=UpdateLimit
- return true, true, filetext
- }
- }
- //fmt.Println(commonRatio)
- //notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
- //notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
- return false, true, filetext
- }
- // 测试方法
- func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
- //fmt.Println(detail)
- defer cu.Catch()
- //过滤空格
- filetextTmp := SpaceReg.ReplaceAllString(detail, "")
- if filetextTmp == "" { //附件为空
- return false, "", 0, 0, 0
- }
- //特殊情况:图片0 图片1
- filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
- if filetextTmp == "" { //附件为空
- return false, "", 1, 0, 0
- }
- //中文匹配
- HanArr := HanReg.FindAllString(filetextTmp, -1)
- hanText := strings.Join(HanArr, "")
- hanTextLen := len([]rune(hanText))
- //长度过滤
- if hanTextLen <= 100 {
- return false, "", 2, 0, 0
- }
- //fmt.Println(textLen, text)
- commonArr := CmmonDFA.CheckSensitiveWord(hanText)
- commonLen := len(commonArr)
- fmt.Println(commonLen, commonArr)
- //commonText := strings.Join(commonArr, "")
- notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
- notCommonLen := len(notCommonArr)
- fmt.Println(notCommonLen, notCommonArr)
- //notCommonText := strings.Join(notCommonArr, "")
- //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
- commonRatio := float64(commonLen) / float64(hanTextLen)
- commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
- notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
- notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
- return true, filetextTmp, 10, commonRatio, notCommonRatio
- }
|