package main

import (
	"bufio"
	"fmt"
	"io"
	cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
	su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil"
	"os"
	"regexp"
	"strconv"
	"strings"
)

var (
	CmmonDFA       *DFA                                             //常用字
	NotCommonDFA   *DFA                                             //不常用字
	TimesLimit     int                                              //常用字界限
	UpdateLimit    float64                                          //更新界限
	OssSite        map[string]float64                               //解析附件站点集合
	HanReg         = regexp.MustCompile("[\u4e00-\u9fa5]+")         //中文正则
	SpaceReg       = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
	SpecialReg     = regexp.MustCompile("图片(\\d)+")                 //
	SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件！|详情请访问原网页！)")
)

func DealFile(tmp map[string]interface{}) {
	site := cu.ObjToString(tmp["site"])              //解析附件站点
	if limitRatio := OssSite[site]; limitRatio > 0 { //配置站点解析附件，根据准确率情况替换正文
		replace, filetext := AnalysisFile(true, limitRatio, tmp)
		if replace { //替换正文
			tmp["detail"] = filetext
		}
	} else { //其它网站附件信息，detail无效，只有一个附件且不是ocr识别的，替换正文
		//判断detail是否有效
		detail := cu.ObjToString(tmp["detail"])
		detail = su.FilterDetail(detail) //只保留文本内容
		if len([]rune(detail)) <= 5 || (len([]rune(detail)) <= 50 && SpecialTextReg.MatchString(detail)) {
			replace, filetext := AnalysisFile(false, 0, tmp)
			if replace { //替换正文
				tmp["detail"] = filetext
			}
		}
	}
}

func InitFileInfo() {
	OssSite = map[string]float64{}
	TimesLimit = cu.IntAll(Config["timeslimit"])
	UpdateLimit = cu.Float64All(Config["updatelimit"])
	for site, b := range Config["osssite"].(map[string]interface{}) {
		OssSite[site] = cu.Float64All(b)
	}
	fmt.Println(TimesLimit, UpdateLimit, OssSite)
	CmmonDFA = &DFA{}
	NotCommonDFA = &DFA{}
	LoadDict("common.txt") //初始化常用字典
}

// DFA
type DFA struct {
	Link map[string]interface{}
}

func (d *DFA) AddWord(keys ...string) {
	d.AddWordAll(true, keys...)
}
func (d *DFA) AddWordAll(haskey bool, keys ...string) {
	if d.Link == nil {
		d.Link = make(map[string]interface{})
	}
	for _, key := range keys {
		nowMap := &d.Link
		for i := 0; i < len(key); i++ {
			kc := key[i : i+1]
			if v, ok := (*nowMap)[kc]; ok {
				nowMap, _ = v.(*map[string]interface{})
			} else {
				newMap := map[string]interface{}{}
				newMap["YN"] = "0"
				(*nowMap)[kc] = &newMap
				nowMap = &newMap
			}
			if i == len(key)-1 {
				(*nowMap)["YN"] = "1"
				if haskey {
					(*nowMap)["K"] = key
				}
			}
		}
	}
}

func (d *DFA) CheckSensitiveWord(src string) []string {
	res := make([]string, 0)
	for j := 0; j < len(src); j++ {
		nowMap := &d.Link
		for i := j; i < len(src); i++ {
			word := src[i : i+1]
			nowMap, _ = (*nowMap)[word].(*map[string]interface{})
			if nowMap != nil { // 存在，则判断是否为最后一个
				if "1" == cu.ObjToString((*nowMap)["YN"]) {
					s := cu.ObjToString((*nowMap)["K"])
					res = append(res, s)
				}
			} else {
				break
			}
		}
	}
	return res
}

// 加载统计的常用词
func LoadDict(path string) {
	dictFile, err := os.Open(path)
	if err != nil {
		fmt.Println("Load Common.txt Error")
		os.Exit(-1)
	}
	defer dictFile.Close()
	reader := bufio.NewReader(dictFile)
	var (
		text      string
		frequency int
	)

	// 逐行读入分词
	line := 0
	for {
		line++
		size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
		if fsErr == io.EOF {                                  //读取到结尾
			break
		}
		if size == 2 { //正确数据
			if frequency >= TimesLimit { //常用字
				CmmonDFA.AddWord(text)
			} else { //非常用字
				//NotCommonDFA.AddWord(text)
			}
		} else {
			fmt.Println("Read Line Error:", line)
		}
	}
}

func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文；第二个bool:附件是否正常
	defer cu.Catch()
	filetext, byOcr := GetFileText(tmp) //解析附件
	if filetext == "" {
		return false, filetext
	}
	if !replaceSite { //不是指定站点解析的数据，若是ocr识别的不进行替换
		return !byOcr, filetext
	} else if replaceSite && !byOcr { //指定站点解析的数据，非ocr识别，认为附件可替换正文
		return true, filetext
	}
	//下面是指定站点附件识别后，按准确率判断是否替换detail
	//特殊情况：图片0 图片1
	filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
	if filetextTmp == "" { //附件为空
		return false, filetext
	}
	//中文匹配
	HanArr := HanReg.FindAllString(filetextTmp, -1)
	hanText := strings.Join(HanArr, "")
	hanLen := len([]rune(hanText))
	//filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
	//filetextLen := len([]rune(filetextTmp))
	//长度过滤
	if hanLen <= 100 {
		return false, filetext
	}
	commonArr := CmmonDFA.CheckSensitiveWord(hanText)
	commonLen := len(commonArr)
	//解析常用字和非常用字占比(由于常用字或非常用字集不全，会导致比例相加不为100%)
	commonRatio := float64(commonLen) / float64(hanLen)
	commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
	if commonRatio >= limitRatio {
		return true, filetext
	}
	return false, filetext

}

// 解析附件
func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文；第二个bool:附件是否正常
	defer cu.Catch()
	filetext, _ := GetFileText(tmp) //解析附件
	//过滤空格
	filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
	if filetextTmp == "" { //附件为空
		return false, false, filetext
	}
	//特殊情况：图片0 图片1
	filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
	if filetextTmp == "" { //附件为空
		return false, false, filetext
	}
	//中文匹配
	HanArr := HanReg.FindAllString(filetextTmp, -1)
	hanText := strings.Join(HanArr, "")
	hanTextLen := len([]rune(hanText))
	//长度过滤
	if hanTextLen <= 20 {
		return false, false, filetext
	} else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 {
		return false, false, filetext
	}
	//fmt.Println(hanTextLen, hanText)
	commonArr := CmmonDFA.CheckSensitiveWord(hanText)
	commonLen := len(commonArr)
	//fmt.Println(commonLen, commonArr)
	//commonText := strings.Join(commonArr, "")
	//notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
	//notCommonLen := len(notCommonArr)
	//fmt.Println(notCommonLen, notCommonArr)
	//解析常用字和非常用字占比(由于常用字或非常用字集不全，会导致比例相加不为100%)
	commonRatio := float64(commonLen) / float64(hanTextLen)
	commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
	if commonRatio < 0.5 { //常用字占比低于x<50%
		return false, false, filetext
	} else if replaceSite {
		if commonRatio < UpdateLimit { //50%<x<UpdateLimit
			return false, true, filetext
		} else { //x>=UpdateLimit
			return true, true, filetext
		}
	}
	//fmt.Println(commonRatio)
	//notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
	//notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
	return false, true, filetext
}

// 测试方法
func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
	//fmt.Println(detail)
	defer cu.Catch()
	//过滤空格
	filetextTmp := SpaceReg.ReplaceAllString(detail, "")
	if filetextTmp == "" { //附件为空
		return false, "", 0, 0, 0
	}
	//特殊情况：图片0 图片1
	filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
	if filetextTmp == "" { //附件为空
		return false, "", 1, 0, 0
	}
	//中文匹配
	HanArr := HanReg.FindAllString(filetextTmp, -1)
	hanText := strings.Join(HanArr, "")
	hanTextLen := len([]rune(hanText))
	//长度过滤
	if hanTextLen <= 100 {
		return false, "", 2, 0, 0
	}
	//fmt.Println(textLen, text)
	commonArr := CmmonDFA.CheckSensitiveWord(hanText)
	commonLen := len(commonArr)
	fmt.Println(commonLen, commonArr)
	//commonText := strings.Join(commonArr, "")
	notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
	notCommonLen := len(notCommonArr)
	fmt.Println(notCommonLen, notCommonArr)
	//notCommonText := strings.Join(notCommonArr, "")
	//解析常用字和非常用字占比(由于常用字或非常用字集不全，会导致比例相加不为100%)
	commonRatio := float64(commonLen) / float64(hanTextLen)
	commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
	notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
	notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
	return true, filetextTmp, 10, commonRatio, notCommonRatio
}