|
@@ -8,6 +8,7 @@ import (
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
|
|
|
|
+ llog "log"
|
|
"os"
|
|
"os"
|
|
"regexp"
|
|
"regexp"
|
|
"strconv"
|
|
"strconv"
|
|
@@ -25,6 +26,7 @@ var (
|
|
insertUrl = make(map[string]bool, 0)
|
|
insertUrl = make(map[string]bool, 0)
|
|
//specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
|
|
//specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
|
|
// "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
|
|
// "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
|
|
|
|
+ zkMap = make(map[string]string) //重客类型
|
|
)
|
|
)
|
|
|
|
|
|
func InitConfig() (err error) {
|
|
func InitConfig() (err error) {
|
|
@@ -99,7 +101,8 @@ func main() {
|
|
InitConfig()
|
|
InitConfig()
|
|
InitLog()
|
|
InitLog()
|
|
InitMgo()
|
|
InitMgo()
|
|
- //readFile()
|
|
|
|
|
|
+ //readFile() // 读文件写入数据库
|
|
|
|
+ readZK() //
|
|
|
|
|
|
exportFiles()
|
|
exportFiles()
|
|
|
|
|
|
@@ -162,6 +165,19 @@ func readFile() {
|
|
if len(rows[i]) > 6 {
|
|
if len(rows[i]) > 6 {
|
|
insert["wtype"] = rows[i][6]
|
|
insert["wtype"] = rows[i][6]
|
|
}
|
|
}
|
|
|
|
+ if len(rows[i]) > 7 {
|
|
|
|
+ insert["rule_type"] = rows[i][7]
|
|
|
|
+ }
|
|
|
|
+ //大模型给的是否相关
|
|
|
|
+ if len(rows[i]) > 8 {
|
|
|
|
+ insert["model_ai"] = rows[i][8]
|
|
|
|
+ }
|
|
|
|
+ if len(rows[i]) > 9 {
|
|
|
|
+ insert["model_weight"] = rows[i][9]
|
|
|
|
+ }
|
|
|
|
+ if len(rows[i]) > 10 {
|
|
|
|
+ insert["l2"] = rows[i][10]
|
|
|
|
+ }
|
|
MgoN.Save(GF.MongoN.Coll, insert)
|
|
MgoN.Save(GF.MongoN.Coll, insert)
|
|
}
|
|
}
|
|
|
|
|
|
@@ -203,12 +219,13 @@ func exportFiles() {
|
|
if infourl != "" {
|
|
if infourl != "" {
|
|
infourls[infourl] = true
|
|
infourls[infourl] = true
|
|
}
|
|
}
|
|
- if count%1000 == 0 {
|
|
|
|
|
|
+ if count%10000 == 0 {
|
|
log.Info("infourl", zap.Int("current", count))
|
|
log.Info("infourl", zap.Int("current", count))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
|
|
log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
|
|
-
|
|
|
|
|
|
+ //上月1号时间戳
|
|
|
|
+ lastMonthTimestamp := getLastMonthFirstDayTimestamp()
|
|
//2.官网数据
|
|
//2.官网数据
|
|
var data = make([]map[string]interface{}, 0)
|
|
var data = make([]map[string]interface{}, 0)
|
|
count2 := 0
|
|
count2 := 0
|
|
@@ -237,12 +254,25 @@ func exportFiles() {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ //标题内容排除词
|
|
|
|
+ rsa, rsb := isOutData(tmp)
|
|
|
|
+ if rsa {
|
|
|
|
+ continue
|
|
|
|
+ } else {
|
|
|
|
+ tmp["typea"] = rsb
|
|
|
|
+ }
|
|
|
|
+
|
|
datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
|
|
datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
|
|
|
|
+ ////发布时间,小于上月1号,直接过滤
|
|
|
|
+ if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+
|
|
if datetime > 0 {
|
|
if datetime > 0 {
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
}
|
|
}
|
|
tmp["wtype"] = "官网-python"
|
|
tmp["wtype"] = "官网-python"
|
|
-
|
|
|
|
|
|
+ tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
|
|
data = append(data, tmp)
|
|
data = append(data, tmp)
|
|
insertUrl[infourl] = true
|
|
insertUrl[infourl] = true
|
|
}
|
|
}
|
|
@@ -272,6 +302,13 @@ func exportFiles() {
|
|
if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
|
|
if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
+ //标题内容排除词
|
|
|
|
+ rsa, rsb := isOutData(tmp)
|
|
|
|
+ if rsa {
|
|
|
|
+ continue
|
|
|
|
+ } else {
|
|
|
|
+ tmp["typea"] = rsb
|
|
|
|
+ }
|
|
|
|
|
|
if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
|
|
if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
|
|
res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
|
|
res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
|
|
@@ -280,10 +317,16 @@ func exportFiles() {
|
|
}
|
|
}
|
|
|
|
|
|
datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
|
|
datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
|
|
|
|
+ ////发布时间,小于上月1号,直接过滤
|
|
|
|
+ if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+
|
|
if datetime > 0 {
|
|
if datetime > 0 {
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
}
|
|
}
|
|
tmp["wtype"] = "百度-Python"
|
|
tmp["wtype"] = "百度-Python"
|
|
|
|
+ tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
|
|
data = append(data, tmp)
|
|
data = append(data, tmp)
|
|
insertUrl[infourl] = true
|
|
insertUrl[infourl] = true
|
|
}
|
|
}
|
|
@@ -314,12 +357,25 @@ func exportFiles() {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ //标题内容排除词
|
|
|
|
+ rsa, rsb := isOutData(tmp)
|
|
|
|
+ if rsa {
|
|
|
|
+ continue
|
|
|
|
+ } else {
|
|
|
|
+ tmp["typea"] = rsb
|
|
|
|
+ }
|
|
datetime := util.Int64All(tmp["publishtime"])
|
|
datetime := util.Int64All(tmp["publishtime"])
|
|
|
|
+ ////发布时间,小于上月1号,直接过滤
|
|
|
|
+ if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+
|
|
if datetime > 0 {
|
|
if datetime > 0 {
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
|
|
}
|
|
}
|
|
tmp["infourl"] = tmp["href"]
|
|
tmp["infourl"] = tmp["href"]
|
|
tmp["wtype"] = "官网-lua"
|
|
tmp["wtype"] = "官网-lua"
|
|
|
|
+ tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
|
|
data = append(data, tmp)
|
|
data = append(data, tmp)
|
|
insertUrl[infourl] = true
|
|
insertUrl[infourl] = true
|
|
}
|
|
}
|
|
@@ -353,7 +409,7 @@ func exportFiles() {
|
|
_ = xlsx.SetColWidth(sheet, "E", "E", 45)
|
|
_ = xlsx.SetColWidth(sheet, "E", "E", 45)
|
|
_ = xlsx.SetColWidth(sheet, "F", "F", 20)
|
|
_ = xlsx.SetColWidth(sheet, "F", "F", 20)
|
|
|
|
|
|
- subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式"}
|
|
|
|
|
|
+ subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式", "规则相关度", "ai相关度", "ai权重", "L2"}
|
|
line++
|
|
line++
|
|
//设置第一行title
|
|
//设置第一行title
|
|
_ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
|
|
_ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
|
|
@@ -361,11 +417,39 @@ func exportFiles() {
|
|
//file := "20230825惠普_舆情.xlsx"
|
|
//file := "20230825惠普_舆情.xlsx"
|
|
fmt.Println("导出数据总数:-------", len(data))
|
|
fmt.Println("导出数据总数:-------", len(data))
|
|
for k, _ := range data {
|
|
for k, _ := range data {
|
|
- fmt.Println("导出数据-------", k)
|
|
|
|
|
|
+ llog.Println("导出数据-------", k)
|
|
line++
|
|
line++
|
|
val := []interface{}{
|
|
val := []interface{}{
|
|
- data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], "", data[k]["wtype"],
|
|
|
|
|
|
+ data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], data[k]["zk"], data[k]["wtype"], data[k]["typea"],
|
|
|
|
+ }
|
|
|
|
+ //调用智普AI
|
|
|
|
+ if GF.Env.Key != "" && GF.Env.Model != "" {
|
|
|
|
+ res := ZpRelated(GF.Env.Key, GF.Env.Model, util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
|
|
|
|
+ //res := normalChat(util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
|
|
|
|
+ val = append(val, res["type_ai"])
|
|
|
|
+ val = append(val, res["type_weight"])
|
|
|
|
+ //只要在L0 名单里,就不需要返回L2,否则返回大模型识别的主体
|
|
|
|
+ if zkMap[util.ObjToString(data[k]["site"])] != "" {
|
|
|
|
+ val = append(val, res["name"])
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ insert := map[string]interface{}{
|
|
|
|
+ "pubulishtime": strings.Replace(util.ObjToString(data[k]["pubulishtime"]), "/", "-", -1),
|
|
|
|
+ "site": data[k]["site"],
|
|
|
|
+ "title": data[k]["title"],
|
|
|
|
+ "detail": data[k]["detail"],
|
|
|
|
+ "infourl": data[k]["infourl"],
|
|
|
|
+ "type": data[k]["zk"],
|
|
|
|
+ "wtype": data[k]["wtype"],
|
|
|
|
+ "label": data[k]["typea"],
|
|
|
|
+ "type_ai": data[k]["type_ai"],
|
|
|
|
+ "type_weight": data[k]["type_weight"],
|
|
|
|
+ "name": data[k]["name"],
|
|
|
|
+ "createtime": time.Now().Format("2006-01-02:15:04:05"),
|
|
}
|
|
}
|
|
|
|
+ MgoN.Save(GF.MongoN.Coll, insert)
|
|
|
|
+
|
|
err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
|
|
err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
|
|
if err != nil {
|
|
if err != nil {
|
|
log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
|
|
log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
|
|
@@ -380,6 +464,30 @@ func exportFiles() {
|
|
fmt.Println("数据导出结束")
|
|
fmt.Println("数据导出结束")
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+// readZK 读取重客类型
|
|
|
|
+func readZK() {
|
|
|
|
+ f, err := excelize.OpenFile("重客类型.xlsx")
|
|
|
|
+ if err != nil {
|
|
|
|
+ fmt.Println(err)
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ defer func() {
|
|
|
|
+ if err := f.Close(); err != nil {
|
|
|
|
+ fmt.Println(err)
|
|
|
|
+ }
|
|
|
|
+ }()
|
|
|
|
+
|
|
|
|
+ rows, err := f.GetRows("Sheet1")
|
|
|
|
+ if err != nil {
|
|
|
|
+ fmt.Println(err)
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ //
|
|
|
|
+ for i := 1; i < len(rows); i++ {
|
|
|
|
+ zkMap[rows[i][1]] = rows[i][2]
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
// contains contains
|
|
// contains contains
|
|
func contains(data string, specials []string) bool {
|
|
func contains(data string, specials []string) bool {
|
|
for _, v := range specials {
|
|
for _, v := range specials {
|
|
@@ -390,6 +498,7 @@ func contains(data string, specials []string) bool {
|
|
return false
|
|
return false
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+// ParseDateString s时间字符串转时间戳
|
|
func ParseDateString(dateString string) (int64, error) {
|
|
func ParseDateString(dateString string) (int64, error) {
|
|
// Regular expressions for different date formats
|
|
// Regular expressions for different date formats
|
|
regexPatterns := []string{
|
|
regexPatterns := []string{
|