|
@@ -1,19 +1,46 @@
|
|
|
package main
|
|
|
|
|
|
import (
|
|
|
+ "bytes"
|
|
|
+ "encoding/json"
|
|
|
"fmt"
|
|
|
log "github.com/donnie4w/go-logger/logger"
|
|
|
+ "github.com/tealeg/xlsx"
|
|
|
+ "io/ioutil"
|
|
|
+ "net/http"
|
|
|
+ "os"
|
|
|
qu "qfw/util"
|
|
|
"qfw/util/elastic"
|
|
|
"qfw/util/redis"
|
|
|
+ "reflect"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "sync"
|
|
|
+ "time"
|
|
|
+ "unicode"
|
|
|
+ "unicode/utf8"
|
|
|
)
|
|
|
|
|
|
+
|
|
|
+var task chan struct{} = make(chan struct{}, 1)
|
|
|
+var rpre *regexp.Regexp = regexp.MustCompile("https://www.jianyu360.com/article/content/")
|
|
|
+var rsuf *regexp.Regexp = regexp.MustCompile("(.html).*")
|
|
|
+
|
|
|
//解密
|
|
|
func decodeJyUrl() {
|
|
|
|
|
|
- test := "ABCY1wJYzwOMyg4NHdxZ3IkCCQCIDFjcWhwPw4nLS4NYGpzcQFUCSs%3D"
|
|
|
+ //jyurl := ""
|
|
|
+ //jyurl = rpre.ReplaceAllString(jyurl, "")
|
|
|
+ //jyurl = rsuf.ReplaceAllString(jyurl, "")
|
|
|
+ //new_id := qu.CommonDecodeArticle("content", jyurl)[0]
|
|
|
+ //log.Debug(new_id)
|
|
|
+
|
|
|
+
|
|
|
+ test := "ABCY1wJYjxYJys7RHhjZHUoCDI4QCJ0XFJ0KB4nKDodd3tzeD9UCjE%3D"
|
|
|
var Decode = qu.CommonDecodeArticle("content", test)
|
|
|
log.Debug(Decode[0])
|
|
|
+
|
|
|
+ return
|
|
|
}
|
|
|
//加密
|
|
|
func encodeJyUrl() {
|
|
@@ -21,6 +48,335 @@ func encodeJyUrl() {
|
|
|
var Encode = fmt.Sprintf(Url, qu.CommonEncodeArticle("content", "60b9bf4a8a2adb30a5a25000"))
|
|
|
log.Debug(Encode)
|
|
|
}
|
|
|
+func dealWithBaiduYunData() {
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
+ q,total,isOK,exist,un_exist:=map[string]interface{}{},0,0,0,0
|
|
|
+ arr := make([]map[string]string,0)
|
|
|
+ it := sess.DB(save_mgo.DbName).C("bidding_test").Find(&q).Select(map[string]interface{}{
|
|
|
+ "detail":1,
|
|
|
+ "buyer":1,
|
|
|
+ }).Iter()
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
|
|
|
+ if total%1000==0 {
|
|
|
+ log.Debug("cur index ",total,isOK,exist,un_exist,tmp["_id"])
|
|
|
+ }
|
|
|
+ detail := trimHtml(qu.ObjToString(tmp["detail"]))
|
|
|
+ buyer := qu.ObjToString(tmp["buyer"])
|
|
|
+ length := utf8.RuneCountInString(detail)
|
|
|
+ if length > 50 && buyer!="" {
|
|
|
+ isOK++
|
|
|
+ new_buyer := getBaiduYunBuyer(detail)
|
|
|
+ if new_buyer!="" {
|
|
|
+ exist++
|
|
|
+ //log.Debug("原值:",buyer," 最终匹配:",new_buyer)
|
|
|
+ }else {
|
|
|
+ un_exist++
|
|
|
+ //log.Debug("未匹配:",tmp["_id"])
|
|
|
+ }
|
|
|
+ arr = append(arr, map[string]string{
|
|
|
+ "source":buyer,
|
|
|
+ "buyer":new_buyer,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ if total>200 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
+ }
|
|
|
+
|
|
|
+ log.Debug("处理完毕over......",total,isOK,exist,un_exist)
|
|
|
+ log.Debug("准备写入xlsx...",len(arr))
|
|
|
+ os.Remove("百度云统计.xlsx")
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ sheet, _ := f.AddSheet("训练")
|
|
|
+ row := sheet.AddRow()
|
|
|
+ row.AddCell().Value = "原采购"
|
|
|
+ row.AddCell().Value = "结果"
|
|
|
+ row.AddCell().Value = "新采购等"
|
|
|
+ for _,v :=range arr{
|
|
|
+ row := sheet.AddRow()
|
|
|
+ buyer:=v["source"]
|
|
|
+ new_buyer := v["buyer"]
|
|
|
+ row.AddCell().SetString(buyer)
|
|
|
+ buyerArr := strings.Split(new_buyer,"~")
|
|
|
+ isTrue := false
|
|
|
+ for _,name :=range buyerArr{
|
|
|
+ if name == buyer {
|
|
|
+ isTrue = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if isTrue {
|
|
|
+ row.AddCell().Value = "正确"
|
|
|
+ }else {
|
|
|
+ if new_buyer !="" {
|
|
|
+ row.AddCell().Value = "异常"
|
|
|
+ }else {
|
|
|
+ row.AddCell().Value = ""
|
|
|
+ }
|
|
|
+ }
|
|
|
+ row.AddCell().SetString(new_buyer)
|
|
|
+ }
|
|
|
+ err := f.Save("百度云统计.xlsx")
|
|
|
+ if err != nil {
|
|
|
+ log.Debug("保存xlsx失败:", err)
|
|
|
+ }else {
|
|
|
+ log.Debug("保存xlsx成功:", err)
|
|
|
+ }
|
|
|
+}
|
|
|
+//百度云相关
|
|
|
+func getBaiduYunBuyer(detail string) string {
|
|
|
+ //fmt.Println("runing...")
|
|
|
+ buyer:=""
|
|
|
+ body := map[string]interface{}{"text":detail}
|
|
|
+ data := postBaiDuYun("https://aip.baidubce.com/rpc/2.0/ai_custom/v1/entity_xtr/allbuyer?access_token=24.595a79beb92df28ae44081d8c069e32c.2592000.1627033355.282335-24414386",
|
|
|
+ body, "application/json")
|
|
|
+ //fmt.Println("post...end")
|
|
|
+ if results, ok := data["results"].([]interface{}); ok {
|
|
|
+ for _,v:=range results{
|
|
|
+ tmp := *qu.ObjToMap(v)
|
|
|
+ span := qu.ObjToString(tmp["span"])
|
|
|
+ if span!="" {
|
|
|
+ if buyer=="" {
|
|
|
+ buyer = span
|
|
|
+ }else {
|
|
|
+ buyer = buyer+"~"+qu.ObjToString(tmp["span"])
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ log.Debug("异常:",reflect.TypeOf(data["results"]),data["results"])
|
|
|
+ }
|
|
|
+
|
|
|
+ return buyer
|
|
|
+}
|
|
|
+func postBaiDuYun(url string, data interface{}, contentType string) map[string]interface{}{
|
|
|
+ task <- struct{}{}
|
|
|
+ defer func() {
|
|
|
+ <-task
|
|
|
+ }()
|
|
|
+ client := &http.Client{Timeout: 15 * time.Second}
|
|
|
+ jsonStr, _ := json.Marshal(data)
|
|
|
+ resp, err := client.Post(url, contentType, bytes.NewBuffer(jsonStr))
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+ defer resp.Body.Close()
|
|
|
+ result, _ := ioutil.ReadAll(resp.Body)
|
|
|
+ dict := make(map[string]interface{})
|
|
|
+ json.Unmarshal(result, &dict)
|
|
|
+ //fmt.Println("post...start")
|
|
|
+ return dict
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+//修复全量-指定字段数据
|
|
|
+func dealWithFullData() {
|
|
|
+ log.Debug("......处理全量数据")
|
|
|
+
|
|
|
+
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
+ q:=map[string]interface{}{}
|
|
|
+ total,isOK := 0,0
|
|
|
+ pool := make(chan bool, 10)
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
+ it := sess.DB(save_mgo.DbName).C("result_dis_0618").Find(&q).Select(map[string]interface{}{
|
|
|
+ "bidmode":1,
|
|
|
+ "getdocmethod":1,
|
|
|
+ "agencyfee":1,
|
|
|
+ "agencyrate":1,
|
|
|
+ "docamount":1,
|
|
|
+ }).Iter()
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
|
|
|
+ if total%100000==0 {
|
|
|
+ log.Debug("cur index ",total,isOK,tmp["_id"])
|
|
|
+ }
|
|
|
+ b,dict := fieldValidValue(tmp)
|
|
|
+ curID := BsonTOStringId(tmp["_id"])
|
|
|
+ if b {
|
|
|
+ source := save_mgo.FindById("result_20210108",curID)
|
|
|
+ if source!=nil && len(source)>2 {
|
|
|
+ isOK++ //符合条件-可以更新
|
|
|
+ pool <- true
|
|
|
+ wg.Add(1)
|
|
|
+ go func(dict map[string]interface{},curID string) {
|
|
|
+ defer func() {
|
|
|
+ <-pool
|
|
|
+ wg.Done()
|
|
|
+ }()
|
|
|
+ //更新方法
|
|
|
+ save_mgo.UpdateById("result_20210108",curID,map[string]interface{}{
|
|
|
+ "$set": dict,
|
|
|
+ })
|
|
|
+ }(dict,curID)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
+ }
|
|
|
+
|
|
|
+ wg.Wait()
|
|
|
+ time.Sleep(30*time.Second)
|
|
|
+ log.Debug("处理完毕over......",total,isOK)
|
|
|
+}
|
|
|
+func fieldValidValue(data map[string]interface{}) (bool,map[string]interface{}) {
|
|
|
+ b:=false
|
|
|
+ dict := make(map[string]interface{},0)
|
|
|
+ bidmode := qu.IntAll(data["bidmode"])
|
|
|
+ if bidmode == 1 {
|
|
|
+ b=true
|
|
|
+ dict["bidway"] = "纸质投标"
|
|
|
+ }else if bidmode == 2 {
|
|
|
+ b=true
|
|
|
+ dict["bidway"] = "电子投标"
|
|
|
+ }
|
|
|
+ getdocmethod := qu.ObjToString(data["getdocmethod"])
|
|
|
+ if getdocmethod !="" {
|
|
|
+ b=true
|
|
|
+ dict["getdocmethod"] = getdocmethod
|
|
|
+ }
|
|
|
+ agencyfee := qu.Float64All(data["agencyfee"])
|
|
|
+ if agencyfee > float64(0) {
|
|
|
+ b=true
|
|
|
+ dict["agencyfee"] = agencyfee
|
|
|
+ }
|
|
|
+ agencyrate := qu.Float64All(data["agencyrate"])
|
|
|
+ if agencyrate > float64(0) {
|
|
|
+ b=true
|
|
|
+ dict["agencyrate"] = agencyrate
|
|
|
+ }
|
|
|
+ docamount := qu.Float64All(data["docamount"])
|
|
|
+ if docamount > float64(0) {
|
|
|
+ b=true
|
|
|
+ dict["docamount"] = docamount
|
|
|
+ }
|
|
|
+ return b,dict
|
|
|
+}
|
|
|
+//导出实体数据
|
|
|
+func exportEntityData() {
|
|
|
+ log.Debug("......导出数据")
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
+ q:=map[string]interface{}{}
|
|
|
+ arr := make([]map[string]string,0)
|
|
|
+ total := 0
|
|
|
+ it := sess.DB(save_mgo.DbName).C("bidding_buyer_test").Find(&q).Iter()
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
|
|
|
+ if total%1000==0 {
|
|
|
+ log.Debug("cur index ",total,tmp["_id"])
|
|
|
+ }
|
|
|
+
|
|
|
+ detail := trimHtml(qu.ObjToString(tmp["detail"]))
|
|
|
+ buyer := qu.ObjToString(tmp["buyer"])
|
|
|
+ //buyer不能有符号
|
|
|
+ buyer = strings.ReplaceAll(buyer,"(","(")
|
|
|
+ buyer = strings.ReplaceAll(buyer,")",")")
|
|
|
+
|
|
|
+ length := utf8.RuneCountInString(detail)
|
|
|
+ if length > 50 && buyer!="" {
|
|
|
+ if length > 500 {
|
|
|
+ detail = string([]rune(detail)[0:500])
|
|
|
+ }
|
|
|
+ if strings.Contains(detail,buyer) {
|
|
|
+ arr = append(arr, map[string]string{
|
|
|
+ "detail":detail,
|
|
|
+ "buyer":buyer,
|
|
|
+ })
|
|
|
+ //log.Debug("长度:",len(detail),utf8.RuneCountInString(detail))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
+ }
|
|
|
+ log.Debug("准备完毕......",len(arr))
|
|
|
+
|
|
|
+
|
|
|
+ maxTag :=0
|
|
|
+ indexArrStr := make([]map[string]string,0)
|
|
|
+ for _,v:=range arr{
|
|
|
+ detail:= v["detail"]
|
|
|
+ buyer:= v["buyer"]
|
|
|
+ reg := regexp.MustCompile(buyer)
|
|
|
+ indexArr := reg.FindAllStringIndex(detail,-1)
|
|
|
+ //log.Debug(indexArr)
|
|
|
+ if len(indexArr)>maxTag {
|
|
|
+ maxTag = len(indexArr)
|
|
|
+ }
|
|
|
+ //处理下标 [7,8],LOC [[3 30] [48 75] [304 331]]
|
|
|
+ str := ""
|
|
|
+ for _,index := range indexArr {
|
|
|
+ first_index:=index[0]
|
|
|
+ tempStr := detail[0:first_index]
|
|
|
+ head,length := utf8.RuneCountInString(tempStr),utf8.RuneCountInString(buyer)
|
|
|
+ if str =="" {
|
|
|
+ str = fmt.Sprintf("[%d,%d],采购单位",head,head+length)
|
|
|
+ }else {
|
|
|
+ str = str+fmt.Sprintf(":[%d,%d],采购单位",head,head+length)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ indexArrStr = append(indexArrStr, map[string]string{
|
|
|
+ "detail":string(detail),
|
|
|
+ "index":str,
|
|
|
+ })
|
|
|
+ //log.Debug(str)
|
|
|
+ }
|
|
|
+ os.Remove("训练模型.xlsx")
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ sheet, _ := f.AddSheet("extract")
|
|
|
+ row := sheet.AddRow()
|
|
|
+ row.AddCell().Value = "文本内容"
|
|
|
+ for i := 1; i <= maxTag; i++ {
|
|
|
+ row.AddCell().SetString(fmt.Sprintf("实体标注%d", i))
|
|
|
+ }
|
|
|
+ for _,tmp:=range indexArrStr {
|
|
|
+ row = sheet.AddRow()
|
|
|
+ row.AddCell().SetString(fmt.Sprintf("%s入库量", tmp["detail"]))
|
|
|
+ indexArr := strings.Split(tmp["index"], ":")
|
|
|
+ for _, str := range indexArr {
|
|
|
+ row.AddCell().SetString(fmt.Sprintf("%s", str))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ err := f.Save("训练模型.xlsx")
|
|
|
+ if err != nil {
|
|
|
+ log.Debug("保存xlsx失败:", err)
|
|
|
+ return
|
|
|
+ }else {
|
|
|
+ log.Debug("保存xlsx成功:", err)
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+func trimHtml(src string) string {
|
|
|
+ //将HTML标签全转换成小写
|
|
|
+ re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
|
|
|
+ src = re.ReplaceAllStringFunc(src, strings.ToLower)
|
|
|
+ //去除STYLE
|
|
|
+ re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
|
|
|
+ src = re.ReplaceAllString(src, "")
|
|
|
+ //去除SCRIPT
|
|
|
+ re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
|
|
|
+ src = re.ReplaceAllString(src, "")
|
|
|
+ //去除所有尖括号内的HTML代码,并换成换行符
|
|
|
+ re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
|
|
|
+ src = re.ReplaceAllString(src, "\n")
|
|
|
+ //去除连续的换行符
|
|
|
+ re, _ = regexp.Compile("\\s{2,}")
|
|
|
+ src = re.ReplaceAllString(src, "\n")
|
|
|
+ return strings.TrimSpace(src)
|
|
|
+}
|
|
|
+func escape(s string) string {
|
|
|
+ news := ""
|
|
|
+ for _, c := range s {
|
|
|
+ if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
|
|
|
+ news = news + string(c)
|
|
|
+ }else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
|
|
|
+ a := string([]rune{os.PathSeparator, '\\'})
|
|
|
+ news = news + a + string(c)
|
|
|
+ } else {
|
|
|
+ return ""
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return news
|
|
|
+}
|
|
|
|
|
|
func testMethod() {
|
|
|
qu.Catch()
|