|
@@ -3,6 +3,7 @@ package main
|
|
|
import (
|
|
|
"bytes"
|
|
|
"crypto/tls"
|
|
|
+ "encoding/base64"
|
|
|
"fmt"
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
"io"
|
|
@@ -50,11 +51,12 @@ var (
|
|
|
)
|
|
|
|
|
|
type Data struct {
|
|
|
- Url string
|
|
|
- Text string
|
|
|
- Ok bool
|
|
|
- By string
|
|
|
- FileType string
|
|
|
+ Url string
|
|
|
+ Text string
|
|
|
+ Ok bool
|
|
|
+ By string
|
|
|
+ FileType string
|
|
|
+ Base64Type bool
|
|
|
}
|
|
|
|
|
|
// DownloadFile 补充未下附件
|
|
@@ -63,13 +65,15 @@ func DownloadFile() bool {
|
|
|
if gtid == "" || lteid == "" {
|
|
|
return false
|
|
|
}
|
|
|
- GetDataAndDownload(gtid, lteid) //下载
|
|
|
- SendUdp(gtid, lteid, NextStype, NextAddr, NextPort)
|
|
|
+ getdata := GetDataAndDownload(gtid, lteid) //下载
|
|
|
+ if getdata {
|
|
|
+ SendUdp(gtid, lteid, NextStype, NextAddr, NextPort)
|
|
|
+ }
|
|
|
return true
|
|
|
}
|
|
|
|
|
|
// DownloadFile 补充未下附件
|
|
|
-func GetDataAndDownload(gtid, lteid string) {
|
|
|
+func GetDataAndDownload(gtid, lteid string) (getdata bool) {
|
|
|
defer qu.Catch()
|
|
|
//查询数据
|
|
|
sess := MgoB.GetMgoConn()
|
|
@@ -83,19 +87,11 @@ func GetDataAndDownload(gtid, lteid string) {
|
|
|
"$lte": mongodb.StringTOBsonId(lteid),
|
|
|
},
|
|
|
}
|
|
|
- //field := map[string]interface{}{
|
|
|
- // "contenthtml": 1,
|
|
|
- // "spidercode": 1,
|
|
|
- // "href": 1,
|
|
|
- // "site": 1,
|
|
|
- // "channel": 1,
|
|
|
- // "title": 1,
|
|
|
- // "competehref": 1,
|
|
|
- // "projectinfo": 1,
|
|
|
+ //query = map[string]interface{}{
|
|
|
+ // "_id": mongodb.StringTOBsonId("64a3fa52b44bf087514687b3"), //64a216f2b44bf0875142bc1e
|
|
|
//}
|
|
|
- query = map[string]interface{}{
|
|
|
- "_id": mongodb.StringTOBsonId("64a216f2b44bf0875142bc1e"),
|
|
|
- }
|
|
|
+ count := MgoB.Count("bidding", query)
|
|
|
+ qu.Debug("数据量:", count, " query:", query)
|
|
|
it := sess.DB(MgoB.DbName).C("bidding").Find(&query).Iter()
|
|
|
n := 0
|
|
|
arr := []map[string]interface{}{}
|
|
@@ -137,6 +133,7 @@ func GetDataAndDownload(gtid, lteid string) {
|
|
|
tmp["projectinfo"] = map[string]interface{}{"attachments": attachments}
|
|
|
}
|
|
|
if len(attchText) > 0 {
|
|
|
+ getdata = true
|
|
|
tmp["attach_text"] = attchText
|
|
|
}
|
|
|
lock.Lock()
|
|
@@ -159,6 +156,7 @@ func GetDataAndDownload(gtid, lteid string) {
|
|
|
arr = []map[string]interface{}{}
|
|
|
}
|
|
|
qu.Debug("当前轮执行完毕:", gtid, lteid)
|
|
|
+ return
|
|
|
}
|
|
|
|
|
|
// FilterAndDownload 筛选有效数据并下载对应附件
|
|
@@ -231,6 +229,9 @@ func DealAndDownload(tmp []*Data, href string) (result []*Data, attachments, att
|
|
|
if !strings.HasPrefix(url, "https") && !strings.HasPrefix(url, "http") { //异常链接
|
|
|
if strings.HasPrefix(url, "data:image/") { //base64图片
|
|
|
//待处理TODO
|
|
|
+ data.Base64Type = true
|
|
|
+ result = append(result, data)
|
|
|
+ data.Url = ""
|
|
|
} else {
|
|
|
url = reg_repair_href1.ReplaceAllString(url, "") //处理../ ./ /
|
|
|
//获取href域名
|
|
@@ -253,53 +254,88 @@ func DealAndDownload(tmp []*Data, href string) (result []*Data, attachments, att
|
|
|
if len(result) > 0 {
|
|
|
index := 0
|
|
|
for _, data := range result {
|
|
|
- contentType, ret := Download(data.Url) //下载
|
|
|
- fileType := data.FileType //从url或者text提取的附件类型
|
|
|
- if fileType == "" {
|
|
|
- fileType = GetType(contentType, ret) //获取附件类型
|
|
|
- data.FileType = fileType
|
|
|
- }
|
|
|
- if fileType != "" {
|
|
|
- fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
|
|
|
- fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
|
|
|
- bs := bytes.NewReader(ret)
|
|
|
- size := qu.ConvertFileSize(bs.Len())
|
|
|
- b, _ := sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
|
|
|
- //qu.Debug("oss", fileName, size, fileType, fid)
|
|
|
- data.Ok = b
|
|
|
- if b {
|
|
|
- attachments[fmt.Sprint(index+1)] = map[string]interface{}{
|
|
|
- "fid": fid,
|
|
|
- "filename": fileName,
|
|
|
- "ftype": fileType,
|
|
|
- "org_url": data.Url,
|
|
|
- "size": size,
|
|
|
- "url": "oss",
|
|
|
+ if data.Base64Type {
|
|
|
+ fileName := "附件" + fmt.Sprint(index+1) + ".jpg"
|
|
|
+ i := strings.Index(data.Url, ",")
|
|
|
+ dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(data.Url[i+1:]))
|
|
|
+ ret, err := io.ReadAll(dec)
|
|
|
+ if err == nil && len(ret) >= 1024*3 {
|
|
|
+ fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
|
|
|
+ bs := bytes.NewReader(ret)
|
|
|
+ size := qu.ConvertFileSize(bs.Len())
|
|
|
+ data.Ok, err = sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
|
|
|
+ if data.Ok { //上传成功,解析附件
|
|
|
+ GetAttachText(fid, fileName, "jpg", "", size, index, ret, attachments, attachText)
|
|
|
+ index++
|
|
|
}
|
|
|
- //附件解析
|
|
|
- conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心
|
|
|
- if err == nil {
|
|
|
- resp := GetFileText(conn, fileName, fid, fileType, ret)
|
|
|
- if resp != nil {
|
|
|
- tmap := map[string]interface{}{}
|
|
|
- for i, r := range resp.Result {
|
|
|
- rmap := map[string]interface{}{
|
|
|
- "file_name": r.FileName,
|
|
|
- "attach_url": r.TextUrl,
|
|
|
- "state": r.ErrorState,
|
|
|
- }
|
|
|
- tmap[fmt.Sprint(i)] = rmap
|
|
|
- }
|
|
|
- if len(tmap) > 0 {
|
|
|
- attachText[fmt.Sprint(index)] = tmap
|
|
|
- }
|
|
|
- }
|
|
|
- } else {
|
|
|
- qu.Debug("附件解析服务连接失败:", err)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ contentType, ret := Download(data.Url) //下载
|
|
|
+ fileType := data.FileType //从url或者text提取的附件类型
|
|
|
+ if fileType == "" {
|
|
|
+ fileType = GetType(contentType, ret) //获取附件类型
|
|
|
+ data.FileType = fileType
|
|
|
+ }
|
|
|
+ if fileType != "" && len(ret) >= 1024*3 {
|
|
|
+ fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
|
|
|
+ fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
|
|
|
+ bs := bytes.NewReader(ret)
|
|
|
+ size := qu.ConvertFileSize(bs.Len())
|
|
|
+ data.Ok, _ = sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
|
|
|
+ if data.Ok { //上传成功,解析附件
|
|
|
+ GetAttachText(fid, fileName, fileType, data.Url, size, index, ret, attachments, attachText)
|
|
|
+ index++
|
|
|
}
|
|
|
- index++
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ //contentType, ret := Download(data.Url) //下载
|
|
|
+ //fileType := data.FileType //从url或者text提取的附件类型
|
|
|
+ //if fileType == "" {
|
|
|
+ // fileType = GetType(contentType, ret) //获取附件类型
|
|
|
+ // data.FileType = fileType
|
|
|
+ //}
|
|
|
+ //if fileType != "" {
|
|
|
+ // fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
|
|
|
+ // fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
|
|
|
+ // bs := bytes.NewReader(ret)
|
|
|
+ // size := qu.ConvertFileSize(bs.Len())
|
|
|
+ // b, _ := sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
|
|
|
+ // //qu.Debug("oss", fileName, size, fileType, fid)
|
|
|
+ // data.Ok = b
|
|
|
+ // if b {
|
|
|
+ // attachments[fmt.Sprint(index+1)] = map[string]interface{}{
|
|
|
+ // "fid": fid,
|
|
|
+ // "filename": fileName,
|
|
|
+ // "ftype": fileType,
|
|
|
+ // "org_url": data.Url,
|
|
|
+ // "size": size,
|
|
|
+ // "url": "oss",
|
|
|
+ // }
|
|
|
+ // //附件解析
|
|
|
+ // conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心
|
|
|
+ // if err == nil {
|
|
|
+ // resp := GetFileText(conn, fileName, fid, fileType, ret)
|
|
|
+ // if resp != nil {
|
|
|
+ // tmap := map[string]interface{}{}
|
|
|
+ // for i, r := range resp.Result {
|
|
|
+ // rmap := map[string]interface{}{
|
|
|
+ // "file_name": r.FileName,
|
|
|
+ // "attach_url": r.TextUrl,
|
|
|
+ // "state": r.ErrorState,
|
|
|
+ // }
|
|
|
+ // tmap[fmt.Sprint(i)] = rmap
|
|
|
+ // }
|
|
|
+ // if len(tmap) > 0 {
|
|
|
+ // attachText[fmt.Sprint(index)] = tmap
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // } else {
|
|
|
+ // qu.Debug("附件解析服务连接失败:", err)
|
|
|
+ // }
|
|
|
+ // index++
|
|
|
+ // }
|
|
|
+ //}
|
|
|
}
|
|
|
}
|
|
|
return
|
|
@@ -354,3 +390,36 @@ func GetType(contentType string, ret []byte) string {
|
|
|
}
|
|
|
return ""
|
|
|
}
|
|
|
+
|
|
|
+func GetAttachText(fid, fileName, fileType, url, size string, index int, ret []byte, attachments, attachText map[string]interface{}) {
|
|
|
+ defer qu.Catch()
|
|
|
+ attachments[fmt.Sprint(index+1)] = map[string]interface{}{
|
|
|
+ "fid": fid,
|
|
|
+ "filename": fileName,
|
|
|
+ "ftype": fileType,
|
|
|
+ "org_url": url,
|
|
|
+ "size": size,
|
|
|
+ "url": "oss",
|
|
|
+ }
|
|
|
+ //附件解析
|
|
|
+ conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心
|
|
|
+ if err == nil {
|
|
|
+ resp := GetFileText(conn, fileName, fid, fileType, ret)
|
|
|
+ if resp != nil {
|
|
|
+ tmap := map[string]interface{}{}
|
|
|
+ for i, r := range resp.Result {
|
|
|
+ rmap := map[string]interface{}{
|
|
|
+ "file_name": r.FileName,
|
|
|
+ "attach_url": r.TextUrl,
|
|
|
+ "state": r.ErrorState,
|
|
|
+ }
|
|
|
+ tmap[fmt.Sprint(i)] = rmap
|
|
|
+ }
|
|
|
+ if len(tmap) > 0 {
|
|
|
+ attachText[fmt.Sprint(index)] = tmap
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ qu.Debug("附件解析服务连接失败:", err)
|
|
|
+ }
|
|
|
+}
|