123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- package main
- import (
- "bytes"
- "crypto/tls"
- "encoding/base64"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "io"
- "io/ioutil"
- cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
- su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil"
- "net/http"
- "regexp"
- "strings"
- "time"
- )
- var (
- htmlModelReg = regexp.MustCompile(`{{[a-zA-z.()\d,:]{5,}}}|^(\$)`) //过滤模板语言
- reg_filter_url = regexp.MustCompile(`((\.\./)+|null|[。))]+$)`)
- reg_invalid_url = regexp.MustCompile(`(^(tel)|^#[\p{Han}]+$|^[\p{Han}]+$|javascript|login|mailto|\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)[))##/、]{0,}$)+`)
- reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
- reg_filetype = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg)$`)
- reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
- //全匹配无效内容
- reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
- //包含无效关键词
- reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
- //以关键词结尾
- reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
- //修复链接
- reg_repair_href1 = regexp.MustCompile(`^(\.\./|\./|/)+`)
- reg_domain = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.)[^/]+/`)
- reg_domain_param = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.).*/`)
- //附件类型
- reg_jpg = regexp.MustCompile(`(jpg|png|jpeg|image)`)
- reg_docx = regexp.MustCompile(`(docx|word)`)
- reg_doc = regexp.MustCompile(`doc`)
- reg_xlsx = regexp.MustCompile(`(xlsx|xls|sheet)`)
- reg_pdf = regexp.MustCompile(`pdf`)
- reg_zip = regexp.MustCompile(`zip`)
- reg_rar = regexp.MustCompile(`rar`)
- )
- type Data struct {
- Url string
- Text string
- Ok bool
- By string
- FileType string
- Base64Type bool
- }
- // DownloadFile 补充未下附件
- func GetDataAndDownload(tmp map[string]interface{}) (isEnd int, saveMgo bool) {
- defer cu.Catch()
- isEnd = 1
- //1、筛选a标签
- hrefMap := map[string]string{} //记录contenthtml中筛出的a标签信息;key:url,val:text
- contenthtml := cu.ObjToString(tmp["contenthtml"])
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(contenthtml))
- doc.Find("a[href]").Each(func(index int, element *goquery.Selection) {
- attachmentURL, _ := element.Attr("href") //链接
- if attachmentURL != "" && !htmlModelReg.MatchString(attachmentURL) {
- hrefMap[attachmentURL] = element.Text()
- }
- })
- tmpResult := FilterAndDownload(hrefMap) //筛选有效附件链接
- if len(tmpResult) > 0 {
- href := cu.ObjToString(tmp["href"])
- _, attachments, attchText := DealAndDownload(tmpResult, href) //修复链接和文本并下载附件
- if len(attachments) > 0 {
- //tmp["file_add_log"] = result
- if projectinfo, ok := tmp["projectinfo"].(map[string]interface{}); ok {
- projectinfo["attachments"] = attachments
- } else {
- tmp["projectinfo"] = map[string]interface{}{"attachments": attachments}
- }
- if len(attchText) > 0 {
- tmp["attach_text"] = attchText
- } else {
- saveMgo = true
- }
- isEnd = 0
- }
- }
- return
- }
- // FilterAndDownload 筛选有效数据并下载对应附件
- func FilterAndDownload(hrefMap map[string]string) (result []*Data) {
- defer cu.Catch()
- if len(hrefMap) == 0 {
- return
- }
- for url, text := range hrefMap {
- //url长度过滤
- tmpUrl := strings.ToLower(url)
- if len([]rune(tmpUrl)) <= 10 { //长度
- continue
- }
- //url无效字符过滤
- tmpUrl = reg_filter_url.ReplaceAllString(tmpUrl, "")
- if tmpUrl == "" || reg_invalid_url.MatchString(tmpUrl) {
- continue
- }
- tmpText := strings.ToLower(text)
- //url、text无效附件类型过滤
- if reg_err_filetype.MatchString(tmpUrl) || reg_err_filetype.MatchString(tmpText) { //无效附件类型
- continue
- }
- tmpText = reg_fileter_text.ReplaceAllString(tmpText, "") //过滤无效字符
- //text过滤
- if fileType := reg_filetype.FindString(tmpUrl); fileType != "" { //含常见附件类型结尾的url
- result = append(result, &Data{
- Url: url,
- Text: text,
- By: "url",
- FileType: strings.ReplaceAll(fileType, ".", ""),
- })
- } else if fileType := reg_filetype.FindString(tmpText); fileType != "" { //含常见附件类型结尾的text
- result = append(result, &Data{
- Url: url,
- Text: text,
- By: "text",
- FileType: strings.ReplaceAll(fileType, ".", ""),
- })
- } else {
- //textStr = reg_fileter_text.ReplaceAllString(textStr, "") //过滤无效字符
- if reg_invalid_text.ReplaceAllString(tmpText, "") == "" { //无效,全文本匹配,舍弃
- continue
- } else if reg_filter_text1.MatchString(tmpText) || reg_filter_text2.MatchString(tmpText) { //无效,部分文本匹配,舍弃
- continue
- }
- result = append(result, &Data{
- Url: url,
- Text: tmpText,
- By: "filter",
- })
- }
- }
- return
- }
- // DealAndDownload 修复链接和文本并下载附件
- func DealAndDownload(tmp []*Data, href string) (result []*Data, attachments, attachText map[string]interface{}) {
- defer cu.Catch()
- attachments = map[string]interface{}{}
- attachText = map[string]interface{}{}
- for _, data := range tmp {
- url := strings.ReplaceAll(data.Url, "\\", "/")
- //异常链接修复
- if !strings.HasPrefix(url, "https") && !strings.HasPrefix(url, "http") { //异常链接
- if strings.HasPrefix(url, "data:image/") { //base64图片
- //待处理TODO
- data.Base64Type = true
- result = append(result, data)
- data.Url = ""
- } else {
- url = reg_repair_href1.ReplaceAllString(url, "") //处理../ ./ /
- //获取href域名
- domain := reg_domain.FindString(href)
- //var urlArr []string
- param_domain := reg_domain_param.FindString(href)
- if domain != "" { //优先拼接域名
- data.Url = domain + url
- result = append(result, data)
- }
- if param_domain != "" { //再拼接带参链接
- data.Url = param_domain + url
- result = append(result, data)
- }
- }
- } else {
- result = append(result, data)
- }
- }
- if len(result) > 0 {
- index := 0
- for _, data := range result {
- if data.Base64Type {
- fileName := "附件" + fmt.Sprint(index+1) + ".jpg"
- i := strings.Index(data.Url, ",")
- dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(data.Url[i+1:]))
- ret, err := io.ReadAll(dec)
- if err == nil && len(ret) >= 1024*3 && len(ret) < 15*1024*1024 {
- fid := su.GetHashKey(ret) + su.TypeByExt(fileName)
- bs := bytes.NewReader(ret)
- size := su.ConvertFileSize(bs.Len())
- data.Ok, err = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传
- if data.Ok { //上传成功,解析附件
- GetAttachText(fid, fileName, "jpg", "", size, index, ret, attachments, attachText)
- index++
- }
- }
- } else {
- contentType, ret := Download(data.Url) //下载
- if len(ret) > 15*1024*1024 || len(ret) < 1024*3 {
- fmt.Println("file size is too big or small!")
- continue
- }
- fileType := data.FileType //从url或者text提取的附件类型
- if fileType == "" {
- fileType = GetType(contentType, ret) //获取附件类型
- data.FileType = fileType
- }
- if fileType != "" {
- fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
- fid := su.GetHashKey(ret) + su.TypeByExt(fileName)
- bs := bytes.NewReader(ret)
- size := su.ConvertFileSize(bs.Len())
- data.Ok, _ = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传
- if data.Ok { //上传成功,解析附件
- GetAttachText(fid, fileName, fileType, data.Url, size, index, ret, attachments, attachText)
- index++
- }
- }
- }
- //contentType, ret := Download(data.Url) //下载
- //fileType := data.FileType //从url或者text提取的附件类型
- //if fileType == "" {
- // fileType = GetType(contentType, ret) //获取附件类型
- // data.FileType = fileType
- //}
- //if fileType != "" {
- // fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
- // fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
- // bs := bytes.NewReader(ret)
- // size := qu.ConvertFileSize(bs.Len())
- // b, _ := sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
- // //qu.Debug("oss", fileName, size, fileType, fid)
- // data.Ok = b
- // if b {
- // attachments[fmt.Sprint(index+1)] = map[string]interface{}{
- // "fid": fid,
- // "filename": fileName,
- // "ftype": fileType,
- // "org_url": data.Url,
- // "size": size,
- // "url": "oss",
- // }
- // //附件解析
- // conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心
- // if err == nil {
- // resp := GetFileText(conn, fileName, fid, fileType, ret)
- // if resp != nil {
- // tmap := map[string]interface{}{}
- // for i, r := range resp.Result {
- // rmap := map[string]interface{}{
- // "file_name": r.FileName,
- // "attach_url": r.TextUrl,
- // "state": r.ErrorState,
- // }
- // tmap[fmt.Sprint(i)] = rmap
- // }
- // if len(tmap) > 0 {
- // attachText[fmt.Sprint(index)] = tmap
- // }
- // }
- // } else {
- // qu.Debug("附件解析服务连接失败:", err)
- // }
- // index++
- // }
- //}
- }
- }
- return
- }
- // 下载
- func Download(url string) (string, []byte) {
- defer cu.Catch()
- client := &http.Client{
- Timeout: 3 * time.Minute,
- Transport: &http.Transport{
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
- },
- }
- req, err := http.NewRequest("GET", url, nil)
- if err != nil {
- //fmt.Println("Error creating request:", err)
- return "", []byte{}
- }
- resp, err := client.Do(req)
- if err != nil {
- //fmt.Println("Error sending request:", err)
- return "", []byte{}
- }
- defer resp.Body.Close()
- if resp.StatusCode == 200 {
- bodyBytes, _ := ioutil.ReadAll(resp.Body)
- return resp.Header.Get("Content-Type"), bodyBytes
- }
- return "", []byte{}
- }
- func GetType(contentType string, ret []byte) string {
- if contentType != "" {
- if reg_jpg.MatchString(contentType) {
- return "jpg"
- } else if reg_docx.MatchString(contentType) {
- return "docx"
- } else if reg_doc.MatchString(contentType) {
- return "doc"
- } else if reg_xlsx.MatchString(contentType) {
- return "xlsx"
- } else if reg_pdf.MatchString(contentType) {
- return "pdf"
- } else if reg_zip.MatchString(contentType) {
- return "zip"
- } else if reg_rar.MatchString(contentType) {
- return "rar"
- }
- } else if len(ret) > 0 {
- return su.GetFileType(ret)
- }
- return ""
- }
- func GetAttachText(fid, fileName, fileType, url, size string, index int, ret []byte, attachments, attachText map[string]interface{}) {
- defer cu.Catch()
- attachments[fmt.Sprint(index+1)] = map[string]interface{}{
- "fid": fid,
- "filename": fileName,
- "ftype": fileType,
- "org_url": url,
- "size": size,
- "url": "oss",
- }
- //附件解析
- resp := GetFileText(fileName, fid, fileType, ret)
- if resp != nil {
- tmap := map[string]interface{}{}
- for i, r := range resp.Result {
- rmap := map[string]interface{}{
- "file_name": r.FileName,
- "attach_url": r.TextUrl,
- "state": r.ErrorState,
- }
- tmap[fmt.Sprint(i)] = rmap
- }
- if len(tmap) > 0 {
- attachText[fmt.Sprint(index)] = tmap
- }
- }
- }
|