package main import ( "bytes" "crypto/tls" "encoding/base64" "fmt" "github.com/PuerkitoBio/goquery" "io" "io/ioutil" cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil" su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil" "net/http" "regexp" "strings" "time" ) var ( htmlModelReg = regexp.MustCompile(`{{[a-zA-z.()\d,:]{5,}}}|^(\$)`) //过滤模板语言 reg_filter_url = regexp.MustCompile(`((\.\./)+|null|[。))]+$)`) reg_invalid_url = regexp.MustCompile(`(^(tel)|^#[\p{Han}]+$|^[\p{Han}]+$|javascript|login|mailto|\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)[))##/、]{0,}$)+`) reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)") reg_filetype = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg)$`) reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`) //全匹配无效内容 reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`) //包含无效关键词 reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`) //以关键词结尾 reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`) //修复链接 reg_repair_href1 = regexp.MustCompile(`^(\.\./|\./|/)+`) reg_domain = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.)[^/]+/`) reg_domain_param = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.).*/`) //附件类型 reg_jpg = regexp.MustCompile(`(jpg|png|jpeg|image)`) reg_docx = regexp.MustCompile(`(docx|word)`) reg_doc = regexp.MustCompile(`doc`) reg_xlsx = regexp.MustCompile(`(xlsx|xls|sheet)`) reg_pdf = regexp.MustCompile(`pdf`) reg_zip = regexp.MustCompile(`zip`) reg_rar = regexp.MustCompile(`rar`) ) type Data struct { Url string Text string Ok bool By string FileType string Base64Type bool } // DownloadFile 补充未下附件 func GetDataAndDownload(tmp map[string]interface{}) (isEnd int, saveMgo bool) { defer cu.Catch() isEnd = 1 //1、筛选a标签 hrefMap := map[string]string{} //记录contenthtml中筛出的a标签信息;key:url,val:text contenthtml := cu.ObjToString(tmp["contenthtml"]) doc, _ := goquery.NewDocumentFromReader(strings.NewReader(contenthtml)) doc.Find("a[href]").Each(func(index int, element *goquery.Selection) { attachmentURL, _ := element.Attr("href") //链接 if attachmentURL != "" && !htmlModelReg.MatchString(attachmentURL) { hrefMap[attachmentURL] = element.Text() } }) tmpResult := FilterAndDownload(hrefMap) //筛选有效附件链接 if len(tmpResult) > 0 { href := cu.ObjToString(tmp["href"]) _, attachments, attchText := DealAndDownload(tmpResult, href) //修复链接和文本并下载附件 if len(attachments) > 0 { //tmp["file_add_log"] = result if projectinfo, ok := tmp["projectinfo"].(map[string]interface{}); ok { projectinfo["attachments"] = attachments } else { tmp["projectinfo"] = map[string]interface{}{"attachments": attachments} } if len(attchText) > 0 { tmp["attach_text"] = attchText } else { saveMgo = true } isEnd = 0 } } return } // FilterAndDownload 筛选有效数据并下载对应附件 func FilterAndDownload(hrefMap map[string]string) (result []*Data) { defer cu.Catch() if len(hrefMap) == 0 { return } for url, text := range hrefMap { //url长度过滤 tmpUrl := strings.ToLower(url) if len([]rune(tmpUrl)) <= 10 { //长度 continue } //url无效字符过滤 tmpUrl = reg_filter_url.ReplaceAllString(tmpUrl, "") if tmpUrl == "" || reg_invalid_url.MatchString(tmpUrl) { continue } tmpText := strings.ToLower(text) //url、text无效附件类型过滤 if reg_err_filetype.MatchString(tmpUrl) || reg_err_filetype.MatchString(tmpText) { //无效附件类型 continue } tmpText = reg_fileter_text.ReplaceAllString(tmpText, "") //过滤无效字符 //text过滤 if fileType := reg_filetype.FindString(tmpUrl); fileType != "" { //含常见附件类型结尾的url result = append(result, &Data{ Url: url, Text: text, By: "url", FileType: strings.ReplaceAll(fileType, ".", ""), }) } else if fileType := reg_filetype.FindString(tmpText); fileType != "" { //含常见附件类型结尾的text result = append(result, &Data{ Url: url, Text: text, By: "text", FileType: strings.ReplaceAll(fileType, ".", ""), }) } else { //textStr = reg_fileter_text.ReplaceAllString(textStr, "") //过滤无效字符 if reg_invalid_text.ReplaceAllString(tmpText, "") == "" { //无效,全文本匹配,舍弃 continue } else if reg_filter_text1.MatchString(tmpText) || reg_filter_text2.MatchString(tmpText) { //无效,部分文本匹配,舍弃 continue } result = append(result, &Data{ Url: url, Text: tmpText, By: "filter", }) } } return } // DealAndDownload 修复链接和文本并下载附件 func DealAndDownload(tmp []*Data, href string) (result []*Data, attachments, attachText map[string]interface{}) { defer cu.Catch() attachments = map[string]interface{}{} attachText = map[string]interface{}{} for _, data := range tmp { url := strings.ReplaceAll(data.Url, "\\", "/") //异常链接修复 if !strings.HasPrefix(url, "https") && !strings.HasPrefix(url, "http") { //异常链接 if strings.HasPrefix(url, "data:image/") { //base64图片 //待处理TODO data.Base64Type = true result = append(result, data) data.Url = "" } else { url = reg_repair_href1.ReplaceAllString(url, "") //处理../ ./ / //获取href域名 domain := reg_domain.FindString(href) //var urlArr []string param_domain := reg_domain_param.FindString(href) if domain != "" { //优先拼接域名 data.Url = domain + url result = append(result, data) } if param_domain != "" { //再拼接带参链接 data.Url = param_domain + url result = append(result, data) } } } else { result = append(result, data) } } if len(result) > 0 { index := 0 for _, data := range result { if data.Base64Type { fileName := "附件" + fmt.Sprint(index+1) + ".jpg" i := strings.Index(data.Url, ",") dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(data.Url[i+1:])) ret, err := io.ReadAll(dec) if err == nil && len(ret) >= 1024*3 && len(ret) < 15*1024*1024 { fid := su.GetHashKey(ret) + su.TypeByExt(fileName) bs := bytes.NewReader(ret) size := su.ConvertFileSize(bs.Len()) data.Ok, err = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传 if data.Ok { //上传成功,解析附件 GetAttachText(fid, fileName, "jpg", "", size, index, ret, attachments, attachText) index++ } } } else { contentType, ret := Download(data.Url) //下载 if len(ret) > 15*1024*1024 || len(ret) < 1024*3 { fmt.Println("file size is too big or small!") continue } fileType := data.FileType //从url或者text提取的附件类型 if fileType == "" { fileType = GetType(contentType, ret) //获取附件类型 data.FileType = fileType } if fileType != "" { fileName := "附件" + fmt.Sprint(index+1) + "." + fileType fid := su.GetHashKey(ret) + su.TypeByExt(fileName) bs := bytes.NewReader(ret) size := su.ConvertFileSize(bs.Len()) data.Ok, _ = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传 if data.Ok { //上传成功,解析附件 GetAttachText(fid, fileName, fileType, data.Url, size, index, ret, attachments, attachText) index++ } } } //contentType, ret := Download(data.Url) //下载 //fileType := data.FileType //从url或者text提取的附件类型 //if fileType == "" { // fileType = GetType(contentType, ret) //获取附件类型 // data.FileType = fileType //} //if fileType != "" { // fileName := "附件" + fmt.Sprint(index+1) + "." + fileType // fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName) // bs := bytes.NewReader(ret) // size := qu.ConvertFileSize(bs.Len()) // b, _ := sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传 // //qu.Debug("oss", fileName, size, fileType, fid) // data.Ok = b // if b { // attachments[fmt.Sprint(index+1)] = map[string]interface{}{ // "fid": fid, // "filename": fileName, // "ftype": fileType, // "org_url": data.Url, // "size": size, // "url": "oss", // } // //附件解析 // conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心 // if err == nil { // resp := GetFileText(conn, fileName, fid, fileType, ret) // if resp != nil { // tmap := map[string]interface{}{} // for i, r := range resp.Result { // rmap := map[string]interface{}{ // "file_name": r.FileName, // "attach_url": r.TextUrl, // "state": r.ErrorState, // } // tmap[fmt.Sprint(i)] = rmap // } // if len(tmap) > 0 { // attachText[fmt.Sprint(index)] = tmap // } // } // } else { // qu.Debug("附件解析服务连接失败:", err) // } // index++ // } //} } } return } // 下载 func Download(url string) (string, []byte) { defer cu.Catch() client := &http.Client{ Timeout: 3 * time.Minute, Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, }, } req, err := http.NewRequest("GET", url, nil) if err != nil { //fmt.Println("Error creating request:", err) return "", []byte{} } resp, err := client.Do(req) if err != nil { //fmt.Println("Error sending request:", err) return "", []byte{} } defer resp.Body.Close() if resp.StatusCode == 200 { bodyBytes, _ := ioutil.ReadAll(resp.Body) return resp.Header.Get("Content-Type"), bodyBytes } return "", []byte{} } func GetType(contentType string, ret []byte) string { if contentType != "" { if reg_jpg.MatchString(contentType) { return "jpg" } else if reg_docx.MatchString(contentType) { return "docx" } else if reg_doc.MatchString(contentType) { return "doc" } else if reg_xlsx.MatchString(contentType) { return "xlsx" } else if reg_pdf.MatchString(contentType) { return "pdf" } else if reg_zip.MatchString(contentType) { return "zip" } else if reg_rar.MatchString(contentType) { return "rar" } } else if len(ret) > 0 { return su.GetFileType(ret) } return "" } func GetAttachText(fid, fileName, fileType, url, size string, index int, ret []byte, attachments, attachText map[string]interface{}) { defer cu.Catch() attachments[fmt.Sprint(index+1)] = map[string]interface{}{ "fid": fid, "filename": fileName, "ftype": fileType, "org_url": url, "size": size, "url": "oss", } //附件解析 resp := GetFileText(fileName, fid, fileType, ret) if resp != nil { tmap := map[string]interface{}{} for i, r := range resp.Result { rmap := map[string]interface{}{ "file_name": r.FileName, "attach_url": r.TextUrl, "state": r.ErrorState, } tmap[fmt.Sprint(i)] = rmap } if len(tmap) > 0 { attachText[fmt.Sprint(index)] = tmap } } }