task.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. package main
  2. import (
  3. "bytes"
  4. "crypto/tls"
  5. "encoding/base64"
  6. "fmt"
  7. "github.com/PuerkitoBio/goquery"
  8. "io"
  9. "io/ioutil"
  10. cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
  11. su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil"
  12. "net/http"
  13. "regexp"
  14. "strings"
  15. "time"
  16. )
  17. var (
  18. htmlModelReg = regexp.MustCompile(`{{[a-zA-z.()\d,:]{5,}}}|^(\$)`) //过滤模板语言
  19. reg_filter_url = regexp.MustCompile(`((\.\./)+|null|[。))]+$)`)
  20. reg_invalid_url = regexp.MustCompile(`(^(tel)|^#[\p{Han}]+$|^[\p{Han}]+$|javascript|login|mailto|\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)[))##/、]{0,}$)+`)
  21. reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
  22. reg_filetype = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg)$`)
  23. reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
  24. //全匹配无效内容
  25. reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
  26. //包含无效关键词
  27. reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
  28. //以关键词结尾
  29. reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
  30. //修复链接
  31. reg_repair_href1 = regexp.MustCompile(`^(\.\./|\./|/)+`)
  32. reg_domain = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.)[^/]+/`)
  33. reg_domain_param = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.).*/`)
  34. //附件类型
  35. reg_jpg = regexp.MustCompile(`(jpg|png|jpeg|image)`)
  36. reg_docx = regexp.MustCompile(`(docx|word)`)
  37. reg_doc = regexp.MustCompile(`doc`)
  38. reg_xlsx = regexp.MustCompile(`(xlsx|xls|sheet)`)
  39. reg_pdf = regexp.MustCompile(`pdf`)
  40. reg_zip = regexp.MustCompile(`zip`)
  41. reg_rar = regexp.MustCompile(`rar`)
  42. )
  43. type Data struct {
  44. Url string
  45. Text string
  46. Ok bool
  47. By string
  48. FileType string
  49. Base64Type bool
  50. }
  51. // DownloadFile 补充未下附件
  52. func GetDataAndDownload(tmp map[string]interface{}) (isEnd int, saveMgo bool) {
  53. defer cu.Catch()
  54. isEnd = 1
  55. //1、筛选a标签
  56. hrefMap := map[string]string{} //记录contenthtml中筛出的a标签信息;key:url,val:text
  57. contenthtml := cu.ObjToString(tmp["contenthtml"])
  58. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(contenthtml))
  59. doc.Find("a[href]").Each(func(index int, element *goquery.Selection) {
  60. attachmentURL, _ := element.Attr("href") //链接
  61. if attachmentURL != "" && !htmlModelReg.MatchString(attachmentURL) {
  62. hrefMap[attachmentURL] = element.Text()
  63. }
  64. })
  65. tmpResult := FilterAndDownload(hrefMap) //筛选有效附件链接
  66. if len(tmpResult) > 0 {
  67. href := cu.ObjToString(tmp["href"])
  68. _, attachments, attchText := DealAndDownload(tmpResult, href) //修复链接和文本并下载附件
  69. if len(attachments) > 0 {
  70. //tmp["file_add_log"] = result
  71. if projectinfo, ok := tmp["projectinfo"].(map[string]interface{}); ok {
  72. projectinfo["attachments"] = attachments
  73. } else {
  74. tmp["projectinfo"] = map[string]interface{}{"attachments": attachments}
  75. }
  76. if len(attchText) > 0 {
  77. tmp["attach_text"] = attchText
  78. } else {
  79. saveMgo = true
  80. }
  81. isEnd = 0
  82. }
  83. }
  84. return
  85. }
  86. // FilterAndDownload 筛选有效数据并下载对应附件
  87. func FilterAndDownload(hrefMap map[string]string) (result []*Data) {
  88. defer cu.Catch()
  89. if len(hrefMap) == 0 {
  90. return
  91. }
  92. for url, text := range hrefMap {
  93. //url长度过滤
  94. tmpUrl := strings.ToLower(url)
  95. if len([]rune(tmpUrl)) <= 10 { //长度
  96. continue
  97. }
  98. //url无效字符过滤
  99. tmpUrl = reg_filter_url.ReplaceAllString(tmpUrl, "")
  100. if tmpUrl == "" || reg_invalid_url.MatchString(tmpUrl) {
  101. continue
  102. }
  103. tmpText := strings.ToLower(text)
  104. //url、text无效附件类型过滤
  105. if reg_err_filetype.MatchString(tmpUrl) || reg_err_filetype.MatchString(tmpText) { //无效附件类型
  106. continue
  107. }
  108. tmpText = reg_fileter_text.ReplaceAllString(tmpText, "") //过滤无效字符
  109. //text过滤
  110. if fileType := reg_filetype.FindString(tmpUrl); fileType != "" { //含常见附件类型结尾的url
  111. result = append(result, &Data{
  112. Url: url,
  113. Text: text,
  114. By: "url",
  115. FileType: strings.ReplaceAll(fileType, ".", ""),
  116. })
  117. } else if fileType := reg_filetype.FindString(tmpText); fileType != "" { //含常见附件类型结尾的text
  118. result = append(result, &Data{
  119. Url: url,
  120. Text: text,
  121. By: "text",
  122. FileType: strings.ReplaceAll(fileType, ".", ""),
  123. })
  124. } else {
  125. //textStr = reg_fileter_text.ReplaceAllString(textStr, "") //过滤无效字符
  126. if reg_invalid_text.ReplaceAllString(tmpText, "") == "" { //无效,全文本匹配,舍弃
  127. continue
  128. } else if reg_filter_text1.MatchString(tmpText) || reg_filter_text2.MatchString(tmpText) { //无效,部分文本匹配,舍弃
  129. continue
  130. }
  131. result = append(result, &Data{
  132. Url: url,
  133. Text: tmpText,
  134. By: "filter",
  135. })
  136. }
  137. }
  138. return
  139. }
  140. // DealAndDownload 修复链接和文本并下载附件
  141. func DealAndDownload(tmp []*Data, href string) (result []*Data, attachments, attachText map[string]interface{}) {
  142. defer cu.Catch()
  143. attachments = map[string]interface{}{}
  144. attachText = map[string]interface{}{}
  145. for _, data := range tmp {
  146. url := strings.ReplaceAll(data.Url, "\\", "/")
  147. //异常链接修复
  148. if !strings.HasPrefix(url, "https") && !strings.HasPrefix(url, "http") { //异常链接
  149. if strings.HasPrefix(url, "data:image/") { //base64图片
  150. //待处理TODO
  151. data.Base64Type = true
  152. result = append(result, data)
  153. data.Url = ""
  154. } else {
  155. url = reg_repair_href1.ReplaceAllString(url, "") //处理../ ./ /
  156. //获取href域名
  157. domain := reg_domain.FindString(href)
  158. //var urlArr []string
  159. param_domain := reg_domain_param.FindString(href)
  160. if domain != "" { //优先拼接域名
  161. data.Url = domain + url
  162. result = append(result, data)
  163. }
  164. if param_domain != "" { //再拼接带参链接
  165. data.Url = param_domain + url
  166. result = append(result, data)
  167. }
  168. }
  169. } else {
  170. result = append(result, data)
  171. }
  172. }
  173. if len(result) > 0 {
  174. index := 0
  175. for _, data := range result {
  176. if data.Base64Type {
  177. fileName := "附件" + fmt.Sprint(index+1) + ".jpg"
  178. i := strings.Index(data.Url, ",")
  179. dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(data.Url[i+1:]))
  180. ret, err := io.ReadAll(dec)
  181. if err == nil && len(ret) >= 1024*3 && len(ret) < 15*1024*1024 {
  182. fid := su.GetHashKey(ret) + su.TypeByExt(fileName)
  183. bs := bytes.NewReader(ret)
  184. size := su.ConvertFileSize(bs.Len())
  185. data.Ok, err = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传
  186. if data.Ok { //上传成功,解析附件
  187. GetAttachText(fid, fileName, "jpg", "", size, index, ret, attachments, attachText)
  188. index++
  189. }
  190. }
  191. } else {
  192. contentType, ret := Download(data.Url) //下载
  193. if len(ret) > 15*1024*1024 || len(ret) < 1024*3 {
  194. fmt.Println("file size is too big or small!")
  195. continue
  196. }
  197. fileType := data.FileType //从url或者text提取的附件类型
  198. if fileType == "" {
  199. fileType = GetType(contentType, ret) //获取附件类型
  200. data.FileType = fileType
  201. }
  202. if fileType != "" {
  203. fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
  204. fid := su.GetHashKey(ret) + su.TypeByExt(fileName)
  205. bs := bytes.NewReader(ret)
  206. size := su.ConvertFileSize(bs.Len())
  207. data.Ok, _ = su.OssPutObject(fid, io.MultiReader(bs)) //附件上传
  208. if data.Ok { //上传成功,解析附件
  209. GetAttachText(fid, fileName, fileType, data.Url, size, index, ret, attachments, attachText)
  210. index++
  211. }
  212. }
  213. }
  214. //contentType, ret := Download(data.Url) //下载
  215. //fileType := data.FileType //从url或者text提取的附件类型
  216. //if fileType == "" {
  217. // fileType = GetType(contentType, ret) //获取附件类型
  218. // data.FileType = fileType
  219. //}
  220. //if fileType != "" {
  221. // fileName := "附件" + fmt.Sprint(index+1) + "." + fileType
  222. // fid := sp.GetHashKey(ret) + sp.TypeByExt(fileName)
  223. // bs := bytes.NewReader(ret)
  224. // size := qu.ConvertFileSize(bs.Len())
  225. // b, _ := sp.OssPutObject(fid, io.MultiReader(bs)) //附件上传
  226. // //qu.Debug("oss", fileName, size, fileType, fid)
  227. // data.Ok = b
  228. // if b {
  229. // attachments[fmt.Sprint(index+1)] = map[string]interface{}{
  230. // "fid": fid,
  231. // "filename": fileName,
  232. // "ftype": fileType,
  233. // "org_url": data.Url,
  234. // "size": size,
  235. // "url": "oss",
  236. // }
  237. // //附件解析
  238. // conn, err := serviced.GetOcrServerConn() //链接ocr服务治理中心
  239. // if err == nil {
  240. // resp := GetFileText(conn, fileName, fid, fileType, ret)
  241. // if resp != nil {
  242. // tmap := map[string]interface{}{}
  243. // for i, r := range resp.Result {
  244. // rmap := map[string]interface{}{
  245. // "file_name": r.FileName,
  246. // "attach_url": r.TextUrl,
  247. // "state": r.ErrorState,
  248. // }
  249. // tmap[fmt.Sprint(i)] = rmap
  250. // }
  251. // if len(tmap) > 0 {
  252. // attachText[fmt.Sprint(index)] = tmap
  253. // }
  254. // }
  255. // } else {
  256. // qu.Debug("附件解析服务连接失败:", err)
  257. // }
  258. // index++
  259. // }
  260. //}
  261. }
  262. }
  263. return
  264. }
  265. // 下载
  266. func Download(url string) (string, []byte) {
  267. defer cu.Catch()
  268. client := &http.Client{
  269. Timeout: 3 * time.Minute,
  270. Transport: &http.Transport{
  271. TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
  272. },
  273. }
  274. req, err := http.NewRequest("GET", url, nil)
  275. if err != nil {
  276. //fmt.Println("Error creating request:", err)
  277. return "", []byte{}
  278. }
  279. resp, err := client.Do(req)
  280. if err != nil {
  281. //fmt.Println("Error sending request:", err)
  282. return "", []byte{}
  283. }
  284. defer resp.Body.Close()
  285. if resp.StatusCode == 200 {
  286. bodyBytes, _ := ioutil.ReadAll(resp.Body)
  287. return resp.Header.Get("Content-Type"), bodyBytes
  288. }
  289. return "", []byte{}
  290. }
  291. func GetType(contentType string, ret []byte) string {
  292. if contentType != "" {
  293. if reg_jpg.MatchString(contentType) {
  294. return "jpg"
  295. } else if reg_docx.MatchString(contentType) {
  296. return "docx"
  297. } else if reg_doc.MatchString(contentType) {
  298. return "doc"
  299. } else if reg_xlsx.MatchString(contentType) {
  300. return "xlsx"
  301. } else if reg_pdf.MatchString(contentType) {
  302. return "pdf"
  303. } else if reg_zip.MatchString(contentType) {
  304. return "zip"
  305. } else if reg_rar.MatchString(contentType) {
  306. return "rar"
  307. }
  308. } else if len(ret) > 0 {
  309. return su.GetFileType(ret)
  310. }
  311. return ""
  312. }
  313. func GetAttachText(fid, fileName, fileType, url, size string, index int, ret []byte, attachments, attachText map[string]interface{}) {
  314. defer cu.Catch()
  315. attachments[fmt.Sprint(index+1)] = map[string]interface{}{
  316. "fid": fid,
  317. "filename": fileName,
  318. "ftype": fileType,
  319. "org_url": url,
  320. "size": size,
  321. "url": "oss",
  322. }
  323. //附件解析
  324. resp := GetFileText(fileName, fid, fileType, ret)
  325. if resp != nil {
  326. tmap := map[string]interface{}{}
  327. for i, r := range resp.Result {
  328. rmap := map[string]interface{}{
  329. "file_name": r.FileName,
  330. "attach_url": r.TextUrl,
  331. "state": r.ErrorState,
  332. }
  333. tmap[fmt.Sprint(i)] = rmap
  334. }
  335. if len(tmap) > 0 {
  336. attachText[fmt.Sprint(index)] = tmap
  337. }
  338. }
  339. }