2层URL翻页模版.lua 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. -- 采集参数设置
  2. -- 二层级,单位信息列表/信息
  3. -- 拼接列表页地址,需要指定模版
  4. -- 列表中含有附件
  5. require "scripts/inc/util"
  6. -- 列表页模版
  7. local href = "https://czj.xinyang.gov.cn/c/113/1.html"
  8. local href_tpl = "https://czj.xinyang.gov.cn/c/113/%d.html"
  9. -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
  10. local info_list_css_selector = "div.zfxxgk_zdgkc ul a"
  11. -- 列表页页数
  12. local first_page_no,last_page_no,page_step = 1,100,1
  13. local timeout = 1000 * 60 * 120
  14. -- TODO
  15. -- 检查是否有下一页
  16. local is_has_next_page_js = [[
  17. var ret="false";
  18. //var obj = %s;
  19. //if(obj){ret="true"};
  20. ret="true";
  21. ret
  22. ]]
  23. is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
  24. -- 找信息
  25. local find_info_list_js = [[
  26. var ns = [];
  27. document.querySelectorAll("%s").forEach((v,i)=>{
  28. v.setAttribute("target","_blank");
  29. var linkText = v.innerText;
  30. if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1||linkText.indexOf("绩效")>-1 ||linkText.indexOf("目标")>-1 ) &&
  31. (linkText.indexOf("2024年")>-1)) { //linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 ||
  32. ns.push({"index":i,"title":linkText,"href":v.href})
  33. }
  34. });
  35. ns
  36. ]]
  37. find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
  38. --找附件
  39. local find_info_attach_file_js = [[
  40. var ns = [];
  41. var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar','PDF','DOC','DOCX','XLS','XLSX','ZIP','RAR'];
  42. var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')$', 'i');
  43. document.querySelectorAll("a").forEach((v,i)=>{
  44. v.setAttribute("target","_blank");
  45. var linkText = v.innerText;
  46. var href = v.href
  47. if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
  48. ns.push({"index":i,"title":linkText,"href":v.href})
  49. }
  50. });
  51. ns
  52. ]]
  53. --TODO 1. 打开主页
  54. local page_no=first_page_no
  55. local current_page_has_data=true
  56. repeat
  57. if page_no>first_page_no then
  58. href = string.format(href_tpl,page_no)
  59. end
  60. browser_navagite(1000*120, href)
  61. browser_sleep(1000 * 1)
  62. current_page_has_data=false
  63. local ok, list = browser_executejs(1000*30, 1, find_info_list_js)
  64. if ok == "ok" then
  65. print("信息列表信息", list, table_length(list))
  66. for _, v in pairs(list) do
  67. print(v.title, v.href, v.index)
  68. browser_navagite(true,1000*30, v.href)
  69. local path = v.href
  70. local ok2, list2 = browser_executejs(v.href, 1000*30, 1, find_info_attach_file_js)
  71. if ok2~="ok" then
  72. path = browser_url_last_segs(2,v.href)
  73. ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
  74. end
  75. print('查找详情页附件链接', ok2, list2, table_length(list2))
  76. if ok2 == "ok" then
  77. for _, v2 in pairs(list2) do
  78. -- 存储数据
  79. current_page_has_data=true
  80. print(v2.title, v2.href)
  81. browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] ,["source"]=v.href})
  82. end
  83. end
  84. -- 找不到附件,保存主文档区域到文件
  85. if table_length(list2)==0 then
  86. print("找不到附件,准备生成网页pdf")
  87. clear_page_el("",path,1000*2,"div.article_txt")
  88. save_page_to_pdf("",path,"/Users/taozhang/Downloads/docs2/",v.title)
  89. browser_save("", "", "", "", { ["department"] = v["title"], ["info_title"] = v["title"], ["attach_href"] = "",["source"]=v.href })
  90. end
  91. browser_closetabs(path, 500)
  92. end
  93. end
  94. browser_reset()
  95. page_no = page_no + page_step
  96. print("翻页::",page_no,last_page_no)
  97. until page_no>last_page_no --not current_page_has_data or
  98. print("所有链接都爬完了")
  99. browser_sleep(1000 * 5)