2层URL翻页模版.lua_bak 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. -- 采集参数设置
  2. -- 二层级,单位信息列表/信息
  3. -- 拼接列表页地址,需要指定模版
  4. -- 列表中含有附件
  5. -- 列表页地址
  6. local href = "https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/2023bmys/index.html"
  7. -- 列表页模版
  8. local href_tpl = "https://www.nanning.gov.cn/zwgk/fdzdgknr/czxx/sbjyjs/sbjbmys/2023bmys/index_%d.html"
  9. -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
  10. local info_list_css_selector = "a"
  11. -- 三级页特征,用于找到tab页,准备在这个tab页执行js,关闭tab页
  12. local info_url_path_tpl = "t20240306"
  13. -- 列表页页数
  14. local first_page_no,last_page_no,page_step = 0,6,1
  15. local timeout = 1000 * 60 * 120
  16. -- TODO
  17. -- 检查是否有下一页
  18. local is_has_next_page_js = [[
  19. var ret="false";
  20. //var obj = %s;
  21. //if(obj){ret="true"};
  22. ret="true";
  23. ret
  24. ]]
  25. is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
  26. -- 找信息
  27. local find_info_list_js = [[
  28. var ns = [];
  29. document.querySelectorAll("%s").forEach((v,i)=>{
  30. v.setAttribute("target","_blank");
  31. var linkText = v.innerText;
  32. ns.push({"index":i,"title":linkText,"href":v.href})
  33. if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1) &&
  34. (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) {
  35. ns.push({"index":i,"title":linkText,"href":v.href})
  36. }
  37. });
  38. ns
  39. ]]
  40. find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
  41. --找附件
  42. local find_info_attach_file_js = [[
  43. var ns = [];
  44. var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar','PDF','DOC','DOCX','XLS','XLSX','ZIP','RAR'];
  45. var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i');
  46. document.querySelectorAll("a").forEach((v,i)=>{
  47. v.setAttribute("target","_blank");
  48. var linkText = v.innerText;
  49. var href = v.href
  50. if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
  51. ns.push({"index":i,"title":linkText,"href":v.href})
  52. }
  53. });
  54. ns
  55. ]]
  56. --TODO 1. 打开主页
  57. local page_no=first_page_no
  58. local current_page_has_data=true
  59. repeat
  60. if page_no>first_page_no then
  61. href = string.format(href_tpl,page_no)
  62. end
  63. browser_navagite(1000*120, href)
  64. browser_sleep(1000 * 1)
  65. current_page_has_data=false
  66. local ok, list = browser_executejs(1000*30, 1, find_info_list_js)
  67. if ok == "ok" then
  68. print("信息列表信息", list, #list)
  69. for _, v in pairs(list) do
  70. print(v.title, v.href, v.index)
  71. --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v['index'] .. ']')
  72. --browser_sleep(1000 * 1)
  73. browser_navagite(true,1000*30, v.href)
  74. --local path = string.match(v.href, info_url_path_tpl)
  75. --print("path::",path)
  76. local path = v.href
  77. local ok2, list2 = browser_executejs(v.href, 1000*30, 1, find_info_attach_file_js)
  78. if ok2~="ok" then
  79. path = browser_url_last_segs(2,v.href)
  80. ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
  81. end
  82. print('查找详情页附件链接', ok2, list2, #list2)
  83. if ok2 == "ok" then
  84. for _, v2 in pairs(list2) do
  85. -- 存储数据
  86. current_page_has_data=true
  87. print(v2.title, v2.href)
  88. browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] })
  89. end
  90. end
  91. browser_closetabs(path, 500)
  92. end
  93. end
  94. -- browser_reset()
  95. page_no = page_no + page_step
  96. --关闭其他页面,莫名其妙有tab关不掉
  97. browser_closetabs_without("",href,1000*1)
  98. print("翻页::",page_no,last_page_no)
  99. browser_sleep(1000 * 2)
  100. until page_no>last_page_no --not current_page_has_data or
  101. print("所有链接都爬完了")
  102. browser_sleep(1000 * 5)