2层JS翻页模版.lua 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. -- 采集参数设置
  2. -- 二层级,单位信息列表/信息
  3. -- 不用拼接列表页地址,直接模拟点击列表页的下一页链接
  4. -- href_tpl 可以不用理会
  5. --https://www.mas.gov.cn/xxgk/opennessTarget/?branch_id=57a3df762c262ea9a00aae84&column_code=150101&topic_id=&type=bumen
  6. local href = "https://www.songjiang.gov.cn/Template/dynamic/currency/infoOpenMore.html?categorynum=004003002"
  7. -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
  8. local info_list_css_selector = "div.ewb-infos td a"
  9. -- 三级页特征,用于找到tab页,准备在这个tab页执行js,关闭tab页
  10. local info_url_path_tpl = "contents"
  11. -- 下一页链接或者按钮的CSS选择期
  12. local next_page_css_selector = 'var a;document.querySelectorAll("ul.m-pagination-page li a").forEach((v,i)=>{if(v.innerText.indexOf("下一页")>-1){a=v;}});a'
  13. -- 列表页最大页数
  14. local max_pages = 7
  15. -- 用JS点击下一页,这个变量将弃用
  16. local href_tpl = "https://www.bjshy.gov.cn/web/zwgk/czsj/3fb44002-%d.html"
  17. local timeout = 1000 * 60 * 120
  18. -- TODO
  19. -- 检查是否有下一页
  20. local is_has_next_page_js = [[
  21. var ret="false";
  22. //var obj = %s;
  23. //if(obj){ret="true"};
  24. ret="true";
  25. ret
  26. ]]
  27. --is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
  28. -- 找信息
  29. local find_info_list_js = [[
  30. var ns = [];
  31. document.querySelectorAll("%s").forEach((v,i)=>{
  32. v.setAttribute("target","_blank");
  33. var linkText = v.innerText;
  34. var href = v.href;
  35. if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1||linkText.indexOf("绩效")>-1 ||linkText.indexOf("目标")>-1 ) &&
  36. (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) {
  37. ns.push({"index":i,"title":linkText,"href":href})
  38. }
  39. });
  40. ns
  41. ]]
  42. find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
  43. --找附件
  44. local find_info_attach_file_js = [[
  45. var ns = [];
  46. var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar','PDF','DOC','DOCX','XLS','XLSX','ZIP','RAR'];
  47. var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i');
  48. document.querySelectorAll("a").forEach((v,i)=>{
  49. v.setAttribute("target","_blank");
  50. var linkText = v.innerText;
  51. var href = v.href
  52. if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
  53. ns.push({"index":i,"title":linkText,"href":v.href})
  54. }
  55. });
  56. ns
  57. ]]
  58. --TODO 1. 打开主页
  59. browser_navagite(timeout, href)
  60. browser_sleep(1000 * 3)
  61. local has_next_page,current_page_has_data = true,false
  62. local page_no=1
  63. repeat
  64. current_page_has_data=false
  65. local ok, list = browser_executejs(1000*3, 1, find_info_list_js)
  66. if ok == "ok" then
  67. print("信息列表信息", list, #list)
  68. for _, v in pairs(list) do
  69. print(v.title, v.href, v.index)
  70. --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v['index'] .. ']')
  71. --browser_sleep(1000 * 1)
  72. browser_navagite(true,1000*30, v.href)
  73. --local path = browser_url_last_segs(2,v.href)
  74. --print("path::",path)
  75. path = v.href
  76. local ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
  77. if ok2~="ok" then
  78. path = browser_url_last_segs(2,v.href)
  79. ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
  80. end
  81. print('查找详情页附件链接', ok2, list2, #list2)
  82. if ok2 == "ok" then
  83. for _, v2 in pairs(list2) do
  84. -- 存储数据
  85. current_page_has_data=true
  86. print(v2.title, v2.href)
  87. browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] })
  88. end
  89. else
  90. print("未下载",v.title,v.href)
  91. browser_log("下载失败",v.title,v.href)
  92. end
  93. --browser_closetabs(path, 500)
  94. end
  95. else
  96. print("执行列表页查询信息js失败",ok)
  97. end
  98. local ok3, ret = browser_executejs(1000, 0, is_has_next_page_js)
  99. print(ok3,ret)
  100. if ret=="true" then
  101. has_next_page=true
  102. browser_click(1000, 3, next_page_css_selector)
  103. page_no=page_no+1
  104. else
  105. has_next_page=false
  106. end
  107. print("翻页::",page_no,max_pages)
  108. browser_sleep(1000 * 3)
  109. --关闭其他页面,莫名其妙有tab关不掉
  110. browser_closetabs_without("",href,1000*1)
  111. until not has_next_page or page_no > max_pages --not current_page_has_data
  112. print("所有链接都爬完了")
  113. browser_sleep(1000 * 5)