3级列表不翻页不定css.lua 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. -- 采集参数设置
  2. -- 三层级,部门、单位列表/单位信息列表/信息
  3. -- 部门不翻页,信息列表也不翻页
  4. local href = "https://www.hlj.gov.cn/hlj/c108397/zfxxgk.shtml"
  5. -- 单位机构清单选择器
  6. local organ_list_css_selector = 'document.querySelectorAll("div#tableList table a")'
  7. -- 下一页链接或者按钮的CSS选择期
  8. local next_page_css_selector = 'document.querySelector("div#page_div a.nextbtn")'
  9. -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
  10. local info_list_css_selector = "a"
  11. -- 检查是否有下一页
  12. local is_has_next_page_js = [[
  13. var ret="false";
  14. var obj = %s;
  15. if(obj){ret="true"};
  16. ret
  17. ]]
  18. is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
  19. -- 查找单位清单
  20. local find_organ_list_js = [[
  21. var ns = [];
  22. %s.forEach((v,i)=>{
  23. v.setAttribute("target","_blank");
  24. ns.push({"index":i,"href":v.href,"text":v.innerText});
  25. });
  26. ns
  27. ]]
  28. find_organ_list_js = string.format(find_organ_list_js, organ_list_css_selector)
  29. -- 获取单位年度财政预算链接
  30. local find_organ_info_js = [[
  31. var ns = [];
  32. document.querySelectorAll("a").forEach((v,i)=>{
  33. v.setAttribute("target","_blank");
  34. var linkText = v.innerText;
  35. if( (linkText.indexOf("预算")>0 ||linkText.indexOf("预决算")>0) &&
  36. (linkText.indexOf("2022年")>0 || linkText.indexOf("2023年")>0 || linkText.indexOf("2024年")>0)) {
  37. ns.push({"index":i,"title":linkText,"href":v.href})
  38. }
  39. });
  40. ns
  41. ]]
  42. -- 获取信息中的附件
  43. local find_info_attach_file_js = [[
  44. var ns = [];
  45. var allow_exts= ["doc","docx","xls","xlsx","pdf","txt","jpg","png","ppt","pptx","zip","rar"]
  46. var getFileExt=(text)=>{var tmp=text.split(".");return tmp[tmp.length-1];}
  47. document.querySelectorAll("a").forEach((v,i)=>{
  48. v.setAttribute("target","_blank");
  49. var linkText = v.innerText;
  50. var ext = getFileExt(linkText);
  51. if(ext.length==0 ||ext.length>4 ){
  52. ext = getFileExt(v.href);
  53. }
  54. if(ext.length==0 ||ext.length>4 ){return;}
  55. if(allow_exts.indexOf(ext)>=0){
  56. ns.push({"index":i,"title":linkText,"href":v.href})
  57. }
  58. });
  59. ns
  60. ]]
  61. local timeout = 1000 * 60 * 120
  62. browser_sleep(1000 * 2)
  63. --TODO 1. 打开主页
  64. browser_navagite(timeout, href)
  65. browser_sleep(1000 * 2)
  66. --先点击部门预决算链接
  67. local has_next_page=false
  68. repeat
  69. --TODO 2. 找到单位链接
  70. print("找单位链接")
  71. local ok, list = browser_executejs(timeout, 1, find_organ_list_js)
  72. if ok == "ok" then
  73. print("返回拉取市政府下边子级单位清单", list, #list)
  74. for k, v in pairs(list) do
  75. print(v)
  76. --打开单位链接
  77. browser_click(6000, 3, organ_list_css_selector..'[' .. v['index'] .. ']')
  78. browser_sleep(1000 * 1)
  79. -- 这里不翻页翻页
  80. browser_click(1000*5, 3, string.format('document.querySelectorAll("a")[', info_list_css_selector) .. v['index'] .. ']')
  81. browser_sleep(1000 * 1)
  82. --local path = string.match(v.href, info_url_path_tpl)
  83. --print("path::",path)
  84. local ok2, list2 = browser_executejs(v.href, timeout, 1, find_info_attach_file_js)
  85. print('查找详情页附件链接', ok2, list2, #list2)
  86. if ok2 == "ok" then
  87. for _, v2 in pairs(list2) do
  88. -- 存储数据
  89. current_page_has_data=true
  90. print(v2.title, v2.href)
  91. browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] })
  92. end
  93. end
  94. browser_closetabs(v.href, 500)
  95. --关闭列表页
  96. browser_closetabs('bmys', 1000)
  97. end
  98. else
  99. print("--错误,第一页就打不开")
  100. end
  101. local ok, ret = browser_executejs(timeout, 0, is_has_next_page_js)
  102. if ret == "true" then
  103. has_next_page = true
  104. else
  105. has_next_page=false
  106. end
  107. until not has_next_page
  108. print("找单位链接结束")
  109. browser_sleep(1000 * 60)