3层带机构列表无需翻页模版.lua_bak 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. -- 采集参数设置,芜湖第一屏是机构清单
  2. -- 二层级,单位信息列表/信息
  3. -- 不用拼接列表页地址,直接模拟点击列表页的下一页链接
  4. -- href_tpl 可以不用理会
  5. local href = "https://www.bjxch.gov.cn/zt/yjs/index.html"
  6. -- 机构选择
  7. local organ_list_css_selector = "div#Section1 div.box-danwei a"
  8. -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
  9. local info_list_css_selector = ""
  10. -- 三级页特征,用于找到tab页,准备在这个tab页执行js,关闭tab页
  11. local info_url_path_tpl = "contents"
  12. -- 下一页链接或者按钮的CSS选择期
  13. local next_page_css_selector = 'document.querySelector("div#page_public_info a[aria-label=跳转至尾页]")'
  14. -- 列表页最大页数
  15. local max_pages = 1
  16. -- 用JS点击下一页,这个变量将弃用
  17. local href_tpl = "https://www.bjshy.gov.cn/web/zwgk/czsj/3fb44002-%d.html"
  18. local timeout = 1000 * 60 * 120
  19. -- JS函数集合
  20. local find_organ_list_js = [[
  21. var ns = [];
  22. document.querySelectorAll("%s").forEach((v,i)=>{
  23. v.setAttribute("target","_blank");
  24. ns.push({"index":i,"title":v.innerText,"href":v.href});
  25. return
  26. v.querySelectorAll("a").forEach((v1,i1)=>{
  27. var linkText = v1.innerText;
  28. var href = v1.href;
  29. if (href==""){return;}
  30. if( linkText.indexOf("财政预算")>-1 || linkText.indexOf("三公")>-1 || linkText.indexOf("专项资金")>-1 ){
  31. ns.push({"index":i1,"title":linkText,"href":href});
  32. }
  33. });
  34. });
  35. ns
  36. ]]
  37. find_organ_list_js = string.format(find_organ_list_js, organ_list_css_selector)
  38. local find_info_list_js = [[
  39. var ns = [];
  40. document.querySelectorAll("%s").forEach((v,i)=>{
  41. v.setAttribute("target","_blank");
  42. var linkText = v.innerText;
  43. //ns.push({"index":i,"title":linkText,"href":v.href})
  44. if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1) &&
  45. (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) {
  46. ns.push({"index":i,"title":linkText,"href":v.href})
  47. }
  48. });
  49. ns
  50. ]]
  51. find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
  52. local find_info_attach_file_js = [[
  53. var ns = [];
  54. var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar'];
  55. var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i');
  56. document.querySelectorAll("a").forEach((v,i)=>{
  57. v.setAttribute("target","_blank");
  58. var linkText = v.innerText;
  59. var href = v.href
  60. if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
  61. ns.push({"index":i,"title":linkText,"href":v.href})
  62. }
  63. });
  64. ns
  65. ]]
  66. browser_navagite(timeout, href)
  67. browser_sleep(1000 * 2)
  68. local ok, list = browser_executejs(1000*30, 1, find_organ_list_js)
  69. print('执行查找机构列表', ok, list, #list)
  70. if ok == "ok" then
  71. for _, v in pairs(list) do
  72. -- 打开机构页面
  73. print("机构信息",v.title, v.href)
  74. -- 关闭其他页面,莫名其妙有tab关不掉
  75. browser_closetabs_without("",href,1000*1)
  76. --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', organ_list_css_selector) .. v['index'] .. ']')
  77. browser_navagite(true,1000*30,v.href)
  78. -- 点击一下更多
  79. local path = browser_url_last_segs(2,v.href)
  80. print("机构页path",path)
  81. browser_click(path,1000*10,3,'document.querySelector("div.zfxxgk_zdgkc div.more a")')
  82. browser_sleep(1000 * 1)
  83. -- 找机构页面的信息列表
  84. local ok2, list2 = browser_executejs(path,1000*30, 1, find_info_list_js)
  85. print('执行查找信息列表', ok, list, #list)
  86. if ok2 == "ok" then
  87. for _, v2 in pairs(list2) do
  88. print(v2.title,v2.href)
  89. --打开信息页
  90. browser_click(path,1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v2['index'] .. ']')
  91. browser_sleep(1000 * 1)
  92. local path2 = browser_url_last_segs(1,v2.href)
  93. print("详情页path",path2)
  94. local ok3, list3 = browser_executejs(v2.href,1000*30, 1, find_info_attach_file_js)
  95. print('执行查找附件列表', ok, list, #list)
  96. if ok3=="ok" then
  97. for _, v3 in pairs(list3) do
  98. print("找到附件",v.title,v2.title,v3.title,v3.href)
  99. browser_save("", "", "", "", { ["department"] = v.title, ["info_title"] = v2.title, ["attach_href"] = v3.href })
  100. end
  101. else
  102. print("错误",v2.title,v2.href)
  103. browser_log("错误",v.title,v.href,v2.title,v2.href)
  104. end
  105. browser_closetabs(path2, 500)
  106. end
  107. else
  108. print("错误",v.title,v.href)
  109. browser_log("错误",v.title,v.href,"","")
  110. end
  111. browser_sleep(1000 * 2)
  112. browser_closetabs(path, 500)
  113. end
  114. else
  115. print("抓取失败",ok)
  116. end
  117. print("任务完成")