123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- -- 采集参数设置,芜湖第一屏是机构清单
- -- 二层级,单位信息列表/信息
- -- 不用拼接列表页地址,直接模拟点击列表页的下一页链接
- -- href_tpl 可以不用理会
- local href = "https://www.jdz.gov.cn/zwzx/ztbd/yjsgk/2024/ys/"
- -- 机构选择
- local organ_list_css_selector = "div.ys-tab a"
- -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
- local info_list_css_selector = "ul li a"
- local timeout = 1000 * 60 * 120
- -- JS函数集合
- local find_organ_list_js = [[
- var ns = [];
- document.querySelectorAll("%s").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- ns.push({"index":i,"title":v.innerText,"href":v.href});
- return
- });
- ns
- ]]
- find_organ_list_js = string.format(find_organ_list_js, organ_list_css_selector)
- local find_info_list_js = [[
- var ns = [];
- document.querySelectorAll("%s").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1) &&
- (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) {
- ns.push({"index":i,"title":linkText,"href":v.href})
- }
- });
- ns
- ]]
- find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
- local find_info_attach_file_js = [[
- var ns = [];
- var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar'];
- var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i');
- document.querySelectorAll("a").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- var href = v.href
- if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
- ns.push({"index":i,"title":linkText,"href":v.href})
- }
- });
- ns
- ]]
- --获取table长度
- function getn(t)
- if type(t)~="table" then
- return 0
- end
- local len=0
- for _,_ in pairs(t) do
- len=len+1
- end
- return len
- end
- browser_navagite(timeout, href)
- browser_sleep(1000 * 2)
- local ok, list = browser_executejs(1000*30, 1, find_organ_list_js)
- local total,index = getn(list),1
- print('执行查找机构列表', ok, total)
- if ok == "ok" then
- for _, v in pairs(list) do
- -- 打开机构页面
- print("机构信息",index,total,v.title, v.href)
- index = index+1
- -- 关闭其他页面,莫名其妙有tab关不掉
- browser_closetabs_without("",href,1000*1)
- --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', organ_list_css_selector) .. v['index'] .. ']')
- browser_navagite(true,1000*30,v.href)
- -- 点击一下更多
-
-
- local path = browser_url_last_segs(2,v.href)
- print("机构页path",path)
- browser_click(path,1000*10,3,'document.querySelector("div.zfxxgk_zdgkc div.more a")')
- browser_sleep(1000 * 1)
- -- 找机构页面的信息列表
- local ok2, list2 = browser_executejs(path,1000*30, 1, find_info_list_js)
- print('执行查找信息列表', ok, list, #list)
- if ok2 == "ok" then
- for _, v2 in pairs(list2) do
- print(v2.title,v2.href)
- --打开信息页
- browser_click(path,1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v2['index'] .. ']')
- browser_sleep(1000 * 1)
- local path2 = browser_url_last_segs(1,v2.href)
- print("详情页path",path2)
- local ok3, list3 = browser_executejs(v2.href,1000*30, 1, find_info_attach_file_js)
- print('执行查找附件列表', ok, list, #list)
- if ok3=="ok" then
- for _, v3 in pairs(list3) do
- print("找到附件",v.title,v2.title,v3.title,v3.href)
- browser_save("", "", "", "", { ["department"] = v.title, ["info_title"] = v2.title, ["attach_href"] = v3.href })
- end
- else
- print("错误",v2.title,v2.href)
- browser_log("错误",v.title,v.href,v2.title,v2.href)
- end
- browser_closetabs(path2, 500)
- end
- else
- print("错误",v.title,v.href)
- browser_log("错误",v.title,v.href,"","")
- end
- browser_sleep(1000 * 2)
- browser_closetabs(path, 500)
- end
- else
- print("抓取失败",ok)
- end
- print("任务完成")
|