-- 采集参数设置 -- 二层级,单位信息列表/信息 -- 不用拼接列表页地址,直接模拟点击列表页的下一页链接 -- href_tpl 可以不用理会 --https://www.mas.gov.cn/xxgk/opennessTarget/?branch_id=57a3df762c262ea9a00aae84&column_code=150101&topic_id=&type=bumen local href = "https://www.songjiang.gov.cn/Template/dynamic/currency/infoOpenMore.html?categorynum=004003002" -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行 local info_list_css_selector = "div.ewb-infos td a" -- 三级页特征,用于找到tab页,准备在这个tab页执行js,关闭tab页 local info_url_path_tpl = "contents" -- 下一页链接或者按钮的CSS选择期 local next_page_css_selector = 'var a;document.querySelectorAll("ul.m-pagination-page li a").forEach((v,i)=>{if(v.innerText.indexOf("下一页")>-1){a=v;}});a' -- 列表页最大页数 local max_pages = 7 -- 用JS点击下一页,这个变量将弃用 local href_tpl = "https://www.bjshy.gov.cn/web/zwgk/czsj/3fb44002-%d.html" local timeout = 1000 * 60 * 120 -- TODO -- 检查是否有下一页 local is_has_next_page_js = [[ var ret="false"; //var obj = %s; //if(obj){ret="true"}; ret="true"; ret ]] --is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector) -- 找信息 local find_info_list_js = [[ var ns = []; document.querySelectorAll("%s").forEach((v,i)=>{ v.setAttribute("target","_blank"); var linkText = v.innerText; var href = v.href; if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1||linkText.indexOf("绩效")>-1 ||linkText.indexOf("目标")>-1 ) && (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) { ns.push({"index":i,"title":linkText,"href":href}) } }); ns ]] find_info_list_js = string.format(find_info_list_js, info_list_css_selector) --找附件 local find_info_attach_file_js = [[ var ns = []; var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar','PDF','DOC','DOCX','XLS','XLSX','ZIP','RAR']; var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i'); document.querySelectorAll("a").forEach((v,i)=>{ v.setAttribute("target","_blank"); var linkText = v.innerText; var href = v.href if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){ ns.push({"index":i,"title":linkText,"href":v.href}) } }); ns ]] --TODO 1. 打开主页 browser_navagite(timeout, href) browser_sleep(1000 * 3) local has_next_page,current_page_has_data = true,false local page_no=1 repeat current_page_has_data=false local ok, list = browser_executejs(1000*3, 1, find_info_list_js) if ok == "ok" then print("信息列表信息", list, #list) for _, v in pairs(list) do print(v.title, v.href, v.index) --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v['index'] .. ']') --browser_sleep(1000 * 1) browser_navagite(true,1000*30, v.href) --local path = browser_url_last_segs(2,v.href) --print("path::",path) path = v.href local ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js) if ok2~="ok" then path = browser_url_last_segs(2,v.href) ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js) end print('查找详情页附件链接', ok2, list2, #list2) if ok2 == "ok" then for _, v2 in pairs(list2) do -- 存储数据 current_page_has_data=true print(v2.title, v2.href) browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] }) end else print("未下载",v.title,v.href) browser_log("下载失败",v.title,v.href) end --browser_closetabs(path, 500) end else print("执行列表页查询信息js失败",ok) end local ok3, ret = browser_executejs(1000, 0, is_has_next_page_js) print(ok3,ret) if ret=="true" then has_next_page=true browser_click(1000, 3, next_page_css_selector) page_no=page_no+1 else has_next_page=false end print("翻页::",page_no,max_pages) browser_sleep(1000 * 3) --关闭其他页面,莫名其妙有tab关不掉 browser_closetabs_without("",href,1000*1) until not has_next_page or page_no > max_pages --not current_page_has_data print("所有链接都爬完了") browser_sleep(1000 * 5)