123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- -- 采集参数设置
- -- 三层级,部门、单位列表/单位信息列表/信息
- -- 部门不翻页,信息列表也不翻页
- local href = "https://www.hlj.gov.cn/hlj/c108397/zfxxgk.shtml"
- -- 单位机构清单选择器
- local organ_list_css_selector = 'document.querySelectorAll("div#tableList table a")'
- -- 下一页链接或者按钮的CSS选择期
- local next_page_css_selector = 'document.querySelector("div#page_div a.nextbtn")'
- -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
- local info_list_css_selector = "a"
- -- 检查是否有下一页
- local is_has_next_page_js = [[
- var ret="false";
- var obj = %s;
- if(obj){ret="true"};
- ret
- ]]
- is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
- -- 查找单位清单
- local find_organ_list_js = [[
- var ns = [];
- %s.forEach((v,i)=>{
- v.setAttribute("target","_blank");
- ns.push({"index":i,"href":v.href,"text":v.innerText});
- });
- ns
- ]]
- find_organ_list_js = string.format(find_organ_list_js, organ_list_css_selector)
- -- 获取单位年度财政预算链接
- local find_organ_info_js = [[
- var ns = [];
- document.querySelectorAll("a").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- if( (linkText.indexOf("预算")>0 ||linkText.indexOf("预决算")>0) &&
- (linkText.indexOf("2022年")>0 || linkText.indexOf("2023年")>0 || linkText.indexOf("2024年")>0)) {
- ns.push({"index":i,"title":linkText,"href":v.href})
- }
- });
- ns
- ]]
- -- 获取信息中的附件
- local find_info_attach_file_js = [[
- var ns = [];
- var allow_exts= ["doc","docx","xls","xlsx","pdf","txt","jpg","png","ppt","pptx","zip","rar"]
- var getFileExt=(text)=>{var tmp=text.split(".");return tmp[tmp.length-1];}
- document.querySelectorAll("a").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- var ext = getFileExt(linkText);
- if(ext.length==0 ||ext.length>4 ){
- ext = getFileExt(v.href);
- }
- if(ext.length==0 ||ext.length>4 ){return;}
- if(allow_exts.indexOf(ext)>=0){
- ns.push({"index":i,"title":linkText,"href":v.href})
- }
- });
- ns
- ]]
- local timeout = 1000 * 60 * 120
- browser_sleep(1000 * 2)
- --TODO 1. 打开主页
- browser_navagite(timeout, href)
- browser_sleep(1000 * 2)
- --先点击部门预决算链接
- local has_next_page=false
- repeat
- --TODO 2. 找到单位链接
- print("找单位链接")
- local ok, list = browser_executejs(timeout, 1, find_organ_list_js)
- if ok == "ok" then
- print("返回拉取市政府下边子级单位清单", list, #list)
- for k, v in pairs(list) do
- print(v)
- --打开单位链接
- browser_click(6000, 3, organ_list_css_selector..'[' .. v['index'] .. ']')
- browser_sleep(1000 * 1)
- -- 这里不翻页翻页
- browser_click(1000*5, 3, string.format('document.querySelectorAll("a")[', info_list_css_selector) .. v['index'] .. ']')
- browser_sleep(1000 * 1)
- --local path = string.match(v.href, info_url_path_tpl)
- --print("path::",path)
- local ok2, list2 = browser_executejs(v.href, timeout, 1, find_info_attach_file_js)
- print('查找详情页附件链接', ok2, list2, #list2)
- if ok2 == "ok" then
- for _, v2 in pairs(list2) do
- -- 存储数据
- current_page_has_data=true
- print(v2.title, v2.href)
- browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] })
- end
- end
- browser_closetabs(v.href, 500)
- --关闭列表页
- browser_closetabs('bmys', 1000)
- end
- else
- print("--错误,第一页就打不开")
- end
- local ok, ret = browser_executejs(timeout, 0, is_has_next_page_js)
- if ret == "true" then
- has_next_page = true
- else
- has_next_page=false
- end
- until not has_next_page
- print("找单位链接结束")
- browser_sleep(1000 * 60)
|