123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- -- 采集参数设置
- -- 二层级,单位信息列表/信息
- -- 不用拼接列表页地址,直接模拟点击列表页的下一页链接
- -- href_tpl 可以不用理会
- --https://www.mas.gov.cn/xxgk/opennessTarget/?branch_id=57a3df762c262ea9a00aae84&column_code=150101&topic_id=&type=bumen
- local href = "https://www.songjiang.gov.cn/Template/dynamic/currency/infoOpenMore.html?categorynum=004003002"
- -- 列表页CSS选择期,可以避免检测垃圾数据,不想定义,直接写a也行
- local info_list_css_selector = "div.ewb-infos td a"
- -- 三级页特征,用于找到tab页,准备在这个tab页执行js,关闭tab页
- local info_url_path_tpl = "contents"
- -- 下一页链接或者按钮的CSS选择期
- local next_page_css_selector = 'var a;document.querySelectorAll("ul.m-pagination-page li a").forEach((v,i)=>{if(v.innerText.indexOf("下一页")>-1){a=v;}});a'
- -- 列表页最大页数
- local max_pages = 7
- -- 用JS点击下一页,这个变量将弃用
- local href_tpl = "https://www.bjshy.gov.cn/web/zwgk/czsj/3fb44002-%d.html"
- local timeout = 1000 * 60 * 120
- -- TODO
- -- 检查是否有下一页
- local is_has_next_page_js = [[
- var ret="false";
- //var obj = %s;
- //if(obj){ret="true"};
- ret="true";
- ret
- ]]
- --is_has_next_page_js = string.format(is_has_next_page_js, next_page_css_selector)
- -- 找信息
- local find_info_list_js = [[
- var ns = [];
- document.querySelectorAll("%s").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- var href = v.href;
- if((linkText.indexOf("预算")>-1 ||linkText.indexOf("预决算")>-1||linkText.indexOf("绩效")>-1 ||linkText.indexOf("目标")>-1 ) &&
- (linkText.indexOf("2022年")>-1 || linkText.indexOf("2023年")>-1 || linkText.indexOf("2024年")>-1)) {
- ns.push({"index":i,"title":linkText,"href":href})
- }
-
- });
- ns
- ]]
- find_info_list_js = string.format(find_info_list_js, info_list_css_selector)
- --找附件
- local find_info_attach_file_js = [[
- var ns = [];
- var extensions = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar','PDF','DOC','DOCX','XLS','XLSX','ZIP','RAR'];
- var extensionRegex = new RegExp('\\.(' + extensions.join('|') + ')', 'i');
- document.querySelectorAll("a").forEach((v,i)=>{
- v.setAttribute("target","_blank");
- var linkText = v.innerText;
- var href = v.href
- if(extensionRegex.test(linkText.toLowerCase())||extensionRegex.test(href.toLowerCase())){
- ns.push({"index":i,"title":linkText,"href":v.href})
- }
- });
- ns
- ]]
- --TODO 1. 打开主页
- browser_navagite(timeout, href)
- browser_sleep(1000 * 3)
- local has_next_page,current_page_has_data = true,false
- local page_no=1
- repeat
- current_page_has_data=false
- local ok, list = browser_executejs(1000*3, 1, find_info_list_js)
- if ok == "ok" then
- print("信息列表信息", list, #list)
- for _, v in pairs(list) do
- print(v.title, v.href, v.index)
- --browser_click(1000*5, 3, string.format('document.querySelectorAll("%s")[', info_list_css_selector) .. v['index'] .. ']')
- --browser_sleep(1000 * 1)
- browser_navagite(true,1000*30, v.href)
- --local path = browser_url_last_segs(2,v.href)
- --print("path::",path)
- path = v.href
- local ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
- if ok2~="ok" then
- path = browser_url_last_segs(2,v.href)
- ok2, list2 = browser_executejs(path, 1000*3, 1, find_info_attach_file_js)
- end
- print('查找详情页附件链接', ok2, list2, #list2)
- if ok2 == "ok" then
- for _, v2 in pairs(list2) do
- -- 存储数据
- current_page_has_data=true
- print(v2.title, v2.href)
- browser_save("", "", "", "", { ["department"] = v['title'], ["info_title"] = v2['title'], ["attach_href"] = v2["href"] })
- end
- else
- print("未下载",v.title,v.href)
- browser_log("下载失败",v.title,v.href)
- end
- --browser_closetabs(path, 500)
- end
- else
- print("执行列表页查询信息js失败",ok)
- end
- local ok3, ret = browser_executejs(1000, 0, is_has_next_page_js)
- print(ok3,ret)
- if ret=="true" then
- has_next_page=true
- browser_click(1000, 3, next_page_css_selector)
- page_no=page_no+1
- else
- has_next_page=false
- end
- print("翻页::",page_no,max_pages)
- browser_sleep(1000 * 3)
- --关闭其他页面,莫名其妙有tab关不掉
- browser_closetabs_without("",href,1000*1)
-
- until not has_next_page or page_no > max_pages --not current_page_has_data
- print("所有链接都爬完了")
- browser_sleep(1000 * 5)
|