1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- from crawler.download import Downloader, RenderDownloader
- from crawler.services.channel import bfs
- if __name__ == '__main__':
- d = Downloader()
- r = RenderDownloader()
- # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
- # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
- # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
- # url = 'http://113.200.193.24:8009/Main/Projects#'
- # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
- # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
- # url = 'http://www.hdzbgs.com/List.aspx?id=12'
- # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
- # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
- # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
- # url = 'https://www.elongbiao.com/List/NoticeP/9'
- # url = 'https://www.elongbiao.com/List/Notice/12' # 多时间文本 算法优化一次
- # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
- # url = 'http://www.lydeyy.com/plus/list.php?tid=36' # 时间文本 算法优化一次
- # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
- # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
- # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
- # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302' # 多个时间文本窗口栏目抽取,完成优化
- # url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html' # 优化时间文本块数量多与先辈节点个数,导致无法全部删除,残余时间文本块干扰问题
- # url = 'http://www.shanghang.gov.cn/zwgk/zwgkzdgz/gczb/sphzbaxx/'
- # url = 'http://www.qlebid.com/cms/channel/1ywgg4qb/index.htm'
- # url = ' http://zhaobiao.elongcheng.com:82/'
- # url = 'http://www.gdgpo.gov.cn/queryPlanList.do'
- # url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
- # url = 'http://www.xtsrmyy.com.cn/newlist.asp?bigclassid=4&smallclassid=5'
- # url = 'http://jsj.yima.gov.cn/col/col109/index.html'
- # url = 'http://zw.hainan.gov.cn/wssc/ra/projects/rp_list.html?num=3'
- # url = 'http://www.hlbeggzyjy.org.cn/jygk/021001/trade_public.html'
- # url = 'http://jsj.yima.gov.cn/col/col109/index.html?uid=8327&pageNum=4'
- # url = 'http://oldzfcg.scsczt.cn/CmsNewsController.do?method=recommendBulletinList&moreType=provincebuyBulletinMore&channelCode=cggg&rp=25&page=1'
- # url = 'http://www.ccgp-xizang.gov.cn/freecms/site/xizang/index.html'
- # url = 'http://ggzy.yn.gov.cn/#/tradeHall/tradeList'
- # url = 'http://www.gdgpo.gov.cn/queryPlanList.do'
- # url = 'http://www.ccgp-gansu.gov.cn/web/contract/0/index.htm?contractsInfo.id=d0'
- # javascript 渲染页面
- # url = 'http://zhaobiao.elongcheng.com:82/' # 详情所在 onclick
- url = 'https://ebid.espic.com.cn/newgdtcms//category/purchaseListNew.html?dates=300&categoryId=2&tenderMethod=00&tabName=%E9%87%87%E8%B4%AD%E4%BF%A1%E6%81%AF&page=1'
- resp = r.get(url, timeout=3)
- # resp = r.get(url, timeout=3)
- print(resp)
- # bfs(resp, url)
|