spider.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. import json
  2. import re
  3. import time
  4. import redis
  5. import requests
  6. from bson.int64 import Int64
  7. from parsel import Selector
  8. from pymongo import MongoClient
  9. from urllib.parse import quote
  10. cli = MongoClient("192.168.20.248", 27017)
  11. bzz_list = cli['dzr']['bzz_zb_list']
  12. cli1 = MongoClient("127.0.0.1", 27001)
  13. data_bak = cli['dzr']['data_bak']
  14. _pool = redis.ConnectionPool(
  15. host='192.168.20.248',
  16. port=6379,
  17. password='top@123',
  18. db=2
  19. )
  20. r = redis.Redis(connection_pool=_pool, decode_responses=True)
  21. redis_key = 'duplicate_bzz_list'
  22. type_maps = {
  23. '中标': '42',
  24. '成交': '43',
  25. '单一来源': '410',
  26. '合同及验收': '48',
  27. }
  28. # channel = "中标"
  29. # channel_sign = "42"
  30. headers = {
  31. "Accept": "application/json, text/plain, */*",
  32. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  33. "Cache-Control": "no-cache",
  34. "Connection": "keep-alive",
  35. "Content-Type": "application/json",
  36. "Origin": "https://www.biaozhaozhao.com",
  37. "Pragma": "no-cache",
  38. "Referer": "https://www.biaozhaozhao.com/search?keyword=%E4%B8%80%E4%BD%93%E6%9C%BA&restore=1",
  39. "Sec-Fetch-Dest": "empty",
  40. "Sec-Fetch-Mode": "cors",
  41. "Sec-Fetch-Site": "same-origin",
  42. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
  43. "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
  44. "sec-ch-ua-mobile": "?0",
  45. "sec-ch-ua-platform": "\"macOS\"",
  46. "sentry-trace": "aaa4d61b9e3243a5baa5cca1c1c88108-bd50500fc742e0e9-0",
  47. "x-kzz-request-from": "qcc-tender-web",
  48. "x-kzz-request-id": "27328858-5576-2734-188319849847",
  49. "x-kzz-request-key": "E76F330BE1AA411B23876BC67C11750D",
  50. "x-kzz-request-time": "1668845731606"
  51. }
  52. cookies = {
  53. "QCCSESSID": "b410c26e4aa5895529a652519c",
  54. "gid_87f036f882014cd8": "b410c26e4aa5895529a652519c",
  55. "ls_371ef9f55b6b63dc": "42d6a4659d87930e"
  56. }
  57. # 独立元素
  58. INDEPENDENT_TAGS = {
  59. '<head>[\s\S]*?</head>': '',
  60. '<html>|<html [^>]*>|</html>': '',
  61. '<body>|<body [^>]*>|</body>': '',
  62. '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
  63. '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
  64. '\\xa0|\\u3000': '', # 空格
  65. '<!--[\s\S]*?-->': '', # 注释
  66. '<style[^<>]*>[\s\S]*?</style>': '', # 样式
  67. '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
  68. '<input>': '', # 输入框
  69. '<img[^>]*>': '<br>', # 图片
  70. }
  71. # 行内元素
  72. INLINE_TAGS = {
  73. '<a>|<a [^>]*>|</a>': '', # 超链接
  74. '<span>|<span [^>]*>|</span>': '', # span
  75. '<label>|<label [^>]*>|</label>': '<br>', # label
  76. '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '', # font
  77. }
  78. # 块级元素
  79. BLOCK_TAGS = {
  80. # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
  81. # '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
  82. '<p>|<p [^>]*>|</p>': '<br>', # 段落
  83. '<div>|<div [^>]*>|</div>': '<br>', # 分割 division
  84. '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
  85. }
  86. # 其他
  87. OTHER = {
  88. '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
  89. '<epointform>': '',
  90. '<!doctype html>|<!doctype html [^>]*>': '',
  91. '【关闭】|关闭': '',
  92. '【打印】|打印本页': '',
  93. '【字体:[\s\S]*】': '',
  94. '文章来源:[\u4e00-\u9fa5]+': '',
  95. '浏览次数:.*[<]+': '',
  96. '(责任编辑:.*?)': '',
  97. '分享到[:]': '',
  98. '相关链接:[\s\S]+': '',
  99. '阅读数[::]\d+': '',
  100. }
  101. # 样式
  102. CSS_STYLE = {
  103. 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
  104. 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
  105. 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
  106. 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
  107. 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
  108. 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
  109. }
  110. # 空白符
  111. BLANKS = {
  112. '\n\s*\n': '\n',
  113. '\s*\n\s*': '\n',
  114. '[^\S\n]': ' ',
  115. '\s+': ' ',
  116. }
  117. # css标签集合
  118. TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
  119. # css属性集合
  120. ATTRS = {'id', 'class', 'style', 'width'}
  121. def _repair_tag():
  122. """异常的标签组合,用来替换非标准页面的标签"""
  123. _repairs = {}
  124. for tag in TAGS:
  125. for attr in ATTRS:
  126. key = '{}{}'.format(tag, attr)
  127. val = '{} {}'.format(tag, attr)
  128. _repairs[key] = val
  129. return _repairs
  130. def _escape_character(html):
  131. """转义字符"""
  132. html = html.replace('&lt;', '<')
  133. html = html.replace('&gt;', '>')
  134. html = html.replace('&quot;', '"')
  135. html = html.replace('&amp;', '&')
  136. return html
  137. def _lowercase_tag(html):
  138. """标签归一化处理(全部小写)"""
  139. tags = re.findall("<[^>]+>", html)
  140. for tag in tags:
  141. html = html.replace(tag, str(tag).lower())
  142. repair_tags = _repair_tag()
  143. for err, right in repair_tags.items():
  144. html = html.replace(err, right)
  145. return html
  146. def cleaner(html, special=None, completely=False):
  147. """
  148. 数据清洗
  149. :param html: 清洗的页面
  150. :param special: 额外指定页面清洗规则
  151. :param completely: 是否完全清洗页面
  152. :return: 清洗后的页面源码
  153. """
  154. if special is None:
  155. special = {}
  156. OTHER.update(special)
  157. remove_tags = {
  158. **INDEPENDENT_TAGS,
  159. **INLINE_TAGS,
  160. **BLOCK_TAGS,
  161. **OTHER,
  162. **CSS_STYLE,
  163. **BLANKS,
  164. }
  165. html = _lowercase_tag(html)
  166. for tag, repl in remove_tags.items():
  167. html = re.sub(tag, repl, html)
  168. if completely:
  169. html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
  170. html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
  171. html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
  172. html = _escape_character(html)
  173. return html
  174. def save_data(documents, col):
  175. if isinstance(documents, list):
  176. col.insert_many(documents)
  177. else:
  178. col.insert_many([documents])
  179. def crawl_spider_zb(channel, progress, page, keyword):
  180. url = "https://www.biaozhaozhao.com/qcc/tender/search"
  181. data = {
  182. "sortField": "publishdate",
  183. "sortOrder": "DESC",
  184. "searchType": "accurate",
  185. "searchKeyList": [keyword],
  186. "filter": {
  187. "publishdate": [
  188. {
  189. "currently": True,
  190. "flag": 5,
  191. "number": 1,
  192. "unit": "day",
  193. "min": "2022-07-31T16:00:00.000Z",
  194. "max": "2022-08-31T15:59:59.999Z"
  195. }
  196. ],
  197. "ifbprogress": [
  198. progress
  199. ],
  200. "region": [
  201. {
  202. "pr": "GD"
  203. }
  204. ],
  205. "isvalid": 1
  206. },
  207. "queryLink": "or",
  208. "pageIndex": page,
  209. "pageSize": 50,
  210. "isHighlight": True,
  211. "isDegrade": True
  212. }
  213. data = json.dumps(data)
  214. response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
  215. if response.status_code == 403:
  216. print(f"{keyword}>>>当前第{page}页, 账号cookie失效")
  217. return True, 0
  218. results = []
  219. info_list = response.json()["Result"]
  220. if len(info_list) == 0:
  221. print(f"{keyword}>>>暂无数据")
  222. return True, 0
  223. for item in info_list:
  224. href = 'https://www.biaozhaozhao.com/detail/{}?keywords=%5B"{}"%5D'.format(item["id"], quote(keyword))
  225. if not r.hexists(redis_key, href):
  226. title = item['title']
  227. publish_time = item['publishdate']
  228. time_array = time.strptime(publish_time, "%Y-%m-%d %H:%M:%S")
  229. publishtime_ts = int(time.mktime(time_array))
  230. data = {
  231. 'site': '标找找',
  232. 'channel': channel,
  233. 'spidercode': 'sdxzbiddingsjzypc',
  234. 'title': title,
  235. 'area': '广东', # 省
  236. 'city': item['city'], # 市
  237. 'district': item['district'],
  238. 'publishdept': '',
  239. 'type': '',
  240. 'T': 'bidding', # 数据表名称
  241. 'sendflag': 'false',
  242. '_d': 'comeintime',
  243. 'iscompete': True, # 新爬虫
  244. 'crawl': False,
  245. 'href': '#',
  246. 'competehref': href,
  247. 'publishtime': publish_time,
  248. 'l_np_publishtime': Int64(publishtime_ts),
  249. }
  250. results.append(data)
  251. r.hset(redis_key, href, '')
  252. if len(results) > 0:
  253. save_data(results, bzz_list)
  254. print(f"{keyword}>>>第{page}页采集完成,收录{len(results)}条")
  255. return False, len(results)
  256. def list_page(channel, progress, keyword, pages):
  257. for page in range(1, pages + 1):
  258. stop, total = crawl_spider_zb(channel, progress, page, keyword)
  259. if stop or total < 50:
  260. return False, page
  261. time.sleep(2)
  262. return True, page
  263. def select_label(keyword, pages):
  264. print(f"{keyword}>>>开始采集{pages}页")
  265. for name, index in type_maps.items():
  266. normal_stop, page = list_page(name, index, keyword, pages)
  267. if not normal_stop:
  268. return False
  269. print(f"{name}>>>完成采集:{keyword} 第{page}页")
  270. return True
  271. def handler_detail(items):
  272. headers = {
  273. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
  274. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  275. "Cache-Control": "no-cache",
  276. "Connection": "keep-alive",
  277. "Pragma": "no-cache",
  278. "Sec-Fetch-Dest": "document",
  279. "Sec-Fetch-Mode": "navigate",
  280. "Sec-Fetch-Site": "none",
  281. "Sec-Fetch-User": "?1",
  282. "Upgrade-Insecure-Requests": "1",
  283. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
  284. "sec-ch-ua": "\"Google Chrome\";v=\"107\", \"Chromium\";v=\"107\", \"Not=A?Brand\";v=\"24\"",
  285. "sec-ch-ua-mobile": "?0",
  286. "sec-ch-ua-platform": "\"macOS\""
  287. }
  288. channel = items['channel']
  289. url = items['competehref']
  290. try:
  291. response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
  292. if '您当天的查看项目额度已用完!' in response.text:
  293. print(f'{channel}>>>{items["title"]} 额度已用完!')
  294. return False
  295. response = Selector(response.text)
  296. rubbish = response.xpath('//div[@class="dfUzPvnO"]').extract_first()
  297. html = response.xpath('//div[@class="_4nFsQzFd"]').extract_first()
  298. if html is not None:
  299. contenthtml = html.replace(rubbish, '')
  300. items['contenthtml'] = contenthtml
  301. items['detail'] = cleaner(contenthtml)
  302. items['comeintime'] = Int64(int(time.time()))
  303. bzz_list.update_one(
  304. {'_id': items['_id']},
  305. {'$set': {'crawl': True}}
  306. )
  307. if '_id' in items:
  308. del items['_id'], items['crawl']
  309. data_bak.insert_one(items)
  310. print(f'{channel}>>>{items["title"]} 下载成功')
  311. return True
  312. except requests.RequestException:
  313. print(f'{channel}>>>{items["title"]} 下载失败')
  314. return False
  315. def detail_page():
  316. # q = {'crawl': False}
  317. q = {'crawl': True}
  318. amount_of_project = 0
  319. # total = bzz_list.count_documents(q)
  320. # while total > 0:
  321. # print(f"{channel}>>>剩余任务{total}条")
  322. with bzz_list.find(q) as cursor:
  323. tasks = [item for item in cursor]
  324. for item in tasks:
  325. handler_detail(item)
  326. # if stop_crawl:
  327. # amount_of_project += 1
  328. time.sleep(1.5)
  329. print("任务结束")
  330. def _count_zb_total(page, keyword, progress):
  331. url = "https://www.biaozhaozhao.com/qcc/tender/search"
  332. data = {
  333. "sortField": "publishdate",
  334. "sortOrder": "DESC",
  335. "searchType": "accurate",
  336. "searchKeyList": [keyword],
  337. "filter": {
  338. "publishdate": [
  339. {
  340. "currently": True,
  341. "flag": 5,
  342. "number": 1,
  343. "unit": "day",
  344. "min": "2022-07-31T16:00:00.000Z",
  345. "max": "2022-08-31T15:59:59.999Z"
  346. }
  347. ],
  348. "ifbprogress": [
  349. progress
  350. ],
  351. "region": [
  352. {
  353. "pr": "GD",
  354. "ct": "4403"
  355. }
  356. ],
  357. "isvalid": 1
  358. },
  359. "queryLink": "or",
  360. "pageIndex": page,
  361. "pageSize": 50,
  362. "isHighlight": True,
  363. "isDegrade": True
  364. }
  365. data = json.dumps(data)
  366. response = requests.post(url, headers=headers, cookies=cookies, data=data, timeout=30)
  367. # print(response.json())
  368. total_records = response.json()['Paging']['TotalRecords']
  369. return int(total_records)
  370. def count_total():
  371. count = 0
  372. for tp_name, tp_index in type_maps.items():
  373. totals = 0
  374. for kw in ['一体机', '白板', '黑板', '大屏', '智慧屏', '录播', '智能教师']:
  375. totals += _count_zb_total(1, kw, tp_index)
  376. print(tp_name, totals)
  377. count += totals
  378. print("总计 ", count)
  379. def push_spider_dbs():
  380. dbk = cli1['py_spider']['data_bak']
  381. cur1 = data_bak.find()
  382. for item in cur1:
  383. del item['_id']
  384. print(item)
  385. item['comeintime'] = Int64(int(time.time()))
  386. dbk.insert_one(item)
  387. if __name__ == '__main__':
  388. select_label('一体机', 13)
  389. # select_label('白板', 2)
  390. # select_label('黑板', 2)
  391. # select_label('大屏', 3)
  392. # select_label('智慧屏', 1)
  393. # select_label('录播', 2)
  394. # select_label('智能教师', 1)
  395. detail_page()