news_list.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # coding: utf-8
  2. import time
  3. import uuid
  4. import httpx
  5. import requests
  6. from loguru import logger
  7. from lxml.html import fromstring
  8. from tools import dedup
  9. from tools import get_proxy, ua
  10. from tools import news_keyword_coll, news_list_coll
  11. def analysis_info(site, page, search_word, response):
  12. data_count = 0
  13. select_lst = []
  14. if response:
  15. html_parse = fromstring(response.content.decode())
  16. select_lst = html_parse.xpath("//div[@id='content_left']//h3/a")
  17. for elem in select_lst:
  18. title = str(elem.xpath("./@aria-label")[0]).replace("标题:", "")
  19. url = elem.xpath("./@href")[0]
  20. if not dedup.get(url):
  21. item = dict(
  22. _id=str(uuid.uuid4()),
  23. url=url,
  24. list_title=title,
  25. searchengine="baidu",
  26. searchwords=search_word,
  27. site=site,
  28. )
  29. news_list_coll.insert_one(item)
  30. dedup.add(url)
  31. data_count += 1
  32. tips = [
  33. f"第{page}页--{search_word}",
  34. f"采集量:{len(select_lst)}",
  35. f"入库量:{data_count}"
  36. ]
  37. logger.info(",".join(tips))
  38. def get_list_response(key, page_num, follow_redirects=False):
  39. url = "https://www.baidu.com/s"
  40. headers = {
  41. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  42. "Accept-Language": "zh-CN,zh;q=0.9",
  43. "Cache-Control": "no-cache",
  44. "Connection": "keep-alive",
  45. "User-Agent": ua.random,
  46. }
  47. params = {
  48. "tn": "news",
  49. "rtt": "4",
  50. "bsst": "1",
  51. "cl": "2",
  52. "wd": key,
  53. "medium": "0",
  54. "tngroupname": "organic_news",
  55. "newVideo": "12",
  56. "goods_entry_switch": "1",
  57. "rsv_dl": "news_b_pn",
  58. "pn": page_num * 20
  59. }
  60. try:
  61. proxy = get_proxy()
  62. except requests.RequestException as e:
  63. logger.error(f'HTTP Proxy Exception for {e}')
  64. raise e
  65. request_kwargs = dict(
  66. headers=headers,
  67. params=params,
  68. timeout=10,
  69. proxy=proxy,
  70. follow_redirects=follow_redirects # 百度图形验证,302跳转
  71. )
  72. try:
  73. response = httpx.get(url, **request_kwargs)
  74. response.raise_for_status()
  75. return response
  76. except httpx.HTTPError as exc:
  77. logger.error(f"HTTP Exception for {exc.request.url} - {exc}")
  78. def baidu_search(document):
  79. titles = document["key"]
  80. site = document["site"]
  81. for pn in range(0, 4):
  82. try:
  83. response = get_list_response(f"intitle:{titles}", pn)
  84. analysis_info(site, pn + 1, titles, response)
  85. except Exception:
  86. break
  87. news_keyword_coll.update_one({"_id": document["_id"]}, {"$set": {"down": 1}})
  88. def start():
  89. news_keyword_coll.update_many({}, {"$unset": {"down": ""}}) # 重置全部 down 字段
  90. search_items = [item for item in news_keyword_coll.find({"down": {"$exists": 0}})]
  91. while search_items:
  92. item = search_items.pop(0)
  93. baidu_search(item)
  94. if __name__ == '__main__':
  95. while 1:
  96. start()
  97. logger.info("本轮执行完成, 将延时3小时后执行.")
  98. time.sleep(3 * 3600)