tools.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-04-09
  4. ---------
  5. @summary: 主题爬虫 工具类
  6. ---------
  7. @author: Lzz
  8. """
  9. import sys
  10. import os
  11. sys.path.append(os.path.dirname(os.getcwd()))
  12. import re
  13. import time
  14. import bson
  15. import redis
  16. import requests
  17. import datetime
  18. import calendar
  19. import hashlib
  20. import random
  21. import execjs
  22. import functools
  23. from hashlib import md5
  24. from loguru import logger
  25. from collections import namedtuple
  26. from pymongo import MongoClient
  27. from pymongo.errors import DuplicateKeyError
  28. from .clean_html import cleaner
  29. SearchText = namedtuple('SearchText', ['total'])
  30. def nsssjss():
  31. ex_js = '''
  32. const jsdom = require("jsdom");
  33. const {JSDOM} = jsdom;
  34. const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
  35. window = dom.window;
  36. document = window.document;
  37. JSEncrypt = require('jsencrypt')
  38. function encryptByRSA(value) {
  39. var encrypt = new JSEncrypt;
  40. var RSAPublicKey = "MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCS2TZDs5+orLYCL5SsJ54+bPCVs1ZQQwP2RoPkFQF2jcT0HnNNT8ZoQgJTrGwNi5QNTBDoHC4oJesAVYe6DoxXS9Nls8WbGE8ZNgOC5tVv1WVjyBw7k2x72C/qjPoyo/kO7TYl6Qnu4jqW/ImLoup/nsJppUznF0YgbyU/dFFNBQIDAQAB";
  41. encrypt.setPublicKey('-----BEGIN PUBLIC KEY-----' + RSAPublicKey + '-----END PUBLIC KEY-----')
  42. return encrypt.encrypt(value)
  43. }
  44. function get_njs(){
  45. nsssjss = encryptByRSA('/freecms' + '/rest/v1/notice/selectInfoMoreChannel.do' + '$$' + new Date().getTime())
  46. return nsssjss
  47. }
  48. '''
  49. ctx = execjs.compile(ex_js)
  50. njs = ctx.call('get_njs')
  51. return njs
  52. def get_QGIP():
  53. proxy = "http://6278CF0D:41D9C796172D@tun-vdpzuj.qg.net:15254"
  54. proxies = {
  55. "http": proxy,
  56. "https": proxy,
  57. }
  58. return proxies
  59. def pinyi_proxy(count=100):
  60. url = f"http://zltiqu.pyhttp.taolop.com/getip?count={count}&neek=80160&type=2&yys=0&port=2&sb=&mr=1&sep=0&ts=1"
  61. retry = 0
  62. while (retry := retry + 1) < 30:
  63. try:
  64. res = requests.get(url, timeout=10)
  65. data_list = res.json().get('data')
  66. if not data_list:
  67. time.sleep(3)
  68. continue
  69. if "白名单" in res.text:
  70. logger.warning("请将此IP加入品易白名单")
  71. return []
  72. new_list = []
  73. for pp in data_list:
  74. proxy = {'http': f'http://{pp.get("ip")}:{pp.get("port")}',
  75. 'https': f'http://{pp.get("ip")}:{pp.get("port")}'}
  76. new_list.append(proxy)
  77. return new_list
  78. except:
  79. logger.error("pinyi访问异常!")
  80. time.sleep(3)
  81. def get_proxy(scheme=None, default=None, socks5h=False):
  82. headers = {
  83. "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
  84. }
  85. while True:
  86. proxy = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
  87. # proxy = requests.get("http://39.106.157.58:1405/crawl/proxy/socks5/fetch", headers=headers).json()
  88. proxies = proxy.get("data")
  89. if proxies:
  90. break
  91. else:
  92. logger.warning("暂无代理...")
  93. time.sleep(3)
  94. if socks5h:
  95. proxyh = {
  96. "http": proxies.get("http").replace("socks5", "socks5h"),
  97. "https": proxies.get("http").replace("socks5", "socks5h")
  98. }
  99. proxies = proxyh
  100. logger.info(f"切换代理: {proxies}")
  101. if not scheme:
  102. return proxies
  103. else:
  104. return proxies.get(scheme, default)
  105. def Mongo_client():
  106. client = MongoClient("172.17.4.87", 27080)
  107. # client = MongoClient("127.0.0.1", 27017)
  108. return client
  109. def Redis_client():
  110. _pool = redis.ConnectionPool(
  111. host='172.17.162.28',
  112. # host='127.0.0.1',
  113. port=7361,
  114. password='k5ZJR5KV4q7DRZ92DQ',
  115. db=1
  116. )
  117. # _pool = redis.ConnectionPool(
  118. # host='127.0.0.1',
  119. # port=6379,
  120. # db=1
  121. # )
  122. r = redis.Redis(connection_pool=_pool, decode_responses=True)
  123. return r
  124. def int2long(param: int):
  125. """int 转换成 long """
  126. return bson.int64.Int64(param)
  127. def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
  128. return datetime.datetime.now().strftime(date_format)
  129. def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
  130. """
  131. @summary:
  132. ---------
  133. @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
  134. @param format:时间格式
  135. ---------
  136. @result: 返回时间戳
  137. """
  138. if ":" in date:
  139. timestamp = time.mktime(time.strptime(date, time_format))
  140. else:
  141. timestamp = time.mktime(time.strptime(date, "%Y-%m-%d"))
  142. return int(timestamp)
  143. def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
  144. """
  145. @summary:
  146. ---------
  147. @param timestamp: 将时间戳转化为日期
  148. @param format: 日期格式
  149. ---------
  150. @result: 返回日期
  151. """
  152. if timestamp is None:
  153. raise ValueError("timestamp is null")
  154. date = time.localtime(timestamp)
  155. return time.strftime(time_format, date)
  156. def get_sha1(*args):
  157. """
  158. @summary: 获取唯一的40位值, 用于获取唯一的id
  159. ---------
  160. @param *args: 参与联合去重的值
  161. ---------
  162. @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
  163. """
  164. sha1 = hashlib.sha1()
  165. for arg in args:
  166. sha1.update(str(arg).encode())
  167. return sha1.hexdigest() # 40位
  168. def get_sha256(*args):
  169. """
  170. @summary: 获取唯一的64位值, 用于获取唯一的id
  171. ---------
  172. @param *args: 参与联合去重的值
  173. ---------
  174. @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
  175. """
  176. sha256 = hashlib.sha256()
  177. for arg in args:
  178. sha256.update(str(arg).encode())
  179. return sha256.hexdigest() # 64位
  180. def md5value(val):
  181. md5 = hashlib.md5()
  182. if isinstance(val, bytes):
  183. md5.update(str(val).encode("utf-8"))
  184. elif isinstance(val, str):
  185. md5.update(val.encode("utf-8"))
  186. return md5.hexdigest()
  187. def ensure_int64(n):
  188. """
  189. >>> ensure_int64(None)
  190. 0
  191. >>> ensure_float(False)
  192. 0
  193. >>> ensure_float(12)
  194. 12
  195. >>> ensure_float("72")
  196. 72
  197. """
  198. if not n:
  199. return bson.int64.Int64(0)
  200. return bson.int64.Int64(n)
  201. def get_today_of_day(day_offset=0):
  202. return str(datetime.date.today() + datetime.timedelta(days=day_offset))
  203. def get_current_timestamp():
  204. return int(time.time())
  205. def add_zero(n):
  206. return "%02d" % n
  207. def sup_zero(indate):
  208. deal = indate.split(' ')
  209. head = deal[0].split('-')
  210. tail = ""
  211. if len(deal) == 2:
  212. tail = " " + deal[1]
  213. year = int(head[0])
  214. month = int(head[1])
  215. day = int(head[2])
  216. fdate = datetime.datetime(year=year, month=month, day=day)
  217. formatted_date = fdate.strftime("%Y-%m-%d") + tail
  218. return formatted_date
  219. def get_days_of_month(year, month):
  220. """
  221. 返回天数
  222. """
  223. return calendar.monthrange(year, month)[1]
  224. def get_year_month_and_days(month_offset=0):
  225. """
  226. @summary:
  227. ---------
  228. @param month_offset: 月份偏移量
  229. ---------
  230. @result: ('2019', '04', '30')
  231. """
  232. today = datetime.datetime.now()
  233. year, month = today.year, today.month
  234. this_year = int(year)
  235. this_month = int(month)
  236. total_month = this_month + month_offset
  237. if month_offset >= 0:
  238. if total_month <= 12:
  239. days = str(get_days_of_month(this_year, total_month))
  240. total_month = add_zero(total_month)
  241. return (year, total_month, days)
  242. else:
  243. i = total_month // 12
  244. j = total_month % 12
  245. if j == 0:
  246. i -= 1
  247. j = 12
  248. this_year += i
  249. days = str(get_days_of_month(this_year, j))
  250. j = add_zero(j)
  251. return (str(this_year), str(j), days)
  252. else:
  253. if (total_month > 0) and (total_month < 12):
  254. days = str(get_days_of_month(this_year, total_month))
  255. total_month = add_zero(total_month)
  256. return (year, total_month, days)
  257. else:
  258. i = total_month // 12
  259. j = total_month % 12
  260. if j == 0:
  261. i -= 1
  262. j = 12
  263. this_year += i
  264. days = str(get_days_of_month(this_year, j))
  265. j = add_zero(j)
  266. return (str(this_year), str(j), days)
  267. def get_month(month_offset=0):
  268. """''
  269. 获取当前日期前后N月的日期
  270. if month_offset>0, 获取当前日期前N月的日期
  271. if month_offset<0, 获取当前日期后N月的日期
  272. date format = "YYYY-MM-DD"
  273. """
  274. today = datetime.datetime.now()
  275. day = add_zero(today.day)
  276. (y, m, d) = get_year_month_and_days(month_offset)
  277. arr = (y, m, d)
  278. if int(day) < int(d):
  279. arr = (y, m, day)
  280. return "-".join("%s" % i for i in arr)
  281. def extract_file_type(file_name="附件名", file_url="附件地址", file_type_list=[]):
  282. """
  283. 抽取附件类型
  284. Args:
  285. file_name: 附件名
  286. file_url: 附件地址
  287. file_type_list: 其他附件后缀
  288. Returns: 附件类型
  289. """
  290. if file_name and file_url:
  291. file_name = file_name.strip()
  292. file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
  293. 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
  294. if file_type_list:
  295. ftp_list = list(map(lambda x: x.lower(), file_type_list))
  296. file_types.extend(ftp_list)
  297. file_type = file_url.split('?')[0].split('.')[-1].lower()
  298. if file_type not in file_types:
  299. file_type = file_url.split('?')[-1].split('.')[-1].lower()
  300. if file_type in file_types:
  301. return file_type
  302. else:
  303. for ftp in file_types:
  304. file_type = re.search(ftp, file_name) or re.search("\." + ftp, file_url)
  305. if file_type:
  306. return file_type.group(0).replace('.', '')
  307. else:
  308. return file_type
  309. return None
  310. def remove_htmldata(remove_info_list: list, html: str, response):
  311. """
  312. 过滤详情页无效数据
  313. Args:
  314. remove_info_list: 需删除内容的xpath或文本 -> list [xpath,re,str] eg:['<re>data:image/(.*?)"',]
  315. html: 待清洗文本
  316. response: 原文响应体
  317. Returns: 清洗后的文本
  318. """
  319. if html and remove_info_list:
  320. for extra_item in remove_info_list:
  321. if re.search('^//.*', extra_item):
  322. extra_html_list = response.xpath(extra_item).extract()
  323. for extra_html in extra_html_list:
  324. if extra_html:
  325. html = html.replace(extra_html, '')
  326. elif re.search('^<re>.*', extra_item):
  327. extra_item = extra_item.replace('<re>', '')
  328. extra_html_list = re.findall(f'{extra_item}', html, re.S | re.I | re.M)
  329. if extra_html_list:
  330. for exhtml in extra_html_list:
  331. html = html.replace(exhtml, '')
  332. else:
  333. extra_html = extra_item
  334. if extra_html:
  335. html = html.replace(extra_html, '')
  336. return html
  337. def text_search(content: str) -> SearchText:
  338. """
  339. 中文检索
  340. :param content: 文本
  341. :return: 中文数量
  342. """
  343. if not content:
  344. return SearchText(0)
  345. results = re.findall('[\u4e00-\u9fa5]', content, re.S)
  346. # 列表长度即是中文的字数
  347. return SearchText(len(results))
  348. def clean_title(title):
  349. '''清洗标题'''
  350. if title:
  351. rule_list = [
  352. '\(\d{1,20}\)',
  353. '\[[\u4e00-\u9fa5]{1,9}\]',
  354. '【[\u4e00-\u9fa5]{1,9}】',
  355. ]
  356. for rule in rule_list:
  357. title = re.sub(rule, '', title)
  358. return title
  359. def substitute(html_str, special=None, completely=False):
  360. """HTML 替换"""
  361. html_str = cleaner(html=html_str, special=special, completely=completely)
  362. return html_str
  363. def handle_publish_time(publishtime):
  364. '''处理发布时间'''
  365. try:
  366. time_str = get_current_date().split(' ')[-1]
  367. if ':' not in publishtime:
  368. publishtime = publishtime + ' ' + time_str
  369. else:
  370. if '00:00:00' in publishtime:
  371. publishtime = publishtime.split(' ')[0] + ' ' + time_str
  372. l_np_publishtime = int2long(date_to_timestamp(publishtime))
  373. publishtime, l_np_publishtime = handle_publish_time_overdue(publishtime, l_np_publishtime)
  374. return publishtime, l_np_publishtime
  375. except:
  376. raise EOFError("publishtime 格式错误!")
  377. def handle_publish_time_overdue(publishtime, l_np_publishtime):
  378. """处理超期发布时间"""
  379. if l_np_publishtime and l_np_publishtime > get_current_timestamp():
  380. logger.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
  381. publishtime = get_current_date()
  382. l_np_publishtime = ensure_int64(date_to_timestamp(publishtime))
  383. return publishtime, l_np_publishtime
  384. def handle_page_html(item):
  385. '''检测正文'''
  386. title = item.get('title')
  387. publishtime = item.get('publishtime')
  388. href = item.get('href')
  389. if href == "#":
  390. href = item.get('competehref')
  391. contenthtml = item.get('contenthtml')
  392. detail = item.get('detail')
  393. if not contenthtml:
  394. logger.warning(f"页面源码不能为空!\n 发布地址:{href}\n 发布时间:{publishtime}\n 标题:{title}")
  395. raise ValueError("无效正文!")
  396. else:
  397. if text_search(detail).total == 0:
  398. logger.warning("无内容数据,数据不入保存服务!")
  399. item['sendflag'] = "true"
  400. def check_data_validity(item):
  401. '''检测基础字段是否完整'''
  402. title = item.get('title')
  403. publishtime = item.get('publishtime')
  404. href = item.get('href')
  405. if href == "#":
  406. href = item.get('competehref')
  407. if not title or not publishtime or not href:
  408. logger.error(f"基础数据不能为空!\n 发布地址:{href}\n 发布时间:{publishtime}\n 标题:{title}")
  409. raise ValueError("基础数据异常")
  410. def format_fileds(item, **kwargs):
  411. '''格式化入库字段(bidding)'''
  412. req_fileds = ['title', 'publishtime', 'spidercode', 'infoformat', 'site', 'channel', 'area', 'city', 'jsondata',
  413. 'district', 'href', 'is_mixed', 'comeintime', 's_title', 'l_np_publishtime', 'contenthtml','competehref',
  414. 'detail', 'iscompete', 'sendflag', '_d', 'publishdept', 'type', 'T', 'projectinfo', 'is_theme']
  415. rm_list = []
  416. for key, val in item.items(): # 过滤非必须字段
  417. if key not in req_fileds:
  418. rm_list.append(key)
  419. for kk in rm_list:
  420. item.pop(kk, None)
  421. item['detail'] = substitute(item.get('contenthtml'))
  422. item['s_title'] = item.get('s_title') or item.get('title')
  423. pub_time = handle_publish_time(item.get('publishtime'))
  424. item['publishtime'] = pub_time[0]
  425. item['l_np_publishtime'] = pub_time[1]
  426. item['infoformat'] = 1
  427. item['iscompete'] = True
  428. item['sendflag'] = "false"
  429. item['_d'] = "comeintime"
  430. item['publishdept'] = ""
  431. item['type'] = ""
  432. item['T'] = "bidding"
  433. for k, v in kwargs.items():
  434. if k in req_fileds:
  435. item[k] = v
  436. else:
  437. logger.error(f"{k} 入库字段未定义!")
  438. handle_page_html(item)
  439. check_data_validity(item)
  440. item['comeintime'] = int2long(time.time())
  441. return item
  442. def format_fileds_njpc(item, **kwargs):
  443. '''格式化入库字段(拟建爬虫)'''
  444. req_fileds = ['site', 'approvenumber', 'method', 'project_scale', 'area', 'is_mixed','competehref',
  445. 'air_conditioner', 'funds', 'scale', 'construction_area', 'channel', 'contenthtml', 'elevator',
  446. 'building_floors', 'ownertel', 'parking', 'building', 'spidercode', 'title',
  447. 'detail', 'projectinfo', 'exterior', 'constructionunit', 'owner_info', 'approvetime',
  448. 'project_startdate', 'investment', 'heating', 'district', 'constructionunitperson',
  449. 'designunitperson', 'publishtime', 'system', 'pace', 'total', 'project_scale_info', 'passive',
  450. 'phone', 'construction', 'parking_pace', 'floors', 'freshair_system', 'other_project_scale',
  451. 'conditioner', 'wall', 'designunit', 'owneraddr', 'prefabricated_building', 'materials',
  452. 'constructionunitaddr', 'constructionunit_info', 'project_person', 'approvecontent',
  453. 'constructionunittel', 'floor', 'person', 'city', 'floor_area', 'project', 'approvestatus',
  454. 'project_completedate', 'completedate', 'ownerperson', 'sendflag', 'comeintime',
  455. 'steel_structure', 'projectaddr', 'freshair', 'T', 'startdate', 'house', 'projectname',
  456. 'exterior_wall_materials', 'other', 'passive_house', 'jsondata', 'air', 'prefabricated',
  457. 'designunit_info', 'approvedept', 'total_investment', 'infoformat', 'project_phone',
  458. 'owner', 'designunittel', 'projecttype', 'approvecode', 'steel', 'is_theme', 'designunitaddr',
  459. 'heating_method', 'href', 'projectperiod', 'structure']
  460. rm_list = []
  461. for key, val in item.items(): # 过滤非必须字段
  462. if key not in req_fileds:
  463. rm_list.append(key)
  464. for kk in rm_list:
  465. item.pop(kk, None)
  466. item['detail'] = substitute(item.get('contenthtml'))
  467. item['title'] = item.get('title') or item.get('projectname')
  468. pub_time = handle_publish_time(item.get('publishtime'))
  469. item['publishtime'] = pub_time[1]
  470. item['infoformat'] = 2
  471. item['sendflag'] = "false"
  472. item['T'] = "bidding"
  473. for k, v in kwargs.items():
  474. if k in req_fileds:
  475. item[k] = v
  476. else:
  477. logger.error(f"{k} 入库字段未定义!")
  478. handle_page_html(item)
  479. check_data_validity(item)
  480. item['comeintime'] = int2long(time.time())
  481. return item
  482. def search(pattern, string):
  483. result = re.search(pattern, string)
  484. if result:
  485. return result.groups()[0]
  486. def sleep_time(start_time: int, end_time=0, step=-1):
  487. time.sleep(random.random())
  488. for i in range(start_time, end_time, step):
  489. print(f"\r *** 休眠中... {i} 秒 *** ", end='')
  490. time.sleep(1)
  491. print("\r <* 休眠结束 *> ", end='')
  492. # 装饰器
  493. class Singleton(object):
  494. def __init__(self, cls):
  495. self._cls = cls
  496. self._instance = {}
  497. def __call__(self, *args, **kwargs):
  498. if self._cls not in self._instance:
  499. self._instance[self._cls] = self._cls(*args, **kwargs)
  500. return self._instance[self._cls]
  501. def down_load_image(proxy=None):
  502. img_url = 'https://gdgpo.czt.gd.gov.cn/freecms/verify/verifyCode.do?createTypeFlag=n'
  503. header = {
  504. "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  505. "Accept-Language": "zh-CN,zh;q=0.9",
  506. "Connection": "keep-alive",
  507. "Referer": "https://gdgpo.czt.gd.gov.cn/cms-gd/site/guangdong/qwjsy/index.html?",
  508. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  509. }
  510. res = requests.get(img_url, headers=header, proxies=proxy, timeout=30, verify=False)
  511. upload_address = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
  512. content = {'file': res.content}
  513. # with open('image.jpg', 'wb+') as f:
  514. # f.write(res.content)
  515. headers = {'accept': 'application/json'}
  516. json_resp = requests.post(upload_address, headers=headers, files=content, stream=True).json()
  517. if "msg" in json_resp and "success" == json_resp["msg"]:
  518. code = json_resp["r"]["code"]
  519. if len(code) == 4:
  520. return code
  521. return None
  522. def _pack_file(file):
  523. """包装验证码格式"""
  524. if isinstance(file, str) and file.startswith("data:image"):
  525. img_file = {"file": file}
  526. elif isinstance(file, bytes):
  527. img_file = {"file": file}
  528. else:
  529. with open(file, "rb") as f:
  530. img_bytes = f.read()
  531. img_file = {"file": img_bytes}
  532. return img_file
  533. def simple_captcha(file):
  534. """
  535. 普通验证码
  536. @param file: 验证码 - 可以是图片或者图片base64编码
  537. @return:
  538. """
  539. url = "http://pycaptcha.spdata.jianyu360.com/v1/images/verify"
  540. files = _pack_file(file)
  541. r = requests.post(url, headers={"accept": "application/json"}, files=files, stream=True, timeout=10)
  542. rp_json = r.json()
  543. if "msg" in rp_json and "success" == rp_json["msg"]:
  544. return str(rp_json["r"]["code"])
  545. return None
  546. def retry_on_exception(retries=1, timeout=1):
  547. def decorate(func):
  548. @functools.wraps(func)
  549. def warp(*args, **kwargs):
  550. for _ in range(retries):
  551. try:
  552. return func(*args, **kwargs)
  553. except Exception as e:
  554. print(f"执行[{func.__name__}]失败, args:{args}, kwargs:{kwargs} 异常:{e}")
  555. time.sleep(timeout)
  556. raise RuntimeError(f"执行[{func.__name__}]达到最大重试次数")
  557. return warp
  558. return decorate
  559. class PySpiderError(Exception):
  560. def __init__(self, *args, **kwargs):
  561. if 'code' not in kwargs and 'reason' not in kwargs:
  562. kwargs['code'] = 10000
  563. kwargs['reason'] = '未知爬虫错误,请手动处理'
  564. for key, val in kwargs.items():
  565. setattr(self, key, val)
  566. super(PySpiderError, self).__init__(*args, kwargs)
  567. class AttachmentNullError(PySpiderError):
  568. def __init__(self, code: int = 10004, reason: str = '附件下载异常'):
  569. super(AttachmentNullError, self).__init__(code=code, reason=reason)
  570. class CustomError(Exception):
  571. def __init__(self, ErrorInfo):
  572. self.ErrorInfo = ErrorInfo
  573. def __str__(self):
  574. return self.ErrorInfo