TimeExtractor.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import re
  2. from lxml.html import HtmlElement
  3. DATETIME_PATTERN = [
  4. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
  5. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
  6. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
  7. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
  8. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
  9. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
  10. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
  11. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
  12. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
  13. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
  14. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
  15. "(\d{1,2}[-|/|.]\d{1,2})",
  16. "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
  17. "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
  18. "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
  19. "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
  20. "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
  21. "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
  22. "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
  23. "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
  24. "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
  25. "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
  26. "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
  27. "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
  28. "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
  29. "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
  30. "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
  31. "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
  32. "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
  33. "(\d{4}年\d{1,2}月\d{1,2}日)",
  34. "(\d{2}年\d{1,2}月\d{1,2}日)",
  35. "(\d{1,2}月\d{1,2}日)"
  36. ]
  37. class TimeExtractor:
  38. def __init__(self):
  39. self.time_pattern = DATETIME_PATTERN
  40. def extractor(self, element: HtmlElement) -> str:
  41. # text = ''.join(element.xpath('.//text()'))
  42. text = ''.join(element.xpath('string(.)').split())
  43. for dt in self.time_pattern:
  44. dt_obj = re.search(dt, text)
  45. if dt_obj:
  46. return dt_obj.group(1)
  47. else:
  48. return ''