1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- import re
- from lxml.html import HtmlElement
- DATETIME_PATTERN = [
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
- "(\d{1,2}[-|/|.]\d{1,2})",
- "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
- "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
- "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
- "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
- "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
- "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
- "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
- "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
- "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
- "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
- "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
- "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
- "(\d{4}年\d{1,2}月\d{1,2}日)",
- "(\d{2}年\d{1,2}月\d{1,2}日)",
- "(\d{1,2}月\d{1,2}日)"
- ]
- class TimeExtractor:
- def __init__(self):
- self.time_pattern = DATETIME_PATTERN
- def extractor(self, element: HtmlElement) -> str:
- # text = ''.join(element.xpath('.//text()'))
- text = ''.join(element.xpath('string(.)').split())
- for dt in self.time_pattern:
- dt_obj = re.search(dt, text)
- if dt_obj:
- return dt_obj.group(1)
- else:
- return ''
|