Browse Source

fixbug - 时间抽取规则修正

dongzhaorui 2 years ago
parent
commit
b14f666eed
1 changed files with 10 additions and 10 deletions
  1. 10 10
      find_source/crawler/analysis/TimeExtractor.py

+ 10 - 10
find_source/crawler/analysis/TimeExtractor.py

@@ -3,31 +3,31 @@ import re
 from lxml.html import HtmlElement
 
 DATETIME_PATTERN = [
-    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
-    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
-    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
-    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-2]?[0-9]:[0-5]?[0-9])",
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
     "(\d{1,2}[-|/|.]\d{1,2})",
-    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
-    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
-    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
-    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
-    "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
     "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
-    "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[0-2]?[0-9]:[0-5]?[0-9])",
     "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
     "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",