Переглянути джерело

文本中提取发布时间

maxiaoshan 4 роки тому
батько
коміт
cf6482c5d5
1 змінених файлів з 14 додано та 10 видалено
  1. 14 10
      src/spiderutil/publishtime.go

+ 14 - 10
src/spiderutil/publishtime.go

@@ -46,9 +46,10 @@ var (
 )
 
 func GetPublishtime(textArr []string) string {
+	defer util.Catch()
 	for _, text := range textArr {
 		t, b := GetPublishtimeByText(text)
-		if b {
+		if b && t < time.Now().Unix() {
 			return util.FormatDateByInt64(&t, util.Date_Full_Layout)
 		}
 	}
@@ -56,12 +57,12 @@ func GetPublishtime(textArr []string) string {
 }
 
 func GetPublishtimeByText(text string) (int64, bool) {
+	defer util.Catch()
 	runeDetail := []rune(text)
 	length := len(runeDetail)
 	if length < 30 {
 		return int64(0), false
 	}
-	comeintime := time.Now().Unix()
 	text = FilterReg.ReplaceAllString(text, "")   //去除html标签
 	text = FilterReg2.ReplaceAllString(text, "")  //去除字母
 	text = RegEndSpace.ReplaceAllString(text, "") //去除结尾空格
@@ -73,32 +74,32 @@ func GetPublishtimeByText(text string) (int64, bool) {
 
 	//1、结尾时间
 	result = strings.ReplaceAll(result, "○", "0")
-	publishtime, endMatch := MatchDate(result, comeintime)
+	publishtime, endMatch := MatchDate(result)
 	if !endMatch { //开头时间
 		if length > 70 {
 			result = string(runeDetail[:70])
 		}
 		result = strings.ReplaceAll(result, "○", "0")
-		publishtime, endMatch = MatchDate(result, comeintime)
+		publishtime, endMatch = MatchDate(result)
 	}
 
 	//2.发布时间
 	if !endMatch { //结尾时间没有匹配到
 		if date := Date3.FindString(text); date != "" {
-			publishtime, endMatch = MatchDate(date, comeintime)
+			publishtime, endMatch = MatchDate(date)
 
 		}
 	}
 	//4.公告(开始)时间
 	if !endMatch {
 		if date := Date4.FindString(text); date != "" {
-			publishtime, endMatch = MatchDate(date, comeintime)
+			publishtime, endMatch = MatchDate(date)
 		}
 	}
 	//5.
 	if !endMatch {
 		if date := Date6.FindString(text); date != "" {
-			publishtime, endMatch = MatchDate(date, comeintime)
+			publishtime, endMatch = MatchDate(date)
 		}
 	}
 	//6.定于2021年1月18日
@@ -118,7 +119,7 @@ func GetPublishtimeByText(text string) (int64, bool) {
 			d = CompletDate(d)
 			if d != "" {
 				t, err := time.ParseInLocation(util.Date_Short_Layout, d, time.Local)
-				if err == nil && t.Unix() < comeintime {
+				if err == nil {
 					publishtime = t.Unix()
 					endMatch = true
 				}
@@ -127,12 +128,13 @@ func GetPublishtimeByText(text string) (int64, bool) {
 	}
 
 	//if date := Date7.FindString(text); date != "" {
-	//	publishtime, _ = MatchDate( date, comeintime)
+	//	publishtime, _ = MatchDate( date)
 	//}
 	return publishtime, endMatch
 }
 
-func MatchDate(text string, comeintime int64) (int64, bool) {
+func MatchDate(text string) (int64, bool) {
+	defer util.Catch()
 	var dateArr []int64
 	for _, reg := range []*regexp.Regexp{HanDate, Date1, Date2} {
 		date := reg.FindAllString(text, -1)
@@ -164,6 +166,7 @@ func MatchDate(text string, comeintime int64) (int64, bool) {
 }
 
 func replaceDate(s string) string {
+	defer util.Catch()
 	dateText := RegSpace.ReplaceAllString(s, "")
 	d := ""
 	for _, r := range []rune(dateText) {
@@ -179,6 +182,7 @@ func replaceDate(s string) string {
 
 //2020-2-2 ->2020-02-02
 func CompletDate(date string) string {
+	defer util.Catch()
 	result := ""
 	dateArr := strings.Split(date, "-")
 	if len(dateArr) == 3 {