fengweiqiang 5 年之前
父节点
当前提交
23a62f84a2
共有 1 个文件被更改,包括 25 次插入19 次删除
  1. 25 19
      src/jy/extract/extract.go

+ 25 - 19
src/jy/extract/extract.go

@@ -27,12 +27,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask //任务列表
+	ClearTaskList map[string]*ClearTask   //清理任务列表
+	saveLimit     = 100                   //抽取日志批量保存
+	PageSize      = 5000                  //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -267,22 +267,28 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
-	//正文小于200个字,有附件把附件内容加到正文
-	tmpDeatil := detail
-	tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
-	if err == nil {
-		conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
-		if conlen < 200 {
-			if isextFile {
-				detail += qu.ObjToString(doc["detailfile"])
-				doc["detail"] = detail
+	if utf8.RuneCountInString(detail) < 2000 {
+		detail += qu.ObjToString(doc["detailfile"])
+		doc["detail"] = detail
+	} else {
+		//正文小于200个字,有附件把附件内容加到正文
+		tmpDeatil := detail
+		tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
+		if err == nil {
+			conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
+			if conlen < 2000 {
+				if isextFile {
+					detail += qu.ObjToString(doc["detailfile"])
+					doc["detail"] = detail
+				}
+			} else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
+				//防止文本过长,造成抽取阻塞
+				log.Debug("文本太长", doc["_id"], conlen)
+				doc["detail"] = d3
 			}
-		} else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
-			//防止文本过长,造成抽取阻塞
-			log.Debug("文本太长", doc["_id"], conlen)
-			doc["detail"] = d3
 		}
 	}
+
 	toptype := qu.ObjToString(doc["toptype"])
 	subtype := qu.ObjToString(doc["subtype"])
 	if qu.ObjToString(doc["type"]) == "bid" {