|
@@ -27,12 +27,12 @@ import (
|
|
|
var (
|
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -267,22 +267,28 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
if isextFile {
|
|
|
file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
}
|
|
|
- //正文小于200个字,有附件把附件内容加到正文
|
|
|
- tmpDeatil := detail
|
|
|
- tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
|
|
|
- if err == nil {
|
|
|
- conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
|
|
|
- if conlen < 200 {
|
|
|
- if isextFile {
|
|
|
- detail += qu.ObjToString(doc["detailfile"])
|
|
|
- doc["detail"] = detail
|
|
|
+ if utf8.RuneCountInString(detail) < 2000 {
|
|
|
+ detail += qu.ObjToString(doc["detailfile"])
|
|
|
+ doc["detail"] = detail
|
|
|
+ } else {
|
|
|
+ //正文小于200个字,有附件把附件内容加到正文
|
|
|
+ tmpDeatil := detail
|
|
|
+ tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
|
|
|
+ if err == nil {
|
|
|
+ conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
|
|
|
+ if conlen < 2000 {
|
|
|
+ if isextFile {
|
|
|
+ detail += qu.ObjToString(doc["detailfile"])
|
|
|
+ doc["detail"] = detail
|
|
|
+ }
|
|
|
+ } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
+ //防止文本过长,造成抽取阻塞
|
|
|
+ log.Debug("文本太长", doc["_id"], conlen)
|
|
|
+ doc["detail"] = d3
|
|
|
}
|
|
|
- } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
- //防止文本过长,造成抽取阻塞
|
|
|
- log.Debug("文本太长", doc["_id"], conlen)
|
|
|
- doc["detail"] = d3
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
|
subtype := qu.ObjToString(doc["subtype"])
|
|
|
if qu.ObjToString(doc["type"]) == "bid" {
|