|
@@ -4,7 +4,6 @@ import (
|
|
|
"bytes"
|
|
|
"encoding/json"
|
|
|
"fmt"
|
|
|
- "github.com/PuerkitoBio/goquery"
|
|
|
"jy/clear"
|
|
|
db "jy/mongodbutil"
|
|
|
"jy/pretreated"
|
|
@@ -18,6 +17,8 @@ import (
|
|
|
"time"
|
|
|
"unicode/utf8"
|
|
|
|
|
|
+ "github.com/PuerkitoBio/goquery"
|
|
|
+
|
|
|
log "github.com/donnie4w/go-logger/logger"
|
|
|
"gopkg.in/mgo.v2/bson"
|
|
|
)
|
|
@@ -25,12 +26,12 @@ import (
|
|
|
var (
|
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -248,8 +249,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
|
//正文小于50个字,有附件把附件内容加到正文
|
|
|
tmpDeatil := detail
|
|
|
tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
|
|
|
- if err == nil{
|
|
|
- if utf8.RuneCountInString(strings.Trim(tmpdocument.Text()," ")) < 50{
|
|
|
+ if err == nil {
|
|
|
+ if utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " ")) < 50 {
|
|
|
if isextFile {
|
|
|
detail += qu.ObjToString(doc["detailfile"])
|
|
|
}
|
|
@@ -716,7 +717,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
|
|
|
//lua脚本根据属性设置提取kv值
|
|
|
func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
|
|
|
kvmap := map[string][]map[string]interface{}{}
|
|
|
- if len(j.Winnerorder) > 0 {
|
|
|
+ if len(j.Winnerorder) > 1 {
|
|
|
if vc.Field == "bidamount" {
|
|
|
for _, v := range j.Winnerorder {
|
|
|
kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
|
|
@@ -1280,8 +1281,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
tmp["kvtext"] = kvtext.String()
|
|
|
}
|
|
|
if len(blocks) > 0 {
|
|
|
- if blocksBytes, err := json.Marshal(blocks);err == nil{
|
|
|
- if utf8.RuneCount(blocksBytes) < 100000{
|
|
|
+ if blocksBytes, err := json.Marshal(blocks); err == nil {
|
|
|
+ if utf8.RuneCount(blocksBytes) < 100000 {
|
|
|
tmp["blocks"] = string(blocksBytes)
|
|
|
}
|
|
|
}
|
|
@@ -1431,7 +1432,7 @@ func delFiled(k string) bool {
|
|
|
return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
|
}
|
|
|
|
|
|
-func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|
|
|
+func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|
|
|
defer qu.Catch()
|
|
|
doc := j.Data
|
|
|
result := j.Result
|
|
@@ -1442,7 +1443,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[stri
|
|
|
for _, val := range result {
|
|
|
ju.Sort(val)
|
|
|
}
|
|
|
- j.Result = JsonDataMergeProcessing(j,e)
|
|
|
+ j.Result = JsonDataMergeProcessing(j, e)
|
|
|
return doc, result, _id
|
|
|
}
|
|
|
|
|
@@ -1512,7 +1513,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|