|
@@ -18,8 +18,6 @@ import (
|
|
|
"time"
|
|
|
"unicode/utf8"
|
|
|
|
|
|
- "github.com/PuerkitoBio/goquery"
|
|
|
-
|
|
|
log "github.com/donnie4w/go-logger/logger"
|
|
|
"gopkg.in/mgo.v2/bson"
|
|
|
)
|
|
@@ -33,8 +31,8 @@ var (
|
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
saveLimit = 100 //抽取日志批量保存
|
|
|
PageSize = 5000 //查询分页
|
|
|
- Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
|
- //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
|
|
|
+ Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
|
|
|
+ //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
|
|
|
@@ -62,6 +60,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
|
ext.InitTag(true)
|
|
|
ext.InitClearFn(false)
|
|
|
ext.InitClearFn(true)
|
|
|
+ ext.Lock()
|
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
|
//初始化城市DFA信息
|
|
|
ext.InitCityInfo()
|
|
@@ -69,6 +68,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
|
ext.InitAreaCode()
|
|
|
ext.InitPostCode()
|
|
|
}
|
|
|
+ ext.Unlock()
|
|
|
//质量审核
|
|
|
ext.InitAuditFields()
|
|
|
ext.InitAuditRule()
|
|
@@ -151,6 +151,7 @@ func StartExtractTaskId(taskId string) bool {
|
|
|
ext.InitTag(true)
|
|
|
ext.InitClearFn(false)
|
|
|
ext.InitClearFn(true)
|
|
|
+ ext.Lock()
|
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
|
//初始化城市DFA信息
|
|
|
//ext.InitCityDFA()
|
|
@@ -158,6 +159,7 @@ func StartExtractTaskId(taskId string) bool {
|
|
|
ext.InitAreaCode()
|
|
|
ext.InitPostCode()
|
|
|
}
|
|
|
+ ext.Unlock()
|
|
|
//质量审核
|
|
|
ext.InitAuditFields()
|
|
|
ext.InitAuditRule()
|
|
@@ -216,7 +218,7 @@ func RunExtractTask(taskId string) {
|
|
|
//}
|
|
|
if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时
|
|
|
continue
|
|
|
- } //根据标题判断是否抽取
|
|
|
+ }
|
|
|
b := IsExtract("title", qu.ObjToString(v["title"]), "")
|
|
|
if !b {
|
|
|
continue
|
|
@@ -285,30 +287,22 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
if isextFile {
|
|
|
file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
}
|
|
|
- if utf8.RuneCountInString(detail) < 2000 {
|
|
|
- if doc["detailfile"] == nil || doc["detailfile"] == "" {
|
|
|
- file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
- }
|
|
|
- detail += qu.ObjToString(doc["detailfile"])
|
|
|
- doc["detail"] = detail
|
|
|
- } else {
|
|
|
- //正文小于200个字,有附件把附件内容加到正文
|
|
|
- tmpDeatil := detail
|
|
|
- tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
|
|
|
- if err == nil {
|
|
|
- conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
|
|
|
- if conlen < 2000 {
|
|
|
- if isextFile {
|
|
|
- detail += qu.ObjToString(doc["detailfile"])
|
|
|
- doc["detail"] = detail
|
|
|
- }
|
|
|
- } else if conlen > qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
- //防止文本过长,造成抽取阻塞
|
|
|
- log.Debug("文本太长", doc["_id"], conlen)
|
|
|
- doc["detail"] = d3
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ //正文小于200个字,有附件把附件内容加到正文
|
|
|
+ //tmpDeatil := detail
|
|
|
+ //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
|
|
|
+ //if err == nil {
|
|
|
+ // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
|
|
|
+ // if conlen < 2000 {
|
|
|
+ // if isextFile {
|
|
|
+ // detail += qu.ObjToString(doc["detailfile"])
|
|
|
+ // doc["detail"] = detail
|
|
|
+ // }
|
|
|
+ // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) {
|
|
|
+ // //防止文本过长,造成抽取阻塞
|
|
|
+ // log.Debug("文本太长", doc["_id"], conlen)
|
|
|
+ // doc["detail"] = d3
|
|
|
+ // }
|
|
|
+ //}
|
|
|
|
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
|
subtype := qu.ObjToString(doc["subtype"])
|
|
@@ -331,6 +325,9 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
if (*toMap)["extweight"] == nil {
|
|
|
(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
|
|
|
}
|
|
|
+ if (*toMap)["jsoncontent"] != nil {
|
|
|
+ delete(*toMap, "jsoncontent")
|
|
|
+ }
|
|
|
}
|
|
|
j = &ju.Job{
|
|
|
SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
@@ -398,7 +395,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
}
|
|
|
qu.Try(func() {
|
|
|
pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
|
|
|
- if isextFile {
|
|
|
+ if isextFile && strings.TrimSpace(jf.Content) != "" {
|
|
|
pretreated.AnalyStart(jf, isSite, codeSite)
|
|
|
}
|
|
|
}, func(err interface{}) {
|
|
@@ -407,9 +404,14 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
return j, jf, isSite
|
|
|
}
|
|
|
|
|
|
+var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
|
|
|
+var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
|
|
|
+
|
|
|
//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
func file2text(doc *map[string]interface{}) {
|
|
|
- tmpstr := ""
|
|
|
+ mnameone := map[string]bool{}
|
|
|
+ mname := map[string]bool{}
|
|
|
+ murl := map[string]string{}
|
|
|
//if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
|
|
|
if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
|
|
|
for _, attachs := range attach_text {
|
|
@@ -417,22 +419,41 @@ func file2text(doc *map[string]interface{}) {
|
|
|
for _, fileinfo := range fileinfos {
|
|
|
if ff, ok := fileinfo.(map[string]interface{}); ok {
|
|
|
attach_url := qu.ObjToString(ff["attach_url"])
|
|
|
- //if utf8.RuneCountInString(tmpstr+attach_url) < qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
- // tmpstr += attach_url + "\n"
|
|
|
- //} else {
|
|
|
- // break
|
|
|
- //}
|
|
|
- bs := ju.OssGetObject(attach_url)
|
|
|
- if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
- tmpstr += bs + "\n"
|
|
|
- } else {
|
|
|
- break
|
|
|
+ ffname := qu.ObjToString(ff["file_name"])
|
|
|
+ if clearStrReg.MatchString(ffname) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ mname[ffname] = true
|
|
|
+ murl[ffname] = attach_url
|
|
|
+ if sortStrReg.MatchString(ffname) {
|
|
|
+ mnameone[ffname] = true
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ tmpstr := ""
|
|
|
+ for k := range mnameone {
|
|
|
+ if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
|
|
|
+ (*doc)["detailfile"] = tmpstr
|
|
|
+ return
|
|
|
+ }
|
|
|
+ bs := ju.OssGetObject(murl[k])
|
|
|
+ if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
|
|
|
+ tmpstr += bs + "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for k := range mname {
|
|
|
+ if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) {
|
|
|
+ (*doc)["detailfile"] = tmpstr
|
|
|
+ return
|
|
|
+ }
|
|
|
+ bs := ju.OssGetObject(murl[k])
|
|
|
+ if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) {
|
|
|
+ tmpstr += bs + "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
(*doc)["detailfile"] = tmpstr
|
|
|
}
|
|
|
|
|
@@ -441,6 +462,14 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
|
e.ExtractDetail(j, isSite, j.SpiderCode)
|
|
|
if jf != nil && jf.IsFile {
|
|
|
e.ExtractFile(jf, isSite, j.SpiderCode)
|
|
|
+ for tmpk, _ := range jf.Result {
|
|
|
+ if len(j.Result[tmpk]) == 0 {
|
|
|
+ j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
|
|
|
+ j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
|
|
|
+ }
|
|
|
}
|
|
|
if isSite {
|
|
|
ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
|
|
@@ -605,6 +634,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
lockclear.Lock()
|
|
|
var cfn = []string{}
|
|
|
if isSite {
|
|
|
+ cfn = e.SiteClearFn[key]
|
|
|
} else {
|
|
|
cfn = e.ClearFn[key]
|
|
|
}
|
|
@@ -623,7 +653,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
}
|
|
|
before, _ := v.Value.(string)
|
|
|
v.Value = data[0]
|
|
|
- BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
|
|
|
+ BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
|
|
|
//添加行数清理的日志
|
|
|
//清理特殊符号
|
|
|
lockclear.Lock()
|
|
@@ -631,7 +661,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
text := qu.ObjToString(v.Value)
|
|
|
before = text
|
|
|
v.Value = clear.OtherClean(key, text)
|
|
|
- BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
|
|
|
+ BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
|
|
|
}
|
|
|
//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
|
|
|
lockclear.Unlock()
|
|
@@ -1198,34 +1228,10 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
|
|
|
for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
|
|
|
if k == 0 {
|
|
|
tp = "colon"
|
|
|
- // for _, vv := range v.Kvs {
|
|
|
- // qu.Debug("colon-kvs:", vv.Key, vv.Value)
|
|
|
- // }
|
|
|
- // for kkk, vv := range v.KvTags {
|
|
|
- // for _, vvv := range vv {
|
|
|
- // qu.Debug("colon-tags", kkk, vvv.Key, vvv.Value)
|
|
|
- // }
|
|
|
- // }
|
|
|
} else if k == 1 {
|
|
|
tp = "space"
|
|
|
- // for _, vv := range v.Kvs {
|
|
|
- // qu.Debug("space-kvs:", vv.Key, vv.Value)
|
|
|
- // }
|
|
|
- // for kkk, vv := range v.KvTags {
|
|
|
- // for _, vvv := range vv {
|
|
|
- // qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
|
|
|
- // }
|
|
|
- // }
|
|
|
} else if k == 2 {
|
|
|
tp = "table"
|
|
|
- // for _, vv := range v.Kvs {
|
|
|
- // qu.Debug("table-kvs:", vv.Key, vv.Value)
|
|
|
- // }
|
|
|
- // for kkk, vv := range v.KvTags {
|
|
|
- // for _, vvv := range vv {
|
|
|
- // qu.Debug("table-tags", kkk, vvv.Key, vvv.Value)
|
|
|
- // }
|
|
|
- // }
|
|
|
}
|
|
|
if v == nil || v.KvTags == nil {
|
|
|
continue
|
|
@@ -1724,11 +1730,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
tmp := map[string]interface{}{} //抽取值
|
|
|
tmp["spidercode"] = j.SpiderCode
|
|
|
tmp["site"] = j.Site
|
|
|
- tmp["jsondata"] = j.Jsondata
|
|
|
+ if len(*j.Jsondata) > 0 {
|
|
|
+ tmp["jsondata"] = j.Jsondata
|
|
|
+ }
|
|
|
for _, val := range result {
|
|
|
for _, v := range val { //取第一个非负数,项目名称除外
|
|
|
//存0是否有效
|
|
|
- if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
|
|
|
+ if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue{
|
|
|
tmp[v.Field] = v.Value
|
|
|
break
|
|
|
}
|
|
@@ -1771,7 +1779,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
if qu.Float64All(tmp["budget"]) < tmpBudget {
|
|
|
tmp["budget"] = tmpBudget
|
|
|
}
|
|
|
- if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/100 > qu.Float64All(tmp["budget"])) {
|
|
|
+ if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
|
|
|
tmp["bidamount"] = tmpBidamount
|
|
|
} else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
|
|
|
tmp["bidamount"] = tmpBidamount
|
|
@@ -1817,8 +1825,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
if len(j.Winnerorder) > 0 { //候选人信息
|
|
|
for i, v := range j.Winnerorder {
|
|
|
if v["price"] != nil {
|
|
|
- j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0]
|
|
|
- }
|
|
|
+ j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0] }
|
|
|
}
|
|
|
tmp["winnerorder"] = j.Winnerorder
|
|
|
}
|
|
@@ -1831,6 +1838,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
for _, v := range val { //取第一个非负数
|
|
|
if v.Score > -1 {
|
|
|
ffield[v.Field] = v.Value
|
|
|
+ if tmp[v.Field] == nil {
|
|
|
+ if v.Field == "budget" || v.Field == "bidamount" {
|
|
|
+ if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
|
|
|
+ tmp[v.Field] = v.Value
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ tmp[v.Field] = v.Value
|
|
|
+ }
|
|
|
+ }
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -1959,9 +1975,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
tmp["projectname"] = j.Title
|
|
|
}
|
|
|
tmp["repeat"] = 0
|
|
|
- if ju.Ffield {
|
|
|
- tmp["ffield"] = ffield
|
|
|
- }
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
|
/* if len(e.SiteFields) <= 0 {
|
|
@@ -2013,9 +2026,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
//if len(auxinfof) > 0 {
|
|
|
// tmp["fieldallf"] = auxinfof
|
|
|
//}
|
|
|
- //if len(ffield) > 0 {
|
|
|
- // tmp["ffield"] = ffield
|
|
|
- //}
|
|
|
+ if ju.Ffield {
|
|
|
+ if len(ffield) > 0 {
|
|
|
+ tmp["ffield"] = ffield
|
|
|
+ }
|
|
|
+ }
|
|
|
delete(tmp, "fieldall")
|
|
|
if len(j.BlockPackage) > 0 { //分包详情
|
|
|
if len(j.BlockPackage) > 10 {
|
|
@@ -2040,6 +2055,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
func checkFields(tmp map[string]interface{}) map[string]interface{} {
|
|
|
delete(tmp, "contenthtml")
|
|
|
delete(tmp, "detail")
|
|
|
+ //delete(tmp, "toptype")
|
|
|
+ //delete(tmp, "subtype")
|
|
|
if _, ok := tmp["bidamount"].(string); ok {
|
|
|
delete(tmp, "bidamount")
|
|
|
} else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]) {
|
|
@@ -2167,7 +2184,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
}
|
|
|
if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
|
|
|
//jsondata清理
|
|
|
- clearJd(j.Jsondata, e, j.SpiderCode)
|
|
|
+ clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney)
|
|
|
marshalbt, _ := json.Marshal(j.Jsondata)
|
|
|
tmpjddata := make(map[string]interface{})
|
|
|
json.Unmarshal(marshalbt, &tmpjddata)
|
|
@@ -2393,7 +2410,7 @@ func resetWinnerorder(j *ju.Job) {
|
|
|
} else if len(bidamounts) > 0 {
|
|
|
j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
}
|
|
|
func RemoveReplicaSliceString(slc []string) []string {
|
|
|
result := make([]string, 0)
|