wcj 6 ani în urmă
părinte
comite
38ddfc361b
2 a modificat fișierele cu 225 adăugiri și 1 ștergeri
  1. 1 1
      src/main.go
  2. 224 0
      src/main_blocktest.go

+ 1 - 1
src/main.go

@@ -36,7 +36,7 @@ func init() {
 	//elastic.InitElasticSize(qu.ObjToString(util.Config["elasticsearch"]), qu.IntAllDef(util.Config["elasticPoolSize"], 30))
 }
 
-func main12() {
+func main() {
 	extract.ExtractUdp() //udp通知抽取
 	extract.ClearUdp()   //udp通知清理
 	go extract.Export()

+ 224 - 0
src/main_blocktest.go

@@ -0,0 +1,224 @@
+package main
+
+import (
+	"fmt"
+	"jy/extract"
+	"jy/mongodbutil"
+	"jy/pretreated"
+	ju "jy/util"
+	"log"
+	"os"
+	qu "qfw/util"
+	"regexp"
+	"time"
+)
+
+var f *os.File
+var m = map[string]bool{}
+
+func main12() {
+	//log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
+	//return
+	f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777)
+	//all()
+	one()
+}
+func all() {
+	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27082", "extract_kf")
+	sess := m.Get()
+	defer m.Close(sess)
+	it := sess.DB("extract_kf").C("bidding201901").Find(nil).Iter()
+	pool := make(chan bool, 5)
+	count := 0
+	for temp := make(map[string]interface{}); it.Next(&temp); {
+		pool <- true
+		count++
+		go func(d map[string]interface{}) {
+			defer func() {
+				<-pool
+			}()
+			com(d)
+		}(temp)
+		temp = make(map[string]interface{})
+		if count%200 == 0 {
+			log.Println(count)
+		}
+	}
+	log.Println("over...")
+	time.Sleep(time.Hour)
+}
+func one() {
+	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
+	d, _ := m.FindById("bidding", "5d423d13a5cb26b9b76e4479", nil)
+	com(*d)
+}
+func com(doc map[string]interface{}) {
+	detail := GetDetail(doc)
+	doc["detail"] = detail
+	toptype := qu.ObjToString(doc["toptype"])
+	subtype := qu.ObjToString(doc["subtype"])
+	if qu.ObjToString(doc["type"]) == "bid" {
+		toptype = "结果"
+	}
+	if toptype == "" {
+		toptype = "*"
+	}
+	e := &extract.ExtractTask{
+		TaskInfo: &extract.TaskInfo{
+			Version:     "V3.1.2",
+			VersionId:   "5cdd1c70e138234848c1d703",
+			ProcessPool: make(chan bool, 1),
+		},
+	}
+
+	e.Id = qu.ObjToString(ju.Config["udptaskid"])
+	e.InitTaskInfo()
+	//d.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	//d.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
+	e.InitRulePres()
+	e.InitRuleBacks()
+	e.InitRuleCore()
+	e.InitBlockRule()
+	e.InitTag()
+	e.InitClearFn()
+	if e.IsExtractCity { //版本上控制是否开始城市抽取
+		//初始化城市DFA信息
+		e.InitCityDFA()
+		e.InitAreaCode()
+		e.InitPostCode()
+	}
+	//质量审核
+	e.InitAuditFields()
+	e.InitAuditRule()
+	e.InitAuditClass()
+	e.InitAuditRecogField()
+
+	//品牌抽取是否开启
+	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+
+	j := &ju.Job{
+		SourceMid:      qu.BsonIdToSId(doc["_id"]),
+		Category:       toptype,
+		CategorySecond: subtype,
+		Content:        qu.ObjToString(doc["detail"]),
+		SpiderCode:     qu.ObjToString(doc["spidercode"]),
+		//Domain:     qu.ObjToString(doc["domain"]),
+		//Href:       qu.ObjToString(doc["href"]),
+		Title:     qu.ObjToString(doc["title"]),
+		Data:      &doc,
+		City:      qu.ObjToString(doc["city"]),
+		Province:  qu.ObjToString(doc["area"]),
+		Result:    map[string][]*ju.ExtField{},
+		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
+		RuleBlock: e.RuleBlock,
+	}
+	e.TaskInfo.ProcessPool <- true
+	pretreated.AnalyStart(j)
+	e.ExtractProcess(j, nil)
+	log.Println("=============KvTags================")
+	for _, v := range j.Block {
+		if v.ColonKV != nil {
+			for kk, vv := range v.ColonKV.KvTags {
+				for _, vvv := range vv {
+					log.Println("ColonKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
+				}
+			}
+		}
+		if v.SpaceKV != nil {
+			for kk, vv := range v.SpaceKV.KvTags {
+				for _, vvv := range vv {
+					log.Println("SpaceKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
+				}
+			}
+		}
+		if v.TableKV != nil {
+			for kk, vv := range v.TableKV.KvTags {
+				for _, vvv := range vv {
+					log.Println("TableKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
+				}
+			}
+		}
+	}
+	log.Println("=============抽取结果================")
+	set := (e.ResultArr[0][1]["$set"]).(map[string]interface{})
+	for k, v := range set {
+		if k == "budget" || k == "amount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {
+			log.Println(k, "---", v)
+		}
+	}
+	log.Println("=============抽取结果 result================")
+	for k, v := range set["result"].(map[string][]*ju.ExtField) {
+		break
+		for _, vv := range v {
+			log.Println(k, fmt.Sprintf("%+v", vv))
+		}
+	}
+	log.Println("=============正文================")
+	//log.Println(j.Content)
+	return
+	for _, v := range j.Block {
+		if v.ColonKV != nil && v.ColonKV.KvTags != nil {
+			for kk, vv := range v.ColonKV.KvTags {
+				for _, vvv := range vv {
+					log.Println(kk, vvv.Weight, vvv.Value)
+				}
+			}
+		}
+		if v.TableKV != nil && v.TableKV.KvTags != nil {
+			for kk, vv := range v.TableKV.KvTags {
+				for _, vvv := range vv {
+					log.Println(kk, vvv.Weight, vvv.Value)
+				}
+			}
+		}
+		if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
+			for kk, vv := range v.SpaceKV.KvTags {
+				for _, vvv := range vv {
+					log.Println(kk, vvv.Weight, vvv.Value)
+				}
+			}
+		}
+	}
+	log.Println(len(j.Block))
+	return
+	for _, v := range j.Block {
+		if m[v.Title] || v.Title == "" {
+			continue
+		}
+		if !regexp.MustCompile("或|和|以?及|与|、|或").MatchString(v.Title) {
+			//continue
+		}
+		m[v.Title] = true
+		f.WriteString(j.SourceMid + "-----" + v.Title + "---" + fmt.Sprint(v.Titles) + "\n")
+		continue
+		for _, kv := range v.ColonKV.Kvs {
+			log.Println("\n")
+			log.Println(kv.Key, "---", kv.Value)
+			log.Println(kv.Line)
+			log.Println("=======================")
+		}
+	}
+}
+func GetDetail(doc map[string]interface{}) (detail string) {
+	detail = ""
+	d1, _ := doc["detail"].(string)
+	d2, _ := doc["contenthtml"].(string)
+	if len(d1) >= len(d2) || d2 == "" {
+		detail = d1
+	} else {
+		detail = d2
+	}
+	detail = ju.CutLableStr(detail)
+	detail = ju.NewCut().ClearHtml(detail)
+	tabs, ration := pretreated.ComputeConRatio(detail, 1)
+	if len(tabs) > 0 {
+		newcon, newtabs, newration := pretreated.FindBigText(detail, ration, tabs)
+		//log.Println(newcon, newtabs, newration)
+		if newcon != "" && newration == 0 {
+			detail = newcon
+			tabs = newtabs
+			ration = newration
+		}
+	}
+	return detail
+}