|
@@ -0,0 +1,224 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "jy/extract"
|
|
|
+ "jy/mongodbutil"
|
|
|
+ "jy/pretreated"
|
|
|
+ ju "jy/util"
|
|
|
+ "log"
|
|
|
+ "os"
|
|
|
+ qu "qfw/util"
|
|
|
+ "regexp"
|
|
|
+ "time"
|
|
|
+)
|
|
|
+
|
|
|
+var f *os.File
|
|
|
+var m = map[string]bool{}
|
|
|
+
|
|
|
+func main12() {
|
|
|
+ //log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
|
|
|
+ //return
|
|
|
+ f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777)
|
|
|
+ //all()
|
|
|
+ one()
|
|
|
+}
|
|
|
+func all() {
|
|
|
+ m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27082", "extract_kf")
|
|
|
+ sess := m.Get()
|
|
|
+ defer m.Close(sess)
|
|
|
+ it := sess.DB("extract_kf").C("bidding201901").Find(nil).Iter()
|
|
|
+ pool := make(chan bool, 5)
|
|
|
+ count := 0
|
|
|
+ for temp := make(map[string]interface{}); it.Next(&temp); {
|
|
|
+ pool <- true
|
|
|
+ count++
|
|
|
+ go func(d map[string]interface{}) {
|
|
|
+ defer func() {
|
|
|
+ <-pool
|
|
|
+ }()
|
|
|
+ com(d)
|
|
|
+ }(temp)
|
|
|
+ temp = make(map[string]interface{})
|
|
|
+ if count%200 == 0 {
|
|
|
+ log.Println(count)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println("over...")
|
|
|
+ time.Sleep(time.Hour)
|
|
|
+}
|
|
|
+func one() {
|
|
|
+ m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
|
|
|
+ d, _ := m.FindById("bidding", "5d423d13a5cb26b9b76e4479", nil)
|
|
|
+ com(*d)
|
|
|
+}
|
|
|
+func com(doc map[string]interface{}) {
|
|
|
+ detail := GetDetail(doc)
|
|
|
+ doc["detail"] = detail
|
|
|
+ toptype := qu.ObjToString(doc["toptype"])
|
|
|
+ subtype := qu.ObjToString(doc["subtype"])
|
|
|
+ if qu.ObjToString(doc["type"]) == "bid" {
|
|
|
+ toptype = "结果"
|
|
|
+ }
|
|
|
+ if toptype == "" {
|
|
|
+ toptype = "*"
|
|
|
+ }
|
|
|
+ e := &extract.ExtractTask{
|
|
|
+ TaskInfo: &extract.TaskInfo{
|
|
|
+ Version: "V3.1.2",
|
|
|
+ VersionId: "5cdd1c70e138234848c1d703",
|
|
|
+ ProcessPool: make(chan bool, 1),
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ e.Id = qu.ObjToString(ju.Config["udptaskid"])
|
|
|
+ e.InitTaskInfo()
|
|
|
+ //d.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
+ //d.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
|
|
|
+ e.InitRulePres()
|
|
|
+ e.InitRuleBacks()
|
|
|
+ e.InitRuleCore()
|
|
|
+ e.InitBlockRule()
|
|
|
+ e.InitTag()
|
|
|
+ e.InitClearFn()
|
|
|
+ if e.IsExtractCity { //版本上控制是否开始城市抽取
|
|
|
+ //初始化城市DFA信息
|
|
|
+ e.InitCityDFA()
|
|
|
+ e.InitAreaCode()
|
|
|
+ e.InitPostCode()
|
|
|
+ }
|
|
|
+ //质量审核
|
|
|
+ e.InitAuditFields()
|
|
|
+ e.InitAuditRule()
|
|
|
+ e.InitAuditClass()
|
|
|
+ e.InitAuditRecogField()
|
|
|
+
|
|
|
+ //品牌抽取是否开启
|
|
|
+ ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
|
|
|
+
|
|
|
+ j := &ju.Job{
|
|
|
+ SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
+ Category: toptype,
|
|
|
+ CategorySecond: subtype,
|
|
|
+ Content: qu.ObjToString(doc["detail"]),
|
|
|
+ SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
+ //Domain: qu.ObjToString(doc["domain"]),
|
|
|
+ //Href: qu.ObjToString(doc["href"]),
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
+ Data: &doc,
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
+ }
|
|
|
+ e.TaskInfo.ProcessPool <- true
|
|
|
+ pretreated.AnalyStart(j)
|
|
|
+ e.ExtractProcess(j, nil)
|
|
|
+ log.Println("=============KvTags================")
|
|
|
+ for _, v := range j.Block {
|
|
|
+ if v.ColonKV != nil {
|
|
|
+ for kk, vv := range v.ColonKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println("ColonKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if v.SpaceKV != nil {
|
|
|
+ for kk, vv := range v.SpaceKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println("SpaceKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if v.TableKV != nil {
|
|
|
+ for kk, vv := range v.TableKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println("TableKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println("=============抽取结果================")
|
|
|
+ set := (e.ResultArr[0][1]["$set"]).(map[string]interface{})
|
|
|
+ for k, v := range set {
|
|
|
+ if k == "budget" || k == "amount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {
|
|
|
+ log.Println(k, "---", v)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println("=============抽取结果 result================")
|
|
|
+ for k, v := range set["result"].(map[string][]*ju.ExtField) {
|
|
|
+ break
|
|
|
+ for _, vv := range v {
|
|
|
+ log.Println(k, fmt.Sprintf("%+v", vv))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println("=============正文================")
|
|
|
+ //log.Println(j.Content)
|
|
|
+ return
|
|
|
+ for _, v := range j.Block {
|
|
|
+ if v.ColonKV != nil && v.ColonKV.KvTags != nil {
|
|
|
+ for kk, vv := range v.ColonKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println(kk, vvv.Weight, vvv.Value)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if v.TableKV != nil && v.TableKV.KvTags != nil {
|
|
|
+ for kk, vv := range v.TableKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println(kk, vvv.Weight, vvv.Value)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
|
|
|
+ for kk, vv := range v.SpaceKV.KvTags {
|
|
|
+ for _, vvv := range vv {
|
|
|
+ log.Println(kk, vvv.Weight, vvv.Value)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println(len(j.Block))
|
|
|
+ return
|
|
|
+ for _, v := range j.Block {
|
|
|
+ if m[v.Title] || v.Title == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if !regexp.MustCompile("或|和|以?及|与|、|或").MatchString(v.Title) {
|
|
|
+ //continue
|
|
|
+ }
|
|
|
+ m[v.Title] = true
|
|
|
+ f.WriteString(j.SourceMid + "-----" + v.Title + "---" + fmt.Sprint(v.Titles) + "\n")
|
|
|
+ continue
|
|
|
+ for _, kv := range v.ColonKV.Kvs {
|
|
|
+ log.Println("\n")
|
|
|
+ log.Println(kv.Key, "---", kv.Value)
|
|
|
+ log.Println(kv.Line)
|
|
|
+ log.Println("=======================")
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+func GetDetail(doc map[string]interface{}) (detail string) {
|
|
|
+ detail = ""
|
|
|
+ d1, _ := doc["detail"].(string)
|
|
|
+ d2, _ := doc["contenthtml"].(string)
|
|
|
+ if len(d1) >= len(d2) || d2 == "" {
|
|
|
+ detail = d1
|
|
|
+ } else {
|
|
|
+ detail = d2
|
|
|
+ }
|
|
|
+ detail = ju.CutLableStr(detail)
|
|
|
+ detail = ju.NewCut().ClearHtml(detail)
|
|
|
+ tabs, ration := pretreated.ComputeConRatio(detail, 1)
|
|
|
+ if len(tabs) > 0 {
|
|
|
+ newcon, newtabs, newration := pretreated.FindBigText(detail, ration, tabs)
|
|
|
+ //log.Println(newcon, newtabs, newration)
|
|
|
+ if newcon != "" && newration == 0 {
|
|
|
+ detail = newcon
|
|
|
+ tabs = newtabs
|
|
|
+ ration = newration
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return detail
|
|
|
+}
|