package main import ( "fmt" "jy/extract" "jy/mongodbutil" "jy/pretreated" ju "jy/util" "log" "os" qu "qfw/util" "regexp" "time" ) var f *os.File var m = map[string]bool{} func main12() { //winnerorder() //return //log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动")) //return //f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777) //all() one() } func all() { m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27082", "extract_kf") sess := m.Get() defer m.Close(sess) it := sess.DB("extract_kf").C("bidding201901").Find(nil).Iter() pool := make(chan bool, 5) count := 0 for temp := make(map[string]interface{}); it.Next(&temp); { pool <- true count++ go func(d map[string]interface{}) { defer func() { <-pool }() com(d) }(temp) temp = make(map[string]interface{}) if count%200 == 0 { log.Println(count) } } log.Println("over...") time.Sleep(time.Hour) } func one() { m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw") d, _ := m.FindById("bidding", "5d424df7a5cb26b9b7b61fde", extract.Fields) com(*d) } func com(doc map[string]interface{}) { detail := GetDetail(doc) doc["detail"] = detail toptype := qu.ObjToString(doc["toptype"]) subtype := qu.ObjToString(doc["subtype"]) if qu.ObjToString(doc["type"]) == "bid" { toptype = "结果" } if toptype == "" { toptype = "*" } e := &extract.ExtractTask{ TaskInfo: &extract.TaskInfo{ Version: "V3.1.2", VersionId: "5cdd1c70e138234848c1d703", ProcessPool: make(chan bool, 1), }, } e.Id = qu.ObjToString(ju.Config["udptaskid"]) e.InitTaskInfo() //d.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB) //d.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB) e.InitSite() e.InitRulePres() e.InitRuleBacks(false) e.InitRuleBacks(true) e.InitRuleCore(false) e.InitRuleCore(true) e.InitBlockRule() e.InitPkgCore() e.InitTag(false) e.InitTag(true) e.InitClearFn(false) e.InitClearFn(true) if e.IsExtractCity { //版本上控制是否开始城市抽取 //初始化城市DFA信息 e.InitCityDFA() e.InitAreaCode() e.InitPostCode() } //质量审核 e.InitAuditFields() e.InitAuditRule() e.InitAuditClass() e.InitAuditRecogField() //品牌抽取是否开启 ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool) j := &ju.Job{ SourceMid: qu.BsonIdToSId(doc["_id"]), Category: toptype, CategorySecond: subtype, Content: qu.ObjToString(doc["detail"]), SpiderCode: qu.ObjToString(doc["spidercode"]), //Domain: qu.ObjToString(doc["domain"]), //Href: qu.ObjToString(doc["href"]), Title: qu.ObjToString(doc["title"]), Data: &doc, City: qu.ObjToString(doc["city"]), Province: qu.ObjToString(doc["area"]), Result: map[string][]*ju.ExtField{}, BuyerAddr: qu.ObjToString(doc["buyeraddr"]), RuleBlock: e.RuleBlock, } e.TaskInfo.ProcessPool <- true pretreated.AnalyStart(j,false,"") e.ExtractProcess(j, nil,false) log.Println("=============块信息================") for _, v := range j.Block { log.Println("----", v.Title, v.Titles, "----") if v.ColonKV != nil { for kk, vv := range v.ColonKV.KvTags { for _, vvv := range vv { log.Println("ColonKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight) } } } if v.SpaceKV != nil { for kk, vv := range v.SpaceKV.KvTags { for _, vvv := range vv { log.Println("SpaceKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight) } } } if v.TableKV != nil { for kk, vv := range v.TableKV.KvTags { for _, vvv := range vv { log.Println("TableKV", kk, "---", vvv.Key, "---", vvv.Value, "---", vvv.Weight) } } } //log.Println("Classify", v.Classify) //log.Println("Tag", v.Tag) } log.Println("=============抽取结果================") set := (e.ResultArr[0][1]["$set"]).(map[string]interface{}) for k, v := range set { if k == "budget" || k == "bidamount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" { log.Println(k, "---", v) } } log.Println("=============抽取结果 result================") for k, v := range set["result"].(map[string][]*ju.ExtField) { if k != "winner" { continue } for _, vv := range v { log.Println(k, fmt.Sprintf("%+v", vv)) for kkk, vvv := range vv.ScoreItem { log.Println("--", kkk, k, fmt.Sprintf("%+v", vvv)) } log.Println("\n") } } log.Println("=============中标候选人================") for _, v := range j.Winnerorder { log.Println(v) } log.Println("=============分包================") for k, v := range j.BlockPackage { log.Println(k, v) } log.Println("=============正文================") //log.Println(j.Content) return for _, v := range j.Block { if v.ColonKV != nil && v.ColonKV.KvTags != nil { for kk, vv := range v.ColonKV.KvTags { for _, vvv := range vv { log.Println(kk, vvv.Weight, vvv.Value) } } } if v.TableKV != nil && v.TableKV.KvTags != nil { for kk, vv := range v.TableKV.KvTags { for _, vvv := range vv { log.Println(kk, vvv.Weight, vvv.Value) } } } if v.SpaceKV != nil && v.SpaceKV.KvTags != nil { for kk, vv := range v.SpaceKV.KvTags { for _, vvv := range vv { log.Println(kk, vvv.Weight, vvv.Value) } } } } log.Println(len(j.Block)) return for _, v := range j.Block { if m[v.Title] || v.Title == "" { continue } if !regexp.MustCompile("或|和|以?及|与|、|或").MatchString(v.Title) { //continue } m[v.Title] = true f.WriteString(j.SourceMid + "-----" + v.Title + "---" + fmt.Sprint(v.Titles) + "\n") continue for _, kv := range v.ColonKV.Kvs { log.Println("\n") log.Println(kv.Key, "---", kv.Value) log.Println(kv.Line) log.Println("=======================") } } } func GetDetail(doc map[string]interface{}) (detail string) { detail = "" d1, _ := doc["detail"].(string) d2, _ := doc["contenthtml"].(string) if len(d1) >= len(d2) || d2 == "" { detail = d1 } else { detail = d2 } detail = ju.CutLableStr(detail) detail = ju.NewCut().ClearHtml(detail) tabs, ration := pretreated.ComputeConRatio(detail, 1) if len(tabs) > 0 { newcon, newtabs, newration := pretreated.FindBigText(detail, ration, tabs) //log.Println(newcon, newtabs, newration) if newcon != "" && newration == 0 { detail = newcon tabs = newtabs ration = newration } } return detail } func winnerorder() { text := `评审专家名单: 吴殿波、韩屹、孙胜进、郑丹、李海波 中标标的名称、规格型号、数量、单价、服务要求: 2019年沈阳惠涌供热有限责任公司、沈阳圣达热力供暖有限责任公司、沈阳惠盛供热有限责任公司PE管保温 第一入围供货商:沈阳曲暖鼎盛保温安装有限公司 、总单价:11.833300 第二入围供货商:沈阳国盛防腐保温有限公司、总单价:11.102100 第三入围供货商:沈阳泰豪管材有限公司、总单价:13.258100` log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1,false,"")) }