浏览代码

锐捷-标的物-导出

zhengkun 3 年之前
父节点
当前提交
a2d9e40425

+ 124 - 0
data_export_yx/src/cgyx.go

@@ -0,0 +1,124 @@
+package main
+
+import (
+	log "github.com/donnie4w/go-logger/logger"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+	qu "qfw/util"
+	"regexp"
+	"sync"
+)
+
+var P_NameReg *regexp.Regexp = regexp.MustCompile(`(项目)`)
+
+
+func dealWithCgyxData()  {
+
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	log.Debug("查询条件~",q)
+	total,isok:= 0,0
+	repair_pool := make(chan bool, 5)
+	repair_wg := &sync.WaitGroup{}
+	it := sess.DB(save_mgo.DbName).C(save_coll).Find(&q).Sort("_id").Iter()
+	for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
+		if total%1000==0 {
+			log.Debug("curent index ",total,tmp["_id"],isok)
+		}
+		subtype := qu.ObjToString(tmp["subtype"])
+		if subtype!="采购意向" {
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		repair_pool <- true
+		repair_wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-repair_pool
+				repair_wg.Done()
+			}()
+			p_list := []map[string]interface{}{}
+			if purchasinglist, ok := tmp["purchasinglist"].(primitive.A); ok {
+				p_list = qu.ObjArrToMapArr(purchasinglist)
+			}else {
+				if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok {
+					p_list = qu.ObjArrToMapArr(purchasinglist)
+				}
+			}
+			//清理正文-标签
+			//detail := cleandetail(qu.ObjToString(tmp["detail"]))
+			//tmp["detail"] = detail
+
+			if len(p_list)>0 {
+				new_list := comparisonList(p_list,tmp)
+				if len(new_list)>0 {
+					isok+=len(new_list)
+				}
+
+
+
+
+
+			}
+		}(tmp)
+		tmp = make(map[string]interface{})
+	}
+	repair_wg.Wait()
+	log.Debug("is export over ",total,isok)
+
+}
+
+
+//对比-有效数据
+func comparisonList(p_list []map[string]interface{} ,tmp map[string]interface{}) []map[string]interface{} {
+	new_list := []map[string]interface{}{}
+	for _,list := range p_list {
+		b := isValidList(list,tmp)
+		if b {
+			new_list = append(new_list,list)
+		}
+	}
+	return new_list
+}
+
+
+func isValidList(list map[string]interface{},tmp map[string]interface{}) bool {
+	publishtime := qu.Int64All(tmp["publishtime"])
+
+	field_dataArr :=[]string{}
+	field_dataArr = append(field_dataArr,qu.ObjToString(tmp["title"]))
+	field_dataArr = append(field_dataArr,qu.ObjToString(tmp["projectname"]))
+	field_dataArr = append(field_dataArr,qu.ObjToString(list["itemname"]))
+	field_dataArr = append(field_dataArr,qu.ObjToString(list["projectname"]))
+
+	expurasingtime := qu.ObjToString(list["expurasingtime"])
+	totalprice := qu.Float64All(list["totalprice"])
+	if expurasingtime=="" {
+		return false
+	}
+	//时间格式转换
+	exp_time := cleanStrToTimestamp(expurasingtime,publishtime)
+	if exp_time>0 && exp_time<end_time && exp_time>=start_time{
+	}else {
+		return false
+	}
+	//时间格式转换
+	if totalprice<300000.0 {
+		return false
+	}
+
+	//项目名称-过滤词相关  tmp 任意可选
+	for _,v :=range words_arr {
+		projectname:=""
+		keywords := qu.ObjToString(v["keywords"])
+		outwords := qu.ObjToString(v["outwords"])
+		k_reg := regexp.MustCompile(keywords)
+		o_reg := regexp.MustCompile(outwords)
+		if k_reg.MatchString(projectname) && !o_reg.MatchString(projectname) {
+			return true
+		}
+	}
+
+	return false
+}

+ 319 - 0
data_export_yx/src/clean_detail.go

@@ -0,0 +1,319 @@
+package main
+
+import (
+	"github.com/PuerkitoBio/goquery"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+	"unicode/utf8"
+)
+const (
+	conStr = "([\\s\u3000\u2003\u00a0]+|\\\\t)" //所有空格
+)
+var clearpkg = regexp.MustCompile("(标示|标识)")
+var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
+var (
+	LReg  = regexp.MustCompile("^" + conStr)
+	cut   = newCut()
+)
+var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
+
+var at = rune('&')
+var ed = rune(';')
+var lableMap = map[string]rune{
+	"&amp;":  rune('&'),
+	"&nbsp;": rune(' '),
+	"&gt;":   rune('>'),
+	"&lt;":   rune('<'),
+}
+
+type Cut struct {
+	tag           *regexp.Regexp
+	scripttag     *regexp.Regexp
+	inputag       *regexp.Regexp
+	hiddentag     *regexp.Regexp
+	styletag      *regexp.Regexp
+	colstag       *regexp.Regexp
+	rowstag       *regexp.Regexp
+	display       *regexp.Regexp
+	multiCR       *regexp.Regexp
+	replBlankLine *regexp.Regexp
+	replStartWrap *regexp.Regexp
+	replTags2CR   []string
+	retainTags2CR []string
+}
+
+
+
+func cleandetail(detail string) string {
+	detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
+	//全文的需要修复表格
+	detail = repairCon(detail)
+	detail = cutLableStr(detail)
+	detail = cut.ClearHtml(detail)
+
+	return detail
+}
+
+
+func newCut() *Cut {
+	t, _ := regexp.Compile("<[^>]+>")
+	m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
+	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
+	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
+	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
+	hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
+	input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
+	cols, _ := regexp.Compile(`colspan="\d+"`)
+	rows, _ := regexp.Compile(`rowspan="\d+"`)
+	dis, _ := regexp.Compile(`display:none`)
+	return &Cut{
+		tag:           t,
+		scripttag:     scs,
+		hiddentag:     hiddentag,
+		inputag:       input,
+		colstag:       cols,
+		rowstag:       rows,
+		display:       dis,
+		multiCR:       m,
+		replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
+		replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
+		replTags2CR:   []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
+		retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
+	}
+}
+func (c *Cut) ClearHtml(src string) string {
+	src = strings.Replace(src, ">\n", ">", -1)
+	src = strings.Replace(src, " ", "", -1)
+	//标签全转小写
+	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
+	//清script,style
+	src = c.scripttag.ReplaceAllString(src, "")
+	//清理input
+	src = c.hiddentag.ReplaceAllString(src, "")
+	src = c.inputag.ReplaceAllString(src, "$2")
+	document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
+	if err == nil {
+		if tmpstr,err := document.Each(func(i int, sel *goquery.Selection) {
+			sel.Find("td").Each(func(i int, selection *goquery.Selection) {
+				val, b := selection.Attr("title")
+				if b && strings.Trim(val, " ") != "" {
+					tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
+						return r == 9|| r == 32
+					})
+					if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
+						selection.SetText(strings.Trim(val, " "))
+					}
+				}
+			})
+		}).Html();err == nil{
+			src = tmpstr
+		}
+	}
+	//换结束标签
+	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
+		tmp = strings.Replace(tmp, " ", "", -1)
+		//保留这些标签
+		for _, v := range c.retainTags2CR {
+			if "<"+v+">" == tmp || "</"+v+">" == tmp {
+				if tmp == "</table>" {
+					return tmp + "\n"
+				}
+				return tmp
+			}
+			if strings.HasPrefix(tmp, "<"+v) {
+				dispstrs := c.display.FindAllString(tmp, -1)
+				rowstrs := c.rowstag.FindAllString(tmp, -1)
+				colstrs := c.colstag.FindAllString(tmp, -1)
+				c := "<" + v
+				if len(colstrs) > 0 { //处理多列合并
+					c += " " + colstrs[0]
+				}
+				if len(rowstrs) > 0 { //处理多行合并
+					c += " " + rowstrs[0]
+				}
+				if len(dispstrs) > 0 {
+					c += " style=\"" + dispstrs[0] + "\""
+				}
+				return c + ">"
+
+			}
+		}
+		if tmp == "<br>" || tmp == "</ul>" ||tmp == "<ul>"  || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>"{
+			return "\n"
+		}
+		if tmp[1] != 47 { //开始标签
+			for _, v := range c.replTags2CR {
+				if v == tmp[1:len(tmp)-1] {
+					return "\n"
+				}
+			}
+			return ""
+		}
+		for _, v := range c.replTags2CR {
+			if v == tmp[2:len(tmp)-1] {
+				return "\n"
+			}
+		}
+		return ""
+	})
+	src = c.replStartWrap.ReplaceAllString(src, "")
+	src = c.replBlankLine.ReplaceAllString(src, "\n")
+	//清除多余换行
+	return c.multiCR.ReplaceAllString(src, "\n")
+	//return strings.Replace(src, "\n", "<br/>", -1)
+}
+
+//处理转义标签
+func cutLableStr(con string) string {
+	for i := 0; i < 3; i++ {
+		runes := []rune{}
+		pools := []rune{}
+		bpool := false
+		strings.IndexFunc(con, func(s rune) bool {
+			if !bpool && s == at {
+				bpool = true
+				pools = []rune{}
+			}
+			if bpool {
+				pools = append(pools, s)
+				if s == ed { //结束
+					lb := lableMap[string(pools)]
+					if lb != 0 {
+						runes = append(runes, lb)
+					} else {
+						runes = append(runes, pools...)
+					}
+					bpool = false
+				} else if len(pools) > 6 {
+					bpool = false
+					runes = append(runes, pools...)
+				}
+			} else {
+				runes = append(runes, s)
+			}
+			return false
+		})
+		str1 := string(runes)
+		if i > 0 && con == str1 {
+			break
+		}
+		con = str1
+	}
+	return con
+}
+
+
+func repairCon(con string) string {
+	con = clearpkg.ReplaceAllString(con, "")
+	res := saveThead.FindAllStringSubmatch(con, 1)
+	th := ""
+	if len(res) == 1 && len(res[0]) == 2 {
+		th = trimLeftSpace(res[0][1], "")
+	}
+	con = thbf.ReplaceAllString(con, "")
+	con = trimLeftSpace(con, "")
+	itbody := strings.Index(con, "<tr")
+	iLen := 3
+	if itbody == 0 {
+		con = findpos(con, iLen, itbody)
+	} else {
+		itable := strings.Index(con, "<table")
+		if itable == -1 || itable > itbody {
+			con = findpos(con, iLen, itbody)
+		}
+	}
+	//保留第一个thead
+	if th != "" {
+		con = strings.Replace(con, th, "<thead>"+th+"</thead>", 1)
+	}
+	//u.Debug(con)
+	return con
+}
+func findpos(con string, iLen, start int) (newcon string) {
+	defer qu.Catch()
+	n := len(con)
+	layer := 0
+	pos := 0
+	if start >= 0 {
+		if iLen == 6 {
+			for i := iLen + start; i < len(con); i++ {
+				if con[i] == '<' && i+6 < n {
+					str := con[i : i+6]
+					if str == "</tbod" {
+						if layer == 0 {
+							pos = i
+							break
+						} else {
+							layer--
+						}
+						i += 6
+					} else if str == "<tbody" {
+						layer++
+						i += 6
+					}
+				}
+			}
+			if pos+7 <= n && start+6 < pos {
+				newcon = con[:start] + "<table" + con[start+6:pos] + "</table" + con[pos+7:]
+			}
+		} else {
+			layer++
+			nq := 0
+			lasttr := 0
+			for i := iLen + start; i < len(con); i++ {
+				if con[i] == '<' && i+4 < n {
+					if nq == 0 {
+						str := con[i : i+4]
+						if str == "</tr" {
+							if layer <= 0 {
+								pos = i //正常情况不会存在此类情况
+								break
+							} else {
+								layer--
+								lasttr = i
+							}
+							i += 4
+						} else if str[:3] == "<tr" {
+							layer++
+							i += 4
+						} else if str == "<tab" && i+6 < n && con[i+4:i+6] == "le" {
+							if layer == 0 {
+								break
+							} else {
+								//内嵌的表格
+								nq++
+							}
+						}
+					} else {
+						if i+6 < n {
+							str := con[i : i+6]
+							if str == "</tabl" {
+								nq--
+							} else if str == "<table" {
+								nq++
+							}
+						} else {
+							break
+						}
+					}
+				}
+			}
+			if pos == 0 && lasttr > 3 {
+				pos = lasttr + 5
+			} else if pos > 0 {
+				pos += 5
+			}
+			if pos <= n && pos < len(con) && start < pos {
+				newcon = con[:start] + "<table>" + con[start:pos] + "</table>" + con[pos:]
+			}
+		}
+	}
+	if newcon == "" {
+		newcon = con
+	}
+	return
+}
+func trimLeftSpace(con, repl string) string {
+	return LReg.ReplaceAllString(con, repl)
+}

+ 197 - 0
data_export_yx/src/clean_time.go

@@ -0,0 +1,197 @@
+package main
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"time"
+)
+var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n", "\u0001"}
+
+var reg, regA, regB, regC, regD,regE,regF, regAfter ,regAfterBool*regexp.Regexp
+
+const (
+	T = 365 * 86400
+)
+
+var item = map[string]string{
+	"一": "1", "二": "2", "三": "3", "四": "4", "五": "5",
+	"六": "6", "七": "7", "八": "8","九": "9", "十": "10", "零": "0", "〇": "0",
+	"1": "1", "2": "2", "3": "3", "4": "4", "5": "5",
+	"6": "6", "7": "7", "8": "8", "9": "9", "0": "0",
+}
+
+func init() {
+	//二〇一五年十一月四日十五时
+	reg, _ = regexp.Compile(`\d+`)
+	regA, _ = regexp.Compile(`[一|二|三|四|五|六|七|八|九|十|零|〇|1|2|3|4|5|6|7|8|9|0]`)
+	regB, _ = regexp.Compile(`\d+年\d+月\d+日((上|下)午)?\s*\d+[::时]\d+分?[-—]\d+[::时]\d+时?分?`)
+	regC, _ = regexp.Compile(`\s*\d+[::时]\d+分?[-—]`)
+	regD, _ = regexp.Compile(`([一|二|三|四|五|六|七|八|九|十|零|〇]{4})年([一|二|三|四|五|六|七|八|九|十]{1,2})月([一|二|三|四|五|六|七|八|九|十]{1,3})日([一|二|三|四|五|六|七|八|九|十]{1,3})时`)
+	regE, _ = regexp.Compile(`^([0-9一二三四五六七八九十]+)月(份)?$`)
+	regF, _ = regexp.Compile(`^(\d{4})(\d{2})$`)
+
+	regAfter, _ = regexp.Compile(`(下午D?\d{1,2}[时|:|:|h|H])`)
+	regAfterBool, _ = regexp.Compile(`(下午D?[1-2][0-9][时|:|:|h|H])`)
+}
+
+/*字符时间转时间戳
+支持全角
+20060102->时间戳
+20060102150405->时间戳
+01%02->时间戳
+2006%01%02->时间戳
+2006%01%02%15->时间戳
+2006%01%02%15%04->时间戳
+2006%01%02%15%04%05->时间戳
+*/
+func cleanStrToTimestamp(time_str string,publishtime int64) int64 {
+	tmp := time_str
+	//处理类似:二〇一五年十一月四日十五时
+	cht := regD.FindStringSubmatch(tmp)
+	if len(cht) == 5 {
+		y := chineseToNumber(cht[1])
+		m := 0
+		for _, v := range []rune(cht[2]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			m += it
+		}
+		d := 0
+		for _, v := range []rune(cht[3]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			d += it
+		}
+		M := 0
+		for _, v := range []rune(cht[4]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			M += it
+		}
+		tmp = fmt.Sprintf("%s年%d月%d日%d时", y, m, d, M)
+	}
+	//2016年12月7日上午9:00-11:30时 时间范围处理 取后面的时间
+	if regB.MatchString(tmp) {
+		tmp = regC.ReplaceAllString(tmp, "")
+	}
+	//2017年11月13日下午3时30分
+	addreptime := int64(0)
+	//2021年09月10日下午15时30分
+	if regAfter.MatchString(tmp) && !regAfterBool.MatchString(tmp) {
+		addreptime = 12 * 60 * 60
+	}
+	regRepl, _ := regexp.Compile(`[,,]`)
+	tmp = regRepl.ReplaceAllString(tmp, "")
+	for _, v := range spaces {
+		strings.Replace(tmp, v, " ", -1)
+	}
+	tmps := reg.FindAllString(chineseToNumber(tmp), -1)
+	//处理类似2016-12-0909:30:00时间
+	if len(tmps) > 2 && len(tmps[2]) > 2 {
+		newtmp := []string{}
+		for k, v := range tmps {
+			if k == 2 {
+				newtmp = append(newtmp, v[0:2], v[2:])
+			} else {
+				newtmp = append(newtmp, v)
+			}
+		}
+		tmps = newtmp
+	}
+	timestr := "" //2006-01-02 15:04:05
+	timestamp := int64(0)
+	if len(tmps) == 1 {
+		if len(tmps[0]) == 8 {
+			timestr = tmps[0][0:4] + "-" + tmps[0][4:6] + "-" + tmps[0][6:8]
+			t, _ := time.ParseInLocation("2006-01-02-15-04", timestr+"-09-00", time.Local)
+			timestamp = t.Unix()
+		} else if len(tmps[0]) == 14 {
+			timestr = tmps[0][0:4] + "-" + tmps[0][4:6] + "-" + tmps[0][6:8] + " " + tmps[0][8:10] + ":" + tmps[0][10:12] + ":" + tmps[0][12:14]
+			t, _ := time.ParseInLocation("2006-01-02 15:04:00", timestr, time.Local)
+			timestamp = t.Unix()
+		}
+	} else if len(tmps) == 2 {
+		timestr = fmt.Sprint(time.Now().Year()) + "-" + MDhmsRepair(tmps[0]) + "-" + MDhmsRepair(tmps[1])
+		t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+		timestamp = t.Unix()
+
+		if timestamp<=0 {
+			timestr = fmt.Sprint(MDhmsRepair(tmps[0]) + "-" + MDhmsRepair(tmps[1])+"-01")
+			t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+			timestamp = t.Unix()
+		}
+
+	} else if len(tmps) == 3 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2])
+		t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+		timestamp = t.Unix()
+	} else if len(tmps) == 4 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2]) + " " + MDhmsRepair(tmps[3])
+		t, _ := time.ParseInLocation("2006-01-02 15", timestr, time.Local)
+		timestamp = t.Unix()
+	} else if len(tmps) >= 5 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2]) + " " + MDhmsRepair(tmps[3]) + ":" + MDhmsRepair(tmps[4])
+		t, _ := time.ParseInLocation("2006-01-02 15:04", timestr, time.Local)
+		timestamp = t.Unix()
+	}
+
+	if regE.MatchString(tmp) && timestamp<=0 {
+		m := 0
+		new_str := regE.ReplaceAllString(tmp,"$1")
+		str := chineseToNumber(new_str)
+		it, _ := strconv.Atoi(str)
+		if it >100 {
+			m = 10+it%100
+		}else {
+			m = it
+		}
+		if m>0&&m<13 {
+			m_s := fmt.Sprintf("%d",m)
+			y_s := fmt.Sprintf("%d",time.Now().Year())
+			timestr = y_s + "-" + MDhmsRepair(m_s) + "-01"
+			t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+			if t.Unix()>publishtime {
+				timestamp = t.Unix()
+			}else {
+				timestamp = t.Unix()+int64(365*86400)
+			}
+		}
+	}
+	if regF.MatchString(tmp) && timestamp<=0 {
+		new_str := regF.ReplaceAllString(tmp,"$1-$2-01")
+		t, _ := time.ParseInLocation("2006-01-02", new_str, time.Local)
+		timestamp = t.Unix()
+	}
+
+
+	if timestamp <= 0 || timestamp > (time.Now().Unix()+T) {
+		timestamp = 0
+	} else {
+		if addreptime > 0 {
+			timestamp += addreptime
+		}
+	}
+	return timestamp
+}
+
+
+//汉子数和全角转数字
+func chineseToNumber(con string) string {
+	tmp := regA.ReplaceAllStringFunc(con, func(key string) string {
+		if item[key] != "" {
+			return item[key]
+		} else {
+			return key
+		}
+		return key
+	})
+	return tmp
+}
+
+//补位
+func MDhmsRepair(t string) string {
+	if len(t) == 1 {
+		return "0" + t
+	} else {
+		return t
+	}
+}

+ 19 - 0
data_export_yx/src/config.json

@@ -0,0 +1,19 @@
+{
+  "save_mgodb": {
+    "addr": "127.0.0.1:27017",
+    "db": "zhengkun",
+    "coll": "zktest_cgyx_data",
+    "pool": 5
+  },
+  "export_coll": "1111122222",
+  "words_arr": [
+    {
+      "keywords": "(交换机|由器|防火墙|无线AP|智慧黑板|云终端|云课堂|云桌面|网络设备|互动教学软件|大屏|桌面虚拟化|网络信息安全|网络安全)",
+      "outwords": "(光纤交换机|智能路由器|防火隔墙|光交换机|光纤交换机|程控交换机|热交换剂|缆桥交换机|存储|磁盘阵列|服务器|防病毒软件|WAF设备)"
+    },
+    {
+      "keywords": "(智慧教室|智慧校园|班班通|智慧医院|无线覆盖|薄改|改薄|政务外网|内网|办公网|宿舍网|教育网|园区网|生产网|城域网|数据中心|网络改造|网络建设|两化融合|同步课堂|专递课堂|工业互联网|实训室|系统集成)",
+      "outwords": "(维保|服务|测评|布线|安全加固服务)"
+    }
+  ]
+}

+ 42 - 0
data_export_yx/src/main.go

@@ -0,0 +1,42 @@
+package main
+
+import (
+	"log"
+	qu "qfw/util"
+	"time"
+)
+
+var (
+	sysconfig    				map[string]interface{} //配置文件
+	save_mgo        			*MongodbSim            //mongodb操作对象
+	save_coll,export_coll		string
+	start_time,end_time			int64
+	words_arr					[]map[string]interface{}
+)
+func initMgo()  {
+
+
+	saveconf := sysconfig["save_mgodb"].(map[string]interface{})
+	save_coll = qu.ObjToString(saveconf["coll"])
+	save_mgo = &MongodbSim{
+		MongodbAddr: saveconf["addr"].(string),
+		DbName:      saveconf["db"].(string),
+		Size:        qu.IntAllDef(saveconf["pool"], 5),
+	}
+	save_mgo.InitPool()
+	export_coll = qu.ObjToString(sysconfig["export_coll"])
+
+	start_time = time.Date(time.Now().Year(), 3, 1, 0, 0, 0, 0, time.Local).Unix()
+	end_time = time.Date(time.Now().Year()+1, 1, 1, 0, 0, 0, 0, time.Local).Unix()
+
+	words_arr = qu.ObjArrToMapArr(sysconfig["words_arr"].([]interface{}))
+	log.Println(sysconfig)
+}
+func init()  {
+	qu.ReadConfig(&sysconfig)
+	initMgo()
+}
+func main()  {
+	log.Println("测试...")
+	dealWithCgyxData()
+}

+ 329 - 0
data_export_yx/src/mgo.go

@@ -0,0 +1,329 @@
+package main
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+	"go.mongodb.org/mongo-driver/mongo"
+	"go.mongodb.org/mongo-driver/mongo/options"
+)
+
+type MgoSess struct {
+	Db     string
+	Coll   string
+	Query  interface{}
+	Sorts  []string
+	fields interface{}
+	limit  int64
+	skip   int64
+	M      *MongodbSim
+}
+
+type MgoIter struct {
+	Cursor *mongo.Cursor
+}
+
+func (mt *MgoIter) Next(result interface{}) bool {
+	if mt.Cursor != nil {
+		if mt.Cursor.Next(nil) {
+			err := mt.Cursor.Decode(result)
+			if err != nil {
+				log.Println("mgo cur err", err.Error())
+				mt.Cursor.Close(nil)
+				return false
+			}
+			return true
+		} else {
+			mt.Cursor.Close(nil)
+			return false
+		}
+	} else {
+		return false
+	}
+
+}
+
+func (ms *MgoSess) DB(name string) *MgoSess {
+	ms.Db = name
+	return ms
+}
+
+func (ms *MgoSess) C(name string) *MgoSess {
+	ms.Coll = name
+	return ms
+}
+
+func (ms *MgoSess) Find(q interface{}) *MgoSess {
+	ms.Query = q
+	return ms
+}
+
+func (ms *MgoSess) Select(fields interface{}) *MgoSess {
+	ms.fields = fields
+	return ms
+}
+
+func (ms *MgoSess) Limit(limit int64) *MgoSess {
+	ms.limit = limit
+	return ms
+}
+func (ms *MgoSess) Skip(skip int64) *MgoSess {
+	ms.skip = skip
+	return ms
+}
+
+func (ms *MgoSess) Sort(sorts ...string) *MgoSess {
+	ms.Sorts = sorts
+	return ms
+}
+
+func (ms *MgoSess) Iter() *MgoIter {
+	it := &MgoIter{}
+	find := options.Find()
+	if ms.skip > 0 {
+		find.SetSkip(ms.skip)
+	}
+	if ms.limit > 0 {
+		find.SetLimit(ms.limit)
+	}
+	find.SetBatchSize(100)
+	if len(ms.Sorts) > 0 {
+		sort := bson.M{}
+		for _, k := range ms.Sorts {
+			switch k[:1] {
+			case "-":
+				sort[k[1:]] = -1
+			case "+":
+				sort[k[1:]] = 1
+			default:
+				sort[k] = 1
+			}
+		}
+		find.SetSort(sort)
+	}
+	if ms.fields != nil {
+		find.SetProjection(ms.fields)
+	}
+	cur, err := ms.M.C.Database(ms.Db).Collection(ms.Coll).Find(ms.M.Ctx, ms.Query, find)
+	if err != nil {
+		log.Println("mgo find err", err.Error())
+	} else {
+		it.Cursor = cur
+	}
+	return it
+}
+
+type MongodbSim struct {
+	MongodbAddr string
+	Size        int
+	//	MinSize     int
+	DbName   string
+	C        *mongo.Client
+	Ctx      context.Context
+	ShortCtx context.Context
+	pool     chan bool
+	UserName string
+	Password string
+}
+
+func (m *MongodbSim) GetMgoConn() *MgoSess {
+	//m.Open()
+	ms := &MgoSess{}
+	ms.M = m
+	return ms
+}
+
+func (m *MongodbSim) DestoryMongoConn(ms *MgoSess) {
+	//m.Close()
+	ms.M = nil
+	ms = nil
+}
+
+func (m *MongodbSim) InitPool() {
+	opts := options.Client()
+	opts.SetConnectTimeout(3 * time.Second)
+	opts.ApplyURI("mongodb://" + m.MongodbAddr)
+	opts.SetMaxPoolSize(uint64(m.Size))
+	m.pool = make(chan bool, m.Size)
+
+	if m.UserName !="" && m.Password !="" {
+		cre := options.Credential{
+			Username:m.UserName,
+			Password:m.Password,
+		}
+		opts.SetAuth(cre)
+	}
+
+
+
+	opts.SetMaxConnIdleTime(2 * time.Hour)
+	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
+	m.ShortCtx, _ = context.WithTimeout(context.Background(), 1*time.Minute)
+	client, err := mongo.Connect(m.ShortCtx, opts)
+	if err != nil {
+		log.Println("mgo init error:", err.Error())
+	} else {
+		m.C = client
+		log.Println("init success")
+	}
+}
+
+func (m *MongodbSim) Open() {
+	m.pool <- true
+}
+func (m *MongodbSim) Close() {
+	<-m.pool
+}
+
+//批量插入
+func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[int64]interface{}, bool) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewUpdateOneModel()
+		write.SetFilter(d[0])
+		write.SetUpdate(d[1])
+		write.SetUpsert(true)
+		writes = append(writes, write)
+	}
+	r, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo upsert error:", e.Error())
+		return nil, false
+	}
+	//	else {
+	//		if r.UpsertedCount != int64(len(doc)) {
+	//			log.Println("mgo upsert uncomplete:uc/dc", r.UpsertedCount, len(doc))
+	//		}
+	//		return true
+	//	}
+	return r.UpsertedIDs, true
+}
+
+//批量插入
+func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewInsertOneModel()
+		write.SetDocument(d)
+		writes = append(writes, write)
+	}
+	_, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo savebulk error:", e.Error())
+		return false
+	}
+	return true
+}
+
+//保存
+func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.InsertOne(m.Ctx, doc)
+	if err != nil {
+		return nil
+	}
+	return r.InsertedID
+}
+
+//更新by Id
+func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	_, err := coll.UpdateOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)}, doc)
+	if err != nil {
+		return false
+	}
+	return true
+}
+
+//删除by id
+func (m *MongodbSim) DeleteById(c, id string) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//通过条件删除
+func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteMany(m.Ctx, query)
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//findbyid
+func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//findone
+func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, query)
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//find
+func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields interface{}) ([]map[string]interface{}, error) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	op := options.Find()
+	r, err := coll.Find(m.Ctx, query, op.SetSort(sort), op.SetProjection(fields))
+	if err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+
+	var results []map[string]interface{}
+	if err = r.All(m.Ctx, &results); err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	return results, nil
+}
+
+//创建_id
+func NewObjectId() primitive.ObjectID {
+	return primitive.NewObjectID()
+}
+
+func StringTOBsonId(id string) primitive.ObjectID {
+	objectId, _ := primitive.ObjectIDFromHex(id)
+	return objectId
+}
+
+func BsonTOStringId(id interface{}) string {
+	return id.(primitive.ObjectID).Hex()
+}

二进制
data_export_yx/src/rjqy.xlsx


+ 1 - 1
listen_data/src/main.go

@@ -126,7 +126,7 @@ func main()  {
 	*/
 
 
-	repairRepeatTag()
+	repairCopyNextData()
 
 	return
 

+ 141 - 41
listen_data/src/zkmethod.go

@@ -53,74 +53,174 @@ func encodeJyUrl()  {
 	log.Debug(Encode)
 }
 
+
+//copy数据 2022-02-19之后
+func repairCopyNextData() {
+	//暂时截止 2-23日 621508800000000000000000
+	log.Debug("620fd0900000000000000000~621508800000000000000000")
+	log.Debug("copy 之后的数据")
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt": StringTOBsonId("620fd0900000000000000000"),
+			"$lte": StringTOBsonId("621508800000000000000000"),
+		},
+	}
+	log.Debug("查询条件~",q)
+	total:= 0
+	//适当增加线程数
+	repair_pool := make(chan bool, 5)
+	repair_wg := &sync.WaitGroup{}
+	it := sess.DB("qfw").C("result_20210108").Find(&q).Sort("_id").Iter()
+	for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
+		if total%10000==0 {
+			log.Debug("curent index ",total,tmp["_id"])
+		}
+		repair_pool <- true
+		repair_wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-repair_pool
+				repair_wg.Done()
+			}()
+			save_mgo.Save("result_20220218",tmp)
+
+
+		}(tmp)
+		tmp = make(map[string]interface{})
+	}
+	repair_wg.Wait()
+	log.Debug("is copy over ",total)
+
+}
+
+//敏感词修复标记
+func repairSensitiveTag() {
+
+	//从11月30号开始  61a4f9000000000000000000
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gte": StringTOBsonId("620fc2800000000000000000"),
+			"$lte": StringTOBsonId("620fd0900000000000000000"),
+		},
+	}
+	total,isok1,isok2:= 0,0,0
+	it := sess.DB("qfw").C("result_20210108").Find(&q).Sort("_id").Iter()
+	//多线程处理
+	repair_pool := make(chan bool, 1)
+	repair_wg := &sync.WaitGroup{}
+	for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
+		if total%10000==0 {
+			log.Debug("curent index ",total,tmp["_id"],isok1,isok2)
+		}
+		repair_pool <- true
+		repair_wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-repair_pool
+				repair_wg.Done()
+			}()
+
+			tmpid := BsonTOStringId(tmp["_id"])
+			if tmp["log"]!=nil {
+				isok1++
+				data := save_mgo.FindById("result_20220218",tmpid)
+				if data!=nil && len(data)>2 {
+					log_dict := *qu.ObjToMap(tmp["log"])
+					if len(log_dict)>0 {
+						isok2++
+						update_dict := map[string]interface{}{}
+						for k,_ := range log_dict {
+							update_dict[k] = qu.ObjToString(tmp[k])
+						}
+						update_dict["log"] = log_dict
+
+						save_mgo.UpdateById("result_20220218",tmpid,map[string]interface{}{
+							"$set": update_dict,
+						})
+					}
+				}else {
+					log.Debug("异常-未查询到数据",tmp["_id"])
+				}
+			}
+		}(tmp)
+
+		tmp = make(map[string]interface{})
+	}
+	repair_wg.Wait()
+	log.Debug("is sensitive over ",total,isok1,isok2)
+
+}
 //修复重复标记
 func repairRepeatTag()  {
-	log.Debug("6064aa900000000000000000")
-	log.Debug("遍历-result_20220218~查询~result_20210108")
-	log.Debug("重复标记~更新~repeat,repeat_reason,repeat_id")
-	log.Debug("不重复标记~更新~repeat_ids,是否存在")
-	log.Debug("~~~~~~~~~~~~~")
-	log.Debug("~~~~~~~~~~~~~")
 	sess := save_mgo.GetMgoConn()
 	defer save_mgo.DestoryMongoConn(sess)
 	q := map[string]interface{}{
 		"_id": map[string]interface{}{
-			"$lte": StringTOBsonId("6064aa900000000000000000"),
+			"$gte": StringTOBsonId("620fc2800000000000000000"),
+			"$lte": StringTOBsonId("620fd0900000000000000000"),
 		},
 	}
-	total,ok_1,ok_2,err_3,err_4 := 0,0,0,0,0
+	total:= 0
 	it := sess.DB("qfw").C("result_20220218").Find(&q).Sort("_id").Select(map[string]interface{}{
 		"_id":1,
 	}).Iter()
+
+	//多线程处理
+	repair_pool := make(chan bool, 5)
+	repair_wg := &sync.WaitGroup{}
 	for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
 		if total%10000==0 {
-			log.Debug("curent index ",total,"~",ok_1,ok_2,err_3,err_4)
+			log.Debug("curent index ",total,tmp["_id"])
 		}
-
 		tmpid := BsonTOStringId(tmp["_id"])
+		repair_pool <- true
+		repair_wg.Add(1)
+		go func(tmpid string) {
+			defer func() {
+				<-repair_pool
+				repair_wg.Done()
+			}()
 
-		data := save_mgo.FindById("result_20210108",tmpid)
-		if data!=nil && len(data)>2 {
-			repeat := qu.IntAll(data["repeat"])
-			if repeat==1 {
-				update := map[string]interface{}{
-					"repeat":1,
-				}
-				if data["repeat_id"]!="" {
-					update["repeat_id"] = data["repeat_id"]
-				}
-				if data["repeat_reason"]!="" {
-					update["repeat_reason"] = data["repeat_reason"]
-				}
-				save_mgo.UpdateById("result_20220218",tmpid, map[string]interface{}{
-					"$set": update,
-				})
-				ok_1++
-			}else {
-				if data["repeat_ids"]!=nil {
-					ok_2++
+			data := save_mgo.FindById("result_20210108",tmpid)
+			if data!=nil && len(data)>2 {
+				repeat := qu.IntAll(data["repeat"])
+				if repeat==1 {
+					update := map[string]interface{}{
+						"repeat":1,
+					}
+					if data["repeat_id"]!="" {
+						update["repeat_id"] = data["repeat_id"]
+					}
+					if data["repeat_reason"]!="" {
+						update["repeat_reason"] = data["repeat_reason"]
+					}
 					save_mgo.UpdateById("result_20220218",tmpid, map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat_ids": data["repeat_ids"],
-						},
+						"$set": update,
 					})
 				}else {
-					err_3++
+					if data["repeat_ids"]!=nil {
+						save_mgo.UpdateById("result_20220218",tmpid, map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat_ids": data["repeat_ids"],
+							},
+						})
+					}
 				}
 			}
-		}else {
-			log.Debug("异常~",tmpid)
-			err_4++
-		}
+		}(tmpid)
+
 		tmp = make(map[string]interface{})
 	}
-
-	log.Debug("is bidding over ",total,ok_1,ok_2,err_3,err_4)
+	repair_wg.Wait()
+	log.Debug("is city over ",total)
 
 }
 
 
-
 //统计医疗器械数据
 func tongjiYLQX() {
 	//ylhydata

+ 4 - 2
process_medical/src/main.go

@@ -38,12 +38,14 @@ func init()  {
 
 
 func main()  {
-
 	//initMySqlMgoData()
 	//saveBiddingPurMysql()
+	//exportAreaMysql()
+
+
+	saveHospitalMysql()
 
 
-	//exportAreaMysql()
 }
 
 

+ 43 - 13
process_medical/src/medical_hospital.go

@@ -5,39 +5,56 @@ import (
 	qu "qfw/util"
 	"regexp"
 	"strings"
+	"unicode"
 )
 
 var hp_levelReg = regexp.MustCompile(`^[一二三]级[甲乙丙]等`)
+var hp_containsReg = regexp.MustCompile(`(-|—|-| )`)
 
 func saveHospitalMysql() {
+
+
+
 	sess := save_mgo.GetMgoConn()
 	defer save_mgo.DestoryMongoConn(sess)
 	q,total:=map[string]interface{}{},0
+	name_dict := map[string]string{}
 	it := sess.DB(save_mgo.DbName).C(o_hospital_coll).Find(&q).Iter()
 	for tmp := make(map[string]interface{}); it.Next(&tmp);total++{
-		if total%10000==0 {
+		if total%1000==0 {
 			log.Debug("curent index ",total)
 		}
 		tmpid := BsonTOStringId(tmp["_id"])
 		hospital_name :=  qu.ObjToString(tmp["name"])
-		data,alias_arr := updateHospitalData(tmp)
-		insertMysqlData("f_hospital",data,tmpid)
-		for _,v := range alias_arr{
-			if v=="" {
-				continue
-			}
-			insertMysqlData("f_hospital_history", map[string]interface{}{
-				"name":v,
-				"hospital_name":hospital_name,
-			},tmpid)
+		arr_name := strings.Split(hospital_name,"医院")
+		if len(arr_name)>2 {
+			hospital_name = arr_name[0]+"医院"
+		}
+		if hp_containsReg.MatchString(hospital_name) {
+			hospital_name = hp_containsReg.ReplaceAllString(hospital_name,"")
 		}
 
+		if name_dict[hospital_name]=="" {
+			name_dict[hospital_name] = hospital_name
+			data,alias_arr := updateHospitalData(tmp)
+			data["name"] = hospital_name
+			insertMysqlData("f_hospital",data,tmpid)
+			for _,v := range alias_arr{
+				if v=="" {
+					continue
+				}
+				insertMysqlData("f_hospital_history", map[string]interface{}{
+					"name":v,
+					"hospital_name":hospital_name,
+				},tmpid)
+			}
+		}else {
+			//重复-过滤
+		}
 		tmp = make(map[string]interface{})
 	}
 	log.Debug("is hospital over ",total)
 }
-
-
 func updateHospitalData(tmp map[string]interface{})(map[string]interface{},[]string) {
 	hospital := map[string]interface{}{}
 	alias := qu.ObjToString(tmp["alias"])
@@ -62,4 +79,17 @@ func updateHospitalData(tmp map[string]interface{})(map[string]interface{},[]str
 	hospital["alias"] = alias
 	aliasArr := strings.Split(alias,",")
 	return hospital,aliasArr
+}
+
+
+//包含非中文
+func isUnHan(str string) bool {
+	var count int
+	for _, v := range str {
+		if !unicode.Is(unicode.Han, v) {
+			count++
+			break
+		}
+	}
+	return count > 0
 }

+ 1 - 1
process_medical/src/mysql.go

@@ -351,7 +351,7 @@ func (m *Mysql) UpdateByTx(tx *sql.Tx, tableName string, query, update map[strin
 		values = append(values, v)
 	}
 	q := fmt.Sprintf("update %s set %s where %s", tableName, strings.Join(q_fs, ","), strings.Join(u_fs, " and "))
-	log.Println(q, values)
+	//log.Println(q, values)
 	return m.UpdateOrDeleteBySqlByTx(tx, q, values...) >= 0
 }