zhangjinkun 5 lat temu
rodzic
commit
bce7cb7ecf
3 zmienionych plików z 17 dodań i 12 usunięć
  1. 3 3
      src/config.json
  2. 9 9
      src/jy/extract/extract.go
  3. 5 0
      src/main.go

+ 3 - 3
src/config.json

@@ -2,19 +2,19 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 10,
-    "dbname": "extract_kf",
+    "dbname": "extract_dev32",
     "redis": "buyer=192.168.3.207:1679,winner=192.168.3.207:1679,agency=192.168.3.207:1679",
     "elasticsearch": "http://192.168.3.11:9800",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "qualityaudit": false,
     "saveblock": false,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
-    "udptaskid": "5cdd3025698414032c8322b1",
+    "udptaskid": "5e103206234ddc34b406c5d1",
     "udpport": "1484",
     "nextNode": [
         {

+ 9 - 9
src/jy/extract/extract.go

@@ -26,13 +26,13 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 100                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -267,7 +267,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
 	if err == nil {
 		conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
-		if conlen < 50 {
+		if conlen < 200 {
 			if isextFile {
 				detail += qu.ObjToString(doc["detailfile"])
 				doc["detail"] = detail
@@ -1871,7 +1871,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 5 - 0
src/main.go

@@ -9,6 +9,8 @@ import (
 	_ "jy/front"
 	. "jy/router"
 	"jy/util"
+	"net/http"
+	_ "net/http/pprof"
 	qu "qfw/util"
 	//"qfw/util/elastic"
 	"qfw/util/redis"
@@ -42,6 +44,9 @@ func main() {
 	go extract.Export()
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))
+	go func() {
+		http.ListenAndServe("localhost:10000", nil)
+	}()
 	lock := make(chan bool)
 	<-lock
 }