Ver código fonte

爬虫程序合并

zhangjinkun@topnet.net.cn 9 anos atrás
pai
commit
d1b499d0de

+ 4 - 1
spider2/src/client/ecps/ln/parse.go

@@ -64,7 +64,10 @@ func setJbxx(html string) {
 	infoMap = FormatJbxx(jbxx, infoMap)
 	infoMap["Area"] = Code
 	RegNo := fmt.Sprint(infoMap["RegNo"])
-	infoMap["OpLocDistrict"] = RegNo[0:6]
+	if len(RegNo) > 7 {
+		infoMap["OpLocDistrict"] = RegNo[0:6]
+	}
+
 	//infoMap["Ycml"] = true
 }
 

+ 15 - 7
spider2/src/client/ecps/tj/downcontent.go

@@ -30,11 +30,7 @@ func downloadContent(param map[string]interface{}) bool {
 	query := ""
 	querys := param["regno"]
 	if querys == nil {
-		querys = param["entname"]
-		query = querys.(string)
-	}
-	if querys == nil {
-		querys = param["word"]
+		querys = param["query"]
 		query = querys.(string)
 	}
 	if querys == nil {
@@ -45,8 +41,8 @@ func downloadContent(param map[string]interface{}) bool {
 	for { //显示数字的规则,100-10000之前,能被3整除
 		url := "http://tjcredit.gov.cn/verifycode"
 		timestamp := strconv.Itoa(int(time.Now().Unix())) + "000"
-		ret, _ := DownWithCookie(url, "get", map[string]string{"date": timestamp}, cookie, nil)
-		if ret == nil {
+		ret, _ := DownWithCookie(url, "get", map[string]string{"date": timestamp}, cookie, head)
+		if ret != nil && len(ret) < 0 {
 			continue
 		}
 		r := Clean4TJ(ret)
@@ -96,6 +92,9 @@ func downloadContent(param map[string]interface{}) bool {
 	}
 	doc, _ := goquery.NewDocumentFromReader(bytes.NewBuffer(listbs))
 	info := getListInfo(doc)
+	if info["regno"] == nil {
+		return false
+	}
 	regno := info["regno"].(string)
 	if regno == "" {
 		return false
@@ -111,6 +110,9 @@ func downloadContent(param map[string]interface{}) bool {
 	regex, _ := regexp.Compile(`url:"(.*)",`)
 	regex1, _ := regexp.Compile(`/[^"]*`)
 	params, get_sort_url := getParams(regex1.FindString(regex.FindString(string(bs))))
+	if params == nil || get_sort_url == "" {
+		return false
+	}
 	bs, err = DownWithCookie("http://tjcredit.gov.cn"+get_sort_url, "get", params, cookie, head)
 	arr := []interface{}{}
 	json.Unmarshal(bs, &arr)
@@ -189,6 +191,9 @@ func getListInfo(doc *goquery.Document) map[string]interface{} {
 }
 
 func getParams(urlstr string) (map[string]string, string) {
+	if urlstr == "" {
+		return nil, ""
+	}
 	urlPrefix := urlstr[0:strings.LastIndex(urlstr, "?")]
 	paramstr := urlstr[strings.LastIndex(urlstr, "?")+1:]
 	paramarr := strings.Split(paramstr, "&")
@@ -206,6 +211,9 @@ func getParams(urlstr string) (map[string]string, string) {
 func downTabDetail(url string) string {
 	url = "http://tjcredit.gov.cn" + url
 	params, get := getParams(url)
+	if params == nil || get == "" {
+		return ""
+	}
 	bs, _, _ := hd.Download(&RequestConfig{
 		Urlstr:     get,
 		Method:     REQ_METHOD_GET,

+ 0 - 4
spider2/src/client/ecps/tj/downlist.go

@@ -5,7 +5,6 @@ import (
 	"bytes"
 	. "common/spiderutil"
 	"github.com/PuerkitoBio/goquery"
-	"log"
 	"net/http"
 
 	"strconv"
@@ -16,7 +15,6 @@ var cookie []*http.Cookie
 
 //下载异常名录列表页
 func downloadList(p int) {
-	log.Println(p)
 	_, cookie = DownWithCookie("http://tjcredit.gov.cn/platform/saic/exclist.ftl", "get", nil, nil, nil)
 	bs, _, _ := hd.Download(&RequestConfig{
 		Urlstr: "http://tjcredit.gov.cn/platform/saic/exclist.ftl",
@@ -42,7 +40,6 @@ func downloadList(p int) {
 			"X-Requested-With": "XMLHttpRequest",
 		},
 	})
-	log.Println(string(bs))
 	godoc, _ := goquery.NewDocumentFromReader(bytes.NewReader(bs))
 	godoc.Find(".tb-b a").Each(func(index int, sel *goquery.Selection) {
 		if sel.Text() == "无" || len(sel.Text()) < 2 {
@@ -52,7 +49,6 @@ func downloadList(p int) {
 		//in["pripid"] = tmp[0][1]
 		//in["regno"] = tmp[1][1]
 		//in["etype"] = tmp[2][1]
-		in["entname"] = sel.Text()
 		in["query"] = sel.Text()
 		for {
 			b := pushJob(in)

+ 22 - 5
spider2/src/client/ecps/tj/parse.go

@@ -18,6 +18,9 @@ func Parse(htmlMap map[string]interface{}) map[string]interface{} {
 	infoMap = make(map[string]interface{})
 	if htmlMap != nil {
 		//登记信息
+		if htmlMap[JBXX] == nil {
+			return nil
+		}
 		jbxxstr := htmlMap[JBXX].(string)
 		doc, _ := goquery.NewDocumentFromReader(bytes.NewReader([]byte(jbxxstr)))
 		jbxxtable := doc.Find("table").Eq(0)
@@ -29,12 +32,23 @@ func Parse(htmlMap map[string]interface{}) map[string]interface{} {
 		regno := ReplaceSNR(regstr[0:strings.Index(regstr, "<")])
 		infoMap["ID"] = regno
 		infoMap["RegNo"] = regno
-
+		if len(regno) < 6 {
+			return nil
+		}
+		infoMap["OpLocDistrict"] = regno[0:6]
 		//股东信息
 		gdxxtable := doc.Find("table").Eq(1)
 		tmp = getInfoRMap(&P{node: gdxxtable, dir: 2, srow: 2, trow: 1, rpMap: nil, rsMap: nil})
 		ctmp := convmap(tmp)
 		gdxxlist := FormatGdxx(InvType, ctmp)
+		for _, v := range gdxxlist {
+			v1 := v.(map[string]interface{})
+			time.Sleep(1)
+			if v1["CerNo"] == nil || v1["CerNo"].(string) == "" {
+				v1["CerNo"] = "t_" + fmt.Sprint(time.Now().UnixNano())
+			}
+
+		}
 		infoMap["investor"] = gdxxlist
 		addJob(gdxxlist)
 		//变更信息
@@ -56,6 +70,9 @@ func Parse(htmlMap map[string]interface{}) map[string]interface{} {
 			infoMap["alterInfo"] = bgxxtmp
 		}
 		//行政处罚信息
+		if htmlMap[XZCF] == nil {
+			return nil
+		}
 		xzcf := htmlMap[XZCF].(string)
 		doc, _ = goquery.NewDocumentFromReader(bytes.NewReader([]byte(xzcf)))
 		xzcftable := doc.Find("table").Eq(0)
@@ -87,12 +104,12 @@ func addJob(tmplist []interface{}) {
 			break
 		default:
 			cm := make(map[string]interface{})
-			if tmp["CerNo"] != nil {
-				cm["word"] = tmp["CerNo"]
+			if tmp["CerNo"] != nil && strings.Index(fmt.Sprint(tmp["CerNo"]), "t_") == -1 {
+				cm["query"] = tmp["CerNo"]
 			} else {
-				cm["word"] = tmp["Inv"]
+				cm["query"] = tmp["Inv"]
 			}
-			if cm["word"] != nil && cm["word"] != "" {
+			if cm["query"] != nil && cm["query"] != "" {
 				cms := *new(com.CmdMsg)
 				cms.Nodetype = com.NODE_TYPE_ECPS
 				cms.Param = cm

+ 5 - 3
spider2/src/client/ecps/tj/spider.go

@@ -77,7 +77,6 @@ func downList() {
 		for e := Lists.Front(); e != nil; e = e.Next() {
 
 			tmp := e.Value.(map[string]interface{})
-			log.Println(tmp)
 			if tmp == nil || tmp["query"] == nil || tmp["query"] == "" {
 				log.Println("下载失败:", tmp)
 				putLogs("下载失败:"+fmt.Sprint(tmp), 1)
@@ -135,17 +134,20 @@ func loadCommand() {
 		//log.Println(err.Error())
 	} else {
 		json.Unmarshal(*bt, &arr)
+		//log.Println("arr", arr)
 		var rcmd com.ReturnMsg
 		for _, v := range arr {
 			tmp1, b1 := v.(map[string]interface{})
 			if b1 {
 				rcmd = tmp1
 			}
-			tmp2, b2 := v.(com.CmdMsg)
+			tmp2, b2 := v.(map[string]interface{})
 			if b2 {
-				cmd, b3 := (tmp2.Param).(map[string]interface{})
+
+				cmd, b3 := (tmp2["Param"]).(map[string]interface{})
 				if b3 {
 					Lists.PushFront(cmd)
+					log.Println("插入任务"+fmt.Sprint(cmd), 1)
 					putLogs("插入任务"+fmt.Sprint(cmd), 1)
 				}
 			}

+ 5 - 0
spider2/src/common/rpccall.go

@@ -13,6 +13,7 @@ type RpcCall struct {
 //加入
 func (rc *RpcCall) Join(p JoinMsg, r *SimpleReturnMsg) error {
 	client, err := rpc.DialHTTP("tcp", rc.Addr)
+	defer client.Close()
 	if err != nil {
 		return err
 	}
@@ -23,6 +24,7 @@ func (rc *RpcCall) Join(p JoinMsg, r *SimpleReturnMsg) error {
 //取命令
 func (rc *RpcCall) LoadCmd(p string, bt *[]byte) error {
 	client, err := rpc.DialHTTP("tcp", rc.Addr)
+	defer client.Close()
 	if err != nil {
 		return err
 	}
@@ -33,6 +35,7 @@ func (rc *RpcCall) LoadCmd(p string, bt *[]byte) error {
 //追加任务
 func (rc *RpcCall) PutCmd(p CmdMsg, r *SimpleReturnMsg) error {
 	client, err := rpc.DialHTTP("tcp", rc.Addr)
+	defer client.Close()
 	if err != nil {
 		return err
 	}
@@ -45,6 +48,7 @@ func (rc *RpcCall) PutCmd(p CmdMsg, r *SimpleReturnMsg) error {
 //发送日志
 func (rc *RpcCall) PutLog(p LogMsg, r *ReturnMsg) error {
 	client, err := rpc.DialHTTP("tcp", rc.Addr)
+	defer client.Close()
 	if err != nil {
 		return err
 	}
@@ -57,6 +61,7 @@ func (rc *RpcCall) PutLog(p LogMsg, r *ReturnMsg) error {
 //保存数据
 func (rc *RpcCall) UpdateEntpriseData(info map[string]interface{}, r *SimpleReturnMsg) error {
 	client, err := rpc.DialHTTP("tcp", rc.Addr)
+	defer client.Close()
 	if err != nil {
 		return err
 	}