123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- // service
- package spider
- import (
- "encoding/json"
- "errors"
- "fmt"
- mu "mfw/util"
- "os"
- qu "qfw/util"
- mongodb "qfw/util/mongodb"
- util "spiderutil"
- "strings"
- "time"
- "gopkg.in/mgo.v2/bson"
- )
- //获取脚本文件
- func GetScript(code string, str ...interface{}) (script, script_list, script_content string) {
- defer mu.Catch()
- //script := ""
- luaconfig := *mongodb.FindOne("luaconfig", `{"code":"`+code+`"}`)
- //qu.Debug(code, "lua---", luaconfig)
- if luaconfig["listcheck"] == nil {
- luaconfig["listcheck"] = ""
- }
- if luaconfig["contentcheck"] == nil {
- luaconfig["contentcheck"] = ""
- }
- if luaconfig != nil && len(luaconfig) > 0 {
- common := luaconfig["param_common"].([]interface{})
- if len(str) > 0 {
- if len(common) == 15 {
- common = append(common, str[0], str[1], str[2])
- } else {
- common = append(common, false, false, str[0], str[1], str[2])
- }
- } else {
- if len(common) == 15 {
- common = append(common, "", "", "")
- } else {
- common = append(common, false, false, "", "", "")
- }
- }
- for k, v := range common {
- if k == 4 || k == 5 || k == 6 || k == 9 || k == 10 {
- common[k] = qu.IntAll(v)
- }
- }
- script, _ = GetTmpModel(map[string][]interface{}{"common": common})
- script_time := ""
- if luaconfig["type_time"] == 0 {
- time := luaconfig["param_time"].([]interface{})
- script_time, _ = GetTmpModel(map[string][]interface{}{
- "time": time,
- })
- } else {
- script_time = luaconfig["str_time"].(string)
- }
- //script_list := "" //列表页
- if luaconfig["type_list"] == 0 {
- list := luaconfig["param_list"].([]interface{})
- addrs := strings.Split(list[1].(string), "\n")
- if len(addrs) > 0 {
- for k, v := range addrs {
- addrs[k] = "'" + v + "'"
- }
- list[1] = strings.Join(addrs, ",")
- } else {
- list[1] = ""
- }
- script_list, _ = GetTmpModel(map[string][]interface{}{
- "list": list,
- "listcheck": []interface{}{luaconfig["listcheck"]},
- })
- } else {
- script_list = luaconfig["str_list"].(string)
- }
- //script_content := "" //三级页
- if luaconfig["type_content"] == 0 {
- content := luaconfig["param_content"].([]interface{})
- script_content, _ = GetTmpModel(map[string][]interface{}{
- "content": content,
- "contentcheck": []interface{}{luaconfig["contentcheck"]},
- })
- } else {
- script_content = luaconfig["str_content"].(string)
- }
- script += fmt.Sprintf(util.Tmp_Other, luaconfig["spidertype"], luaconfig["spiderhistorymaxpage"], luaconfig["spidermovevent"])
- script += `
- ` + script_time + `
- ` + script_list + `
- ` + script_content
- script = ReplaceModel(script, common, luaconfig["model"].(map[string]interface{}))
- }
- return
- }
- //保存更新脚本
- func SaveSpider(code string, param map[string]interface{}) bool {
- return mongodb.Update("luaconfig", bson.M{"code": code}, map[string]interface{}{"$set": param}, true, true)
- }
- /*获取最后发布时间
- comm 通用参数
- param 向导参数
- proficient 专家脚本
- guideType 向导类型
- */
- func GetLastPublishTime(comm, param []interface{}, proficient, downloadnode string, guideType int, scripts ...int) (timestr interface{}, err interface{}) {
- defer mu.Catch()
- var script string
- if guideType == 0 {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- "time": param,
- })
- } else {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- })
- script += proficient
- }
- if len(scripts) > 0 {
- return "", errors.New(script).Error()
- }
- if err != nil {
- return "", err
- }
- sp := CreateSpider(downloadnode, script)
- defer sp.L.Close()
- timestr, err = sp.GetLastPublishTimeTest()
- return
- }
- /*获取列表信息
- comm 通用参数
- param 向导参数
- model 补充模型
- modeltype 模型类型
- proficient 专家脚本
- guideType 向导类型
- */
- func GetPageList(comm, param []interface{}, model map[string]interface{}, listcheck interface{}, proficient, downloadnode string, guideType int, scripts ...int) (list []interface{}, err interface{}) {
- defer mu.Catch()
- var script string
- if guideType == 0 {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- "list": param,
- "listcheck": []interface{}{listcheck},
- })
- script = ReplaceModel(script, comm, model)
- } else {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- })
- script += proficient
- }
- if len(scripts) > 0 {
- return nil, errors.New(script).Error()
- }
- if err != nil {
- return nil, err
- }
- sp := CreateSpider(downloadnode, script)
- sp.SpiderMaxPage = 1
- defer sp.L.Close()
- list, err = sp.DownListPageItemTest()
- return
- }
- /*获取三级页信息
- comm 通用参数
- param 向导参数
- proficient 专家脚本
- guideType 向导类型
- */
- func GetContentInfo(comm, param []interface{}, data map[string]interface{}, contentcheck interface{}, proficient, downloadnode string, guideType int, scripts ...int) (rep map[string]interface{}, err interface{}) {
- defer mu.Catch()
- var script string
- if guideType == 0 {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- "content": param,
- "contentcheck": []interface{}{contentcheck},
- })
- } else {
- script, err = GetTmpModel(map[string][]interface{}{
- "common": comm,
- })
- script += proficient
- }
- if len(scripts) > 0 {
- return nil, errors.New(script).Error()
- }
- if err != nil {
- return nil, err
- }
- sp := CreateSpider(downloadnode, script)
- sp.SpiderMaxPage = 1
- defer sp.L.Close()
- param2 := map[string]string{}
- for k, v := range data {
- param2[k] = fmt.Sprint(v)
- }
- rep, err = sp.DownloadDetailPageTest(param2, data)
- return
- }
- //补充模型
- func ReplaceModel(script string, comm []interface{}, model map[string]interface{}) string {
- //补充通用信息
- commstr := `item["spidercode"]="` + comm[0].(string) + `";`
- commstr += `item["site"]="` + comm[1].(string) + `";`
- commstr += `item["channel"]="` + comm[2].(string) + `";`
- script = strings.Replace(script, "--Common--", commstr, -1)
- //补充模型信息
- modelstr := ""
- for k, v := range model {
- modelstr += `item["` + k + `"]="` + v.(string) + `";`
- }
- script = strings.Replace(script, "--Model--", modelstr, -1)
- return script
- }
- //创建爬虫
- func CreateSpider(downloadnode, script string, isfile ...string) *Spider {
- defer mu.Catch()
- sp := &Spider{}
- sp.LoadScript(downloadnode, script, isfile...)
- sp.Code = sp.GetVar("spiderCode")
- sp.SCode = sp.Code
- sp.Name = sp.GetVar("spiderName")
- sp.Channel = sp.GetVar("spiderChannel")
- sp.DownDetail = sp.GetBoolVar("spiderDownDetailPage")
- sp.Collection = sp.GetVar("spider2Collection")
- sp.SpiderStartPage = int64(sp.GetIntVar("spiderStartPage"))
- sp.SpiderMaxPage = int64(sp.GetIntVar("spiderMaxPage"))
- sp.SpiderRunRate = int64(sp.GetIntVar("spiderRunRate"))
- sp.StoreToMsgEvent = sp.GetIntVar("spiderStoreToMsgEvent")
- sp.StoreMode = sp.GetIntVar("spiderStoreMode")
- sp.CoverAttr = sp.GetVar("spiderCoverAttr")
- spiderSleepBase := sp.GetIntVar("spiderSleepBase")
- if spiderSleepBase == -1 {
- sp.SleepBase = 1000
- } else {
- sp.SleepBase = spiderSleepBase
- }
- spiderSleepRand := sp.GetIntVar("spiderSleepRand")
- if spiderSleepRand == -1 {
- sp.SleepRand = 1000
- } else {
- sp.SleepRand = spiderSleepRand
- }
- spiderTimeout := sp.GetIntVar("spiderTimeout")
- if spiderTimeout == -1 {
- sp.Timeout = 60
- } else {
- sp.Timeout = int64(spiderTimeout)
- }
- sp.TargetChannelUrl = sp.GetVar("spiderTargetChannelUrl")
- sp.SpiderIsHistoricalMend = sp.GetBoolVar("spiderIsHistoricalMend")
- sp.SpiderIsMustDownload = sp.GetBoolVar("spiderIsMustDownload")
- //qu.Debug(sp.SpiderIsHistoricalMend, sp.SpiderIsMustDownload)
- return sp
- }
- //生成爬虫脚本
- func GetTmpModel(param map[string][]interface{}) (script string, err interface{}) {
- qu.Try(func() {
- if param != nil && param["common"] != nil {
- if len(param["common"]) < 12 {
- err = "公共参数配置不全"
- } else {
- script = fmt.Sprintf(util.Tmp_common, param["common"]...)
- }
- }
- if param != nil && param["time"] != nil {
- if len(param["time"]) < 3 {
- err = "方法:time-参数配置不全"
- } else {
- script += fmt.Sprintf(util.Tmp_pubtime, param["time"]...)
- }
- }
- if param != nil && param["list"] != nil {
- if len(param["list"]) < 7 {
- err = "方法:list-参数配置不全"
- } else {
- list := []interface{}{param["listcheck"][0]}
- list = append(list, param["list"]...)
- script += fmt.Sprintf(util.Tmp_pagelist, list...)
- script = strings.Replace(script, "#pageno#", `"..tostring(pageno).."`, -1)
- }
- }
- if param != nil && param["content"] != nil {
- if len(param["content"]) < 2 {
- err = "方法:content-参数配置不全"
- } else {
- content := []interface{}{param["contentcheck"][0]}
- content = append(content, param["content"]...)
- script += fmt.Sprintf(util.Tmp_content, content...)
- }
- }
- }, func(e interface{}) {
- err = e
- })
- return script, err
- }
- //生成文件
- func CreateFile(code, script string) (string, error) {
- filepath := "res/" + time.Now().Format("2006/01/02")
- err := os.MkdirAll(filepath, 0777)
- f, err := os.Create(filepath + "/spider_" + code + ".lua")
- defer f.Close()
- f.WriteString(script)
- return filepath, err
- }
- //上传脚本
- func UpdateSpiderByCodeState(code, state string, event int) (bool, error) {
- msgid := mu.UUID(8)
- data := map[string]interface{}{}
- data["code"] = code
- data["state"] = state
- rep := map[string]interface{}{}
- var bs []byte
- var err error
- if util.Config.Uploadevents[fmt.Sprint(event)] == "bid" { //?
- bs, err = MsclientBid.Call("", msgid, event, mu.SENDTO_TYPE_ALL_RECIVER, data, 60)
- } else {
- bs, err = Msclient.Call("", msgid, event, mu.SENDTO_TYPE_ALL_RECIVER, data, 60)
- }
- if err != nil {
- return false, err
- } else {
- json.Unmarshal(bs, &rep)
- b, _ := rep["b"].(bool)
- if !b {
- err = errors.New(qu.ObjToString(rep["err"]))
- }
- return b, err
- }
- }
|