|
@@ -2,7 +2,7 @@ package backend
|
|
|
|
|
|
import (
|
|
|
"container/list"
|
|
|
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
+ "encoding/json"
|
|
|
)
|
|
|
|
|
|
const (
|
|
@@ -10,13 +10,18 @@ const (
|
|
|
JOB_RUNNING_EVENT_DEBUG = 0
|
|
|
)
|
|
|
|
|
|
+var (
|
|
|
+ DataResults = map[string]*list.List{} //调试运行结果
|
|
|
+ VerifyResults = map[string]*SpiderConfigVerifyResult{} //调试验证结果
|
|
|
+)
|
|
|
+
|
|
|
type (
|
|
|
//爬虫配置信息
|
|
|
SpiderConfig struct {
|
|
|
Site string `json:"site"`
|
|
|
Channel string `json:"channel"`
|
|
|
- Author string `json:"author"`
|
|
|
- Url string `json:"url"`
|
|
|
+ ModifyUser string `json:"modifyuser"`
|
|
|
+ Href string `json:"href"`
|
|
|
Code string `json:"code"`
|
|
|
ListBodyCss string `json:"listBodyCss"`
|
|
|
ListItemCss string `json:"listItemCss"`
|
|
@@ -31,7 +36,12 @@ type (
|
|
|
ListJSCode string `json:"listJs"`
|
|
|
ContentJSCode string `json:"contentJs"`
|
|
|
AttachJSCode string `json:"attachJs"` //无效
|
|
|
- ListTrunPageJSCode string `json:"listTrunPageJs"`
|
|
|
+ ListTurnPageJSCode string `json:"listTurnPageJs"`
|
|
|
+ MaxPages int64 `json:"maxPages"`
|
|
|
+ //延时
|
|
|
+ ListDelayTime int64 `json:"listDelayTime"`
|
|
|
+ ListTurnDelayTime int64 `json:"listTurnDelayTime"`
|
|
|
+ ContentDelayTime int64 `json:"contentDelayTime"`
|
|
|
}
|
|
|
//附件链接
|
|
|
AttachLink struct {
|
|
@@ -157,35 +167,41 @@ func CopyAttribute(dst *string, value1, value2 string) {
|
|
|
}
|
|
|
|
|
|
// NewSpiderConfig 生成css对象
|
|
|
-func NewSpiderConfig(param map[string]interface{}) *SpiderConfig {
|
|
|
- param_common := param["param_common"].(map[string]interface{})
|
|
|
- css_list, _ := param["css_list"].(map[string]interface{})
|
|
|
- css_content, _ := param["css_content"].(map[string]interface{})
|
|
|
- js_list, _ := param["js_list"].(map[string]interface{})
|
|
|
- js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
|
|
|
- js_content, _ := param["js_content"].(map[string]interface{})
|
|
|
- sc := &SpiderConfig{
|
|
|
- Site: qu.ObjToString(param_common["site"]),
|
|
|
- Channel: qu.ObjToString(param_common["channel"]),
|
|
|
- Author: qu.ObjToString(param_common["modifyuser"]),
|
|
|
- Url: qu.ObjToString(param_common["href"]),
|
|
|
- Code: qu.ObjToString(param_common["code"]),
|
|
|
- ListBodyCss: qu.ObjToString(css_list["body"]),
|
|
|
- ListItemCss: qu.ObjToString(css_list["title"]),
|
|
|
- ListLinkCss: qu.ObjToString(css_list["href"]),
|
|
|
- ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
|
|
|
- ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
|
|
|
- TitleCss: qu.ObjToString(css_content["title"]),
|
|
|
- PublishUnitCss: qu.ObjToString(css_content["source"]),
|
|
|
- PublishTimeCss: qu.ObjToString(css_content["ptime"]),
|
|
|
- ContentCss: qu.ObjToString(css_content["content"]),
|
|
|
- AttachCss: qu.ObjToString(css_content["file"]),
|
|
|
- ListJSCode: qu.ObjToString(js_list["js"]),
|
|
|
- ContentJSCode: qu.ObjToString(js_content["js"]),
|
|
|
- ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
|
|
|
- //AttachJSCode : string `json:"attachJs"` //无效
|
|
|
+func NewSpiderConfig(cssmark map[string]interface{}) (*SpiderConfig, error) {
|
|
|
+ sc := &SpiderConfig{}
|
|
|
+ cssmark_byte, err := json.Marshal(cssmark)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
}
|
|
|
- return sc
|
|
|
+ err = json.Unmarshal(cssmark_byte, &sc)
|
|
|
+ return sc, err
|
|
|
+ //param_common := param["param_common"].(map[string]interface{})
|
|
|
+ //css_list, _ := param["css_list"].(map[string]interface{})
|
|
|
+ //css_content, _ := param["css_content"].(map[string]interface{})
|
|
|
+ //js_list, _ := param["js_list"].(map[string]interface{})
|
|
|
+ //js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
|
|
|
+ //js_content, _ := param["js_content"].(map[string]interface{})
|
|
|
+ //sc := &SpiderConfig{
|
|
|
+ // Site: qu.ObjToString(param_common["site"]),
|
|
|
+ // Channel: qu.ObjToString(param_common["channel"]),
|
|
|
+ // Modifyuser: qu.ObjToString(param_common["modifyuser"]),
|
|
|
+ // Href: qu.ObjToString(param_common["href"]),
|
|
|
+ // Code: qu.ObjToString(param_common["code"]),
|
|
|
+ // ListBodyCss: qu.ObjToString(css_list["body"]),
|
|
|
+ // ListItemCss: qu.ObjToString(css_list["title"]),
|
|
|
+ // ListLinkCss: qu.ObjToString(css_list["href"]),
|
|
|
+ // ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
|
|
|
+ // ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
|
|
|
+ // TitleCss: qu.ObjToString(css_content["title"]),
|
|
|
+ // PublishUnitCss: qu.ObjToString(css_content["source"]),
|
|
|
+ // PublishTimeCss: qu.ObjToString(css_content["ptime"]),
|
|
|
+ // ContentCss: qu.ObjToString(css_content["content"]),
|
|
|
+ // AttachCss: qu.ObjToString(css_content["file"]),
|
|
|
+ // ListJSCode: qu.ObjToString(js_list["js"]),
|
|
|
+ // ContentJSCode: qu.ObjToString(js_content["js"]),
|
|
|
+ // ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
|
|
|
+ // //AttachJSCode : string `json:"attachJs"` //无效
|
|
|
+ //}
|
|
|
}
|
|
|
|
|
|
// MergeSpiderConfig 合并
|
|
@@ -194,8 +210,8 @@ func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
|
|
|
CopyAttribute(&nsc.Code, src2.Code, src1.Code)
|
|
|
CopyAttribute(&nsc.Site, src2.Site, src1.Site)
|
|
|
CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
|
|
|
- CopyAttribute(&nsc.Url, src2.Url, src1.Url)
|
|
|
- CopyAttribute(&nsc.Author, src2.Author, src1.Author)
|
|
|
+ CopyAttribute(&nsc.Href, src2.Href, src1.Href)
|
|
|
+ CopyAttribute(&nsc.ModifyUser, src2.ModifyUser, src1.ModifyUser)
|
|
|
CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
|
|
|
CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
|
|
|
CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
|
|
@@ -209,7 +225,7 @@ func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
|
|
|
CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
|
|
|
CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
|
|
|
CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
|
|
|
- CopyAttribute(&nsc.ListTrunPageJSCode, src2.ListTrunPageJSCode, src1.ListTrunPageJSCode)
|
|
|
+ CopyAttribute(&nsc.ListTurnPageJSCode, src2.ListTurnPageJSCode, src1.ListTurnPageJSCode)
|
|
|
return nsc
|
|
|
}
|
|
|
|
|
@@ -217,9 +233,9 @@ func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
|
|
|
func CopySpiderConfig(src1, src2 *SpiderConfig) {
|
|
|
src1.Code = src2.Code
|
|
|
src1.Site = src2.Site
|
|
|
- src1.Author = src2.Author
|
|
|
+ src1.ModifyUser = src2.ModifyUser
|
|
|
src1.Channel = src2.Channel
|
|
|
- src1.Url = src2.Url
|
|
|
+ src1.Href = src2.Href
|
|
|
src1.ListBodyCss = src2.ListBodyCss
|
|
|
src1.ListItemCss = src2.ListItemCss
|
|
|
src1.ListPubtimeCss = src2.ListPubtimeCss
|
|
@@ -231,7 +247,7 @@ func CopySpiderConfig(src1, src2 *SpiderConfig) {
|
|
|
src1.ContentCss = src2.ContentCss
|
|
|
src1.AttachCss = src2.AttachCss
|
|
|
src1.ListJSCode = src2.ListJSCode
|
|
|
- src1.ListTrunPageJSCode = src2.ListTrunPageJSCode
|
|
|
+ src1.ListTurnPageJSCode = src2.ListTurnPageJSCode
|
|
|
src1.ContentJSCode = src2.ContentJSCode
|
|
|
src1.AttachJSCode = src2.AttachJSCode
|
|
|
}
|