package backend import ( "container/list" "encoding/json" "regexp" ) const ( JOB_RUNNING_EVENT_PROGRESS = 1 JOB_RUNNING_EVENT_DEBUG = 0 //动作执行结果 RUN_ACTION_NOTRUN = 0 //未执行 RUN_ACTION_SUCCESS = 1 //执行成功 RUN_ACTION_ERROR = 2 //执行异常 RUN_ACTION_TIMEOUT = 3 //执行超时 //动作执行检查结果 CHECK_ACTION_NOTCHECK = 0 CHECK_ACTION_SUCCESS = 1 CHECK_ACTION_ERROR = 2 CHECK_ACTION_TIMEOUT = 3 CHECH_RESULT = "true" ) var ( DataResults = map[string]*list.List{} //调试运行结果 VerifyResults = map[string]*SpiderConfigVerifyResult{} //调试验证结果 RegSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") ) type ( //爬虫配置信息 SpiderConfig struct { Site string `json:"site"` Channel string `json:"channel"` ModifyUser string `json:"modifyuser"` Href string `json:"href"` Code string `json:"code"` InitList []*Actions `json:"initList"` ListBodyCss string `json:"listBodyCss"` //用于判断是否翻页成功 ListItemCss string `json:"listItemCss"` ListLinkCss string `json:"listLinkCss"` ListPubtimeCss string `json:"listPublishTimeCss"` ListNextPageCss string `json:"listNextPageCss"` TitleCss string `json:"titleCss"` PublishUnitCss string `json:"publishUnitCss"` PublishTimeCss string `json:"publishTimeCss"` ContentCss string `json:"contentCss"` AttachCss string `json:"attachCss"` ListJSCode string `json:"listJs"` ContentJSCode string `json:"contentJs"` AttachJSCode string `json:"attachJs"` //无效 ListTurnPageJSCode string `json:"listTurnPageJs"` MaxPages int64 `json:"maxPages"` FilterResource string `json:"filterResource"` //要过滤的资源 //延时 ListDelayTime int64 `json:"listDelayTime"` ListTurnDelayTime int64 `json:"listTurnDelayTime"` ContentDelayTime int64 `json:"contentDelayTime"` } //附件链接 AttachLink struct { Title string `json:"title"` Href string `json:"href"` FileName string `json:"fileName"` FileType string `json:"fileType"` FileSize string `json:"fileSize"` FilePath string `json:"filePath"` } Actions struct { ActionJs string `json:"actionJs"` CheckJs string `json:"checkJs"` SleepTime int64 `json:"sleepTime"` } //动作执行与检查 ActionRunResult struct { ActionNum int `json:"actionNum"` ActionJs string `json:"actionJs` CheckJs string `json:"checkJs"` Result bool `json:"result"` RunResult int `json:"runResult"` CheckResult int `json:"checkResult"` } //爬取结果信息 ResultItem struct { No int `json:"no"` //编号 Site string `json:"site"` Channel string `json:"channel"` Href string `json:"href"` ListTitle string `json:"listTitle"` ListPubTime string `json:"listPublishTime"` Title string `json:"title"` PublishUnit string `json:"publishUnit"` PublishTime string `json:"publishTime"` Content string `json:"content"` ContentHtml string `json:"contentHtml"` AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接 AttachJson string `json:"attachJson"` //存放附件的OSS元信息 } //爬取结果信息清单 ResultItems []*ResultItem SpiderConfiges []*SpiderConfig // JobItem struct { //code/site与爬虫配置一致 SpiderCode string `json:"code"` SpiderSite string `json:"site"` //以下是运行参数(不配置时,默认使用通用配置) Channel string `json:"channel"` Url string `json:"url"` ProxyServe string `json:"proxyServe"` MaxPages int `json:"maxPages"` Threads int `json:"threads"` ListDelay int64 `json:"listDelay"` TrunPageDelay int64 `json:"trunPageDelay"` ContentDelay int64 `json:"contentDelay"` NeedDownloadAttaches bool `json:"needDownloadAttaches"` } //作业定义 Job struct { Code string `json:"code"` Name string `json:"name"` Items []*JobItem `json:"items"` //通用参数 ProxyServe string `json:"proxyServe"` MaxPages int `json:"maxPages"` Threads int `json:"threads"` ListDelay int64 `json:"listDelay"` TrunPageDelay int64 `json:"trunPageDelay"` ContentDelay int64 `json:"contentDelay"` State int `json:"state"` StateType string `json:"stateType"` Progress int `json:"progress"` NeedDownloadAttaches bool `json:"needDownloadAttaches"` } Jobs []*Job //推送事件 JobRunningEvent struct { Progress int `json:"progress"` Msg string `json:"msg"` Act int `json:"act"` Code string `json:"code"` } //Job运行时状态,中间结果 JobRunningState struct { Code string State int Progress int ResultCache *list.List //缓存 ExitCh chan bool } //爬虫配置验证结果 SpiderConfigVerifyResult struct { Title bool `json:"title"` PublishUnit bool `json:"publishUnit"` PublishTime bool `json:"publishTime"` Content bool `json:"content"` Attaches bool `json:"attaches"` ListItems bool `json:"listItems"` ListTrunPage bool `json:"listTrunPage"` } ) func (sc SpiderConfiges) Len() int { return len(sc) } func (sc SpiderConfiges) Swap(i, j int) { sc[i], sc[j] = sc[j], sc[i] } func (sc SpiderConfiges) Less(i, j int) bool { return sc[i].Code > sc[j].Code } func (js Jobs) Len() int { return len(js) } func (js Jobs) Swap(i, j int) { js[i], js[j] = js[j], js[i] } func (js Jobs) Less(i, j int) bool { return js[i].Code > js[j].Code } // CopyAttribute func CopyAttribute(dst *string, value1, value2 string) { if value1 == "EMPTY" { //特殊符号,强制修改 *dst = "" } else if value1 != "" { *dst = value1 } else if value2 != "" { *dst = value2 } } // NewSpiderConfig 生成css对象 func NewSpiderConfig(cssmark map[string]interface{}) (*SpiderConfig, error) { sc := &SpiderConfig{} cssmark_byte, err := json.Marshal(cssmark) if err != nil { return nil, err } err = json.Unmarshal(cssmark_byte, &sc) return sc, err //param_common := param["param_common"].(map[string]interface{}) //css_list, _ := param["css_list"].(map[string]interface{}) //css_content, _ := param["css_content"].(map[string]interface{}) //js_list, _ := param["js_list"].(map[string]interface{}) //js_nextpage, _ := param["js_nextpage"].(map[string]interface{}) //js_content, _ := param["js_content"].(map[string]interface{}) //sc := &SpiderConfig{ // Site: qu.ObjToString(param_common["site"]), // Channel: qu.ObjToString(param_common["channel"]), // Modifyuser: qu.ObjToString(param_common["modifyuser"]), // Href: qu.ObjToString(param_common["href"]), // Code: qu.ObjToString(param_common["code"]), // ListBodyCss: qu.ObjToString(css_list["body"]), // ListItemCss: qu.ObjToString(css_list["title"]), // ListLinkCss: qu.ObjToString(css_list["href"]), // ListPubtimeCss: qu.ObjToString(css_list["ptime"]), // ListNextPageCss: qu.ObjToString(css_list["nextpage"]), // TitleCss: qu.ObjToString(css_content["title"]), // PublishUnitCss: qu.ObjToString(css_content["source"]), // PublishTimeCss: qu.ObjToString(css_content["ptime"]), // ContentCss: qu.ObjToString(css_content["content"]), // AttachCss: qu.ObjToString(css_content["file"]), // ListJSCode: qu.ObjToString(js_list["js"]), // ContentJSCode: qu.ObjToString(js_content["js"]), // ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]), // //AttachJSCode : string `json:"attachJs"` //无效 //} } // MergeSpiderConfig 合并 func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig { nsc := new(SpiderConfig) CopyAttribute(&nsc.Code, src2.Code, src1.Code) CopyAttribute(&nsc.Site, src2.Site, src1.Site) CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel) CopyAttribute(&nsc.Href, src2.Href, src1.Href) CopyAttribute(&nsc.ModifyUser, src2.ModifyUser, src1.ModifyUser) CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss) CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss) CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss) CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss) CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss) CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss) CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss) CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss) CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss) CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss) CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode) CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode) CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode) CopyAttribute(&nsc.ListTurnPageJSCode, src2.ListTurnPageJSCode, src1.ListTurnPageJSCode) return nsc } // CopySpiderConfig 复制 func CopySpiderConfig(src1, src2 *SpiderConfig) { src1.Code = src2.Code src1.Site = src2.Site src1.ModifyUser = src2.ModifyUser src1.Channel = src2.Channel src1.Href = src2.Href src1.ListBodyCss = src2.ListBodyCss src1.ListItemCss = src2.ListItemCss src1.ListPubtimeCss = src2.ListPubtimeCss src1.ListNextPageCss = src2.ListNextPageCss src1.ListLinkCss = src2.ListLinkCss src1.TitleCss = src2.TitleCss src1.PublishTimeCss = src2.PublishTimeCss src1.PublishUnitCss = src2.PublishUnitCss src1.ContentCss = src2.ContentCss src1.AttachCss = src2.AttachCss src1.ListJSCode = src2.ListJSCode src1.ListTurnPageJSCode = src2.ListTurnPageJSCode src1.ContentJSCode = src2.ContentJSCode src1.AttachJSCode = src2.AttachJSCode }