123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254 |
- package backend
- import (
- "container/list"
- "encoding/json"
- "regexp"
- )
- const (
- JOB_RUNNING_EVENT_PROGRESS = 1
- JOB_RUNNING_EVENT_DEBUG = 0
- )
- var (
- DataResults = map[string]*list.List{} //调试运行结果
- VerifyResults = map[string]*SpiderConfigVerifyResult{} //调试验证结果
- RegSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
- )
- type (
- //爬虫配置信息
- SpiderConfig struct {
- Site string `json:"site"`
- Channel string `json:"channel"`
- ModifyUser string `json:"modifyuser"`
- Href string `json:"href"`
- Code string `json:"code"`
- ListBodyCss string `json:"listBodyCss"` //用于判断是否翻页成功
- ListItemCss string `json:"listItemCss"`
- ListLinkCss string `json:"listLinkCss"`
- ListPubtimeCss string `json:"listPublishTimeCss"`
- ListNextPageCss string `json:"listNextPageCss"`
- TitleCss string `json:"titleCss"`
- PublishUnitCss string `json:"publishUnitCss"`
- PublishTimeCss string `json:"publishTimeCss"`
- ContentCss string `json:"contentCss"`
- AttachCss string `json:"attachCss"`
- ListJSCode string `json:"listJs"`
- ContentJSCode string `json:"contentJs"`
- AttachJSCode string `json:"attachJs"` //无效
- ListTurnPageJSCode string `json:"listTurnPageJs"`
- MaxPages int64 `json:"maxPages"`
- //延时
- ListDelayTime int64 `json:"listDelayTime"`
- ListTurnDelayTime int64 `json:"listTurnDelayTime"`
- ContentDelayTime int64 `json:"contentDelayTime"`
- }
- //附件链接
- AttachLink struct {
- Title string `json:"title"`
- Href string `json:"href"`
- FileName string `json:"fileName"`
- FileType string `json:"fileType"`
- FileSize string `json:"fileSize"`
- FilePath string `json:"filePath"`
- }
- //爬取结果信息
- ResultItem struct {
- No int `json:"no"` //编号
- Site string `json:"site"`
- Channel string `json:"channel"`
- Href string `json:"href"`
- ListTitle string `json:"listTitle"`
- ListPubTime string `json:"listPublishTime"`
- Title string `json:"title"`
- PublishUnit string `json:"publishUnit"`
- PublishTime string `json:"publishTime"`
- Content string `json:"content"`
- ContentHtml string `json:"contentHtml"`
- AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接
- AttachJson string `json:"attachJson"` //存放附件的OSS元信息
- }
- //爬取结果信息清单
- ResultItems []*ResultItem
- SpiderConfiges []*SpiderConfig
- //
- JobItem struct {
- //code/site与爬虫配置一致
- SpiderCode string `json:"code"`
- SpiderSite string `json:"site"`
- //以下是运行参数(不配置时,默认使用通用配置)
- Channel string `json:"channel"`
- Url string `json:"url"`
- ProxyServe string `json:"proxyServe"`
- MaxPages int `json:"maxPages"`
- Threads int `json:"threads"`
- ListDelay int64 `json:"listDelay"`
- TrunPageDelay int64 `json:"trunPageDelay"`
- ContentDelay int64 `json:"contentDelay"`
- NeedDownloadAttaches bool `json:"needDownloadAttaches"`
- }
- //作业定义
- Job struct {
- Code string `json:"code"`
- Name string `json:"name"`
- Items []*JobItem `json:"items"`
- //通用参数
- ProxyServe string `json:"proxyServe"`
- MaxPages int `json:"maxPages"`
- Threads int `json:"threads"`
- ListDelay int64 `json:"listDelay"`
- TrunPageDelay int64 `json:"trunPageDelay"`
- ContentDelay int64 `json:"contentDelay"`
- State int `json:"state"`
- StateType string `json:"stateType"`
- Progress int `json:"progress"`
- NeedDownloadAttaches bool `json:"needDownloadAttaches"`
- }
- Jobs []*Job
- //推送事件
- JobRunningEvent struct {
- Progress int `json:"progress"`
- Msg string `json:"msg"`
- Act int `json:"act"`
- Code string `json:"code"`
- }
- //Job运行时状态,中间结果
- JobRunningState struct {
- Code string
- State int
- Progress int
- ResultCache *list.List //缓存
- ExitCh chan bool
- }
- //爬虫配置验证结果
- SpiderConfigVerifyResult struct {
- Title bool `json:"title"`
- PublishUnit bool `json:"publishUnit"`
- PublishTime bool `json:"publishTime"`
- Content bool `json:"content"`
- Attaches bool `json:"attaches"`
- ListItems bool `json:"listItems"`
- ListTrunPage bool `json:"listTrunPage"`
- }
- )
- func (sc SpiderConfiges) Len() int {
- return len(sc)
- }
- func (sc SpiderConfiges) Swap(i, j int) {
- sc[i], sc[j] = sc[j], sc[i]
- }
- func (sc SpiderConfiges) Less(i, j int) bool {
- return sc[i].Code > sc[j].Code
- }
- func (js Jobs) Len() int {
- return len(js)
- }
- func (js Jobs) Swap(i, j int) {
- js[i], js[j] = js[j], js[i]
- }
- func (js Jobs) Less(i, j int) bool {
- return js[i].Code > js[j].Code
- }
- // CopyAttribute
- func CopyAttribute(dst *string, value1, value2 string) {
- if value1 == "EMPTY" { //特殊符号,强制修改
- *dst = ""
- } else if value1 != "" {
- *dst = value1
- } else if value2 != "" {
- *dst = value2
- }
- }
- // NewSpiderConfig 生成css对象
- func NewSpiderConfig(cssmark map[string]interface{}) (*SpiderConfig, error) {
- sc := &SpiderConfig{}
- cssmark_byte, err := json.Marshal(cssmark)
- if err != nil {
- return nil, err
- }
- err = json.Unmarshal(cssmark_byte, &sc)
- return sc, err
- //param_common := param["param_common"].(map[string]interface{})
- //css_list, _ := param["css_list"].(map[string]interface{})
- //css_content, _ := param["css_content"].(map[string]interface{})
- //js_list, _ := param["js_list"].(map[string]interface{})
- //js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
- //js_content, _ := param["js_content"].(map[string]interface{})
- //sc := &SpiderConfig{
- // Site: qu.ObjToString(param_common["site"]),
- // Channel: qu.ObjToString(param_common["channel"]),
- // Modifyuser: qu.ObjToString(param_common["modifyuser"]),
- // Href: qu.ObjToString(param_common["href"]),
- // Code: qu.ObjToString(param_common["code"]),
- // ListBodyCss: qu.ObjToString(css_list["body"]),
- // ListItemCss: qu.ObjToString(css_list["title"]),
- // ListLinkCss: qu.ObjToString(css_list["href"]),
- // ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
- // ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
- // TitleCss: qu.ObjToString(css_content["title"]),
- // PublishUnitCss: qu.ObjToString(css_content["source"]),
- // PublishTimeCss: qu.ObjToString(css_content["ptime"]),
- // ContentCss: qu.ObjToString(css_content["content"]),
- // AttachCss: qu.ObjToString(css_content["file"]),
- // ListJSCode: qu.ObjToString(js_list["js"]),
- // ContentJSCode: qu.ObjToString(js_content["js"]),
- // ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
- // //AttachJSCode : string `json:"attachJs"` //无效
- //}
- }
- // MergeSpiderConfig 合并
- func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
- nsc := new(SpiderConfig)
- CopyAttribute(&nsc.Code, src2.Code, src1.Code)
- CopyAttribute(&nsc.Site, src2.Site, src1.Site)
- CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
- CopyAttribute(&nsc.Href, src2.Href, src1.Href)
- CopyAttribute(&nsc.ModifyUser, src2.ModifyUser, src1.ModifyUser)
- CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
- CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
- CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
- CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss)
- CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss)
- CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss)
- CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss)
- CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss)
- CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss)
- CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss)
- CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
- CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
- CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
- CopyAttribute(&nsc.ListTurnPageJSCode, src2.ListTurnPageJSCode, src1.ListTurnPageJSCode)
- return nsc
- }
- // CopySpiderConfig 复制
- func CopySpiderConfig(src1, src2 *SpiderConfig) {
- src1.Code = src2.Code
- src1.Site = src2.Site
- src1.ModifyUser = src2.ModifyUser
- src1.Channel = src2.Channel
- src1.Href = src2.Href
- src1.ListBodyCss = src2.ListBodyCss
- src1.ListItemCss = src2.ListItemCss
- src1.ListPubtimeCss = src2.ListPubtimeCss
- src1.ListNextPageCss = src2.ListNextPageCss
- src1.ListLinkCss = src2.ListLinkCss
- src1.TitleCss = src2.TitleCss
- src1.PublishTimeCss = src2.PublishTimeCss
- src1.PublishUnitCss = src2.PublishUnitCss
- src1.ContentCss = src2.ContentCss
- src1.AttachCss = src2.AttachCss
- src1.ListJSCode = src2.ListJSCode
- src1.ListTurnPageJSCode = src2.ListTurnPageJSCode
- src1.ContentJSCode = src2.ContentJSCode
- src1.AttachJSCode = src2.AttachJSCode
- }
|