types.go 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. package backend
  2. import (
  3. "container/list"
  4. "encoding/json"
  5. )
  6. const (
  7. JOB_RUNNING_EVENT_PROGRESS = 1
  8. JOB_RUNNING_EVENT_DEBUG = 0
  9. )
  10. var (
  11. DataResults = map[string]*list.List{} //调试运行结果
  12. VerifyResults = map[string]*SpiderConfigVerifyResult{} //调试验证结果
  13. )
  14. type (
  15. //爬虫配置信息
  16. SpiderConfig struct {
  17. Site string `json:"site"`
  18. Channel string `json:"channel"`
  19. ModifyUser string `json:"modifyuser"`
  20. Href string `json:"href"`
  21. Code string `json:"code"`
  22. ListBodyCss string `json:"listBodyCss"`
  23. ListItemCss string `json:"listItemCss"`
  24. ListLinkCss string `json:"listLinkCss"`
  25. ListPubtimeCss string `json:"listPublishTimeCss"`
  26. ListNextPageCss string `json:"listNextPageCss"`
  27. TitleCss string `json:"titleCss"`
  28. PublishUnitCss string `json:"publishUnitCss"`
  29. PublishTimeCss string `json:"publishTimeCss"`
  30. ContentCss string `json:"contentCss"`
  31. AttachCss string `json:"attachCss"`
  32. ListJSCode string `json:"listJs"`
  33. ContentJSCode string `json:"contentJs"`
  34. AttachJSCode string `json:"attachJs"` //无效
  35. ListTurnPageJSCode string `json:"listTurnPageJs"`
  36. MaxPages int64 `json:"maxPages"`
  37. //延时
  38. ListDelayTime int64 `json:"listDelayTime"`
  39. ListTurnDelayTime int64 `json:"listTurnDelayTime"`
  40. ContentDelayTime int64 `json:"contentDelayTime"`
  41. }
  42. //附件链接
  43. AttachLink struct {
  44. Title string `json:"title"`
  45. Href string `json:"href"`
  46. FileName string `json:"fileName"`
  47. FileType string `json:"fileType"`
  48. FileSize string `json:"fileSize"`
  49. FilePath string `json:"filePath"`
  50. }
  51. //爬取结果信息
  52. ResultItem struct {
  53. No int `json:"no"` //编号
  54. Site string `json:"site"`
  55. Channel string `json:"channel"`
  56. Href string `json:"href"`
  57. ListTitle string `json:"listTitle"`
  58. ListPubTime string `json:"listPubishTime"`
  59. Title string `json:"title"`
  60. PublishUnit string `json:"publishUnit"`
  61. PublishTime string `json:"publishTime"`
  62. Content string `json:"content"`
  63. ContentHtml string `json:"contentHtml"`
  64. AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接
  65. AttachJson string `json:"attachJson"` //存放附件的OSS元信息
  66. }
  67. //爬取结果信息清单
  68. ResultItems []*ResultItem
  69. SpiderConfiges []*SpiderConfig
  70. //
  71. JobItem struct {
  72. //code/site与爬虫配置一致
  73. SpiderCode string `json:"code"`
  74. SpiderSite string `json:"site"`
  75. //以下是运行参数(不配置时,默认使用通用配置)
  76. Channel string `json:"channel"`
  77. Url string `json:"url"`
  78. ProxyServe string `json:"proxyServe"`
  79. MaxPages int `json:"maxPages"`
  80. Threads int `json:"threads"`
  81. ListDelay int64 `json:"listDelay"`
  82. TrunPageDelay int64 `json:"trunPageDelay"`
  83. ContentDelay int64 `json:"contentDelay"`
  84. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  85. }
  86. //作业定义
  87. Job struct {
  88. Code string `json:"code"`
  89. Name string `json:"name"`
  90. Items []*JobItem `json:"items"`
  91. //通用参数
  92. ProxyServe string `json:"proxyServe"`
  93. MaxPages int `json:"maxPages"`
  94. Threads int `json:"threads"`
  95. ListDelay int64 `json:"listDelay"`
  96. TrunPageDelay int64 `json:"trunPageDelay"`
  97. ContentDelay int64 `json:"contentDelay"`
  98. State int `json:"state"`
  99. StateType string `json:"stateType"`
  100. Progress int `json:"progress"`
  101. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  102. }
  103. Jobs []*Job
  104. //推送事件
  105. JobRunningEvent struct {
  106. Progress int `json:"progress"`
  107. Msg string `json:"msg"`
  108. Act int `json:"act"`
  109. Code string `json:"code"`
  110. }
  111. //Job运行时状态,中间结果
  112. JobRunningState struct {
  113. Code string
  114. State int
  115. Progress int
  116. ResultCache *list.List //缓存
  117. ExitCh chan bool
  118. }
  119. //爬虫配置验证结果
  120. SpiderConfigVerifyResult struct {
  121. Title bool `json:"title"`
  122. PublishUnit bool `json:"publishUnit"`
  123. PublishTime bool `json:"publishTime"`
  124. Content bool `json:"content"`
  125. Attaches bool `json:"attaches"`
  126. ListItems bool `json:"listItems"`
  127. ListTrunPage bool `json:"listTrunPage"`
  128. }
  129. )
  130. func (sc SpiderConfiges) Len() int {
  131. return len(sc)
  132. }
  133. func (sc SpiderConfiges) Swap(i, j int) {
  134. sc[i], sc[j] = sc[j], sc[i]
  135. }
  136. func (sc SpiderConfiges) Less(i, j int) bool {
  137. return sc[i].Code > sc[j].Code
  138. }
  139. func (js Jobs) Len() int {
  140. return len(js)
  141. }
  142. func (js Jobs) Swap(i, j int) {
  143. js[i], js[j] = js[j], js[i]
  144. }
  145. func (js Jobs) Less(i, j int) bool {
  146. return js[i].Code > js[j].Code
  147. }
  148. // CopyAttribute
  149. func CopyAttribute(dst *string, value1, value2 string) {
  150. if value1 == "EMPTY" { //特殊符号,强制修改
  151. *dst = ""
  152. } else if value1 != "" {
  153. *dst = value1
  154. } else if value2 != "" {
  155. *dst = value2
  156. }
  157. }
  158. // NewSpiderConfig 生成css对象
  159. func NewSpiderConfig(cssmark map[string]interface{}) (*SpiderConfig, error) {
  160. sc := &SpiderConfig{}
  161. cssmark_byte, err := json.Marshal(cssmark)
  162. if err != nil {
  163. return nil, err
  164. }
  165. err = json.Unmarshal(cssmark_byte, &sc)
  166. return sc, err
  167. //param_common := param["param_common"].(map[string]interface{})
  168. //css_list, _ := param["css_list"].(map[string]interface{})
  169. //css_content, _ := param["css_content"].(map[string]interface{})
  170. //js_list, _ := param["js_list"].(map[string]interface{})
  171. //js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
  172. //js_content, _ := param["js_content"].(map[string]interface{})
  173. //sc := &SpiderConfig{
  174. // Site: qu.ObjToString(param_common["site"]),
  175. // Channel: qu.ObjToString(param_common["channel"]),
  176. // Modifyuser: qu.ObjToString(param_common["modifyuser"]),
  177. // Href: qu.ObjToString(param_common["href"]),
  178. // Code: qu.ObjToString(param_common["code"]),
  179. // ListBodyCss: qu.ObjToString(css_list["body"]),
  180. // ListItemCss: qu.ObjToString(css_list["title"]),
  181. // ListLinkCss: qu.ObjToString(css_list["href"]),
  182. // ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
  183. // ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
  184. // TitleCss: qu.ObjToString(css_content["title"]),
  185. // PublishUnitCss: qu.ObjToString(css_content["source"]),
  186. // PublishTimeCss: qu.ObjToString(css_content["ptime"]),
  187. // ContentCss: qu.ObjToString(css_content["content"]),
  188. // AttachCss: qu.ObjToString(css_content["file"]),
  189. // ListJSCode: qu.ObjToString(js_list["js"]),
  190. // ContentJSCode: qu.ObjToString(js_content["js"]),
  191. // ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
  192. // //AttachJSCode : string `json:"attachJs"` //无效
  193. //}
  194. }
  195. // MergeSpiderConfig 合并
  196. func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
  197. nsc := new(SpiderConfig)
  198. CopyAttribute(&nsc.Code, src2.Code, src1.Code)
  199. CopyAttribute(&nsc.Site, src2.Site, src1.Site)
  200. CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
  201. CopyAttribute(&nsc.Href, src2.Href, src1.Href)
  202. CopyAttribute(&nsc.ModifyUser, src2.ModifyUser, src1.ModifyUser)
  203. CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
  204. CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
  205. CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
  206. CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss)
  207. CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss)
  208. CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss)
  209. CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss)
  210. CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss)
  211. CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss)
  212. CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss)
  213. CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
  214. CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
  215. CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
  216. CopyAttribute(&nsc.ListTurnPageJSCode, src2.ListTurnPageJSCode, src1.ListTurnPageJSCode)
  217. return nsc
  218. }
  219. // CopySpiderConfig 复制
  220. func CopySpiderConfig(src1, src2 *SpiderConfig) {
  221. src1.Code = src2.Code
  222. src1.Site = src2.Site
  223. src1.ModifyUser = src2.ModifyUser
  224. src1.Channel = src2.Channel
  225. src1.Href = src2.Href
  226. src1.ListBodyCss = src2.ListBodyCss
  227. src1.ListItemCss = src2.ListItemCss
  228. src1.ListPubtimeCss = src2.ListPubtimeCss
  229. src1.ListNextPageCss = src2.ListNextPageCss
  230. src1.ListLinkCss = src2.ListLinkCss
  231. src1.TitleCss = src2.TitleCss
  232. src1.PublishTimeCss = src2.PublishTimeCss
  233. src1.PublishUnitCss = src2.PublishUnitCss
  234. src1.ContentCss = src2.ContentCss
  235. src1.AttachCss = src2.AttachCss
  236. src1.ListJSCode = src2.ListJSCode
  237. src1.ListTurnPageJSCode = src2.ListTurnPageJSCode
  238. src1.ContentJSCode = src2.ContentJSCode
  239. src1.AttachJSCode = src2.AttachJSCode
  240. }