types.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. package backend
  2. import (
  3. "container/list"
  4. )
  5. const (
  6. JOB_RUNNING_EVENT_PROGRESS = 1
  7. JOB_RUNNING_EVENT_DEBUG = 0
  8. )
  9. type (
  10. //爬虫配置信息
  11. SpiderConfig struct {
  12. Site string `json:"site"`
  13. Channel string `json:"channel"`
  14. Author string `json:"author"`
  15. Url string `json:"url"`
  16. Code string `json:"code"`
  17. ListBodyCss string `json:"listBodyCss"`
  18. ListItemCss string `json:"listItemCss"`
  19. ListLinkCss string `json:"listLinkCss"`
  20. ListPubtimeCss string `json:"listPublishTimeCss"`
  21. ListNextPageCss string `json:"listNextPageCss"`
  22. TitleCss string `json:"titleCss"`
  23. PublishUnitCss string `json:"publishUnitCss"`
  24. PublishTimeCss string `json:"publishTimeCss"`
  25. ContentCss string `json:"contentCss"`
  26. AttachCss string `json:"attachCss"`
  27. ListJSCode string `json:"listJs"`
  28. ContentJSCode string `json:"contentJs"`
  29. AttachJSCode string `json:"attachJs"` //无效
  30. ListTrunPageJSCode string `json:"listTrunPageJs"`
  31. }
  32. //附件链接
  33. AttachLink struct {
  34. Title string `json:"title"`
  35. Href string `json:"href"`
  36. FileName string `json:"fileName"`
  37. FileType string `json:"fileType"`
  38. FileSize string `json:"fileSize"`
  39. FilePath string `json:"filePath"`
  40. }
  41. //爬取结果信息
  42. ResultItem struct {
  43. No int `json:"no"` //编号
  44. Site string `json:"site"`
  45. Channel string `json:"channel"`
  46. Href string `json:"href"`
  47. ListTitle string `json:"listTitle"`
  48. ListPubTime string `json:"listPubishTime"`
  49. Title string `json:"title"`
  50. PublishUnit string `json:"publishUnit"`
  51. PublishTime string `json:"publishTime"`
  52. Content string `json:"content"`
  53. ContentHtml string `json:"contentHtml"`
  54. AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接
  55. AttachJson string `json:"attachJson"` //存放附件的OSS元信息
  56. }
  57. //爬取结果信息清单
  58. ResultItems []*ResultItem
  59. SpiderConfiges []*SpiderConfig
  60. //
  61. JobItem struct {
  62. //code/site与爬虫配置一致
  63. SpiderCode string `json:"code"`
  64. SpiderSite string `json:"site"`
  65. //以下是运行参数(不配置时,默认使用通用配置)
  66. Channel string `json:"channel"`
  67. Url string `json:"url"`
  68. ProxyServe string `json:"proxyServe"`
  69. MaxPages int `json:"maxPages"`
  70. Threads int `json:"threads"`
  71. ListDelay int64 `json:"listDelay"`
  72. TrunPageDelay int64 `json:"trunPageDelay"`
  73. ContentDelay int64 `json:"contentDelay"`
  74. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  75. }
  76. //作业定义
  77. Job struct {
  78. Code string `json:"code"`
  79. Name string `json:"name"`
  80. Items []*JobItem `json:"items"`
  81. //通用参数
  82. ProxyServe string `json:"proxyServe"`
  83. MaxPages int `json:"maxPages"`
  84. Threads int `json:"threads"`
  85. ListDelay int64 `json:"listDelay"`
  86. TrunPageDelay int64 `json:"trunPageDelay"`
  87. ContentDelay int64 `json:"contentDelay"`
  88. State int `json:"state"`
  89. StateType string `json:"stateType"`
  90. Progress int `json:"progress"`
  91. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  92. }
  93. Jobs []*Job
  94. //推送事件
  95. JobRunningEvent struct {
  96. Progress int `json:"progress"`
  97. Msg string `json:"msg"`
  98. Act int `json:"act"`
  99. Code string `json:"code"`
  100. }
  101. //Job运行时状态,中间结果
  102. JobRunningState struct {
  103. Code string
  104. State int
  105. Progress int
  106. ResultCache *list.List //缓存
  107. ExitCh chan bool
  108. }
  109. )
  110. func (sc SpiderConfiges) Len() int {
  111. return len(sc)
  112. }
  113. func (sc SpiderConfiges) Swap(i, j int) {
  114. sc[i], sc[j] = sc[j], sc[i]
  115. }
  116. func (sc SpiderConfiges) Less(i, j int) bool {
  117. return sc[i].Code > sc[j].Code
  118. }
  119. func (js Jobs) Len() int {
  120. return len(js)
  121. }
  122. func (js Jobs) Swap(i, j int) {
  123. js[i], js[j] = js[j], js[i]
  124. }
  125. func (js Jobs) Less(i, j int) bool {
  126. return js[i].Code > js[j].Code
  127. }
  128. // CopyAttribute
  129. func CopyAttribute(dst *string, value1, value2 string) {
  130. if value1 == "EMPTY" { //特殊符号,强制修改
  131. *dst = ""
  132. } else if value1 != "" {
  133. *dst = value1
  134. } else if value2 != "" {
  135. *dst = value2
  136. }
  137. }
  138. // MergeSpiderConfig 合并
  139. func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
  140. nsc := new(SpiderConfig)
  141. CopyAttribute(&nsc.Code, src2.Code, src1.Code)
  142. CopyAttribute(&nsc.Site, src2.Site, src1.Site)
  143. CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
  144. CopyAttribute(&nsc.Url, src2.Url, src1.Url)
  145. CopyAttribute(&nsc.Author, src2.Author, src1.Author)
  146. CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
  147. CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
  148. CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
  149. CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss)
  150. CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss)
  151. CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss)
  152. CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss)
  153. CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss)
  154. CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss)
  155. CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss)
  156. CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
  157. CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
  158. CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
  159. CopyAttribute(&nsc.ListTrunPageJSCode, src2.ListTrunPageJSCode, src1.ListTrunPageJSCode)
  160. return nsc
  161. }
  162. // CopySpiderConfig 复制
  163. func CopySpiderConfig(src1, src2 *SpiderConfig) {
  164. src1.Code = src2.Code
  165. src1.Site = src2.Site
  166. src1.Author = src2.Author
  167. src1.Channel = src2.Channel
  168. src1.Url = src2.Url
  169. src1.ListBodyCss = src2.ListBodyCss
  170. src1.ListItemCss = src2.ListItemCss
  171. src1.ListPubtimeCss = src2.ListPubtimeCss
  172. src1.ListNextPageCss = src2.ListNextPageCss
  173. src1.ListLinkCss = src2.ListLinkCss
  174. src1.TitleCss = src2.TitleCss
  175. src1.PublishTimeCss = src2.PublishTimeCss
  176. src1.PublishUnitCss = src2.PublishUnitCss
  177. src1.ContentCss = src2.ContentCss
  178. src1.AttachCss = src2.AttachCss
  179. src1.ListJSCode = src2.ListJSCode
  180. src1.ListTrunPageJSCode = src2.ListTrunPageJSCode
  181. src1.ContentJSCode = src2.ContentJSCode
  182. src1.AttachJSCode = src2.AttachJSCode
  183. }