types.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. package backend
  2. import (
  3. "container/list"
  4. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  5. )
  6. const (
  7. JOB_RUNNING_EVENT_PROGRESS = 1
  8. JOB_RUNNING_EVENT_DEBUG = 0
  9. )
  10. type (
  11. //爬虫配置信息
  12. SpiderConfig struct {
  13. Site string `json:"site"`
  14. Channel string `json:"channel"`
  15. Author string `json:"author"`
  16. Url string `json:"url"`
  17. Code string `json:"code"`
  18. ListBodyCss string `json:"listBodyCss"`
  19. ListItemCss string `json:"listItemCss"`
  20. ListLinkCss string `json:"listLinkCss"`
  21. ListPubtimeCss string `json:"listPublishTimeCss"`
  22. ListNextPageCss string `json:"listNextPageCss"`
  23. TitleCss string `json:"titleCss"`
  24. PublishUnitCss string `json:"publishUnitCss"`
  25. PublishTimeCss string `json:"publishTimeCss"`
  26. ContentCss string `json:"contentCss"`
  27. AttachCss string `json:"attachCss"`
  28. ListJSCode string `json:"listJs"`
  29. ContentJSCode string `json:"contentJs"`
  30. AttachJSCode string `json:"attachJs"` //无效
  31. ListTrunPageJSCode string `json:"listTrunPageJs"`
  32. }
  33. //附件链接
  34. AttachLink struct {
  35. Title string `json:"title"`
  36. Href string `json:"href"`
  37. FileName string `json:"fileName"`
  38. FileType string `json:"fileType"`
  39. FileSize string `json:"fileSize"`
  40. FilePath string `json:"filePath"`
  41. }
  42. //爬取结果信息
  43. ResultItem struct {
  44. No int `json:"no"` //编号
  45. Site string `json:"site"`
  46. Channel string `json:"channel"`
  47. Href string `json:"href"`
  48. ListTitle string `json:"listTitle"`
  49. ListPubTime string `json:"listPubishTime"`
  50. Title string `json:"title"`
  51. PublishUnit string `json:"publishUnit"`
  52. PublishTime string `json:"publishTime"`
  53. Content string `json:"content"`
  54. ContentHtml string `json:"contentHtml"`
  55. AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接
  56. AttachJson string `json:"attachJson"` //存放附件的OSS元信息
  57. }
  58. //爬取结果信息清单
  59. ResultItems []*ResultItem
  60. SpiderConfiges []*SpiderConfig
  61. //
  62. JobItem struct {
  63. //code/site与爬虫配置一致
  64. SpiderCode string `json:"code"`
  65. SpiderSite string `json:"site"`
  66. //以下是运行参数(不配置时,默认使用通用配置)
  67. Channel string `json:"channel"`
  68. Url string `json:"url"`
  69. ProxyServe string `json:"proxyServe"`
  70. MaxPages int `json:"maxPages"`
  71. Threads int `json:"threads"`
  72. ListDelay int64 `json:"listDelay"`
  73. TrunPageDelay int64 `json:"trunPageDelay"`
  74. ContentDelay int64 `json:"contentDelay"`
  75. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  76. }
  77. //作业定义
  78. Job struct {
  79. Code string `json:"code"`
  80. Name string `json:"name"`
  81. Items []*JobItem `json:"items"`
  82. //通用参数
  83. ProxyServe string `json:"proxyServe"`
  84. MaxPages int `json:"maxPages"`
  85. Threads int `json:"threads"`
  86. ListDelay int64 `json:"listDelay"`
  87. TrunPageDelay int64 `json:"trunPageDelay"`
  88. ContentDelay int64 `json:"contentDelay"`
  89. State int `json:"state"`
  90. StateType string `json:"stateType"`
  91. Progress int `json:"progress"`
  92. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  93. }
  94. Jobs []*Job
  95. //推送事件
  96. JobRunningEvent struct {
  97. Progress int `json:"progress"`
  98. Msg string `json:"msg"`
  99. Act int `json:"act"`
  100. Code string `json:"code"`
  101. }
  102. //Job运行时状态,中间结果
  103. JobRunningState struct {
  104. Code string
  105. State int
  106. Progress int
  107. ResultCache *list.List //缓存
  108. ExitCh chan bool
  109. }
  110. //爬虫配置验证结果
  111. SpiderConfigVerifyResult struct {
  112. Title bool `json:"title"`
  113. PublishUnit bool `json:"publishUnit"`
  114. PublishTime bool `json:"publishTime"`
  115. Content bool `json:"content"`
  116. Attaches bool `json:"attaches"`
  117. ListItems bool `json:"listItems"`
  118. ListTrunPage bool `json:"listTrunPage"`
  119. }
  120. )
  121. func (sc SpiderConfiges) Len() int {
  122. return len(sc)
  123. }
  124. func (sc SpiderConfiges) Swap(i, j int) {
  125. sc[i], sc[j] = sc[j], sc[i]
  126. }
  127. func (sc SpiderConfiges) Less(i, j int) bool {
  128. return sc[i].Code > sc[j].Code
  129. }
  130. func (js Jobs) Len() int {
  131. return len(js)
  132. }
  133. func (js Jobs) Swap(i, j int) {
  134. js[i], js[j] = js[j], js[i]
  135. }
  136. func (js Jobs) Less(i, j int) bool {
  137. return js[i].Code > js[j].Code
  138. }
  139. // CopyAttribute
  140. func CopyAttribute(dst *string, value1, value2 string) {
  141. if value1 == "EMPTY" { //特殊符号,强制修改
  142. *dst = ""
  143. } else if value1 != "" {
  144. *dst = value1
  145. } else if value2 != "" {
  146. *dst = value2
  147. }
  148. }
  149. // NewSpiderConfig 生成css对象
  150. func NewSpiderConfig(param map[string]interface{}) *SpiderConfig {
  151. param_common := param["param_common"].(map[string]interface{})
  152. css_list, _ := param["css_list"].(map[string]interface{})
  153. css_content, _ := param["css_content"].(map[string]interface{})
  154. js_list, _ := param["js_list"].(map[string]interface{})
  155. js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
  156. js_content, _ := param["js_content"].(map[string]interface{})
  157. sc := &SpiderConfig{
  158. Site: qu.ObjToString(param_common["site"]),
  159. Channel: qu.ObjToString(param_common["channel"]),
  160. Author: qu.ObjToString(param_common["modifyuser"]),
  161. Url: qu.ObjToString(param_common["href"]),
  162. Code: qu.ObjToString(param_common["code"]),
  163. ListBodyCss: qu.ObjToString(css_list["body"]),
  164. ListItemCss: qu.ObjToString(css_list["title"]),
  165. ListLinkCss: qu.ObjToString(css_list["href"]),
  166. ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
  167. ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
  168. TitleCss: qu.ObjToString(css_content["title"]),
  169. PublishUnitCss: qu.ObjToString(css_content["source"]),
  170. PublishTimeCss: qu.ObjToString(css_content["ptime"]),
  171. ContentCss: qu.ObjToString(css_content["content"]),
  172. AttachCss: qu.ObjToString(css_content["file"]),
  173. ListJSCode: qu.ObjToString(js_list["js"]),
  174. ContentJSCode: qu.ObjToString(js_content["js"]),
  175. ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
  176. //AttachJSCode : string `json:"attachJs"` //无效
  177. }
  178. return sc
  179. }
  180. // MergeSpiderConfig 合并
  181. func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
  182. nsc := new(SpiderConfig)
  183. CopyAttribute(&nsc.Code, src2.Code, src1.Code)
  184. CopyAttribute(&nsc.Site, src2.Site, src1.Site)
  185. CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
  186. CopyAttribute(&nsc.Url, src2.Url, src1.Url)
  187. CopyAttribute(&nsc.Author, src2.Author, src1.Author)
  188. CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
  189. CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
  190. CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
  191. CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss)
  192. CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss)
  193. CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss)
  194. CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss)
  195. CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss)
  196. CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss)
  197. CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss)
  198. CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
  199. CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
  200. CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
  201. CopyAttribute(&nsc.ListTrunPageJSCode, src2.ListTrunPageJSCode, src1.ListTrunPageJSCode)
  202. return nsc
  203. }
  204. // CopySpiderConfig 复制
  205. func CopySpiderConfig(src1, src2 *SpiderConfig) {
  206. src1.Code = src2.Code
  207. src1.Site = src2.Site
  208. src1.Author = src2.Author
  209. src1.Channel = src2.Channel
  210. src1.Url = src2.Url
  211. src1.ListBodyCss = src2.ListBodyCss
  212. src1.ListItemCss = src2.ListItemCss
  213. src1.ListPubtimeCss = src2.ListPubtimeCss
  214. src1.ListNextPageCss = src2.ListNextPageCss
  215. src1.ListLinkCss = src2.ListLinkCss
  216. src1.TitleCss = src2.TitleCss
  217. src1.PublishTimeCss = src2.PublishTimeCss
  218. src1.PublishUnitCss = src2.PublishUnitCss
  219. src1.ContentCss = src2.ContentCss
  220. src1.AttachCss = src2.AttachCss
  221. src1.ListJSCode = src2.ListJSCode
  222. src1.ListTrunPageJSCode = src2.ListTrunPageJSCode
  223. src1.ContentJSCode = src2.ContentJSCode
  224. src1.AttachJSCode = src2.AttachJSCode
  225. }