types.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. package backend
  2. import (
  3. "container/list"
  4. "encoding/json"
  5. "regexp"
  6. )
  7. const (
  8. JOB_RUNNING_EVENT_PROGRESS = 1
  9. JOB_RUNNING_EVENT_DEBUG = 0
  10. //动作执行结果
  11. RUN_ACTION_NOTRUN = 0 //未执行
  12. RUN_ACTION_SUCCESS = 1 //执行成功
  13. RUN_ACTION_ERROR = 2 //执行异常
  14. RUN_ACTION_TIMEOUT = 3 //执行超时
  15. //动作执行检查结果
  16. CHECK_ACTION_NOTCHECK = 0
  17. CHECK_ACTION_SUCCESS = 1
  18. CHECK_ACTION_ERROR = 2
  19. CHECK_ACTION_TIMEOUT = 3
  20. CHECH_RESULT = "true"
  21. )
  22. var (
  23. DataResults = map[string]*list.List{} //调试运行结果
  24. VerifyResults = map[string]*SpiderConfigVerifyResult{} //调试验证结果
  25. RegSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
  26. )
  27. type (
  28. //爬虫配置信息
  29. SpiderConfig struct {
  30. Site string `json:"site"`
  31. Channel string `json:"channel"`
  32. ModifyUser string `json:"modifyuser"`
  33. Href string `json:"href"`
  34. Code string `json:"code"`
  35. InitList []*Actions `json:"initList"`
  36. ListBodyCss string `json:"listBodyCss"` //用于判断是否翻页成功
  37. ListItemCss string `json:"listItemCss"`
  38. ListLinkCss string `json:"listLinkCss"`
  39. ListPubtimeCss string `json:"listPublishTimeCss"`
  40. ListNextPageCss string `json:"listNextPageCss"`
  41. TitleCss string `json:"titleCss"`
  42. PublishUnitCss string `json:"publishUnitCss"`
  43. PublishTimeCss string `json:"publishTimeCss"`
  44. ContentCss string `json:"contentCss"`
  45. AttachCss string `json:"attachCss"`
  46. ListJSCode string `json:"listJs"`
  47. ContentJSCode string `json:"contentJs"`
  48. AttachJSCode string `json:"attachJs"` //无效
  49. ListTurnPageJSCode string `json:"listTurnPageJs"`
  50. MaxPages int64 `json:"maxPages"`
  51. FilterResource string `json:"filterResource"` //要过滤的资源
  52. //延时
  53. ListDelayTime int64 `json:"listDelayTime"`
  54. ListTurnDelayTime int64 `json:"listTurnDelayTime"`
  55. ContentDelayTime int64 `json:"contentDelayTime"`
  56. }
  57. //附件链接
  58. AttachLink struct {
  59. Title string `json:"title"`
  60. Href string `json:"href"`
  61. FileName string `json:"fileName"`
  62. FileType string `json:"fileType"`
  63. FileSize string `json:"fileSize"`
  64. FilePath string `json:"filePath"`
  65. }
  66. Actions struct {
  67. ActionJs string `json:"actionJs"`
  68. CheckJs string `json:"checkJs"`
  69. SleepTime int64 `json:"sleepTime"`
  70. }
  71. //动作执行与检查
  72. ActionRunResult struct {
  73. ActionNum int `json:"actionNum"`
  74. ActionJs string `json:"actionJs`
  75. CheckJs string `json:"checkJs"`
  76. Result bool `json:"result"`
  77. RunResult int `json:"runResult"`
  78. CheckResult int `json:"checkResult"`
  79. }
  80. //爬取结果信息
  81. ResultItem struct {
  82. No int `json:"no"` //编号
  83. Site string `json:"site"`
  84. Channel string `json:"channel"`
  85. Href string `json:"href"`
  86. ListTitle string `json:"listTitle"`
  87. ListPubTime string `json:"listPublishTime"`
  88. Title string `json:"title"`
  89. PublishUnit string `json:"publishUnit"`
  90. PublishTime string `json:"publishTime"`
  91. Content string `json:"content"`
  92. ContentHtml string `json:"contentHtml"`
  93. AttachLinks []*AttachLink `json:"attachLinks"` //存放附件的标题,链接
  94. AttachJson string `json:"attachJson"` //存放附件的OSS元信息
  95. }
  96. //爬取结果信息清单
  97. ResultItems []*ResultItem
  98. SpiderConfiges []*SpiderConfig
  99. //
  100. JobItem struct {
  101. //code/site与爬虫配置一致
  102. SpiderCode string `json:"code"`
  103. SpiderSite string `json:"site"`
  104. //以下是运行参数(不配置时,默认使用通用配置)
  105. Channel string `json:"channel"`
  106. Url string `json:"url"`
  107. ProxyServe string `json:"proxyServe"`
  108. MaxPages int `json:"maxPages"`
  109. Threads int `json:"threads"`
  110. ListDelay int64 `json:"listDelay"`
  111. TrunPageDelay int64 `json:"trunPageDelay"`
  112. ContentDelay int64 `json:"contentDelay"`
  113. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  114. }
  115. //作业定义
  116. Job struct {
  117. Code string `json:"code"`
  118. Name string `json:"name"`
  119. Items []*JobItem `json:"items"`
  120. //通用参数
  121. ProxyServe string `json:"proxyServe"`
  122. MaxPages int `json:"maxPages"`
  123. Threads int `json:"threads"`
  124. ListDelay int64 `json:"listDelay"`
  125. TrunPageDelay int64 `json:"trunPageDelay"`
  126. ContentDelay int64 `json:"contentDelay"`
  127. State int `json:"state"`
  128. StateType string `json:"stateType"`
  129. Progress int `json:"progress"`
  130. NeedDownloadAttaches bool `json:"needDownloadAttaches"`
  131. }
  132. Jobs []*Job
  133. //推送事件
  134. JobRunningEvent struct {
  135. Progress int `json:"progress"`
  136. Msg string `json:"msg"`
  137. Act int `json:"act"`
  138. Code string `json:"code"`
  139. }
  140. //Job运行时状态,中间结果
  141. JobRunningState struct {
  142. Code string
  143. State int
  144. Progress int
  145. ResultCache *list.List //缓存
  146. ExitCh chan bool
  147. }
  148. //爬虫配置验证结果
  149. SpiderConfigVerifyResult struct {
  150. Title bool `json:"title"`
  151. PublishUnit bool `json:"publishUnit"`
  152. PublishTime bool `json:"publishTime"`
  153. Content bool `json:"content"`
  154. Attaches bool `json:"attaches"`
  155. ListItems bool `json:"listItems"`
  156. ListTrunPage bool `json:"listTrunPage"`
  157. }
  158. )
  159. func (sc SpiderConfiges) Len() int {
  160. return len(sc)
  161. }
  162. func (sc SpiderConfiges) Swap(i, j int) {
  163. sc[i], sc[j] = sc[j], sc[i]
  164. }
  165. func (sc SpiderConfiges) Less(i, j int) bool {
  166. return sc[i].Code > sc[j].Code
  167. }
  168. func (js Jobs) Len() int {
  169. return len(js)
  170. }
  171. func (js Jobs) Swap(i, j int) {
  172. js[i], js[j] = js[j], js[i]
  173. }
  174. func (js Jobs) Less(i, j int) bool {
  175. return js[i].Code > js[j].Code
  176. }
  177. // CopyAttribute
  178. func CopyAttribute(dst *string, value1, value2 string) {
  179. if value1 == "EMPTY" { //特殊符号,强制修改
  180. *dst = ""
  181. } else if value1 != "" {
  182. *dst = value1
  183. } else if value2 != "" {
  184. *dst = value2
  185. }
  186. }
  187. // NewSpiderConfig 生成css对象
  188. func NewSpiderConfig(cssmark map[string]interface{}) (*SpiderConfig, error) {
  189. sc := &SpiderConfig{}
  190. cssmark_byte, err := json.Marshal(cssmark)
  191. if err != nil {
  192. return nil, err
  193. }
  194. err = json.Unmarshal(cssmark_byte, &sc)
  195. return sc, err
  196. //param_common := param["param_common"].(map[string]interface{})
  197. //css_list, _ := param["css_list"].(map[string]interface{})
  198. //css_content, _ := param["css_content"].(map[string]interface{})
  199. //js_list, _ := param["js_list"].(map[string]interface{})
  200. //js_nextpage, _ := param["js_nextpage"].(map[string]interface{})
  201. //js_content, _ := param["js_content"].(map[string]interface{})
  202. //sc := &SpiderConfig{
  203. // Site: qu.ObjToString(param_common["site"]),
  204. // Channel: qu.ObjToString(param_common["channel"]),
  205. // Modifyuser: qu.ObjToString(param_common["modifyuser"]),
  206. // Href: qu.ObjToString(param_common["href"]),
  207. // Code: qu.ObjToString(param_common["code"]),
  208. // ListBodyCss: qu.ObjToString(css_list["body"]),
  209. // ListItemCss: qu.ObjToString(css_list["title"]),
  210. // ListLinkCss: qu.ObjToString(css_list["href"]),
  211. // ListPubtimeCss: qu.ObjToString(css_list["ptime"]),
  212. // ListNextPageCss: qu.ObjToString(css_list["nextpage"]),
  213. // TitleCss: qu.ObjToString(css_content["title"]),
  214. // PublishUnitCss: qu.ObjToString(css_content["source"]),
  215. // PublishTimeCss: qu.ObjToString(css_content["ptime"]),
  216. // ContentCss: qu.ObjToString(css_content["content"]),
  217. // AttachCss: qu.ObjToString(css_content["file"]),
  218. // ListJSCode: qu.ObjToString(js_list["js"]),
  219. // ContentJSCode: qu.ObjToString(js_content["js"]),
  220. // ListTrunPageJSCode: qu.ObjToString(js_nextpage["js"]),
  221. // //AttachJSCode : string `json:"attachJs"` //无效
  222. //}
  223. }
  224. // MergeSpiderConfig 合并
  225. func MergeSpiderConfig(src1, src2 *SpiderConfig) *SpiderConfig {
  226. nsc := new(SpiderConfig)
  227. CopyAttribute(&nsc.Code, src2.Code, src1.Code)
  228. CopyAttribute(&nsc.Site, src2.Site, src1.Site)
  229. CopyAttribute(&nsc.Channel, src2.Channel, src1.Channel)
  230. CopyAttribute(&nsc.Href, src2.Href, src1.Href)
  231. CopyAttribute(&nsc.ModifyUser, src2.ModifyUser, src1.ModifyUser)
  232. CopyAttribute(&nsc.ListBodyCss, src2.ListBodyCss, src1.ListBodyCss)
  233. CopyAttribute(&nsc.ListItemCss, src2.ListItemCss, src1.ListItemCss)
  234. CopyAttribute(&nsc.ListLinkCss, src2.ListLinkCss, src1.ListLinkCss)
  235. CopyAttribute(&nsc.ListPubtimeCss, src2.ListPubtimeCss, src1.ListPubtimeCss)
  236. CopyAttribute(&nsc.ListNextPageCss, src2.ListNextPageCss, src1.ListNextPageCss)
  237. CopyAttribute(&nsc.TitleCss, src2.TitleCss, src1.TitleCss)
  238. CopyAttribute(&nsc.PublishTimeCss, src2.PublishTimeCss, src1.PublishTimeCss)
  239. CopyAttribute(&nsc.PublishUnitCss, src2.PublishUnitCss, src1.PublishUnitCss)
  240. CopyAttribute(&nsc.ContentCss, src2.ContentCss, src1.ContentCss)
  241. CopyAttribute(&nsc.AttachCss, src2.AttachCss, src1.AttachCss)
  242. CopyAttribute(&nsc.ListJSCode, src2.ListJSCode, src1.ListJSCode)
  243. CopyAttribute(&nsc.ContentJSCode, src2.ContentJSCode, src1.ContentJSCode)
  244. CopyAttribute(&nsc.AttachJSCode, src2.AttachJSCode, src1.AttachJSCode)
  245. CopyAttribute(&nsc.ListTurnPageJSCode, src2.ListTurnPageJSCode, src1.ListTurnPageJSCode)
  246. return nsc
  247. }
  248. // CopySpiderConfig 复制
  249. func CopySpiderConfig(src1, src2 *SpiderConfig) {
  250. src1.Code = src2.Code
  251. src1.Site = src2.Site
  252. src1.ModifyUser = src2.ModifyUser
  253. src1.Channel = src2.Channel
  254. src1.Href = src2.Href
  255. src1.ListBodyCss = src2.ListBodyCss
  256. src1.ListItemCss = src2.ListItemCss
  257. src1.ListPubtimeCss = src2.ListPubtimeCss
  258. src1.ListNextPageCss = src2.ListNextPageCss
  259. src1.ListLinkCss = src2.ListLinkCss
  260. src1.TitleCss = src2.TitleCss
  261. src1.PublishTimeCss = src2.PublishTimeCss
  262. src1.PublishUnitCss = src2.PublishUnitCss
  263. src1.ContentCss = src2.ContentCss
  264. src1.AttachCss = src2.AttachCss
  265. src1.ListJSCode = src2.ListJSCode
  266. src1.ListTurnPageJSCode = src2.ListTurnPageJSCode
  267. src1.ContentJSCode = src2.ContentJSCode
  268. src1.AttachJSCode = src2.AttachJSCode
  269. }