creater.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. package browser
  2. import (
  3. "KeyWebsiteMonitor/spider/db"
  4. . "KeyWebsiteMonitor/spider/types"
  5. "bytes"
  6. "container/list"
  7. _ "embed"
  8. "fmt"
  9. "log"
  10. "text/template"
  11. "time"
  12. . "KeyWebsiteMonitor/spider/util"
  13. qu "app.yhyue.com/moapp/jybase/common"
  14. "app.yhyue.com/moapp/jybase/date"
  15. "github.com/chromedp/chromedp"
  16. )
  17. const (
  18. MAX_TRUN_PAGE = 1000
  19. )
  20. var (
  21. //go:embed js/load_list_items.js
  22. loadListItemsJS string
  23. //go:embed js/load_content.js
  24. loadContentJS string
  25. currentResult = make(ResultItems, 0)
  26. AttachesDir = ""
  27. )
  28. // renderJavascriptCoder
  29. func renderJavascriptCoder(tpl string, sc *SpiderConfig) string {
  30. t, err := template.New("").Parse(tpl)
  31. if err != nil {
  32. log.Println("创建JS代码模板失败", err.Error())
  33. return ""
  34. }
  35. buf := new(bytes.Buffer)
  36. err = t.Execute(buf, sc)
  37. if err != nil {
  38. log.Println("执行JS代码模板失败", err.Error())
  39. return ""
  40. }
  41. return buf.String()
  42. }
  43. var (
  44. exitCh chan bool
  45. currentResults = list.New() //b.ResultItems = make(b.ResultItems, 0)
  46. )
  47. func DoTask() {
  48. query := map[string]interface{}{
  49. "cssmark": map[string]interface{}{
  50. "$exists": true,
  51. },
  52. }
  53. sess := db.Mgo.GetMgoConn()
  54. defer db.Mgo.DestoryMongoConn(sess)
  55. it := sess.DB("zxl").C("luaconfig").Find(query).Sort("_id").Select(map[string]interface{}{
  56. "code": 1,
  57. "href": 1,
  58. "cssmark": 1,
  59. "site": 1,
  60. "channel": 1,
  61. }).Iter()
  62. total := 0
  63. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  64. if total%5000 == 0 {
  65. log.Println("cur index ", total)
  66. }
  67. cssmark := qu.ObjToMap(tmp["cssmark"])
  68. href := qu.ObjToString(tmp["href"])
  69. site := qu.ObjToString(tmp["site"])
  70. channel := qu.ObjToString(tmp["channel"])
  71. code := qu.ObjToString(tmp["code"])
  72. listDealy := qu.Int64All((*cssmark)["listDelayTime"])
  73. trunPageDelay := qu.Int64All((*cssmark)["listTurnDelayTime"])
  74. ///---可视化爬虫平台v1.0
  75. listBodyCss := qu.ObjToString((*cssmark)["listBodyCss"])
  76. listItemCss := qu.ObjToString((*cssmark)["listItemCss"])
  77. listLinkCss := qu.ObjToString((*cssmark)["listLinkCss"])
  78. listPubtimeCss := qu.ObjToString((*cssmark)["listPublishTimeCss"])
  79. listNextPageCss := qu.ObjToString((*cssmark)["listNextPageCss"])
  80. titleCss := qu.ObjToString((*cssmark)["titleCss"])
  81. publishUnitCss := qu.ObjToString((*cssmark)["publishUnitCss"])
  82. publishTimeCss := qu.ObjToString((*cssmark)["publishTimeCss"])
  83. contentCss := qu.ObjToString((*cssmark)["contentCss"])
  84. attachCss := qu.ObjToString((*cssmark)["attachCss"])
  85. listJSCode := qu.ObjToString((*cssmark)["listJSCode"])
  86. contentJSCode := qu.ObjToString((*cssmark)["contentJSCode"])
  87. attachJSCode := qu.ObjToString((*cssmark)["attachJSCode"])
  88. listTrunPageJSCode := qu.ObjToString((*cssmark)["listTrunPageJSCode"])
  89. //
  90. headless := false
  91. showImage := false
  92. //获取重点网站
  93. exitCh = make(chan bool, 1)
  94. //
  95. currentSpiderConfig := &SpiderConfig{
  96. Site: site,
  97. Channel: channel,
  98. Url: href,
  99. Code: code,
  100. ListBodyCss: listBodyCss,
  101. ListItemCss: listItemCss,
  102. ListLinkCss: listLinkCss,
  103. ListPubtimeCss: listPubtimeCss,
  104. ListNextPageCss: listNextPageCss,
  105. TitleCss: titleCss,
  106. PublishUnitCss: publishUnitCss,
  107. PublishTimeCss: publishTimeCss,
  108. ContentCss: contentCss,
  109. AttachCss: attachCss,
  110. ListJSCode: listJSCode,
  111. ContentJSCode: contentJSCode,
  112. AttachJSCode: attachJSCode,
  113. ListTrunPageJSCode: listTrunPageJSCode,
  114. }
  115. //
  116. c, lrt := CountYestodayArts(href, listDealy, trunPageDelay, headless, showImage, exitCh, currentSpiderConfig)
  117. result := map[string]interface{}{
  118. "count": c,
  119. "todayHasData": c > 0,
  120. }
  121. todayData := map[string]interface{}{}
  122. for k, v := range lrt {
  123. todayData[fmt.Sprint(k+1)] = map[string]interface{}{
  124. "text": v.Title,
  125. "date": v.ListPubTime,
  126. "href": v.Href,
  127. }
  128. }
  129. result["todayData"] = todayData
  130. //存库
  131. day := time.Now().AddDate(0, 0, -1).Format(date.Date_Short_Layout)
  132. db.Mgo.Save("checklist", map[string]interface{}{
  133. "day": day,
  134. "site": site,
  135. "spidercode": code,
  136. "channel": channel,
  137. "channelurl": href,
  138. "comeintime": time.Now().Unix(),
  139. "result": result,
  140. })
  141. tmp = make(map[string]interface{})
  142. }
  143. }
  144. // CountYestodayArts 统计昨日信息发布量
  145. func CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  146. headless bool, showImage bool, exit chan bool, currentSpiderConfig *SpiderConfig) (count int, lrt []*ResultItem) {
  147. sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
  148. _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "")
  149. log.Println("1浏览器打开")
  150. defer func() {
  151. cancel()
  152. baseCancel()
  153. log.Println("0浏览器已经销毁")
  154. log.Println(fmt.Sprintf("99 昨日信息发布量:%d ", count))
  155. close(exit)
  156. }()
  157. //时间比较
  158. // now := time.Unix(1721836800, 0)
  159. now := time.Now() /*.AddDate(0, 0, 1)*/
  160. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  161. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  162. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  163. log.Println(startOfYesterday)
  164. log.Println(endOfYesterday)
  165. //TODO 1.
  166. chromedp.Run(ctx, chromedp.Tasks{
  167. chromedp.Navigate(sc.Url),
  168. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  169. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  170. })
  171. log.Println("2页面打开")
  172. //TODO 2. 执行JS代码,获取列表页信息
  173. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  174. tmp := map[string]bool{}
  175. //最多翻页1000页
  176. for i := 0; i < MAX_TRUN_PAGE; i++ {
  177. select {
  178. case <-exit:
  179. log.Println("$$$")
  180. return
  181. default:
  182. log.Println("3执行列表页JS")
  183. listResult := make(ResultItems, 0)
  184. err := chromedp.Run(ctx, chromedp.Tasks{
  185. chromedp.Evaluate(runJs, &listResult),
  186. })
  187. if err != nil {
  188. log.Println("执行JS代码失败", err.Error())
  189. return
  190. }
  191. //采集列表页发布时间转换
  192. //TODO 日期统计
  193. for _, r := range listResult {
  194. day, err := time.Parse("2006-01-02", r.ListPubTime)
  195. if err != nil {
  196. continue
  197. }
  198. if _, ok := tmp[r.Href]; ok { //去重
  199. continue
  200. }
  201. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  202. count += 1
  203. lrt = append(lrt, &ResultItem{
  204. SpiderCode: r.SpiderCode,
  205. Site: r.Site,
  206. Href: r.Href,
  207. ListPubTime: r.ListPubTime,
  208. Title: RemoveNewlinesAndSpaces(r.ListTitle),
  209. })
  210. } else if day.Before(startOfYesterday) {
  211. return
  212. }
  213. }
  214. log.Println(fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  215. //TODO 翻页
  216. if currentSpiderConfig.ListBodyCss == "" || sc.ListNextPageCss == "" {
  217. log.Println("当前爬虫配置,不具备翻页条件")
  218. return
  219. }
  220. var runJs, result string = currentSpiderConfig.ListTrunPageJSCode, ""
  221. if runJs == "" {
  222. runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
  223. }
  224. var result1, result2 string
  225. var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
  226. log.Println("检查翻页是否成功,执行的JS", checkRunJs)
  227. err = chromedp.Run(ctx, chromedp.Tasks{
  228. chromedp.Evaluate(checkRunJs, &result1),
  229. })
  230. if err != nil {
  231. log.Println("翻页检查1失败,", checkRunJs)
  232. return
  233. }
  234. //可能就没有分页
  235. err = chromedp.Run(ctx, chromedp.Tasks{
  236. chromedp.Evaluate(runJs, &result),
  237. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  238. })
  239. if err != nil {
  240. log.Println("翻页操作失败,", runJs)
  241. return
  242. }
  243. err = chromedp.Run(ctx, chromedp.Tasks{
  244. chromedp.Evaluate(checkRunJs, &result2),
  245. })
  246. if err != nil {
  247. log.Println("翻页检查2失败,", checkRunJs)
  248. return
  249. }
  250. if result1 == "" || result2 == "" || result1 == result2 {
  251. log.Println("翻页失败,两次翻页获取到的列表区域块不符合要求")
  252. return
  253. }
  254. }
  255. }
  256. return
  257. }