spider.go 67 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796
  1. /*
  2. *
  3. 爬虫,脚本接口,需要扩展
  4. */
  5. package spider
  6. import (
  7. "crypto/sha1"
  8. "fmt"
  9. elc "github.com/olivere/elastic/v7"
  10. "io"
  11. "log"
  12. mgo "mongodb"
  13. qu "qfw/util"
  14. es "qfw/util/elastic.v7"
  15. "sort"
  16. "strconv"
  17. "strings"
  18. "sync"
  19. "regexp"
  20. util "spiderutil"
  21. "sync/atomic"
  22. "time"
  23. "github.com/donnie4w/go-logger/logger"
  24. "github.com/yuin/gopher-lua"
  25. )
  26. // Heart 心跳
  27. type Heart struct {
  28. DetailHeart int64 //爬虫三级页执行心跳
  29. DetailExecuteHeart int64 //三级页采集到数据心跳
  30. FindListHeart int64 //findListHtml执行心跳
  31. ListHeart int64 //爬虫列表页执行心跳
  32. FirstPageHeart int64 //采集第一页的心跳
  33. ModifyUser string //爬虫维护人
  34. Site string //站点
  35. Channel string //栏目
  36. }
  37. // SpiderFlow 流量
  38. type SpiderFlow struct {
  39. Flow int64 //流量
  40. ModifyUser string //爬虫维护人
  41. Site string //站点
  42. Channel string //栏目
  43. //Code string
  44. }
  45. // Spider 爬虫
  46. type Spider struct {
  47. Script
  48. Code string //代码
  49. Name string //名称
  50. Channel string //站点
  51. DownDetail bool //是否下载详细页
  52. Stop bool //停止标志
  53. Pass bool //暂停标志
  54. LastPubshTime int64 //最后发布时间
  55. LastHeartbeat int64 //最后心跳时间
  56. SpiderRunRate int64 //执行频率
  57. ExecuteOkTime int64 //任务执行成功/完成时间
  58. Collection string //写入表名
  59. Thread int64 //线程数
  60. LastExecTime int64 //最后执行时间
  61. LastDowncount int32 //最后一次下载量
  62. TodayDowncount int32 //今日下载量
  63. YesterdayDowncount int32 //昨日下载量
  64. TotalDowncount int32 //总下载量
  65. RoundCount int32 //执行轮次
  66. StoreMode int //存储模式
  67. StoreToMsgEvent int //消息类型
  68. CoverAttr string //按属性判重数据
  69. SleepBase int //基本延时
  70. SleepRand int //随机延时
  71. TargetChannelUrl string //栏目页地址
  72. UpperLimit, LowerLimit int //正常值上限、下限
  73. //UserName, UserEmail, UploadTime string //开发者名称,开发者邮箱,脚本上传时间
  74. MUserName, MUserEmail string //维护人,维护人邮箱
  75. //Index int //数组索引
  76. //历史补漏
  77. IsHistoricalMend bool //是否是历史补漏爬虫
  78. IsMustDownload bool //是否强制下载
  79. IsCompete bool //区分新老爬虫
  80. Infoformat int //区分爬虫类型 1:招标;2:拟建/审批;3:产权
  81. IsMainThread bool //是否为主线程(多线程采集时区分是否为主线程)
  82. ListParallelTaskNum int //列表页爬虫执行任务并行数量
  83. }
  84. var (
  85. Es *es.Elastic
  86. EsIndex string
  87. EsType string
  88. MgoS *mgo.MongodbSim
  89. MgoEB *mgo.MongodbSim
  90. TimeChan = make(chan bool, 1)
  91. Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
  92. RestrictAccessReg = regexp.MustCompile(`访问被拒绝`)
  93. AllThreadNum int64
  94. ListAllThreadNum int64
  95. DelaySiteMap map[string]*DelaySite //延迟采集站点集合
  96. UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
  97. SPH = make(chan bool, 5)
  98. DataBakSaveCache = make(chan map[string]interface{}, 1000) //保存采集信息详情页记录
  99. DB_CH = make(chan bool, 5)
  100. )
  101. type DelaySite struct {
  102. DelayTime int
  103. Compete bool
  104. }
  105. // 任务
  106. func (s *Spider) StartJob() {
  107. s.Stop = false
  108. s.Pass = false
  109. s.RoundCount++
  110. go s.ExecJob(false)
  111. }
  112. // 单次执行
  113. func (s *Spider) ExecJob(reload bool) {
  114. defer func() {
  115. s.ExecuteOkTime = time.Now().Unix()
  116. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  117. if util.Config.Working == 1 {
  118. s.Stop = true
  119. if _, b := Allspiders.Load(s.Code); b {
  120. Allspiders.Store(s.Code, s)
  121. }
  122. //资源释放(队列模式下架操作时,不操作资源释放,执行完后自动释放)
  123. s.L.Close()
  124. CC <- s.L
  125. }
  126. }()
  127. //高效模式,轮询调度时重载脚本
  128. if reload && util.Config.Working == 0 {
  129. s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
  130. }
  131. logger.Debug(s.Code, s.Name, "频率:", s.SpiderRunRate, ",", s.Timeout)
  132. s.LastDowncount = 0
  133. s.LastExecTime = time.Now().Unix()
  134. s.LastHeartbeat = time.Now().Unix()
  135. s.ExecuteOkTime = 0
  136. //err := s.GetLastPublishTime() //获取最新时间--作为最后更新时间
  137. //if err != nil {
  138. // logger.Error(s.Code, err)
  139. //}
  140. //判断是否使用高并发下载三级页
  141. var err interface{}
  142. if Supplement {
  143. err = s.SupplementDownListPageItem() //增量补采数据,下载列表
  144. } else if util.Config.PageTurnInfo.ListThreadsNum > 1 {
  145. err = s.DownListPageItemByThreads() //并发下载列表
  146. } else {
  147. err = s.DownListPageItem() //下载列表
  148. }
  149. //if util.Config.Working == 0 && util.Config.Modal == 1 && !util.Config.IsHistoryEvent {
  150. // err = s.DownListPageItemByThreads() //下载列表
  151. //} else {
  152. // err = s.DownListPageItem() //下载列表
  153. //}
  154. if err != nil {
  155. logger.Error(s.Code, err)
  156. }
  157. if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //判断爬虫是增量还是历史爬虫(目前只会在7000节点上有历史爬虫)
  158. UpdateSpiderByCodeState(s.Code, "6") //爬虫在该节点下架
  159. SpiderCodeSendToEditor(s.Code) //历史转增量爬虫发送编辑器,切换节点上下架
  160. return
  161. } else {
  162. if util.Config.Working == 0 && !Supplement { //高性能模式
  163. /*
  164. for !s.Stop && s.Pass {
  165. util.TimeSleepFunc(2*time.Second, TimeSleepChan)
  166. }
  167. if s.Stop {
  168. return
  169. }
  170. */
  171. //if s.IsMustDownload { //历史数据下载,只跑一轮
  172. if s.IsHistoricalMend && util.Config.IsHistoryEvent { //历史节点7000,高性能模式,历史补漏只下载一轮
  173. UpdateSpiderByCodeState(s.Code, "6") //爬虫在该节点下架
  174. b := MgoEB.Update("luaconfig", map[string]interface{}{"code": s.Code}, map[string]interface{}{"$set": map[string]interface{}{"state": 6}}, false, false)
  175. logger.Info("Delete History Code:", s.Code, b)
  176. } else {
  177. if !s.Stop { //未下架定时执行
  178. util.TimeAfterFunc(time.Duration(s.SpiderRunRate)*time.Minute, func() {
  179. s.ExecJob(true)
  180. }, TimeChan)
  181. } else { //下架后子线程退出
  182. return
  183. }
  184. }
  185. } else { //排队模式或者数据补采
  186. return
  187. }
  188. }
  189. }
  190. // 获取最新时间--作为最后更新时间
  191. func (s *Spider) GetLastPublishTime() (errs interface{}) {
  192. defer qu.Catch()
  193. var lastpublishtime string
  194. //取得最后更新时间
  195. if err := s.L.CallByParam(lua.P{
  196. Fn: s.L.GetGlobal("getLastPublishTime"),
  197. NRet: 1,
  198. Protect: true,
  199. }); err != nil {
  200. //panic(s.Code + "," + err.Error())
  201. log.Println(s.Code + "," + err.Error())
  202. errs = err.Error()
  203. atomic.AddInt32(&s.Script.ErrorNum, 1)
  204. return errs
  205. }
  206. ret := s.L.Get(-1)
  207. s.L.Pop(1)
  208. if str, ok := ret.(lua.LString); ok {
  209. lastpublishtime = string(str)
  210. }
  211. if s.LastPubshTime < util.ParseDate2Int64(lastpublishtime) {
  212. //防止发布时间超前
  213. if util.ParseDate2Int64(lastpublishtime) > time.Now().Unix() {
  214. s.LastPubshTime = time.Now().Unix()
  215. } else {
  216. s.LastPubshTime = util.ParseDate2Int64(lastpublishtime)
  217. }
  218. }
  219. return nil
  220. }
  221. // 下载列表(较DownListPageItemBack去掉了无数据的重试和重复页记录)
  222. func (s *Spider) DownListPageItem() (errs interface{}) {
  223. defer qu.Catch()
  224. start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
  225. s.MaxPage = max //
  226. repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
  227. downloadAllNum := 0 //本轮采集tmpMax页总个数
  228. if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
  229. max = s.GetIntVar("spiderHistoryMaxPage")
  230. }
  231. downtimes := 0 //记录某页重试次数(暂定3次)
  232. repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
  233. isRunRepeatList := false //是否执行列表页连续判重
  234. if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
  235. isRunRepeatList = true
  236. max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
  237. }
  238. //子任务判断
  239. if s.ContinueDownListChildTask {
  240. start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1 //子任务起始页
  241. max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit //子任务最大页
  242. }
  243. for ; start <= max && !s.Stop; start++ {
  244. if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
  245. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
  246. }
  247. //qu.Debug("爬虫:", s.Code, " 配置最大页:", s.MaxPage, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
  248. if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
  249. break
  250. }
  251. if err := s.L.CallByParam(lua.P{
  252. Fn: s.L.GetGlobal("downloadAndParseListPage"),
  253. NRet: 1,
  254. Protect: true,
  255. }, lua.LNumber(start)); err != nil {
  256. //panic(s.Code + "," + err.Error())
  257. logger.Error("列表页采集报错", start, s.Code+","+err.Error())
  258. errs = err.Error()
  259. atomic.AddInt32(&s.Script.ErrorNum, 1)
  260. //列表页采集报错进行重试,超过重试次数视为该页已采(脚本报错直接退出不进行内部重试)
  261. if downtimes < 3 {
  262. downtimes++
  263. start--
  264. } else if isRunRepeatList { //超过重试次数,视为本页重复
  265. repeatPageTimes++ //次数加1
  266. downtimes = 0
  267. }
  268. continue
  269. }
  270. lv := s.L.Get(-1)
  271. s.L.Pop(1)
  272. if tbl, ok := lv.(*lua.LTable); ok {
  273. //list := []map[string]interface{}{}
  274. //qu.Debug("当前页数据量:", tbl.Len())
  275. if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
  276. repeatListNum := 0 // 当前列表页连接重复个数
  277. for i := 1; i <= tabLen; i++ {
  278. v := tbl.RawGetInt(i).(*lua.LTable)
  279. tmp := util.TableToMap(v)
  280. if !s.IsHistoricalMend { //不是历史补漏
  281. tmp["dataging"] = 0 //数据中打标记dataging=0
  282. if s.DownDetail {
  283. s.DownloadDetailItem(tmp, &repeatListNum)
  284. }
  285. } else { //历史补漏
  286. s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
  287. }
  288. }
  289. repeatAllNum += repeatListNum
  290. downloadAllNum += tabLen
  291. if isRunRepeatList { //执行连续页码判重
  292. if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
  293. repeatPageTimes++ //次数加1
  294. } else { //当前start页有新数据,重复次数重置
  295. repeatPageTimes = 0
  296. }
  297. }
  298. } else if isRunRepeatList {
  299. repeatPageTimes++ //次数加1
  300. }
  301. } else if isRunRepeatList {
  302. repeatPageTimes++ //次数加1
  303. }
  304. downtimes = 0 //当前页下载无误,重置下载重试次数
  305. util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
  306. }
  307. logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, start, s.Stop)
  308. if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
  309. nowTime := time.Now()
  310. sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
  311. set := map[string]interface{}{
  312. "site": s.Name,
  313. "channel": s.Channel,
  314. "spidercode": s.Code,
  315. "updatetime": nowTime.Unix(),
  316. "event": util.Config.Uploadevent,
  317. "modifyuser": s.MUserName,
  318. "maxpage": s.MaxPage,
  319. "runrate": s.SpiderRunRate,
  320. "endpage": start,
  321. "date": sDate,
  322. }
  323. inc := map[string]interface{}{
  324. "alltimes": 1,
  325. }
  326. //记录翻页是否成功
  327. if s.MaxPage > 1 {
  328. if s.PageOneTextHash != "" {
  329. if s.PageTwoTextHash != "" {
  330. if s.PageOneTextHash != s.PageTwoTextHash {
  331. inc["page_success"] = 1
  332. } else {
  333. inc["page_fail"] = 1
  334. }
  335. } else {
  336. inc["page_fail"] = 1
  337. }
  338. } else if s.PageTwoTextHash != "" {
  339. inc["page_onefail"] = 1
  340. }
  341. }
  342. if downloadAllNum > 0 {
  343. rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
  344. rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
  345. if rate == 1.0 {
  346. if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
  347. inc["oh_percent_onenum"] = 1
  348. } else {
  349. inc["oh_percent"] = 1
  350. }
  351. //} else if rate >= 0.9 {
  352. // inc["nt_percent"] = 1
  353. //} else if rate >= 0.8 {
  354. // inc["et_percent"] = 1
  355. //} else {
  356. // inc["other_percent"] = 1
  357. }
  358. if isRunRepeatList && start > max { //连续翻页超过了上限
  359. if !s.ContinueDownListChildTask {
  360. go ContinueDownListPageItem(s) //开启子任务继续采集
  361. } else {
  362. inc["uplimit"] = 1
  363. }
  364. }
  365. } else {
  366. inc["zero"] = 1
  367. }
  368. query := map[string]interface{}{
  369. "date": sDate,
  370. "spidercode": s.Code,
  371. }
  372. MgoS.Update("spider_downloadrate", query, map[string]interface{}{
  373. "$set": set,
  374. "$inc": inc,
  375. }, true, false)
  376. }
  377. //信息重置
  378. s.PageOneTextHash = ""
  379. s.PageTwoTextHash = ""
  380. return errs
  381. }
  382. func (s *Spider) DownListPageItemBack() (errs interface{}) {
  383. defer qu.Catch()
  384. start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
  385. s.MaxPage = max //
  386. //tmpMax := max //临时记录最大页
  387. repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
  388. downloadAllNum := 0 //本轮采集tmpMax页总个数
  389. if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点,爬虫跑历史
  390. max = s.GetIntVar("spiderHistoryMaxPage")
  391. }
  392. downtimes := 0 //记录某页重试次数(暂定3次)
  393. repeatPageNum := 0 //记录列表页所有连接重复的页码
  394. repeatPageTimes := 0 //记录页码连续判重的次数(暂定连续判重页码数为5次时,不再翻页)
  395. isRunRepeatList := false //是否执行列表页连续判重
  396. if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
  397. isRunRepeatList = true
  398. max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
  399. }
  400. //子任务判断
  401. if s.ContinueDownListChildTask {
  402. start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
  403. max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
  404. }
  405. for ; start <= max && !s.Stop; start++ {
  406. if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
  407. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
  408. }
  409. //qu.Debug("爬虫:", s.Code, "重复页:", repeatPageNum, " 配置最大页:", s.MaxPage, " 最终最大页:", max, " 当前页:", start, "重复次数:", repeatPageTimes)
  410. //if start > tmpMax && isRunRepeatList && repeatPageTimes >= 5 { //重复次数超过5次,不再翻页
  411. // break
  412. //}
  413. if isRunRepeatList && repeatPageTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
  414. break
  415. }
  416. if err := s.L.CallByParam(lua.P{
  417. Fn: s.L.GetGlobal("downloadAndParseListPage"),
  418. NRet: 1,
  419. Protect: true,
  420. }, lua.LNumber(start)); err != nil {
  421. //panic(s.Code + "," + err.Error())
  422. logger.Error("列表页采集报错", start, s.Code+","+err.Error())
  423. errs = err.Error()
  424. atomic.AddInt32(&s.Script.ErrorNum, 1)
  425. //列表页采集报错进行重试,超过重试次数视为该页已采
  426. if downtimes < 2 {
  427. downtimes++
  428. start--
  429. } else if isRunRepeatList { //超过重试次数,视为本页重复
  430. if repeatPageNum+1 == start {
  431. repeatPageTimes++ //次数加1
  432. } else {
  433. repeatPageTimes = 0 //重复次数重置0
  434. }
  435. repeatPageNum = start //赋值页码
  436. downtimes = 0
  437. }
  438. continue
  439. }
  440. lv := s.L.Get(-1)
  441. s.L.Pop(1)
  442. if tbl, ok := lv.(*lua.LTable); ok {
  443. //list := []map[string]interface{}{}
  444. //qu.Debug("当前页数据量:", tbl.Len())
  445. if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
  446. repeatListNum := 0 // 当前列表页连接重复个数
  447. for i := 1; i <= tabLen; i++ {
  448. v := tbl.RawGetInt(i).(*lua.LTable)
  449. tmp := util.TableToMap(v)
  450. if !s.IsHistoricalMend { //不是历史补漏
  451. tmp["dataging"] = 0 //数据中打标记dataging=0
  452. if s.DownDetail {
  453. s.DownloadDetailItem(tmp, &repeatListNum)
  454. } /*else {//暂无此类爬虫
  455. tmp["comeintime"] = time.Now().Unix()
  456. //atomic.AddInt32(&s.LastDowncount, 1)
  457. //atomic.AddInt32(&s.TodayDowncount, 1)
  458. //atomic.AddInt32(&s.TotalDowncount, 1)
  459. href := fmt.Sprint(tmp["href"])
  460. if len(href) > 5 { //有效数据
  461. hashHref := util.HexText(href)
  462. util.RedisClusterSet(hashHref, "", -1) //全量redis
  463. list = append(list, tmp)
  464. }
  465. }*/
  466. } else { //历史补漏
  467. s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
  468. }
  469. }
  470. repeatAllNum += repeatListNum
  471. downloadAllNum += tabLen
  472. if isRunRepeatList { //执行连续页码判重
  473. if repeatListNum >= tabLen { //当前start列表页全部数据都已采集
  474. //qu.Debug("重复页:", repeatPageNum, "当前页:", start)
  475. if repeatPageNum+1 == start || repeatPageNum == 0 {
  476. repeatPageTimes++ //次数加1
  477. } else {
  478. repeatPageTimes = 0 //重复次数重置0
  479. }
  480. repeatPageNum = start //赋值页码
  481. } else { //当前start页有遗漏数据
  482. repeatPageTimes = 0
  483. repeatPageNum = 0
  484. }
  485. }
  486. //if !s.IsHistoricalMend && !s.DownDetail {
  487. // if len(list) > 0 { //保存信息入库
  488. // StoreBlak(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, list)
  489. // }
  490. //}
  491. } else { //避免因网络问题当前下载的列表页无数据,重新请求下载列表页(存在列表页数据被全部过滤的情况)
  492. if downtimes < 2 {
  493. downtimes++
  494. start--
  495. continue
  496. } else if isRunRepeatList { //超过重试次数,视为本页重复
  497. if repeatPageNum+1 == start {
  498. repeatPageTimes++ //次数加1
  499. } else {
  500. repeatPageTimes = 0 //重复次数重置0
  501. }
  502. repeatPageNum = start //赋值页码
  503. }
  504. }
  505. } else { //请求当前列表页失败
  506. if downtimes < 2 {
  507. downtimes++
  508. start--
  509. continue
  510. } else if isRunRepeatList { //超过重试次数,视为本页重复
  511. if repeatPageNum+1 == start {
  512. repeatPageTimes++ //次数加1
  513. } else {
  514. repeatPageTimes = 0 //重复次数重置0
  515. }
  516. repeatPageNum = start //赋值页码
  517. }
  518. }
  519. downtimes = 0 //当前页下载无误,重置下载重试次数
  520. util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
  521. }
  522. logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, start, s.Stop)
  523. if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
  524. nowTime := time.Now()
  525. sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
  526. set := map[string]interface{}{
  527. "site": s.Name,
  528. "channel": s.Channel,
  529. "spidercode": s.Code,
  530. "updatetime": nowTime.Unix(),
  531. "event": util.Config.Uploadevent,
  532. "modifyuser": s.MUserName,
  533. "maxpage": s.MaxPage,
  534. "runrate": s.SpiderRunRate,
  535. "endpage": start,
  536. "date": sDate,
  537. }
  538. inc := map[string]interface{}{
  539. "alltimes": 1,
  540. }
  541. //记录翻页是否成功
  542. if s.MaxPage > 1 {
  543. if s.PageOneTextHash != "" {
  544. if s.PageTwoTextHash != "" {
  545. if s.PageOneTextHash != s.PageTwoTextHash {
  546. inc["page_success"] = 1
  547. } else {
  548. inc["page_fail"] = 1
  549. }
  550. } else {
  551. inc["page_fail"] = 1
  552. }
  553. } else if s.PageTwoTextHash != "" {
  554. inc["page_onefail"] = 1
  555. }
  556. }
  557. if downloadAllNum > 0 {
  558. rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
  559. rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
  560. if rate == 1.0 {
  561. if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
  562. inc["oh_percent_onenum"] = 1
  563. } else {
  564. inc["oh_percent"] = 1
  565. }
  566. //} else if rate >= 0.9 {
  567. // inc["nt_percent"] = 1
  568. //} else if rate >= 0.8 {
  569. // inc["et_percent"] = 1
  570. //} else {
  571. // inc["other_percent"] = 1
  572. }
  573. if isRunRepeatList && start > max { //连续翻页超过了上限
  574. if !s.ContinueDownListChildTask {
  575. go ContinueDownListPageItem(s) //开启子任务继续采集
  576. } else {
  577. inc["uplimit"] = 1
  578. }
  579. }
  580. } else {
  581. inc["zero"] = 1
  582. }
  583. query := map[string]interface{}{
  584. "date": sDate,
  585. "spidercode": s.Code,
  586. }
  587. MgoS.Update("spider_downloadrate", query, map[string]interface{}{
  588. "$set": set,
  589. "$inc": inc,
  590. }, true, false)
  591. }
  592. //信息重置
  593. s.PageOneTextHash = ""
  594. s.PageTwoTextHash = ""
  595. return errs
  596. }
  597. // 并发下载列表
  598. func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
  599. defer qu.Catch()
  600. start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
  601. s.MaxPage = max //记录爬虫配置的最大页
  602. repeatAllNum := int64(0) //本轮采集总的重复个数
  603. downloadAllNum := int64(0) //本轮采集总个数
  604. if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点根据爬虫类型,取采集的最大页
  605. max = s.GetIntVar("spiderHistoryMaxPage") //采集历史的爬虫,取历史最大页配置spiderHistoryMaxPage
  606. }
  607. isRunRepeatList := false //是否执行列表页连续判重逻辑
  608. //是否进行连续翻页判断,修改最大页
  609. if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
  610. isRunRepeatList = true
  611. max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
  612. }
  613. //子任务判断
  614. //if s.ContinueDownListChildTask {
  615. // start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
  616. // max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
  617. //}
  618. //创建并发Spider对象
  619. spChan := make(chan *Spider, 1)
  620. if isRunRepeatList && util.Config.PageTurnInfo.ListThreadsNum > 1 { //无限翻页模式设置spChan并发大小
  621. spChan = make(chan *Spider, util.Config.PageTurnInfo.ListThreadsNum)
  622. spChan <- s //加入通道
  623. NewSpiderByScript(util.Config.PageTurnInfo.ListThreadsNum-1, s.Code, s.ScriptFile, spChan) //创建多个Spider对象
  624. } else {
  625. spChan <- s //加入通道
  626. }
  627. endPage := 0 //结束页
  628. repeatTimes := 0 //连续判重次数
  629. for ; start <= max && !s.Stop; start += util.Config.PageTurnInfo.ListThreadsNum {
  630. if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
  631. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
  632. }
  633. listWg := &sync.WaitGroup{}
  634. pageMap := map[int]bool{} //记录某页数据是否都已采集
  635. pageArr := []int{}
  636. lock := &sync.Mutex{}
  637. //并发下载列表页
  638. for listThreadNum := 0; listThreadNum < util.Config.PageTurnInfo.ListThreadsNum; listThreadNum++ {
  639. pagenum := start + listThreadNum //当前实际采集页码
  640. if pagenum > max { //并发采集时,每次开启repeatPageTimesLimit个并发,并发有可能超过max上限
  641. break
  642. }
  643. spTmp := <-spChan //通道中取出sp对象
  644. listWg.Add(1)
  645. atomic.AddInt64(&ListAllThreadNum, 1)
  646. endPage = pagenum + 1
  647. go func(sp *Spider, pagenum int) {
  648. defer func() {
  649. spChan <- sp //处理完数据sp对象放回通道中
  650. listWg.Done()
  651. atomic.AddInt64(&ListAllThreadNum, -1)
  652. }()
  653. //下载某一页数据
  654. downnum, repeatnum := sp.DownListOnePage(pagenum)
  655. //logger.Info(sp.Code, "pagenum", pagenum, "repeat", downnum == repeatnum, downnum, repeatnum, &sp)
  656. //汇总下载量
  657. atomic.AddInt64(&downloadAllNum, int64(downnum))
  658. atomic.AddInt64(&repeatAllNum, int64(repeatnum))
  659. lock.Lock()
  660. pageMap[pagenum] = downnum == repeatnum //当前pagenum页数据是否已采集
  661. pageArr = append(pageArr, pagenum)
  662. lock.Unlock()
  663. if downnum > 0 {
  664. //使用并发采集列表页时,spider对象不是同一个,只能采集后统计
  665. if pagenum == 1 { //将第一页sp采集信息的hash值赋值给s
  666. s.PageOneTextHash = sp.PageOneTextHash
  667. } else if pagenum == 2 { //将第二页sp采集信息的hash值赋值给s
  668. s.PageTwoTextHash = sp.PageTwoTextHash
  669. }
  670. }
  671. }(spTmp, pagenum)
  672. }
  673. listWg.Wait()
  674. if isRunRepeatList {
  675. sort.Ints(pageArr) //页码从小到大排序
  676. for _, page := range pageArr {
  677. if pageMap[page] { //
  678. repeatTimes++
  679. } else {
  680. repeatTimes = 0
  681. }
  682. }
  683. if repeatTimes >= util.Config.PageTurnInfo.RepeatPageTimesLimit { //超过连续判重上限,不再翻页
  684. break
  685. }
  686. }
  687. }
  688. //close(spChan) //关闭通道,释放资源
  689. logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, endPage, s.Stop)
  690. if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
  691. nowTime := time.Now()
  692. sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
  693. set := map[string]interface{}{
  694. "site": s.Name,
  695. "channel": s.Channel,
  696. "spidercode": s.Code,
  697. "updatetime": nowTime.Unix(),
  698. "event": util.Config.Uploadevent,
  699. "modifyuser": s.MUserName,
  700. "maxpage": s.MaxPage,
  701. "runrate": s.SpiderRunRate,
  702. "endpage": endPage,
  703. "date": sDate,
  704. }
  705. inc := map[string]interface{}{
  706. "alltimes": 1,
  707. }
  708. //记录翻页是否成功
  709. if s.MaxPage > 1 { //最大页为1的,用列表页是否异常体现爬虫运行情况
  710. if s.PageOneTextHash != "" {
  711. if s.PageTwoTextHash != "" {
  712. if s.PageOneTextHash != s.PageTwoTextHash {
  713. inc["page_success"] = 1
  714. } else {
  715. inc["page_fail"] = 1
  716. }
  717. } else {
  718. inc["page_fail"] = 1
  719. }
  720. } else if s.PageTwoTextHash != "" {
  721. inc["page_onefail"] = 1
  722. }
  723. }
  724. if downloadAllNum > 0 {
  725. rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
  726. rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
  727. if rate == 1.0 {
  728. if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
  729. inc["oh_percent_onenum"] = 1
  730. } else {
  731. inc["oh_percent"] = 1
  732. }
  733. //} else if rate >= 0.9 {
  734. // inc["nt_percent"] = 1
  735. //} else if rate >= 0.8 {
  736. // inc["et_percent"] = 1
  737. //} else {
  738. // inc["other_percent"] = 1
  739. }
  740. if isRunRepeatList && endPage > max { //连续翻页超过了上限
  741. if !s.ContinueDownListChildTask {
  742. go ContinueDownListPageItem(s) //开启子任务继续采集
  743. } else {
  744. inc["uplimit"] = 1
  745. }
  746. }
  747. } else {
  748. inc["zero"] = 1
  749. }
  750. query := map[string]interface{}{
  751. "date": sDate,
  752. "spidercode": s.Code,
  753. }
  754. MgoS.Update("spider_downloadrate", query, map[string]interface{}{
  755. "$set": set,
  756. "$inc": inc,
  757. }, true, false)
  758. }
  759. //信息重置
  760. s.PageOneTextHash = ""
  761. s.PageTwoTextHash = ""
  762. return errs
  763. }
  764. // 并发下载列表
  765. func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
  766. defer qu.Catch()
  767. start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
  768. s.MaxPage = max //记录爬虫配置的最大页
  769. repeatAllNum := int64(0) //本轮采集总的重复个数
  770. downloadAllNum := int64(0) //本轮采集总个数
  771. if util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history" { //7000节点根据爬虫类型,取采集的最大页
  772. max = s.GetIntVar("spiderHistoryMaxPage") //采集历史的爬虫,取历史最大页配置spiderHistoryMaxPage
  773. }
  774. repeatPageTimesLimit := 1 //记录页码连续判重的次数上限(默认1:for循环翻页时等效,至少+1)
  775. isRunRepeatList := false //是否执行列表页连续判重逻辑
  776. //是否进行连续翻页判断,修改最大页
  777. if !util.Config.IsHistoryEvent && util.Config.Modal == 1 && max > 1 && max < 101 { //除顺序采集模式和非历史节点外所有节点,采集列表页时进行连续10页判重
  778. isRunRepeatList = true
  779. repeatPageTimesLimit = util.Config.PageTurnInfo.RepeatPageTimesLimit
  780. max = util.Config.PageTurnInfo.TurnPageMaxLimit //高性能模式设置最大页为100,队列模式50页
  781. }
  782. //子任务判断
  783. if s.ContinueDownListChildTask {
  784. start = util.Config.PageTurnInfo.TurnPageMaxLimit + 1
  785. max = util.Config.PageTurnInfo.TurnPageMaxLimit + util.Config.PageTurnInfo.NextPageMaxLimit
  786. }
  787. //创建并发Spider对象
  788. spChan := make(chan *Spider, 1)
  789. if isRunRepeatList { //无限翻页模式设置spChan并发大小
  790. spChan = make(chan *Spider, repeatPageTimesLimit) //并发数量由连续翻页判重数量决定
  791. spChan <- s //加入通道
  792. NewSpiderByScript(repeatPageTimesLimit-1, s.Code, s.ScriptFile, spChan) //创建多个Spider对象
  793. } else {
  794. spChan <- s //加入通道
  795. }
  796. endPage := 0 //结束页
  797. for ; start <= max && !s.Stop; start += repeatPageTimesLimit {
  798. if !s.Stop && !s.ContinueDownListChildTask { //在下载详情页时爬虫下架,此时不再存心跳信息
  799. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "list", start == 1) //记录所有节点列表页心跳
  800. }
  801. listWg := &sync.WaitGroup{}
  802. isContinue := false //是否继续采集
  803. //并发下载列表页
  804. for listThreadNum := 0; listThreadNum < repeatPageTimesLimit; listThreadNum++ {
  805. pagenum := start + listThreadNum //当前实际采集页码
  806. if pagenum > max { //并发采集时,每次开启repeatPageTimesLimit个并发,并发有可能超过max上限
  807. break
  808. }
  809. spTmp := <-spChan //通道中取出sp对象
  810. listWg.Add(1)
  811. atomic.AddInt64(&ListAllThreadNum, 1)
  812. endPage = pagenum + 1
  813. go func(sp *Spider, pagenum int) {
  814. defer func() {
  815. spChan <- sp //处理完数据sp对象放回通道中
  816. listWg.Done()
  817. atomic.AddInt64(&ListAllThreadNum, -1)
  818. }()
  819. //下载某一页数据
  820. downnum, repeatnum := sp.DownListOnePage(pagenum)
  821. //汇总下载量
  822. atomic.AddInt64(&downloadAllNum, int64(downnum))
  823. atomic.AddInt64(&repeatAllNum, int64(repeatnum))
  824. if downnum > 0 {
  825. if downnum-repeatnum > 0 { //本页有新数据
  826. isContinue = true
  827. }
  828. //使用并发采集列表页时,spider对象不是同一个,只能采集后统计
  829. if pagenum == 1 { //将第一页sp采集信息的hash值赋值给s
  830. s.PageOneTextHash = sp.PageOneTextHash
  831. } else if pagenum == 2 { //将第二页sp采集信息的hash值赋值给s
  832. s.PageTwoTextHash = sp.PageTwoTextHash
  833. }
  834. }
  835. //qu.Debug("第", pagenum, "页采集信息:", downnum, repeatnum)
  836. }(spTmp, pagenum)
  837. }
  838. listWg.Wait()
  839. if !isContinue { //并发采集结果中,如果某页有新数据,继续采集,直到上限页
  840. break
  841. }
  842. }
  843. close(spChan) //关闭通道,释放资源
  844. logger.Info(s.Code, "本轮列表页采集详情:", s.ContinueDownListChildTask, downloadAllNum, repeatAllNum, endPage, s.Stop)
  845. if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
  846. nowTime := time.Now()
  847. sDate := qu.FormatDate(&nowTime, qu.Date_Short_Layout)
  848. set := map[string]interface{}{
  849. "site": s.Name,
  850. "channel": s.Channel,
  851. "spidercode": s.Code,
  852. "updatetime": nowTime.Unix(),
  853. "event": util.Config.Uploadevent,
  854. "modifyuser": s.MUserName,
  855. "maxpage": s.MaxPage,
  856. "runrate": s.SpiderRunRate,
  857. "endpage": endPage,
  858. "date": sDate,
  859. }
  860. inc := map[string]interface{}{
  861. "alltimes": 1,
  862. }
  863. //记录翻页是否成功
  864. if s.MaxPage > 1 { //最大页为1的,用列表页是否异常体现爬虫运行情况
  865. if s.PageOneTextHash != "" {
  866. if s.PageTwoTextHash != "" {
  867. if s.PageOneTextHash != s.PageTwoTextHash {
  868. inc["page_success"] = 1
  869. } else {
  870. inc["page_fail"] = 1
  871. }
  872. } else {
  873. inc["page_fail"] = 1
  874. }
  875. } else if s.PageTwoTextHash != "" {
  876. inc["page_onefail"] = 1
  877. }
  878. }
  879. if downloadAllNum > 0 {
  880. rate := float64(downloadAllNum-repeatAllNum) / float64(downloadAllNum)
  881. rate, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", rate), 64)
  882. if rate == 1.0 {
  883. if downloadAllNum == 1 { //列表页数据过滤的只剩一条新数据
  884. inc["oh_percent_onenum"] = 1
  885. } else {
  886. inc["oh_percent"] = 1
  887. }
  888. //} else if rate >= 0.9 {
  889. // inc["nt_percent"] = 1
  890. //} else if rate >= 0.8 {
  891. // inc["et_percent"] = 1
  892. //} else {
  893. // inc["other_percent"] = 1
  894. }
  895. if isRunRepeatList && endPage > max { //连续翻页超过了上限
  896. if !s.ContinueDownListChildTask {
  897. go ContinueDownListPageItem(s) //开启子任务继续采集
  898. } else {
  899. inc["uplimit"] = 1
  900. }
  901. }
  902. } else {
  903. inc["zero"] = 1
  904. }
  905. query := map[string]interface{}{
  906. "date": sDate,
  907. "spidercode": s.Code,
  908. }
  909. MgoS.Update("spider_downloadrate", query, map[string]interface{}{
  910. "$set": set,
  911. "$inc": inc,
  912. }, true, false)
  913. }
  914. //信息重置
  915. s.PageOneTextHash = ""
  916. s.PageTwoTextHash = ""
  917. return errs
  918. }
  919. // 补采下载列表
  920. func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
  921. defer qu.Catch()
  922. var (
  923. errtimes int //采集异常次数(暂定10次)
  924. errPageNum int //当前采集异常页码
  925. downtimes int //记录某页重试次数(暂定3次)
  926. downloadAllNum int //记录本次采集,信息采集总量
  927. saveAllNum int //记录本次采集,信息补采总量
  928. repeatAllNum int //记录本次采集,信息重复总量
  929. pageTitleHash string //记录当前页所有title文本
  930. finishText = "正常退出" //
  931. start = 1 //起始页
  932. )
  933. for {
  934. if errtimes >= Supplement_MaxErrorTimes { //连续异常次数超过10次,爬虫不再翻页
  935. finishText = "异常退出"
  936. logger.Info(s.Code + "连续10页采集异常")
  937. break
  938. }
  939. if err := s.L.CallByParam(lua.P{
  940. Fn: s.L.GetGlobal("downloadAndParseListPage"),
  941. NRet: 1,
  942. Protect: true,
  943. }, lua.LNumber(start)); err != nil {
  944. //panic(s.Code + "," + err.Error())
  945. logger.Error("列表页采集报错", start, s.Code+","+err.Error())
  946. errs = err.Error()
  947. if downtimes < 3 {
  948. downtimes++
  949. } else if errtimes == 0 || start == errPageNum+1 {
  950. errtimes++
  951. errPageNum = start
  952. start++
  953. downtimes = 0
  954. }
  955. continue
  956. }
  957. lv := s.L.Get(-1)
  958. s.L.Pop(1)
  959. if tbl, ok := lv.(*lua.LTable); ok {
  960. if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
  961. var (
  962. publishtimeErrTimes int
  963. text string
  964. repeatListNum int // 当前列表页连接重复个数
  965. num = 1
  966. isBreak = false
  967. )
  968. for ; num <= tabLen; num++ {
  969. v := tbl.RawGetInt(num).(*lua.LTable)
  970. tmp := util.TableToMap(v)
  971. tmp["dataging"] = 0 //数据中打标记dataging=0
  972. s.DownloadDetailItem(tmp, &repeatListNum)
  973. pTmp := qu.ObjToString(tmp["publishtime"])
  974. title := qu.ObjToString(tmp["title"])
  975. text += title
  976. pTime, _ := time.ParseInLocation(qu.Date_Full_Layout, pTmp, time.Local)
  977. publishtime := pTime.Unix()
  978. if publishtime > 1000000000 && publishtime < Supplement_Publishtime { //正常退出
  979. isBreak = true
  980. //break
  981. } else if publishtime <= 1000000000 { //异常发布时间
  982. publishtimeErrTimes++
  983. }
  984. }
  985. logger.Info(s.Code, start, tabLen, repeatListNum)
  986. downloadAllNum += tabLen //采集总量累计
  987. repeatAllNum += repeatListNum //重复总量累计
  988. saveAllNum += num - 1 - repeatListNum //保存总量累计
  989. tmpPageTitleHash := pageTitleHash //
  990. pageTitleHash = util.HexText(text) //
  991. if tabLen == publishtimeErrTimes || tmpPageTitleHash == pageTitleHash { //当前页数据发布时间均异常;当前页与上页采集内容一致
  992. //if errtimes == 0 || start == errPageNum+1 {
  993. errtimes++
  994. errPageNum = start
  995. start++
  996. //}
  997. continue
  998. } else if isBreak { //中断不再采集
  999. start++
  1000. break
  1001. }
  1002. } else {
  1003. if downtimes < 3 {
  1004. downtimes++
  1005. } else if errtimes == 0 || start == errPageNum+1 {
  1006. errtimes++
  1007. errPageNum = start
  1008. start++
  1009. downtimes = 0
  1010. }
  1011. continue
  1012. }
  1013. } else {
  1014. if downtimes < 3 {
  1015. downtimes++
  1016. } else if errtimes == 0 || start == errPageNum+1 {
  1017. errtimes++
  1018. errPageNum = start
  1019. start++
  1020. downtimes = 0
  1021. }
  1022. continue
  1023. }
  1024. start++
  1025. downtimes = 0
  1026. errtimes = 0
  1027. errPageNum = 0
  1028. util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
  1029. }
  1030. logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, saveAllNum, finishText)
  1031. save := map[string]interface{}{
  1032. "site": s.Name,
  1033. "channel": s.Channel,
  1034. "spidercode": s.Code,
  1035. "comeintime": time.Now().Unix(),
  1036. "modifyuser": s.MUserName,
  1037. "endpage": start,
  1038. "finish": finishText,
  1039. "savenum": saveAllNum,
  1040. "count": downloadAllNum,
  1041. "repeat": repeatAllNum,
  1042. }
  1043. MgoS.Save("spider_supplement", save)
  1044. return errs
  1045. }
  1046. // 下载某一页数据
  1047. func (s *Spider) DownListOnePage(pagenum int) (downnum, repeatnum int) {
  1048. defer qu.Catch()
  1049. downtimes := 0
  1050. for downtimes < 3 { //错误重试3次
  1051. if err := s.L.CallByParam(lua.P{
  1052. Fn: s.L.GetGlobal("downloadAndParseListPage"),
  1053. NRet: 1,
  1054. Protect: true,
  1055. }, lua.LNumber(pagenum)); err != nil {
  1056. //panic(s.Code + "," + err.Error())
  1057. logger.Error("列表页采集报错", pagenum, s.Code+","+err.Error())
  1058. atomic.AddInt32(&s.Script.ErrorNum, 1)
  1059. //列表页采集报错进行重试
  1060. downtimes++
  1061. continue
  1062. }
  1063. lv := s.L.Get(-1)
  1064. s.L.Pop(1)
  1065. if tbl, ok := lv.(*lua.LTable); ok {
  1066. //list := []map[string]interface{}{}
  1067. //qu.Debug("当前页数据量:", tbl.Len())
  1068. if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
  1069. repeatListNum := 0 // 当前列表页连接重复个数
  1070. for i := 1; i <= tabLen; i++ {
  1071. v := tbl.RawGetInt(i).(*lua.LTable)
  1072. tmp := util.TableToMap(v)
  1073. if !s.IsHistoricalMend { //不是历史补漏
  1074. tmp["dataging"] = 0 //数据中打标记dataging=0
  1075. if s.DownDetail {
  1076. s.DownloadDetailItem(tmp, &repeatListNum)
  1077. }
  1078. } else { //历史补漏
  1079. s.HistoricalMendDownloadDetailItem(tmp) //历史补漏下载三级页
  1080. }
  1081. }
  1082. repeatnum = repeatListNum
  1083. downnum = tabLen
  1084. return
  1085. //if !s.IsHistoricalMend && !s.DownDetail {
  1086. // if len(list) > 0 { //保存信息入库
  1087. // StoreBlak(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, list)
  1088. // }
  1089. //}
  1090. } else { //避免因网络问题当前下载的列表页无数据,重新请求下载列表页(存在列表页数据被全部过滤的情况)
  1091. downtimes++
  1092. continue
  1093. }
  1094. } else { //请求当前列表页失败
  1095. downtimes++
  1096. continue
  1097. }
  1098. }
  1099. return
  1100. }
  1101. // 开启单独线程继续采集列表页
  1102. func ContinueDownListPageItem(s *Spider) {
  1103. defer qu.Catch()
  1104. spTmp, errstr := CreateSpider(s.SCode, s.ScriptFile, true, true) //生成新爬虫
  1105. logger.Info(s.SCode, "补充连续翻页开始...")
  1106. if errstr == "" && spTmp != nil && spTmp.Code != "nil" { //脚本加载成功
  1107. spTmp.ContinueDownListChildTask = true
  1108. defer spTmp.L.Close()
  1109. err := spTmp.DownListPageItem() //下载列表
  1110. logger.Info(s.SCode, "补充连续翻页结束...")
  1111. if err != nil {
  1112. logger.Error(spTmp.Code, err)
  1113. }
  1114. }
  1115. }
  1116. // 遍历,开启三级页下载(历史补漏)
  1117. func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
  1118. //qu.Debug("--------------历史下载-----------------")
  1119. defer qu.Catch()
  1120. var err interface{}
  1121. data := map[string]interface{}{}
  1122. paramdata := p.(map[string]interface{})
  1123. for k, v := range paramdata {
  1124. data[k] = v
  1125. }
  1126. href := qu.ObjToString(data["href"])
  1127. if len(href) <= 5 { //无效数据
  1128. return
  1129. }
  1130. hashHref := util.HexText(href)
  1131. isExist := util.RedisExist("list", "list_"+hashHref)
  1132. //logger.Debug("full href:", href, " isExist:", isExist)
  1133. if !s.IsMustDownload { //非强制下载
  1134. if isExist { //数据存在,直接return
  1135. return
  1136. } else if util.Config.IsHistoryEvent { //1、7000(历史节点)的历史补漏,数据存入spider_historydata
  1137. num := 0
  1138. SaveHighListPageData(paramdata, hashHref, &num)
  1139. return
  1140. }
  1141. } else { //当前不支持强制下载
  1142. return
  1143. }
  1144. //2、非7000(历史节点)的历史补漏,采完列表直接采详情,采完爬虫下架(当前无此爬虫)
  1145. id := ""
  1146. isEsRepeat := false
  1147. if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
  1148. title := qu.ObjToString(paramdata["title"])
  1149. eTime := time.Now().Unix()
  1150. sTime := eTime - int64(7*86400)
  1151. //esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
  1152. esQuery := elc.NewBoolQuery().Must(elc.NewRangeQuery("comeintime").Gte(sTime).Lte(eTime)).Must(elc.NewTermQuery("title.mtitle", title))
  1153. if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
  1154. isEsRepeat = true
  1155. }
  1156. }
  1157. SaveListPageData(paramdata, &id, isEsRepeat) //存储采集记录
  1158. if isEsRepeat { //类竞品数据title判重数据加入redis
  1159. util.RedisSet("list", "list_"+hashHref, "", 86400*365*2)
  1160. util.AddBloomRedis("href", href)
  1161. return
  1162. }
  1163. //qu.Debug("----------------下载、解析、入库--------------------")
  1164. //下载详情页
  1165. data, err = s.DownloadDetailPage(paramdata, data)
  1166. if err != nil || data == nil { //下载失败,结束
  1167. if err != nil {
  1168. logger.Error(s.Code, err, paramdata)
  1169. }
  1170. //更新spider_listdata中数据下载失败标记
  1171. MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
  1172. return
  1173. }
  1174. util.RedisSet("list", "list_"+hashHref, "", 86400*365*2) //采集成功,加入列表页redis
  1175. //根据发布时间进行数据判重校验
  1176. tmphref := qu.ObjToString(data["href"]) //取tmphref,三级页href替换导致前后href不同
  1177. publishtime := qu.Int64All(data["l_np_publishtime"])
  1178. if publishtime < time.Now().AddDate(-1, 0, 0).Unix() { //一年前数据进行全量bloom redis href判重
  1179. isExist, _ = util.ExistsBloomRedis("href", tmphref)
  1180. if isExist {
  1181. MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "bloom_href", "tmphref": tmphref, "updatetime": time.Now().Unix()}})
  1182. return
  1183. }
  1184. }
  1185. //详情页过滤数据
  1186. set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
  1187. if data["delete"] != nil {
  1188. //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
  1189. set["exist"] = "delete"
  1190. //MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
  1191. MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
  1192. return
  1193. }
  1194. //更新spider_listdata中数据下载成功标记(根据链接更新数据state;可能由后续下载成功时更新)
  1195. MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
  1196. //三级页href替换导致前后href不同,采集成功后将原始href加入全量redis
  1197. //if tmphref := qu.ObjToString(data["href"]); tmphref != href {
  1198. // util.AddBloomRedis("href", href)
  1199. //}
  1200. flag := true
  1201. //publishtime := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
  1202. if s.IsMustDownload { //强制下载
  1203. if isExist && publishtime < time.Now().AddDate(0, 0, -5).Unix() {
  1204. //qu.Debug("强制下载 redis存在")
  1205. data["dataging"] = 1 //此处dataging=1对应保存服务中取redis中href对应的id值,进行更新(现redis中已无id值,所以无效)
  1206. flag = false
  1207. } else {
  1208. //qu.Debug("强制下载 redis不存在")
  1209. data["dataging"] = 0
  1210. }
  1211. } else { //非强制下载
  1212. if !isExist {
  1213. //qu.Debug("非强制下载 redis不存在")
  1214. data["dataging"] = 0
  1215. }
  1216. }
  1217. //if publishtime > time.Now().Unix() { //防止发布时间超前
  1218. // data["publishtime"] = time.Now().Unix()
  1219. //}
  1220. delete(data, "state")
  1221. delete(data, "exit")
  1222. delete(data, "checkpublishtime")
  1223. data["comeintime"] = time.Now().Unix()
  1224. data["spidercode"] = s.Code
  1225. //qu.Debug("--------------开始保存---------------")
  1226. data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
  1227. Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, flag)
  1228. //qu.Debug("--------------保存结束---------------")
  1229. }
  1230. // 遍历,开启三级页下载(增量)
  1231. func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
  1232. defer qu.Catch()
  1233. var err interface{}
  1234. data := map[string]interface{}{}
  1235. paramdata := p.(map[string]interface{})
  1236. for k, v := range paramdata {
  1237. data[k] = v
  1238. }
  1239. href := qu.ObjToString(data["href"])
  1240. if len(href) <= 5 { //无效数据
  1241. *num++ //视为已采集
  1242. return
  1243. }
  1244. hashHref := util.HexText(href)
  1245. //列表页redis判重
  1246. isExist := util.RedisExist("list", "list_"+hashHref)
  1247. if Supplement && !isExist { //补采,再进行全量redis判重
  1248. isExist, _ = util.ExistsBloomRedis("href", href)
  1249. }
  1250. if isExist {
  1251. *num++ //已采集
  1252. return
  1253. }
  1254. id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
  1255. //if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //列表页、详情页分开采集模式节点和7000节点新爬虫采集的数据数据
  1256. if util.Config.Modal == 1 || util.Config.IsHistoryEvent || Supplement { //分开采集模式和历史节点(7000)
  1257. SaveHighListPageData(paramdata, hashHref, num) //存表
  1258. return
  1259. } else {
  1260. if !s.Stop {
  1261. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail", false) //记录modal=0老模式采集三级页心跳
  1262. }
  1263. isEsRepeat := false
  1264. if delaySite := DelaySiteMap[s.Name]; delaySite != nil && delaySite.Compete {
  1265. title := qu.ObjToString(paramdata["title"])
  1266. eTime := time.Now().Unix()
  1267. sTime := eTime - int64(7*86400)
  1268. //esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
  1269. esQuery := elc.NewBoolQuery().Must(elc.NewRangeQuery("comeintime").Gte(sTime).Lte(eTime)).Must(elc.NewTermQuery("title.mtitle", title))
  1270. if Es.Count(EsIndex, EsType, esQuery) > 0 { //es中含本title数据,不再采集,更新list表数据状态
  1271. isEsRepeat = true
  1272. }
  1273. }
  1274. SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7510、7520、7700节点列表页采集的信息
  1275. if isEsRepeat { //类竞品数据title判重数据加入redis
  1276. util.RedisSet("list", "list_"+hashHref, "", 86400*365*2)
  1277. util.AddBloomRedis("href", href)
  1278. return
  1279. }
  1280. }
  1281. //下载详情页
  1282. data, err = s.DownloadDetailPage(paramdata, data)
  1283. if err != nil || data == nil {
  1284. *num++ //顺序采集模式,在记录重复数据个数时,采集失败记为重复(避免下载失败数据每轮次采集都不会被判重,造成全采次数+1)
  1285. if err != nil {
  1286. logger.Error(s.Code, err, paramdata)
  1287. //if len(paramdata) > 0 {
  1288. // SaveErrorData(s.MUserName, paramdata, err) //保存错误信息
  1289. //}
  1290. }
  1291. //更新spider_listdata中数据下载失败标记
  1292. MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}})
  1293. return
  1294. } /*else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
  1295. util.RedisClusterSet(hashHref, "", -1) //全量redis中存值列表页href
  1296. }*/
  1297. util.RedisSet("list", "list_"+hashHref, "", 86400*365*2) //加入列表页redis
  1298. //根据发布时间进行数据判重校验
  1299. tmphref := qu.ObjToString(data["href"])
  1300. publishtime := qu.Int64All(data["l_np_publishtime"])
  1301. //7410节点(变链接节点)或者一年前数据进行全量bloomredis href判重
  1302. if util.Config.Uploadevent == 7410 || publishtime < time.Now().AddDate(-1, 0, 0).Unix() {
  1303. isExist, _ = util.ExistsBloomRedis("href", tmphref)
  1304. if isExist {
  1305. //MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "bloom_href", "tmphref": tmphref, "updatetime": time.Now().Unix()}})
  1306. MgoS.Update("spider_listdata", map[string]interface{}{"href": tmphref}, map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "bloom_href", "byid": id, "tmphref": tmphref, "updatetime": time.Now().Unix()}}, false, true)
  1307. return
  1308. }
  1309. }
  1310. //详情页下载数据成功心跳
  1311. if !s.Stop {
  1312. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=0老模式采集到数据心跳
  1313. }
  1314. set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
  1315. //详情页过滤数据
  1316. if data["delete"] != nil {
  1317. //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
  1318. set["exist"] = "delete"
  1319. //MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
  1320. MgoS.UpdateById("spider_listdata", id, map[string]interface{}{"$set": set})
  1321. return
  1322. }
  1323. set["byid"] = id
  1324. //更新spider_listdata中数据下载成功标记(根据链接更新数据state;可能由后续下载成功时更新)
  1325. MgoS.Update("spider_listdata", map[string]interface{}{"href": href}, map[string]interface{}{"$set": set}, false, true)
  1326. //三级页href替换导致前后href不同,采集成功后将原始href加入全量redis
  1327. //if tmphref := qu.ObjToString(data["href"]); tmphref != href {
  1328. // util.AddBloomRedis("href", href)
  1329. //}
  1330. delete(data, "state")
  1331. delete(data, "exit")
  1332. delete(data, "checkpublishtime")
  1333. data["comeintime"] = time.Now().Unix()
  1334. data["spidercode"] = s.Code
  1335. data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
  1336. data["infoformat"] = s.Infoformat //爬虫类型
  1337. Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
  1338. }
  1339. // 遍历下载名录
  1340. func (s *Spider) DownloadDetailByNames(p interface{}) {
  1341. defer qu.Catch()
  1342. var err interface{}
  1343. /*
  1344. if s.Stop {
  1345. return
  1346. }
  1347. for s.Pass {
  1348. util.TimeSleepFunc(2*time.Second, TimeSleepChan)
  1349. }
  1350. */
  1351. data := map[string]interface{}{}
  1352. paramdata := p.(map[string]interface{})
  1353. for k, v := range paramdata {
  1354. data[k] = v
  1355. }
  1356. if s.DownDetail {
  1357. href := qu.ObjToString(data["href"])
  1358. if href == "" || len(href) < 5 { //无效数据
  1359. return
  1360. }
  1361. //下载、解析、入库
  1362. data, err = s.DownloadDetailPage(paramdata, data)
  1363. if err != nil {
  1364. logger.Error(s.Code, paramdata, err)
  1365. return
  1366. }
  1367. }
  1368. data["comeintime"] = time.Now().Unix()
  1369. //atomic.AddInt32(&s.LastDowncount, 1)
  1370. //atomic.AddInt32(&s.TodayDowncount, 1)
  1371. //atomic.AddInt32(&s.TotalDowncount, 1)
  1372. Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
  1373. }
  1374. // 下载解析详情页
  1375. func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
  1376. defer qu.Catch()
  1377. s.LastHeartbeat = time.Now().Unix()
  1378. util.TimeSleepFunc((time.Duration(s.SleepBase+util.GetRandMath(s.SleepRand)))*time.Millisecond, TimeSleepChan)
  1379. tab := s.L.NewTable()
  1380. for k, v := range param {
  1381. if val, ok := v.(string); ok {
  1382. tab.RawSet(lua.LString(k), lua.LString(val))
  1383. } else if val, ok := v.(int64); ok {
  1384. tab.RawSet(lua.LString(k), lua.LNumber(val))
  1385. } else if val, ok := v.(int32); ok {
  1386. tab.RawSet(lua.LString(k), lua.LNumber(val))
  1387. } else if val, ok := v.(float64); ok {
  1388. tab.RawSet(lua.LString(k), lua.LNumber(val))
  1389. } else if val, ok := v.(float32); ok {
  1390. tab.RawSet(lua.LString(k), lua.LNumber(val))
  1391. } else if val, ok := v.(bool); ok {
  1392. tab.RawSet(lua.LString(k), lua.LBool(val))
  1393. }
  1394. }
  1395. var err error
  1396. if err = s.L.CallByParam(lua.P{
  1397. Fn: s.L.GetGlobal("downloadDetailPage"),
  1398. NRet: 1,
  1399. Protect: true,
  1400. }, tab); err != nil {
  1401. //panic(s.Code + "," + err.Error())
  1402. log.Println(s.Code + "," + err.Error())
  1403. atomic.AddInt32(&s.Script.ErrorNum, 1)
  1404. return data, err
  1405. }
  1406. lv := s.L.Get(-1)
  1407. s.L.Pop(1)
  1408. //拼map
  1409. if v3, ok := lv.(*lua.LTable); ok {
  1410. v3.ForEach(func(k, v lua.LValue) {
  1411. if tmp, ok := k.(lua.LString); ok {
  1412. key := string(tmp)
  1413. if value, ok := v.(lua.LString); ok {
  1414. data[key] = string(value)
  1415. } else if value, ok := v.(lua.LNumber); ok {
  1416. data[key] = int64(value)
  1417. } else if value, ok := v.(*lua.LTable); ok {
  1418. tmp := util.TableToMap(value)
  1419. data[key] = tmp
  1420. }
  1421. }
  1422. })
  1423. return data, err
  1424. } else {
  1425. return nil, err
  1426. }
  1427. }
  1428. // 高性能模式定时采集三级页信息
  1429. func DetailData() {
  1430. defer qu.Catch()
  1431. <-InitAllLuaOver //脚本加载完毕,执行
  1432. if util.Config.Working == 0 && !util.Config.IsHistoryEvent { //高性能模式且不是7000节点,只有7000节点util.Config.IsHistoryEvent为true
  1433. GetListDataDownloadDetail()
  1434. }
  1435. }
  1436. func GetListDataDownloadDetail() {
  1437. defer qu.Catch()
  1438. logger.Info("+++++++++++++++++++Download Detail+++++++++++++++++++")
  1439. Allspiders2.Range(func(k, v interface{}) bool {
  1440. sp := v.(*Spider)
  1441. go sp.DownloadHighDetail(true)
  1442. time.Sleep(200 * time.Millisecond)
  1443. return true
  1444. })
  1445. }
  1446. // 高性能模式根据列表页数据下载三级页
  1447. func (s *Spider) DownloadHighDetail(reload bool) {
  1448. defer qu.Catch()
  1449. for {
  1450. logger.Info("Detail Running Code:", s.Code, " Stop:", s.Stop)
  1451. if !s.Stop { //爬虫是运行状态
  1452. s.DownloadDetail(reload, false)
  1453. } else {
  1454. break
  1455. }
  1456. }
  1457. }
  1458. // 队列模式根据列表页数据下载三级页
  1459. func (s *Spider) DownloadListDetail(reload bool) {
  1460. defer qu.Catch()
  1461. s.DownloadDetail(reload, false)
  1462. //队列模式爬虫下载完三级页数据或无下载数据,使用后close
  1463. s.Stop = true
  1464. if _, b := Allspiders2.Load(s.Code); b {
  1465. Allspiders2.Store(s.Code, s)
  1466. }
  1467. s.L.Close()
  1468. CC2 <- s.L
  1469. }
  1470. // 下载详情页
  1471. func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
  1472. defer qu.Catch()
  1473. coll := "spider_highlistdata"
  1474. isEsRepeat := false //是否进行es判重
  1475. q := map[string]interface{}{
  1476. "spidercode": s.Code,
  1477. "state": 0, //0:入库状态;-1:采集失败;1:成功
  1478. }
  1479. o := map[string]interface{}{"_id": -1}
  1480. if !isHistory { //非历史数据下载,补充comeintime时间检索条件
  1481. comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
  1482. if delaySite := DelaySiteMap[s.Name]; delaySite != nil {
  1483. isEsRepeat = delaySite.Compete
  1484. if delaySite.DelayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
  1485. //comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
  1486. comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delaySite.DelayTime)
  1487. }
  1488. }
  1489. q["comeintime"] = comeintimeQuery
  1490. } else {
  1491. coll = "spider_historydata"
  1492. o["_id"] = 1 //历史数据正序
  1493. }
  1494. f := map[string]interface{}{
  1495. "state": 0,
  1496. "comeintime": 0,
  1497. "event": 0,
  1498. }
  1499. if !isHistory && !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
  1500. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail", false) //记录modal=1采集三级页心跳
  1501. }
  1502. countNum := MgoS.Count(coll, q) //统计util.Config.DayNum天内未下载爬虫个数
  1503. if isHistory && countNum == 0 { //下载历史数据量为0,手动stop
  1504. s.Stop = true
  1505. return
  1506. }
  1507. //logger.Info("Thread Info: Code:", s.SCode, " count:", countNum)
  1508. if countNum > 0 {
  1509. threadNum := countNum / util.Config.ThreadBaseNum //线程数
  1510. if threadNum > util.Config.ThreadUpperLimit { //设置单个爬虫线程上限
  1511. threadNum = util.Config.ThreadUpperLimit
  1512. }
  1513. logger.Info("Thread Info: Code:", s.SCode, " count:", countNum, " thread num:", threadNum)
  1514. list, _ := MgoS.Find(coll, q, o, f, false, 0, 200)
  1515. if list != nil && len(*list) > 0 {
  1516. spChan := make(chan *Spider, threadNum+1) //初始化线程通道(+1表示基本的线程数)
  1517. if threadNum > 1 { //初始化多个sp
  1518. if !isHistory {
  1519. //从LoopListPath取爬虫信息,是为了保证创建的spider对象始终使用的是最新的爬虫信息(爬虫上架后会更新LoopListPath中的爬虫信息)
  1520. if v, ok := LoopListPath.Load(s.Code); ok && v != nil {
  1521. if info, ok := v.(map[string]string); ok {
  1522. NewSpiderByScript(threadNum, s.Code, info["script"], spChan)
  1523. } else {
  1524. logger.Debug("LoopListPath Not Has Code:", s.Code)
  1525. spChan = make(chan *Spider, 1) //不能创建其它sp只能用主线程的sp
  1526. }
  1527. } else {
  1528. logger.Debug("LoopListPath Not Has Code:", s.Code)
  1529. spChan = make(chan *Spider, 1) //不能创建其它sp只能用主线程的sp
  1530. }
  1531. } else {
  1532. NewSpiderByScript(threadNum, s.Code, s.ScriptFile, spChan)
  1533. }
  1534. }
  1535. spChan <- s //主线程sp放入通道
  1536. wg := &sync.WaitGroup{}
  1537. spLock := &sync.Mutex{}
  1538. updateArr := [][]map[string]interface{}{}
  1539. for _, tmp := range *list {
  1540. spTmp := <-spChan //通道中取出sp对象
  1541. wg.Add(1)
  1542. atomic.AddInt64(&AllThreadNum, 1)
  1543. go func(tmp map[string]interface{}, sp *Spider) {
  1544. defer func() {
  1545. spChan <- sp //处理完数据sp对象放回通道中
  1546. wg.Done()
  1547. atomic.AddInt64(&AllThreadNum, -1)
  1548. }()
  1549. if s.Stop || sp == nil { //爬虫下架或者初始化sp为nil时不再下载数据
  1550. return
  1551. }
  1552. _id := tmp["_id"]
  1553. query := map[string]interface{}{"_id": _id}
  1554. href := qu.ObjToString(tmp["href"])
  1555. //hashHref := util.HexText(href)
  1556. update := []map[string]interface{}{}
  1557. if isEsRepeat { //es数据title判重
  1558. title := qu.ObjToString(tmp["title"])
  1559. eTime := time.Now().Unix()
  1560. sTime := eTime - int64(7*86400)
  1561. //esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
  1562. esQuery := elc.NewBoolQuery().Must(elc.NewRangeQuery("comeintime").Gte(sTime).Lte(eTime)).Must(elc.NewTermQuery("title.mtitle", title))
  1563. count := Es.Count(EsIndex, EsType, esQuery)
  1564. if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
  1565. util.AddBloomRedis("href", href)
  1566. set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "es", "updatetime": time.Now().Unix()}} //已存在state置为1
  1567. update = append(update, query)
  1568. update = append(update, set)
  1569. spLock.Lock()
  1570. updateArr = append(updateArr, update)
  1571. spLock.Unlock()
  1572. return
  1573. }
  1574. }
  1575. times := qu.IntAll(tmp["times"]) //获取下载次数
  1576. success := true //数据是否下载成功的标志
  1577. delete(tmp, "_id")
  1578. delete(tmp, "times")
  1579. data := map[string]interface{}{}
  1580. var err interface{}
  1581. for k, v := range tmp {
  1582. data[k] = v
  1583. }
  1584. //下载、解析、入库
  1585. data, err = sp.DownloadDetailPage(tmp, data)
  1586. if !isHistory && !sp.Stop && sp.IsMainThread { //在下载详情页时爬虫下架,此时不再存心跳信息
  1587. UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute", false) //记录modal=1下载数据心跳
  1588. }
  1589. if err != nil || data == nil {
  1590. success = false
  1591. times++
  1592. if err != nil {
  1593. logger.Error(s.Code, err, tmp)
  1594. //if len(tmp) > 0 && !isHistory { //下载历史数据时不保存错误信息
  1595. // SaveErrorData(s.MUserName, tmp, err) //保存错误信息
  1596. //}
  1597. } /*else if data == nil && times >= 3 { //下载问题,建editor任务
  1598. DownloadErrorData(s.Code, tmp)
  1599. }*/
  1600. } /*else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
  1601. util.RedisClusterSet(hashHref, "", -1)
  1602. }*/
  1603. if !success { //下载失败更新次数和状态
  1604. ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
  1605. if times >= 3 { //3次下载失败今天不再下载,state置为1
  1606. ss["state"] = -1
  1607. }
  1608. set := map[string]interface{}{"$set": ss}
  1609. update = append(update, query)
  1610. update = append(update, set)
  1611. spLock.Lock()
  1612. updateArr = append(updateArr, update)
  1613. spLock.Unlock()
  1614. return
  1615. } else if data["delete"] != nil { //三级页过滤
  1616. //util.AddBloomRedis("href", tmphref)//delete可能存在删除跳转网站的数据,加入全量redis后可能导致该网站采不到
  1617. //更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
  1618. set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "delete", "updatetime": time.Now().Unix()}}
  1619. update = append(update, query)
  1620. update = append(update, set)
  1621. spLock.Lock()
  1622. updateArr = append(updateArr, update)
  1623. spLock.Unlock()
  1624. return
  1625. }
  1626. //正文、附件分析,下载异常数据重新下载
  1627. if r := AnalysisProjectInfo(data); r != "" { //顺序采集暂不加此块判断(异常数据不会加redis,导致一直下载)
  1628. times++
  1629. ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
  1630. if times >= 3 { //3次下载失败今天不再下载,state置为-1
  1631. ss["state"] = -1
  1632. ss["detailfilerr"] = r
  1633. }
  1634. set := map[string]interface{}{"$set": ss}
  1635. update = append(update, query)
  1636. update = append(update, set)
  1637. spLock.Lock()
  1638. updateArr = append(updateArr, update)
  1639. spLock.Unlock()
  1640. return
  1641. }
  1642. //数据采集成功
  1643. //根据发布时间进行数据判重校验
  1644. tmphref := qu.ObjToString(data["href"])
  1645. publishtime := qu.Int64All(data["l_np_publishtime"])
  1646. if publishtime < time.Now().AddDate(-1, 0, 0).Unix() {
  1647. isExist, _ := util.ExistsBloomRedis("href", tmphref)
  1648. if isExist {
  1649. set := map[string]interface{}{"$set": map[string]interface{}{
  1650. "state": 1,
  1651. "updatetime": time.Now().Unix(),
  1652. "exist": "bloom_href",
  1653. "tmphref": tmphref,
  1654. }}
  1655. update = append(update, query)
  1656. update = append(update, set)
  1657. spLock.Lock()
  1658. updateArr = append(updateArr, update)
  1659. spLock.Unlock()
  1660. return
  1661. }
  1662. }
  1663. delete(data, "exit")
  1664. delete(data, "checkpublishtime")
  1665. data["comeintime"] = time.Now().Unix()
  1666. data["spidercode"] = s.Code
  1667. data["dataging"] = 0
  1668. data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
  1669. data["infoformat"] = s.Infoformat //爬虫类型
  1670. Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
  1671. set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
  1672. update = append(update, query)
  1673. update = append(update, set)
  1674. spLock.Lock()
  1675. updateArr = append(updateArr, update)
  1676. spLock.Unlock()
  1677. //到此数据下载完成
  1678. }(tmp, spTmp)
  1679. }
  1680. wg.Wait()
  1681. //更新数据
  1682. if len(updateArr) > 0 {
  1683. MgoS.UpdateBulk(coll, updateArr...)
  1684. updateArr = [][]map[string]interface{}{}
  1685. }
  1686. close(spChan) //关闭通道
  1687. //释放sp对象(保留主线程sp,IsMainThread=true)
  1688. for sp := range spChan {
  1689. if sp != nil && !sp.IsMainThread {
  1690. sp.L.Close()
  1691. }
  1692. }
  1693. if !s.Stop && reload { //高性能模式下载完三级页数据,sp对象需要重载
  1694. //重载主线程sp
  1695. s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
  1696. }
  1697. }
  1698. } else if reload { //高性能模式无数据sleep
  1699. time.Sleep(30 * time.Second)
  1700. }
  1701. }
  1702. // 初始化sp对象
  1703. func NewSpiderByScript(num int, code, script string, spChan chan *Spider) {
  1704. for i := 1; i <= num; i++ {
  1705. spTmp, errstr := CreateSpider(code, script, true, true)
  1706. if errstr == "" && spTmp != nil { //脚本加载成功
  1707. spChan <- spTmp
  1708. } else {
  1709. spChan <- nil
  1710. }
  1711. }
  1712. }
  1713. // detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
  1714. func AnalysisProjectInfo(data map[string]interface{}) string {
  1715. defer qu.Catch()
  1716. detail := qu.ObjToString(data["detail"])
  1717. if RestrictAccessReg.MatchString(detail) { //限制访问
  1718. return "ip"
  1719. }
  1720. if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
  1721. if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
  1722. if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
  1723. for _, data := range attachments {
  1724. if d, ok := data.(map[string]interface{}); ok {
  1725. fid := qu.ObjToString(d["fid"])
  1726. if fid != "" { //附件上传成功
  1727. return ""
  1728. }
  1729. }
  1730. }
  1731. return "detail_file"
  1732. } else {
  1733. return "detail_file"
  1734. }
  1735. } else {
  1736. return "detail_file"
  1737. }
  1738. }
  1739. return ""
  1740. }
  1741. // 打印线程数
  1742. func AllThreadLog() {
  1743. logger.Info("List Download All Thread:", ListAllThreadNum)
  1744. logger.Info("Detail Download All Thread:", AllThreadNum)
  1745. time.AfterFunc(1*time.Minute, AllThreadLog)
  1746. }
  1747. // 获取hascode
  1748. func GetHas1(data string) string {
  1749. t := sha1.New()
  1750. io.WriteString(t, data)
  1751. hf := Reg.FindString(data)
  1752. if !strings.HasSuffix(hf, "/") {
  1753. hf = hf + "/"
  1754. }
  1755. return hf + fmt.Sprintf("%x", t.Sum(nil))
  1756. }