123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285 |
- package main
- import (
- "bufio"
- "bytes"
- "encoding/json"
- "fmt"
- "github.com/xuri/excelize/v2"
- "io"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "net/http"
- "os"
- "spider_creator/backend/script"
- "time"
- )
- // QlmStopDownloadData 终止下载
- func (a *App) QlmStopDownloadData(record map[string]interface{}) *Result {
- r := &Result{}
- qu.Debug(record)
- glvm.CloseTabs() //关闭浏览器资源
- r.Err = 1
- return r
- }
- // QlmListDataDownload 千里马列表页数据下载
- func (a *App) QlmListDataDownload(param map[string]interface{}, record map[string]interface{}) *Result {
- qu.Debug(param, record)
- r := &Result{}
- if User != nil {
- if !glvm.ScriptRunning {
- page := "list"
- detailScript := glvm.LoadScript("list")
- if detailScript != "" {
- getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
- if r.Err == 1 {
- go DownloadData(record, detailScript, page) //下载
- }
- } else {
- r.Msg = "详情页采集脚本加载失败!"
- }
- } else {
- r.Msg = "同时只能执行一个脚本,请稍后再试!"
- }
- } else {
- r.Msg = "用户登录异常,请重新登录!"
- }
- return r
- }
- // QlmDetailDataDownload 千里马详情页数据下载
- func (a *App) QlmDetailDataDownload(param map[string]interface{}, record map[string]interface{}) *Result {
- qu.Debug(param, record)
- r := &Result{}
- if User != nil {
- if !glvm.ScriptRunning {
- page := "detail"
- detailScript := glvm.LoadScript("detail")
- if detailScript != "" {
- script.Datas = []map[string]interface{}{}
- getData(nil, qu.ObjToString(record["recordid"]), "json", "download", &script.Datas)
- if len(script.Datas) > 0 {
- r.Err = 1
- go DownloadData(record, detailScript, page) //下载
- } else {
- r.Msg = "无可采集数据!"
- }
- //getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
- } else {
- r.Msg = "详情页采集脚本加载失败!"
- }
- } else {
- r.Msg = "同时只能执行一个脚本,请稍后再试!"
- }
- } else {
- r.Msg = "用户登录异常,请重新登录!"
- qu.Debug(r.Msg)
- }
- return r
- }
- // DownloadData 执行脚本下载数据
- func DownloadData(record map[string]interface{}, scriptText, page string) {
- defer qu.Catch()
- glvm.ScriptRunning = true
- defer func() {
- glvm.ScriptRunning = false
- }()
- glvm.ProxyServer, _ = record["proxyServer"].(bool)
- glvm.Headless, _ = record["headless"].(bool)
- glvm.ShowImage, _ = record["showImage"].(bool)
- recordId := qu.ObjToString(record["recordid"])
- //执行脚本
- state := 0
- err := glvm.RunScript(scriptText, recordId)
- if err == nil {
- for len(script.DataCache) > 0 {
- qu.Debug("当前待保存数据量:", len(script.DataCache))
- time.Sleep(time.Second * 1)
- }
- }
- if page == "list" {
- state = 2
- } else if page == "detail" {
- state = 5
- }
- r := &Result{}
- getResult(map[string]interface{}{"param": map[string]interface{}{"recordid": recordId, "state": state}}, r, "qlm/updateRecord")
- }
- // QlmExportExcelFile 导出excel
- func (a *App) QlmExportExcelFile(filepath, recordId string) map[string]interface{} {
- qu.Debug("filepath---", filepath)
- var msg string
- var errType int
- if err := runExportExcelFile(filepath, recordId); err != nil {
- msg = err.Error()
- } else {
- msg = "导出成功"
- errType = 1
- }
- return map[string]interface{}{"err": errType, "msg": msg}
- }
- // QlmExportJsonFile 导出json
- func (a *App) QlmExportJsonFile(filepath, recordId string) map[string]interface{} {
- qu.Debug("filepath---", filepath)
- var msg string
- var errType int
- if err := runQlmExportJsonFile(filepath, recordId); err != nil {
- msg = err.Error()
- } else {
- msg = "导出成功"
- errType = 1
- }
- return map[string]interface{}{"err": errType, "msg": msg}
- }
- func runExportExcelFile(filepath, recordId string) error {
- f := excelize.NewFile()
- defer f.Close()
- f.SetCellStr("Sheet1", "A1", "ID")
- f.SetCellStr("Sheet1", "B1", "标题")
- f.SetCellStr("Sheet1", "C1", "链接")
- f.SetCellStr("Sheet1", "D1", "发布时间")
- f.SetCellStr("Sheet1", "E1", "重复")
- f.SetCellStr("Sheet1", "F1", "详情页采集")
- f.SetCellStr("Sheet1", "G1", "采集账号")
- f.SetCellStr("Sheet1", "H1", "推送状态")
- f.SetCellStr("Sheet1", "I1", "正文")
- getData(f, recordId, "excel", "export", nil)
- err := f.SaveAs(filepath)
- if err != nil {
- return err
- }
- return nil
- }
- func runQlmExportJsonFile(filepath, recordId string) error {
- var result []map[string]interface{}
- getData(nil, recordId, "json", "export", &result)
- jsonData, err := json.MarshalIndent(result, "", " ")
- if err != nil {
- return err
- }
- fo, err := os.Create(filepath)
- if err != nil {
- return err
- }
- defer fo.Close()
- if _, err := fo.Write(jsonData); err != nil {
- return fmt.Errorf("failed to write data to file: %w", err)
- }
- return nil
- }
- func getData(file *excelize.File, recordId, exportStype, from string, result *[]map[string]interface{}) {
- // 将数据编码为JSON格式
- param := map[string]interface{}{
- "recordid": recordId, "from": from,
- }
- jsonData, err := json.Marshal(map[string]interface{}{"param": param})
- if err != nil {
- qu.Debug(err)
- }
- // 创建一个HTTP POST请求
- req, err := http.NewRequest("POST", fmt.Sprintf(serverAddress, "qlm/getData"), bytes.NewBuffer(jsonData))
- if err != nil {
- qu.Debug("Error creating request:", err)
- }
- // 设置请求头,表明发送的是JSON数据
- req.Header.Set("Content-Type", "application/json")
- // 发送HTTP请求并获取响应
- client := &http.Client{}
- resp, err := client.Do(req)
- if err != nil {
- qu.Debug("Error making request:", err)
- }
- defer resp.Body.Close()
- // 检查响应状态码
- if resp.StatusCode != http.StatusOK {
- qu.Debug("Error: server returned status:", resp.StatusCode)
- }
- // 创建一个bufio.Reader来逐行读取响应体(这里假设服务器发送的是逐条JSON对象)
- reader := bufio.NewReader(resp.Body)
- decoder := json.NewDecoder(reader)
- // 逐条读取并处理JSON数据
- n := 0
- index := 0
- for {
- var tmp map[string]interface{}
- // 尝试解码下一条JSON数据
- if err := decoder.Decode(&tmp); err != nil {
- // 检查是否是io.EOF错误,表示已经读取完所有数据
- if err == io.EOF {
- break
- }
- // 对于其他错误,打印错误信息并退出
- qu.Debug(err)
- } else {
- n++
- index++
- if exportStype == "excel" {
- indexStr := fmt.Sprint(index + 1)
- file.SetCellStr("Sheet1", "A"+indexStr, qu.ObjToString(tmp["_id"]))
- file.SetCellStr("Sheet1", "B"+indexStr, qu.ObjToString(tmp["title"]))
- file.SetCellStr("Sheet1", "C"+indexStr, qu.ObjToString(tmp["href"]))
- if ptime, ok := tmp["publishtime"].(string); ok {
- file.SetCellStr("Sheet1", "D"+indexStr, ptime)
- } else {
- publishtime := qu.Int64All(tmp["publishtime"])
- if publishtime == 0 {
- file.SetCellStr("Sheet1", "D"+indexStr, "")
- } else {
- file.SetCellStr("Sheet1", "D"+indexStr, qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout))
- }
- }
- repeatText := "未判重"
- if repeat := tmp["rp"]; repeat != nil {
- if repeatTmp, ok := repeat.(bool); ok && repeatTmp {
- repeatText = "重复"
- } else {
- repeatText = "不重复"
- }
- }
- file.SetCellStr("Sheet1", "E"+indexStr, repeatText)
- stateText := "未采集"
- state := qu.IntAll(tmp["state"])
- if state == 1 {
- stateText = "采集成功"
- } else if state == -1 {
- stateText = "采集失败"
- }
- file.SetCellStr("Sheet1", "F"+indexStr, stateText)
- file.SetCellStr("Sheet1", "G"+indexStr, qu.ObjToString(tmp["username"]))
- pushstateText := "未推送"
- if qu.IntAll(tmp["pushstate"]) == 1 {
- pushstateText = "推送成功"
- }
- file.SetCellStr("Sheet1", "H"+indexStr, pushstateText)
- file.SetCellStr("Sheet1", "I"+indexStr, qu.ObjToString(tmp["detail"]))
- } else if exportStype == "json" {
- *result = append(*result, tmp)
- }
- }
- }
- qu.Debug(recordId, "共获取数据量:", n)
- }
- // 保存数据
- func updateData() {
- for {
- select {
- case data := <-script.DataCache:
- r := &Result{}
- getResult(map[string]interface{}{"param": data}, r, "qlm/updateData")
- if r.Err == 1 {
- qu.Debug("保存成功:", data["href"], data["title"])
- } else {
- qu.Debug("保存失败:", data["href"], data["title"])
- }
- }
- }
- }
|