123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- package db
- import (
- "container/list"
- "encoding/json"
- "fmt"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "log"
- "os"
- "sort"
- be "spider_creator/backend"
- "strconv"
- "strings"
- "github.com/bmaupin/go-epub"
- "github.com/boltdb/bolt"
- "github.com/xuri/excelize/v2"
- )
- // Load
- func (s *SpiderDb) Load(code string) *be.SpiderConfig {
- var req *be.SpiderConfig = new(be.SpiderConfig)
- err := s.db.View(func(tx *bolt.Tx) error {
- bucket := tx.Bucket([]byte("myBucket"))
- value := bucket.Get([]byte(code))
- if value != nil && len(value) > 0 {
- _ = json.Unmarshal(value, req)
- }
- return nil
- })
- if err != nil {
- log.Fatal(err)
- }
- return req
- }
- // SaveOrUpdate
- func (s *SpiderDb) SaveOrUpdate(sc *be.SpiderConfig) {
- //加载原始数据
- var sc1 *be.SpiderConfig = new(be.SpiderConfig)
- var sc2 *be.SpiderConfig
- err := s.db.View(func(tx *bolt.Tx) error {
- bucket := tx.Bucket([]byte("myBucket"))
- value := bucket.Get([]byte(sc.Code))
- if value != nil && len(value) > 0 {
- _ = json.Unmarshal(value, sc1)
- }
- return nil
- })
- if err != nil {
- qu.Debug(err.Error())
- return
- }
- //更新
- if sc1 != nil {
- sc2 = be.MergeSpiderConfig(sc1, sc)
- value, _ := json.Marshal(sc2)
- err = s.db.Update(func(tx *bolt.Tx) error {
- bucket := tx.Bucket([]byte("myBucket"))
- err := bucket.Put([]byte(sc.Code), value)
- return err
- })
- if err != nil {
- qu.Debug(err.Error())
- return
- }
- }
- }
- // LoadAll,默认按照代码排序
- func (s *SpiderDb) LoadSpiderConfigAll() be.SpiderConfiges {
- ret := make(be.SpiderConfiges, 0)
- // 开始读取事务
- err := s.db.View(func(tx *bolt.Tx) error {
- // 遍历数据库中的所有桶
- bucket := tx.Bucket([]byte("myBucket"))
- // 遍历桶中的所有键/值对
- return bucket.ForEach(func(k, v []byte) error {
- var sf *be.SpiderConfig = new(be.SpiderConfig)
- json.Unmarshal(v, sf)
- if sf != nil {
- ret = append(ret, sf)
- }
- return nil
- })
- })
- sort.Sort(ret)
- if err != nil {
- qu.Debug(err.Error())
- }
- return ret
- }
- // Delete
- func (s *SpiderDb) DeleteSpiderConfig(code string) {
- err := s.db.Update(func(tx *bolt.Tx) error {
- bucket := tx.Bucket([]byte("myBucket"))
- err := bucket.Delete([]byte(code))
- return err
- })
- if err != nil {
- qu.Debug(err.Error())
- return
- }
- }
- // 批量导入
- func (s *SpiderDb) BatchImport(filepath string) error {
- f, err := excelize.OpenFile(filepath)
- if err != nil {
- return err
- }
- defer f.Close()
- for _, sheetName := range f.GetSheetList() {
- // 获取工作表的所有行
- rows, err := f.GetRows(sheetName)
- if err != nil {
- continue
- }
- //
- for index, row := range rows {
- if index == 0 || len(row) < 5 || row[0] == "" || row[3] == "" {
- continue
- }
- sc := &be.SpiderConfig{
- Code: row[0],
- Site: row[1],
- Channel: row[2],
- Href: row[3],
- ModifyUser: row[4],
- }
- value, _ := json.Marshal(sc)
- err = s.db.Update(func(tx *bolt.Tx) error {
- bucket := tx.Bucket([]byte("myBucket"))
- err := bucket.Put([]byte(sc.Code), value)
- return err
- })
- if err != nil {
- continue
- }
- }
- }
- return nil
- }
- // ExportEpubFile 导出epub文件
- func (db *SpiderDb) ExportEpubFile(bookname, filepath string,
- currentResult *list.List) error {
- output := epub.NewEpub(bookname)
- output.SetTitle(bookname)
- output.SetDescription(bookname)
- output.SetAuthor("unknow")
- i := 1
- for el := currentResult.Front(); el != nil; el = el.Next() {
- art, _ := el.Value.(*be.ResultItem)
- body := "<h2>" + art.Title + "</h2><p>" + strings.Join(strings.Split(art.Content, "\n"), "</p><p>") + "</p>"
- output.AddSection(body, art.Title, fmt.Sprintf("%06d.xhtml", i+1), "")
- i += 1
- }
- fo, err := os.Create(filepath)
- if err != nil {
- db.enf.Dispatch("debug_event", err.Error())
- }
- output.WriteTo(fo)
- fo.Close()
- return nil
- }
- // ExportExcelFile数据集导出到excel文件中
- func (db *SpiderDb) ExportExcelFile(filepath, site, channel string,
- currentResult *list.List) error {
- f := excelize.NewFile()
- defer f.Close()
- f.SetCellStr("Sheet1", "A1", "站点")
- f.SetCellStr("Sheet1", "B1", "栏目")
- //写入数据
- f.SetCellStr("Sheet1", "C1", "标题")
- f.SetCellStr("Sheet1", "D1", "链接")
- f.SetCellStr("Sheet1", "E1", "发布单位")
- f.SetCellStr("Sheet1", "F1", "发布时间")
- f.SetCellStr("Sheet1", "G1", "正文")
- f.SetCellStr("Sheet1", "H1", "附件")
- i := 0
- for el := currentResult.Front(); el != nil; el = el.Next() {
- r, _ := el.Value.(*be.ResultItem)
- //写入站点信息
- iStr := strconv.Itoa(i + 2)
- f.SetCellStr("Sheet1", "A"+iStr, site)
- f.SetCellStr("Sheet1", "B"+iStr, channel)
- //写入数据
- f.SetCellStr("Sheet1", "C"+iStr, r.Title)
- f.SetCellStr("Sheet1", "D"+iStr, r.Href)
- f.SetCellStr("Sheet1", "E"+iStr, r.PublishUnit)
- f.SetCellStr("Sheet1", "F"+iStr, r.ListPubTime)
- f.SetCellStr("Sheet1", "G"+iStr, r.Content)
- f.SetCellStr("Sheet1", "H"+iStr, "")
- if len(r.AttachLinks) > 0 {
- bs, err := json.Marshal(r.AttachLinks)
- if err == nil {
- f.SetCellStr("Sheet1", "H"+iStr, string(bs))
- }
- }
- i += 1
- }
- err := f.SaveAs(filepath)
- if err != nil {
- return err
- }
- return nil
- }
|