123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- /**
- *goquery接口封装,支持直接属性、EQ语法解析
- *传入查询表达式,支持多项(多选)
- **/
- package spider_com
- import (
- "fmt"
- "strings"
- "github.com/PuerkitoBio/goquery"
- "github.com/yuin/gopher-lua"
- )
- var cut *Cut
- func init() {
- cut = NewCut()
- }
- // 过滤script style等html标签
- func FilterNodes(nodetype string, sel *goquery.Selection) *goquery.Selection {
- for i := 0; i < len((*sel).Nodes); i++ {
- if (*sel).Nodes[i].Data == nodetype {
- (*sel).ReplaceWithNodes((*sel).Nodes[i])
- }
- }
- sel.Children().Each(func(n int, gq *goquery.Selection) {
- for i := 0; i < len((*gq).Nodes); i++ {
- if (*gq).Nodes[i].Data == nodetype {
- (*gq).ReplaceWithNodes((*gq).Nodes[i])
- }
- }
- gq.Children().Each(func(n2 int, gq2 *goquery.Selection) {
- FilterNodes(nodetype, gq2)
- })
- })
- return sel
- }
- // 查询一个文本,支持标签、属性查询
- func FindOneText(qpath, content, fnode string) lua.LString {
- if fnode == "" {
- fnode = "script,style"
- }
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
- tmp := find(qpath, doc)
- ret := ""
- if v, ok := tmp.(*goquery.Selection); ok && v != nil {
- if fnode != "" {
- fnodes := strings.Split(fnode, ",")
- for _, node := range fnodes {
- v = FilterNodes(node, v)
- }
- }
- ret = v.Text()
- } else if v, ok := tmp.(string); ok {
- ret = v
- }
- return lua.LString(ret)
- }
- // 查询一个html,很少用到
- func FindOneHtml(qpath, content, fnode string) lua.LString {
- if fnode == "" {
- fnode = "script"
- }
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
- tmp := find(qpath, doc)
- ret := ""
- if v, ok := tmp.(*goquery.Selection); ok && v != nil {
- if fnode != "" {
- fnodes := strings.Split(fnode, ",")
- for _, node := range fnodes {
- v = FilterNodes(node, v)
- }
- }
- ret, _ = v.Html()
- }
- return lua.LString(ret)
- }
- // 返回列表Text,基本用不上,如果要取文本,直接用FindOneText
- func FindListText(qpath string, content string, ret *lua.LTable) {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
- tmp := find(qpath, doc)
- if v, ok := tmp.(*goquery.Selection); ok && v != nil {
- v.Each(func(i int, gq *goquery.Selection) {
- text := gq.Text()
- ret.Append(lua.LString(text))
- })
- }
- }
- // 返回列表Html
- func FindListHtml(qpath string, content string, ret *lua.LTable) {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
- tmp := find(qpath, doc)
- if v, ok := tmp.(*goquery.Selection); ok && v != nil {
- v.Each(func(i int, gqs *goquery.Selection) {
- html, _ := gqs.Html()
- ret.Append(lua.LString(html))
- })
- }
- }
- // 正文清理
- func FindContentText(qpath, content string) lua.LString {
- tmp := FindOneHtml(qpath, content, "")
- ret := cut.ClearHtml(string(tmp))
- return lua.LString(ret)
- }
- // 批量查询,一个文档
- func FindMap(q *lua.LTable, content string, ret *lua.LTable) {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
- q.ForEach(func(l1, l2 lua.LValue) {
- value := l2.String()
- tmp := find(value, doc)
- if v, ok := tmp.(*goquery.Selection); ok && v != nil {
- ret.RawSet(l1, lua.LString(v.Text()))
- } else if v, ok := tmp.(string); ok {
- ret.RawSet(l1, lua.LString(v))
- }
- })
- }
- // 分词1
- func splitFn1(s rune) bool {
- return s == ':'
- }
- // 分词2
- func splitFn2(s rune) bool {
- return s == '(' || s == ')'
- }
- // 查询,
- func find(paths string, doc *goquery.Document) interface{} {
- var sel *goquery.Selection
- //支持传入多个表达式组合,有一个匹配上,即退出
- for _, path := range strings.Split(paths, "|") {
- sel = nil
- ws := strings.FieldsFunc(path, splitFn1)
- L:
- for _, v := range ws {
- if strings.HasPrefix(v, "eq(") {
- index := -1
- fmt.Sscanf(v, "eq(%d)", &index)
- if index > -1 && index < sel.Length() {
- sel = sel.Eq(index)
- } else {
- sel = nil
- break L
- }
- //断词后包含查询
- if eq := fmt.Sprintf("eq(%d)", index); len(v) > len(eq) {
- sel = sel.Find(v[len(eq):])
- }
- } else if strings.HasPrefix(v, "attr(") {
- tmp := strings.FieldsFunc(v, splitFn2)
- if len(tmp) > 1 {
- ret, exists := sel.Attr(tmp[1])
- if exists {
- return ret
- } else {
- sel = nil
- break L
- }
- }
- } else {
- if sel == nil {
- sel = doc.Find(v)
- } else {
- sel = sel.Find(v)
- }
- }
- }
- //第一条没找到,接着找下一跳表达式
- if sel != nil && sel.Length() > 0 {
- break
- }
- }
- return sel
- }
|