/** *goquery接口封装,支持直接属性、EQ语法解析 *传入查询表达式,支持多项(多选) **/ package spider_com import ( "fmt" "strings" "github.com/PuerkitoBio/goquery" "github.com/yuin/gopher-lua" ) var cut *Cut func init() { cut = NewCut() } // 过滤script style等html标签 func FilterNodes(nodetype string, sel *goquery.Selection) *goquery.Selection { for i := 0; i < len((*sel).Nodes); i++ { if (*sel).Nodes[i].Data == nodetype { (*sel).ReplaceWithNodes((*sel).Nodes[i]) } } sel.Children().Each(func(n int, gq *goquery.Selection) { for i := 0; i < len((*gq).Nodes); i++ { if (*gq).Nodes[i].Data == nodetype { (*gq).ReplaceWithNodes((*gq).Nodes[i]) } } gq.Children().Each(func(n2 int, gq2 *goquery.Selection) { FilterNodes(nodetype, gq2) }) }) return sel } // 查询一个文本,支持标签、属性查询 func FindOneText(qpath, content, fnode string) lua.LString { if fnode == "" { fnode = "script,style" } doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) tmp := find(qpath, doc) ret := "" if v, ok := tmp.(*goquery.Selection); ok && v != nil { if fnode != "" { fnodes := strings.Split(fnode, ",") for _, node := range fnodes { v = FilterNodes(node, v) } } ret = v.Text() } else if v, ok := tmp.(string); ok { ret = v } return lua.LString(ret) } // 查询一个html,很少用到 func FindOneHtml(qpath, content, fnode string) lua.LString { if fnode == "" { fnode = "script" } doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) tmp := find(qpath, doc) ret := "" if v, ok := tmp.(*goquery.Selection); ok && v != nil { if fnode != "" { fnodes := strings.Split(fnode, ",") for _, node := range fnodes { v = FilterNodes(node, v) } } ret, _ = v.Html() } return lua.LString(ret) } // 返回列表Text,基本用不上,如果要取文本,直接用FindOneText func FindListText(qpath string, content string, ret *lua.LTable) { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) tmp := find(qpath, doc) if v, ok := tmp.(*goquery.Selection); ok && v != nil { v.Each(func(i int, gq *goquery.Selection) { text := gq.Text() ret.Append(lua.LString(text)) }) } } // 返回列表Html func FindListHtml(qpath string, content string, ret *lua.LTable) { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) tmp := find(qpath, doc) if v, ok := tmp.(*goquery.Selection); ok && v != nil { v.Each(func(i int, gqs *goquery.Selection) { html, _ := gqs.Html() ret.Append(lua.LString(html)) }) } } // 正文清理 func FindContentText(qpath, content string) lua.LString { tmp := FindOneHtml(qpath, content, "") ret := cut.ClearHtml(string(tmp)) return lua.LString(ret) } // 批量查询,一个文档 func FindMap(q *lua.LTable, content string, ret *lua.LTable) { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content)) q.ForEach(func(l1, l2 lua.LValue) { value := l2.String() tmp := find(value, doc) if v, ok := tmp.(*goquery.Selection); ok && v != nil { ret.RawSet(l1, lua.LString(v.Text())) } else if v, ok := tmp.(string); ok { ret.RawSet(l1, lua.LString(v)) } }) } // 分词1 func splitFn1(s rune) bool { return s == ':' } // 分词2 func splitFn2(s rune) bool { return s == '(' || s == ')' } // 查询, func find(paths string, doc *goquery.Document) interface{} { var sel *goquery.Selection //支持传入多个表达式组合,有一个匹配上,即退出 for _, path := range strings.Split(paths, "|") { sel = nil ws := strings.FieldsFunc(path, splitFn1) L: for _, v := range ws { if strings.HasPrefix(v, "eq(") { index := -1 fmt.Sscanf(v, "eq(%d)", &index) if index > -1 && index < sel.Length() { sel = sel.Eq(index) } else { sel = nil break L } //断词后包含查询 if eq := fmt.Sprintf("eq(%d)", index); len(v) > len(eq) { sel = sel.Find(v[len(eq):]) } } else if strings.HasPrefix(v, "attr(") { tmp := strings.FieldsFunc(v, splitFn2) if len(tmp) > 1 { ret, exists := sel.Attr(tmp[1]) if exists { return ret } else { sel = nil break L } } } else { if sel == nil { sel = doc.Find(v) } else { sel = sel.Find(v) } } } //第一条没找到,接着找下一跳表达式 if sel != nil && sel.Length() > 0 { break } } return sel }