quyer.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /**
  2. *goquery接口封装,支持直接属性、EQ语法解析
  3. *传入查询表达式,支持多项(多选)
  4. **/
  5. package spider_com
  6. import (
  7. "fmt"
  8. "strings"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/yuin/gopher-lua"
  11. )
  12. var cut *Cut
  13. func init() {
  14. cut = NewCut()
  15. }
  16. // 过滤script style等html标签
  17. func FilterNodes(nodetype string, sel *goquery.Selection) *goquery.Selection {
  18. for i := 0; i < len((*sel).Nodes); i++ {
  19. if (*sel).Nodes[i].Data == nodetype {
  20. (*sel).ReplaceWithNodes((*sel).Nodes[i])
  21. }
  22. }
  23. sel.Children().Each(func(n int, gq *goquery.Selection) {
  24. for i := 0; i < len((*gq).Nodes); i++ {
  25. if (*gq).Nodes[i].Data == nodetype {
  26. (*gq).ReplaceWithNodes((*gq).Nodes[i])
  27. }
  28. }
  29. gq.Children().Each(func(n2 int, gq2 *goquery.Selection) {
  30. FilterNodes(nodetype, gq2)
  31. })
  32. })
  33. return sel
  34. }
  35. // 查询一个文本,支持标签、属性查询
  36. func FindOneText(qpath, content, fnode string) lua.LString {
  37. if fnode == "" {
  38. fnode = "script,style"
  39. }
  40. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
  41. tmp := find(qpath, doc)
  42. ret := ""
  43. if v, ok := tmp.(*goquery.Selection); ok && v != nil {
  44. if fnode != "" {
  45. fnodes := strings.Split(fnode, ",")
  46. for _, node := range fnodes {
  47. v = FilterNodes(node, v)
  48. }
  49. }
  50. ret = v.Text()
  51. } else if v, ok := tmp.(string); ok {
  52. ret = v
  53. }
  54. return lua.LString(ret)
  55. }
  56. // 查询一个html,很少用到
  57. func FindOneHtml(qpath, content, fnode string) lua.LString {
  58. if fnode == "" {
  59. fnode = "script"
  60. }
  61. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
  62. tmp := find(qpath, doc)
  63. ret := ""
  64. if v, ok := tmp.(*goquery.Selection); ok && v != nil {
  65. if fnode != "" {
  66. fnodes := strings.Split(fnode, ",")
  67. for _, node := range fnodes {
  68. v = FilterNodes(node, v)
  69. }
  70. }
  71. ret, _ = v.Html()
  72. }
  73. return lua.LString(ret)
  74. }
  75. // 返回列表Text,基本用不上,如果要取文本,直接用FindOneText
  76. func FindListText(qpath string, content string, ret *lua.LTable) {
  77. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
  78. tmp := find(qpath, doc)
  79. if v, ok := tmp.(*goquery.Selection); ok && v != nil {
  80. v.Each(func(i int, gq *goquery.Selection) {
  81. text := gq.Text()
  82. ret.Append(lua.LString(text))
  83. })
  84. }
  85. }
  86. // 返回列表Html
  87. func FindListHtml(qpath string, content string, ret *lua.LTable) {
  88. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
  89. tmp := find(qpath, doc)
  90. if v, ok := tmp.(*goquery.Selection); ok && v != nil {
  91. v.Each(func(i int, gqs *goquery.Selection) {
  92. html, _ := gqs.Html()
  93. ret.Append(lua.LString(html))
  94. })
  95. }
  96. }
  97. // 正文清理
  98. func FindContentText(qpath, content string) lua.LString {
  99. tmp := FindOneHtml(qpath, content, "")
  100. ret := cut.ClearHtml(string(tmp))
  101. return lua.LString(ret)
  102. }
  103. // 批量查询,一个文档
  104. func FindMap(q *lua.LTable, content string, ret *lua.LTable) {
  105. doc, _ := goquery.NewDocumentFromReader(strings.NewReader(content))
  106. q.ForEach(func(l1, l2 lua.LValue) {
  107. value := l2.String()
  108. tmp := find(value, doc)
  109. if v, ok := tmp.(*goquery.Selection); ok && v != nil {
  110. ret.RawSet(l1, lua.LString(v.Text()))
  111. } else if v, ok := tmp.(string); ok {
  112. ret.RawSet(l1, lua.LString(v))
  113. }
  114. })
  115. }
  116. // 分词1
  117. func splitFn1(s rune) bool {
  118. return s == ':'
  119. }
  120. // 分词2
  121. func splitFn2(s rune) bool {
  122. return s == '(' || s == ')'
  123. }
  124. // 查询,
  125. func find(paths string, doc *goquery.Document) interface{} {
  126. var sel *goquery.Selection
  127. //支持传入多个表达式组合,有一个匹配上,即退出
  128. for _, path := range strings.Split(paths, "|") {
  129. sel = nil
  130. ws := strings.FieldsFunc(path, splitFn1)
  131. L:
  132. for _, v := range ws {
  133. if strings.HasPrefix(v, "eq(") {
  134. index := -1
  135. fmt.Sscanf(v, "eq(%d)", &index)
  136. if index > -1 && index < sel.Length() {
  137. sel = sel.Eq(index)
  138. } else {
  139. sel = nil
  140. break L
  141. }
  142. //断词后包含查询
  143. if eq := fmt.Sprintf("eq(%d)", index); len(v) > len(eq) {
  144. sel = sel.Find(v[len(eq):])
  145. }
  146. } else if strings.HasPrefix(v, "attr(") {
  147. tmp := strings.FieldsFunc(v, splitFn2)
  148. if len(tmp) > 1 {
  149. ret, exists := sel.Attr(tmp[1])
  150. if exists {
  151. return ret
  152. } else {
  153. sel = nil
  154. break L
  155. }
  156. }
  157. } else {
  158. if sel == nil {
  159. sel = doc.Find(v)
  160. } else {
  161. sel = sel.Find(v)
  162. }
  163. }
  164. }
  165. //第一条没找到,接着找下一跳表达式
  166. if sel != nil && sel.Length() > 0 {
  167. break
  168. }
  169. }
  170. return sel
  171. }