mirror of https://github.com/misterzym/jdocset.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
189 lines
5.0 KiB
189 lines
5.0 KiB
// Package scrape provides a searching api on top of golang.org/x/net/html |
|
package scrape |
|
|
|
import ( |
|
"strings" |
|
|
|
"golang.org/x/net/html" |
|
"golang.org/x/net/html/atom" |
|
) |
|
|
|
// Matcher should return true when a desired node is found. |
|
type Matcher func(node *html.Node) bool |
|
|
|
// FindAll returns all nodes which match the provided Matcher. After discovering a matching |
|
// node, it will _not_ discover matching subnodes of that node. |
|
func FindAll(node *html.Node, matcher Matcher) []*html.Node { |
|
return findAllInternal(node, matcher, false) |
|
} |
|
|
|
// FindAllNested returns all nodes which match the provided Matcher and _will_ discover |
|
// matching subnodes of matching nodes. |
|
func FindAllNested(node *html.Node, matcher Matcher) []*html.Node { |
|
return findAllInternal(node, matcher, true) |
|
} |
|
|
|
// Find returns the first node which matches the matcher using depth-first search. |
|
// If no node is found, ok will be false. |
|
// |
|
// root, err := html.Parse(resp.Body) |
|
// if err != nil { |
|
// // handle error |
|
// } |
|
// matcher := func(n *html.Node) bool { |
|
// return n.DataAtom == atom.Body |
|
// } |
|
// body, ok := scrape.Find(root, matcher) |
|
func Find(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { |
|
if matcher(node) { |
|
return node, true |
|
} |
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling { |
|
n, ok := Find(c, matcher) |
|
if ok { |
|
return n, true |
|
} |
|
} |
|
return nil, false |
|
} |
|
|
|
// FindParent searches up HTML tree from the current node until either a |
|
// match is found or the top is hit. |
|
func FindParent(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { |
|
for p := node.Parent; p != nil; p = p.Parent { |
|
if matcher(p) { |
|
return p, true |
|
} |
|
} |
|
return nil, false |
|
} |
|
|
|
// Text returns text from all descendant text nodes joined. |
|
// For control over the join function, see TextJoin. |
|
func Text(node *html.Node) string { |
|
joiner := func(s []string) string { |
|
n := 0 |
|
for i := range s { |
|
trimmed := strings.TrimSpace(s[i]) |
|
if trimmed != "" { |
|
s[n] = trimmed |
|
n++ |
|
} |
|
} |
|
return strings.Join(s[:n], " ") |
|
} |
|
return TextJoin(node, joiner) |
|
} |
|
|
|
// TextJoin returns a string from all descendant text nodes joined by a |
|
// caller provided join function. |
|
func TextJoin(node *html.Node, join func([]string) string) string { |
|
nodes := FindAll(node, func(n *html.Node) bool { return n.Type == html.TextNode }) |
|
parts := make([]string, len(nodes)) |
|
for i, n := range nodes { |
|
parts[i] = n.Data |
|
} |
|
return join(parts) |
|
} |
|
|
|
// Attr returns the value of an HTML attribute. |
|
func Attr(node *html.Node, key string) string { |
|
for _, a := range node.Attr { |
|
if a.Key == key { |
|
return a.Val |
|
} |
|
} |
|
return "" |
|
} |
|
|
|
// ByTag returns a Matcher which matches all nodes of the provided tag type. |
|
// |
|
// root, err := html.Parse(resp.Body) |
|
// if err != nil { |
|
// // handle error |
|
// } |
|
// title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) |
|
func ByTag(a atom.Atom) Matcher { |
|
return func(node *html.Node) bool { return node.DataAtom == a } |
|
} |
|
|
|
// ById returns a Matcher which matches all nodes with the provided id. |
|
func ById(id string) Matcher { |
|
return func(node *html.Node) bool { return Attr(node, "id") == id } |
|
} |
|
|
|
// ByClass returns a Matcher which matches all nodes with the provided class. |
|
func ByClass(class string) Matcher { |
|
return func(node *html.Node) bool { |
|
classes := strings.Fields(Attr(node, "class")) |
|
for _, c := range classes { |
|
if c == class { |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
} |
|
|
|
// findAllInternal encapsulates the node tree traversal |
|
func findAllInternal(node *html.Node, matcher Matcher, searchNested bool) []*html.Node { |
|
matched := []*html.Node{} |
|
|
|
if matcher(node) { |
|
matched = append(matched, node) |
|
|
|
if !searchNested { |
|
return matched |
|
} |
|
} |
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling { |
|
found := findAllInternal(c, matcher, searchNested) |
|
if len(found) > 0 { |
|
matched = append(matched, found...) |
|
} |
|
} |
|
return matched |
|
} |
|
|
|
// Find returns the first node which matches the matcher using next sibling search. |
|
// If no node is found, ok will be false. |
|
// |
|
// root, err := html.Parse(resp.Body) |
|
// if err != nil { |
|
// // handle error |
|
// } |
|
// matcher := func(n *html.Node) bool { |
|
// return n.DataAtom == atom.Body |
|
// } |
|
// body, ok := scrape.FindNextSibling(root, matcher) |
|
func FindNextSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { |
|
|
|
for s := node.NextSibling; s != nil; s = s.NextSibling { |
|
if matcher(s) { |
|
return s, true |
|
} |
|
} |
|
return nil, false |
|
} |
|
|
|
// Find returns the first node which matches the matcher using previous sibling search. |
|
// If no node is found, ok will be false. |
|
// |
|
// root, err := html.Parse(resp.Body) |
|
// if err != nil { |
|
// // handle error |
|
// } |
|
// matcher := func(n *html.Node) bool { |
|
// return n.DataAtom == atom.Body |
|
// } |
|
// body, ok := scrape.FindPrevSibling(root, matcher) |
|
func FindPrevSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) { |
|
for s := node.PrevSibling; s != nil; s = s.PrevSibling { |
|
if matcher(s) { |
|
return s, true |
|
} |
|
} |
|
return nil, false |
|
}
|
|
|