/
extract.go
149 lines (138 loc) · 3.14 KB
/
extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Copyright 2014 Hari haran. All rights reserved.
// Use of this source code is governed by a MIT
// license that can be found in the LICENSE file.
package extract
import (
"net/http"
"net/url"
"code.google.com/p/cascadia"
"golang.org/x/net/html"
)
// A selection contains the required elements for extraction.
type selection struct {
Selector string // CSS Selector
URL string
}
// newSelection is a constructor for selection type.
// selector implies a CSS selector string.
func newSelection(selector, url string) *selection {
return &selection{
Selector: selector,
URL: url,
}
}
// Links returns the absolute URLs of all references from an URL of a webpage.
func Links(u string) ([]string, error) {
s := newSelection("a[href]", u)
link, err := url.Parse(s.URL)
if err != nil {
return nil, err
}
r, err := http.Get(link.String())
if err != nil {
return nil, err
}
defer r.Body.Close()
doc, err := html.Parse(r.Body)
if err != nil {
return nil, err
}
sel, err := cascadia.Compile(s.Selector)
if err != nil {
return nil, err
}
matches := sel.MatchAll(doc)
var result []string
for _, m := range matches {
r, err := resolveURL(hrefString(m), link)
if err != nil {
return nil, err
}
result = append(result, r)
}
return result, nil
}
// hrefString takes an *html.Node as input and
// returns the value of attribute href.
func hrefString(n *html.Node) string {
switch n.Type {
case html.TextNode:
return ""
case html.ElementNode:
return attribute(
html.Token{
Type: html.StartTagToken,
Data: n.Data,
Attr: n.Attr,
}, "href")
}
return ""
}
// imageString takes an *html.Node as input and
// returns the value of attribute src.
func imageString(n *html.Node) string {
switch n.Type {
case html.TextNode:
return ""
case html.ElementNode:
return attribute(
html.Token{
Type: html.StartTagToken,
Data: n.Data,
Attr: n.Attr,
}, "src")
}
return ""
}
// attribute takes an html Token and the attribute key as inputs
// and returns the value of the attribute.
func attribute(t html.Token, a string) string {
for _, x := range t.Attr {
if x.Key == a {
return x.Val
}
}
return ""
}
// Images returns the absolute URLs of all the images from an URL of a webpage.
func Images(u string) ([]string, error) {
s := newSelection("img[src]", u)
link, err := url.Parse(s.URL)
if err != nil {
return nil, err
}
r, err := http.Get(link.String())
if err != nil {
return nil, err
}
defer r.Body.Close()
doc, err := html.Parse(r.Body)
if err != nil {
return nil, err
}
sel, err := cascadia.Compile(s.Selector)
if err != nil {
return nil, err
}
matches := sel.MatchAll(doc)
var result []string
for _, m := range matches {
r, err := resolveURL(imageString(m), link)
if err != nil {
return nil, err
}
result = append(result, r)
}
return result, nil
}
// resolveURL converts all input URLs into absolute URLs.
func resolveURL(s string, link *url.URL) (string, error) {
// x may or may not be an absolute URL.
x, err := url.Parse(s)
if err != nil {
return "", err
}
// y is guaranteed to be an absolute URL.
y := link.ResolveReference(x)
return y.String(), nil
}