/
html.go
47 lines (37 loc) · 892 Bytes
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package nlp
import "html"
import "io"
import "os"
import "bytes"
import "strings"
import "unicode"
type HTMLScrubbedReader struct {
node *html.Node
buf *bytes.Buffer
}
func (this *HTMLScrubbedReader) scrub(data string) string {
return strings.TrimFunc(data, func(r int) bool {
return !unicode.IsLetter(r)
})
}
func (this *HTMLScrubbedReader) parse(node * html.Node, buf *bytes.Buffer) {
if node.Type == html.TextNode {
buf.WriteString(node.Data)
}
for _,child := range node.Child {
this.parse(child, buf)
}
}
func NewHTMLScrubbedReader(reader io.Reader) (*HTMLScrubbedReader,os.Error) {
node, err := html.Parse(reader)
if err != nil {
return nil, err
}
buf := bytes.NewBufferString("")
this := &HTMLScrubbedReader{node,buf}
this.parse(node, buf)
return this,nil
}
func (this * HTMLScrubbedReader) Read(p []byte) (int, os.Error) {
return this.buf.Read(p)
}