forked from lestrrat-go/libxml2
/
html.go
55 lines (45 loc) · 1.04 KB
/
html.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
package libxml2
/*
#cgo pkg-config: libxml-2.0
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
*/
import "C"
import (
"bytes"
"io"
)
const (
HtmlParseRecover = 1 << 0
HtmlParseNoError = 1<<iota + 5
HtmlParseNoWarning
HtmlParsePedantic
HtmlParseNoBlanks
HtmlParseNoNet
HtmlParseCompact
)
const DefaultHtmlParseFlags = HtmlParseCompact | HtmlParseNoBlanks | HtmlParseNoError | HtmlParseNoWarning
func htmlReadDoc(content, url, encoding string, opts int) *C.xmlDoc {
return C.htmlReadDoc(
C.xmlCharStrdup(C.CString(content)),
C.CString(url),
C.CString(encoding),
C.int(opts),
)
}
func ParseHTMLString(content string) (*Document, error) {
d := htmlReadDoc(content, "", "", DefaultHtmlParseFlags)
root, err := C.xmlDocGetRootElement(d)
if err != nil || root == nil {
C.xmlFreeDoc(d)
return nil, err
}
return &Document{ptr: d, root: root}, nil
}
func ParseHTML(in io.Reader) (*Document, error) {
buf := &bytes.Buffer{}
if _, err := buf.ReadFrom(in); err != nil {
return nil, err
}
return ParseHTMLString(buf.String())
}