forked from juju/httprequest
/
checkisjson.go
155 lines (145 loc) · 4.45 KB
/
checkisjson.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package httprequest
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"mime"
"net/http"
"unicode"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"gopkg.in/errgo.v1"
)
// maxErrorBodySize holds the maximum amount of body that
// we try to read for an error before extracting text from it.
// It's reasonably large because:
// a) HTML often has large embedded scripts which we want
// to skip and
// b) it should be an relatively unusual case so the size
// shouldn't harm.
//
// It's defined as a variable so that it can be redefined in tests.
var maxErrorBodySize = 200 * 1024
// checkIsJSON checks that the content type of the given header implies
// that the content is JSON. If it is not, then reads from the body to
// try to make a useful error message.
func checkIsJSON(header http.Header, body io.Reader) error {
contentType := header.Get("Content-Type")
mediaType, _, err := mime.ParseMediaType(contentType)
if mediaType == "application/json" {
return nil
}
if err != nil {
// Even if there's no media type, we want to see something useful.
mediaType = fmt.Sprintf("%q", contentType)
}
// TODO use charset.NewReader to convert from non-utf8 content?
// Read the body ignoring any errors - we'll just make do with what we've got.
bodyData, _ := ioutil.ReadAll(io.LimitReader(noErrorReader{body}, int64(maxErrorBodySize)))
switch mediaType {
case "text/html":
text, err := htmlToText(bytes.NewReader(bodyData))
if err != nil {
// Note: it seems that this can never actually
// happen - the only way that the HTML parser
// can fail is if there's a read error and we've
// removed that possibility by using
// noErrorReader above.
return errgo.Notef(err, "unexpected (and invalid) content text/html; want application/json; content: %q", sizeLimit(bodyData))
}
if len(text) == 0 {
return errgo.Newf(`unexpected content type text/html; want application/json; content: %q`, sizeLimit(bodyData))
}
return errgo.Newf(`unexpected content type text/html; want application/json; content: %s`, sizeLimit(text))
case "text/plain":
return errgo.Newf(`unexpected content type text/plain; want application/json; content: %s`, sizeLimit(sanitizeText(string(bodyData), true)))
default:
return errgo.Newf(`unexpected content type %s; want application/json; content: %q`, mediaType, sizeLimit(bodyData))
}
}
// noErrorReader wraps a reader, turning any errors into io.EOF
// so that we can extract some content even if we get an io error.
type noErrorReader struct {
r io.Reader
}
func (r noErrorReader) Read(buf []byte) (int, error) {
n, err := r.r.Read(buf)
if err != nil {
err = io.EOF
}
return n, err
}
func sizeLimit(data []byte) []byte {
const max = 1024
if len(data) < max {
return data
}
return append(data[0:max], fmt.Sprintf(" ... [%d bytes omitted]", len(data)-max)...)
}
// htmlToText attempts to return some relevant textual content
// from the HTML content in the given reader, formatted
// as a single line.
func htmlToText(r io.Reader) ([]byte, error) {
n, err := html.Parse(r)
if err != nil {
return nil, err
}
var buf bytes.Buffer
htmlNodeToText(&buf, n)
return buf.Bytes(), nil
}
func htmlNodeToText(w *bytes.Buffer, n *html.Node) {
for ; n != nil; n = n.NextSibling {
switch n.Type {
case html.TextNode:
data := sanitizeText(n.Data, false)
if len(data) == 0 {
break
}
if w.Len() > 0 {
w.WriteString("; ")
}
w.Write(data)
case html.ElementNode:
if n.DataAtom != atom.Script {
htmlNodeToText(w, n.FirstChild)
}
case html.DocumentNode:
htmlNodeToText(w, n.FirstChild)
}
}
}
// sanitizeText tries to make the given string easier to read when presented
// as a single line. It squashes each run of white space into a single
// space, trims leading and trailing white space and trailing full
// stops. If newlineSemi is true, any newlines will be replaced with a
// semicolon.
func sanitizeText(s string, newlineSemi bool) []byte {
out := make([]byte, 0, len(s))
prevWhite := false
for _, r := range s {
if newlineSemi && r == '\n' && len(out) > 0 {
out = append(out, ';')
prevWhite = true
continue
}
if unicode.IsSpace(r) {
if len(out) > 0 {
prevWhite = true
}
continue
}
if prevWhite {
out = append(out, ' ')
prevWhite = false
}
out = append(out, string(r)...)
}
// Remove final space, any full stops and any final semicolon
// we might have added.
out = bytes.TrimRightFunc(out, func(r rune) bool {
return r == '.' || r == ' ' || r == ';'
})
return out
}