/
fruitsJSON.go
310 lines (248 loc) · 8.86 KB
/
fruitsJSON.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"strconv"
"strings"
"text/scanner"
"golang.org/x/net/publicsuffix"
// third-party libraries
"github.com/PuerkitoBio/goquery"
iconv "github.com/djimenez/iconv-go"
)
// Custom type defined to make sure JSON output contains precision of 2 decimal points
type Number float32
func (n Number) MarshalJSON() ([]byte, error) {
return []byte(fmt.Sprintf("%.2f", n)), nil
}
// Basic struct to be used for channel sharing and JSON Marshaling
type FruitItem struct {
Title string `json:"title"`
Size string `json:"size"`
UnitPrice Number `json:"unit_price"`
Description string `json:"description"`
DetailsUri string `json:"-"` //ignored on marshaling
}
// Fatal if there is an error
func checkErr(err error) {
if err != nil {
log.Fatal(err)
}
}
// Pretty print the simulated progress bar
func prettyProgess(start, mid, end string, max int) {
fmt.Print(start)
for i := 0; i < max; i++ {
fmt.Print(mid)
}
fmt.Println(end)
}
/**
* This function parses a value string parameter and returns Number value
* embedded within the string. It returns nil if it doesn't find any
* Number value in the value string.
* Example: "some4.56more" would return 4.56
*/
func extractFloat32(value string) Number {
var sc scanner.Scanner
var tok rune
var valFloat64 float64
var valFloat32 Number
var err error
var isFound bool
if len(value) > 0 {
sc.Init(strings.NewReader(value))
sc.Mode = scanner.ScanFloats
for tok != scanner.EOF {
tok = sc.Scan()
// fmt.Println("At position", sc.Pos(), ":", sc.TokenText())
valFloat64, err = strconv.ParseFloat(sc.TokenText(), 64)
if err == nil {
isFound = true
break
}
}
}
if isFound {
valFloat32 = Number(valFloat64)
}
return valFloat32
}
/**
* This function parses and returns the uri associated with the HTML anchor
* <a href="http://www..."...> tag
* This function assumes that 'href' attribute contains absolute url.
* It returns "" empty string if it can't find href attribute from the
* goquery.Selection parameter.
*/
func getUri(sel *goquery.Selection) string {
if sel != nil {
str, exists := sel.Attr("href")
if exists {
u, err := url.Parse(str)
checkErr(err)
return u.String()
}
}
return ""
}
/**
* This function creates a partial fruitItem object from the values passed
* as arguments. It also parses the values of unit price and URI. Lastly,
* it pretty prints the progress bar.
*/
func createFruitItem(title, priceStr string, uriSel *goquery.Selection, iter int) (*FruitItem, int) {
iter++
// Parsing float32 value
price := extractFloat32(priceStr)
prodUri := getUri(uriSel)
// Creating fruit item with partial values
fruitItem := &FruitItem{Title: title, UnitPrice: price, Size: "0kb", Description: "", DetailsUri: prodUri}
// Pretty-printing the progress of in channel processing
prettyProgess("Found stuff", "=", ">", iter)
return fruitItem, iter
}
/**
* This function scrapes the fruit item title and unit price from the downloaded
* HTML document. First, it downloads the HTML doc from the given URI parameter.
* Then, it scrapes the products' title, unit price and details URI info from the
* downloaded HTML doc. Later on, it creates fruit items with these partial values,
* and then it puts these fruit item objects into fruitInQueue channel.
*/
func fruitInitScrape(client *http.Client, uri string, fruitInQueue chan *FruitItem) {
var iter int
var fruitItem *FruitItem
// Closing the In channel as it is not needed afterwards
defer close(fruitInQueue)
// Load the URI
res, err := client.Get(uri)
checkErr(err)
defer res.Body.Close()
// Convert the "windows-1252" charset of the downloaded HTML to
// utf-8 encoded HTML.
utfBody, err := iconv.NewReader(res.Body, "windows-1252", "utf-8")
checkErr(err)
// use utfBody using goquery
doc, err := goquery.NewDocumentFromReader(utfBody)
checkErr(err)
fmt.Println("about to find stuff\n")
// Find required info within the document
doc.Find("ul.productLister li").Each(func(i int, s *goquery.Selection) {
product := s.Find(".productInner h3 a")
title := strings.TrimSpace(product.Text())
priceStr := strings.TrimSpace(s.Find(".productInner p.pricePerUnit").Text())
addProduct := s.Find(".crossSellInner h4.crossSellName a")
addTitle := strings.TrimSpace(addProduct.Text())
addPriceStr := strings.TrimSpace(s.Find(".crossSellInner p.pricePerUnit").Text())
// Creating fruit item with partial values
fruitItem, iter = createFruitItem(title, priceStr, product, iter)
// Putting partially formed fruitItem on to fruitInQueue channel
fruitInQueue <- fruitItem
// These additional fruit items are the cross selling product items
if len(addTitle) > 0 {
// Creating fruit item with partial values
fruitItem, iter = createFruitItem(addTitle, addPriceStr, addProduct, iter)
// Putting partially formed fruitItem on to fruitInQueue channel
fruitInQueue <- fruitItem
}
})
fmt.Println("\nfinished finding stuff ... closing channel\n")
}
/**
* This function retrieves the size (KB) of the downloaed HTML document (without assets)
* and scrapes the description of the fruit item from downloaded HTML doc. First, it
* consumes the fruit item struct from fruitInQueue channel. Then, it downloads the HTML
* document to find and save size and description in the same fruit item struct. Lastly,
* it puts this completely formed fruit item struct in the fruitOutQueue channel
*/
func fruitFinishScrape(client *http.Client, fruitInQueue, fruitOutQueue chan *FruitItem) {
var iter int
// Closing the Out channel as it is not needed afterwards
defer close(fruitOutQueue)
// Consuming fruitItem from fruitInQueue channel
for fruitItem := range fruitInQueue {
fmt.Println("about to find MORE stuff\n")
iter++
res, err := client.Get(fruitItem.DetailsUri)
checkErr(err)
defer res.Body.Close()
// Reading to calculate the size of HTML document in kb
body, err := ioutil.ReadAll(res.Body)
checkErr(err)
// Converting bytes into KB
size := float32(len(body)) / float32(1024)
// Restoring the io.ReadCloser to its original state to be re-read
res.Body = ioutil.NopCloser(bytes.NewBuffer(body))
// Querying from the restored http.Response
doc, err := goquery.NewDocumentFromResponse(res)
checkErr(err)
// Looking for fruit item description within newly downloaded HTML document
doc.Find("#information").Each(func(i int, s *goquery.Selection) {
desc := strings.TrimSpace(s.Find(".productText p").First().Text())
fruitItem.Description = desc
fruitItem.Size = strconv.FormatFloat(float64(size), 'f', 2, 32) + "kb"
// Pretty-printing the progress of out channel processing
prettyProgess("<", "=", "Found more stuff", iter)
// Putting completely formed fruitItem on to fruitOutQueue channel
fruitOutQueue <- fruitItem
})
}
fmt.Println("\nfinished finding MORE stuff ... closing channel\n")
}
/**
* This function returns a json []byte from the fruit items. First, it
* consumes fruit items from fruitOutQueue channel. Then it appends it to
* fruitsList and finds the cumulative unit price of all the fruit items.
* Later on, it builds the JSON object and returns it.
*/
func getFruitsJSON(fruitOutQueue chan *FruitItem) []byte {
fruitsList := make([]*FruitItem, 0)
totalPrice := Number(0)
var iter int
for fruitItem := range fruitOutQueue {
iter++
// Pretty-printing the progress of out channel processing
prettyProgess("Thanks <", "-", "> stuff", iter)
fruitsList = append(fruitsList, fruitItem)
totalPrice += fruitItem.UnitPrice
}
fmt.Println("Total Unit Price is: ", totalPrice)
// A temporary struct is defined to generate desired JSON
fruitsJSON, err := json.MarshalIndent(struct {
Results []*FruitItem `json:"results"`
TotalPrice Number `json:"total"`
}{Results: fruitsList, TotalPrice: totalPrice},
"", " ")
checkErr(err)
return fruitsJSON
}
func main() {
uri := "http://www.sainsburys.co.uk/webapp/wcs/stores/servlet" +
"/CategoryDisplay?listView=true&orderBy=FAVOURITES_FIRST&" +
"parent_category_rn=12518&top_category=12518&langId=44&" +
"beginIndex=0&pageSize=20&catalogId=10137&searchTerm=&" +
"categoryId=185749&listId=&storeId=10151&promotionId=#" +
"langId=44&storeId=10151&catalogId=10137&categoryId=185749&" +
"parent_category_rn=12518&top_category=12518&pageSize=20&" +
"orderBy=FAVOURITES_FIRST&searchTerm=&beginIndex=0&" +
"hideFilters=true"
options := cookiejar.Options{
PublicSuffixList: publicsuffix.List,
}
jar, err := cookiejar.New(&options)
checkErr(err)
client := http.Client{Jar: jar}
fmt.Println("the uri is: ", uri)
fruitInQueue := make(chan *FruitItem, 2)
fruitOutQueue := make(chan *FruitItem)
go fruitInitScrape(&client, uri, fruitInQueue)
go fruitFinishScrape(&client, fruitInQueue, fruitOutQueue)
fruitsJSON := getFruitsJSON(fruitOutQueue)
fmt.Println("JSON is: ", string(fruitsJSON))
}