/
spider_ScienceNews.go
executable file
·126 lines (111 loc) · 2.47 KB
/
spider_ScienceNews.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package main
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/boltdb/bolt"
"github.com/celrenheit/spider"
)
// Main data structures
type ScienceNewsGroup struct {
Data []ScienceNews
}
type ScienceNews struct {
Title string
Summary string
Url string
}
// Database functions
func (p *ScienceNewsGroup) Get(bucketName string) error {
if !open {
return fmt.Errorf("db must be opened before saving!")
}
err := db.View(func(tx *bolt.Tx) error {
var err error
b := tx.Bucket([]byte(bucketName))
if b == nil {
return nil
}
k := []byte("ScienceNewsGroup")
val := b.Get(k)
if val == nil {
return nil
}
err = p.decode(val)
if err != nil {
return err
}
return nil
})
if err != nil {
fmt.Printf("Could not get ScienceNewsGroup: %s", err)
return err
}
return nil
}
func (p *ScienceNewsGroup) save() error {
bucketName := Today()
if !open {
return fmt.Errorf("db must be opened before saving!")
}
err := db.Update(func(tx *bolt.Tx) error {
bucket, err := tx.CreateBucketIfNotExists([]byte(bucketName))
if err != nil {
return fmt.Errorf("create bucket: %s", err)
}
enc, err := p.encode()
if err != nil {
return fmt.Errorf("could not encode ScienceNewsGroup: %s", err)
}
err = bucket.Put([]byte("ScienceNewsGroup"), enc)
return err
})
return err
}
func (p *ScienceNewsGroup) encode() ([]byte, error) {
enc, err := json.Marshal(p)
if err != nil {
return nil, err
}
return enc, nil
}
func (p *ScienceNewsGroup) decode(data []byte) error {
err := json.Unmarshal(data, &p)
if err != nil {
return err
}
return nil
}
// Define the spider
var ScienceNewsSpider spider.Spider
func init() {
ScienceNewsSpider = spider.Get("http://www.sciencemag.org/news", func(ctx *spider.Context) error {
fmt.Print(time.Now())
fmt.Println("ScienceNewsSpider")
if _, err := ctx.DoRequest(); err != nil {
return err
}
htmlparser, err := ctx.HTMLParser()
if err != nil {
return err
}
var p ScienceNewsGroup
p.Data = []ScienceNews{}
htmlparser.Find(`div[class="media__body"]`).Each(func(i int, s *goquery.Selection) {
title := strings.TrimSpace(s.Find("h2").Text())
url, _ := s.Find("h2 > a").Attr("href")
url = "http://www.sciencemag.org" + url
summary := "None"
p.Data = append(p.Data, ScienceNews{title, summary, url})
})
// Open()
err = p.save()
// Close()
if err != nil {
return fmt.Errorf("error saving ScienceNewsGroup")
}
return nil
})
}