/
subclub.go
137 lines (112 loc) · 3.11 KB
/
subclub.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package subclub
import (
"errors"
"fmt"
"io"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
// Result is a structure of parsed movie data
type Result struct {
ID string
Name string
SubName string
Views int
Author string
FPS float64
SubtitleLink string
Links []string
Genres []string
Created time.Time
}
func (r Result) String() string {
return fmt.Sprintf("->%s [%s(%s) - %d - %0.2f by %s %v %s]",
r.ID, r.Name, r.SubName, r.Views, r.FPS, r.Author, r.Genres, r.Created)
}
func parse(d *goquery.Document) []Result {
// Select the tales table
rowsSel := d.Find("#tale_list > tbody:nth-child(2) > tr")
// var rows []Result
rows := make([]Result, rowsSel.Length())
rowsSel.Each(func(i int, s *goquery.Selection) {
// Get all the rows children td tags
tdSel := s.Children()
rows[i] = Result{
ID: getMovieID(tdSel.Eq(1)),
Name: getMovieName(tdSel.Eq(1)),
SubName: getMovieSubName(tdSel.Eq(1)),
Views: getViews(tdSel.Eq(4)),
Author: getAuthor(tdSel.Eq(8)),
FPS: getFPS(tdSel.Eq(6)),
SubtitleLink: getSubtitleLink(tdSel.Eq(1)),
Links: getMovieLinks(tdSel.Eq(3)),
Genres: getGenres(tdSel.Eq(2)),
Created: getDate(tdSel.Eq(0)),
}
})
return rows
}
func getAuthor(td *goquery.Selection) (author string) {
author = strings.TrimSpace(td.Text())
return
}
func getFPS(td *goquery.Selection) (fps float64) {
fmt.Sscan(td.Text(), &fps)
return
}
func getViews(td *goquery.Selection) (views int) {
fmt.Sscan(td.Text(), &views)
return
}
func getGenres(td *goquery.Selection) (genres []string) {
genres = strings.Split(td.Text(), ", ")
return
}
func getDate(td *goquery.Selection) (date time.Time) {
rawData := strings.TrimSpace(td.Find("font").Last().Text())
date, err := time.Parse("02.01.2006", rawData)
if err != nil {
date = time.Time{}
}
return date
}
func getMovieSubName(td *goquery.Selection) (name string) {
name = td.Find("span.episode_info > b").Text()
name = strings.TrimSpace(name)
return
}
func getMovieName(td *goquery.Selection) (name string) {
// Get link from the anchor tag
name = td.Find("span a.sc_link").Text()
name = strings.Join(strings.Fields(name), " ")
return
}
func getMovieID(td *goquery.Selection) (ID string) {
link := getSubtitleLink(td)
// Ge the ID from link
re := regexp.MustCompile(`\?id=(?P<Id>\d+)$`)
matches := re.FindStringSubmatch(link)
ID = matches[len(matches)-1]
return
}
func getSubtitleLink(td *goquery.Selection) (link string) {
link = td.Find("span a.sc_link").AttrOr("href", "")
link = strings.TrimSpace(link)
return
}
func getMovieLinks(td *goquery.Selection) []string {
// There are multiple links, use map instead to get all
return td.Find("a").Map(func(i int, s *goquery.Selection) string {
return s.AttrOr("href", "")
})
}
// ExtractFromFile takes the reader as input and parses the Document
func ExtractFromFile(r io.Reader) ([]Result, error) {
doc, err := goquery.NewDocumentFromReader(r)
if err != nil {
return nil, errors.New("Could not parse file")
}
return parse(doc), nil
}