/
urlclassify.go
114 lines (91 loc) · 2.05 KB
/
urlclassify.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package main
import (
"os"
"log"
"fmt"
"io"
"bufio"
"strings"
"encoding/csv"
"github.com/cloudflare/ahocorasick"
"github.com/deckarep/golang-set"
)
func createKeywordDictionaryFromCSVFile(name string) map[string]string {
f, err := os.Open(name)
if err != nil {
log.Fatal(err)
}
defer f.Close()
// Create a new reader.
r := csv.NewReader(bufio.NewReader(f))
m := make(map[string]string)
for {
record, err := r.Read()
// Stop at EOF.
if err == io.EOF {
break
}
// Display record.
//fmt.Printf("%v %v\n", record[0], record[1])
// Map the keyword to the category word...
m[strings.ToLower(record[0])] = strings.ToLower(record[1])
// Map the category word too...
m[strings.ToLower(record[1])] = strings.ToLower(record[1])
}
return m
}
func createListFromCSVFile(name string) []string {
f, err := os.Open(name)
if err != nil {
log.Fatal(err)
}
defer f.Close()
// Create a new reader.
r := csv.NewReader(bufio.NewReader(f))
a := make([]string, 0)
for {
record, err := r.Read()
// Stop at EOF.
if err == io.EOF {
break
}
a = append(a, record[0])
}
return a
}
func getKeys(m map[string]string) []string {
i := 0
keys := make([]string, len(m))
for k := range m {
keys[i] = k
i++
}
return keys
}
func getKeywords(m map[string]string) []string {
return getKeys(m)
}
func getUniqueCategoriesFromHits(hits []int, keywords []string, dict map[string]string) []interface{} {
set := mapset.NewSet()
for _, v := range hits {
set.Add(dict[keywords[v]])
}
return set.ToSlice()
}
func main() {
// Load urls...
urls := createListFromCSVFile("./data/newurls.csv")
// Load keywords...
dict := createKeywordDictionaryFromCSVFile("./data/keywords.csv")
// Classify...
keywords := getKeywords(dict)
m := ahocorasick.NewStringMatcher(keywords)
for _, url := range urls {
hits := m.Match([]byte(strings.ToLower(url)))
//fmt.Printf("# of hits for %s: %d\n", url, len(hits))
categories := getUniqueCategoriesFromHits(hits, keywords, dict)
for _, v := range categories {
fmt.Printf("%s,%s\n", url, v)
}
}
}