/
tagger.go
104 lines (85 loc) · 2.15 KB
/
tagger.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package main
import (
"bufio"
"container/list"
"flag"
"fmt"
"io"
"os"
"strings"
)
func ReadSentences() (sentences chan *list.List, err error) {
sentences = make(chan *list.List)
go func() {
defer close(sentences)
reader := bufio.NewReader(os.Stdin)
sentence := list.New()
for {
line, err := reader.ReadString('\n')
if err != nil {
if err != io.EOF {
fmt.Println("Error reading tokens from stdin:", err)
}
sentences <- sentence
return
}
token := strings.Trim(line, "\n")
if token != "" {
sentence.PushBack(token)
}
if len(token) == 1 && strings.IndexAny(token, ".?!:") == 0 {
sentences <- sentence
sentence = list.New()
}
}
}()
return
}
func main() {
trainPath := flag.String("train", "", "Path to training data")
window := flag.Int("window", 3, "Context window width")
flag.Parse()
if *trainPath == "" {
fmt.Print("\nYou will have to at least supply a path to the training data.\nThe input to tag will be read from stdin and printed to stdout.\n")
os.Exit(1)
}
posModel := NewHMM("POS", *window)
instances, errors := ReadTrainingData(*trainPath)
if err := posModel.Train(instances, errors); err != nil {
fmt.Printf("Training error: %v\n", err)
os.Exit(1)
}
//posModel.PrintModel()
//os.Exit(0)
// tokenizer, err := NewTokenizer()
// if err != nil {
// fmt.Printf("Tokenizer error: %v\n", err)
// os.Exit(1)
// }
// tokens, errors, err := tokenizer.TokenizeFile(*tagPath)
// if err != nil {
// fmt.Printf("Tokenizing error: %v\n", err)
// os.Exit(1)
// }
sentences, err := ReadSentences()
if err != nil {
fmt.Println("Error reading input tokens:", err)
os.Exit(1)
}
var allprocessedtokens int
for sentence := range sentences {
tags, processedtokens, err := posModel.TagViterbi(sentence)
if err != nil {
fmt.Printf("Testing error: %v\n", err)
os.Exit(1)
}
allprocessedtokens += processedtokens
token, tag := sentence.Front(), tags.Front()
for token != nil {
fmt.Printf("%s\t%s\n", token.Value, tag.Value)
token, tag = token.Next(), tag.Next()
}
}
fmt.Fprintf(os.Stderr, "Number of processed tokens: %v\n", allprocessedtokens)
return
}