/
ner.go
143 lines (123 loc) · 3.42 KB
/
ner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package ner
/*
#cgo LDFLAGS: -lmitie
#include <stdlib.h>
#include <stdio.h>
#include "mitie.h"
static char** ner_arr_make(int size) {
return calloc(sizeof(char*), size);
}
static void ner_arr_set(char **a, char *s, int n) {
a[n] = s;
}
static void ner_arr_free(char **a, int size) {
int i;
for (i = 0; i < size; i++) {
free(a[i]);
}
free(a);
}
*/
import "C"
import (
"errors"
"strings"
"unsafe"
)
var (
// ErrCantOpen is returned by NewExtractor when a language model file can't
// be loaded.
ErrCantOpen = errors.New("Unable to open model file")
// ErrMemory occurs when underlying C structs cannot be allocated.
ErrMemory = errors.New("Could not allocate memory")
)
// Tokenize returns a slice that contains a tokenized copy of the input text.
func Tokenize(text string) []string {
cs := C.CString(text)
defer C.free(unsafe.Pointer(cs))
ctokens := C.mitie_tokenize(cs)
defer C.mitie_free(unsafe.Pointer(ctokens))
i := 0
// a hack since mitie arrays are NULL terminated.
p := (*[1 << 30]*C.char)(unsafe.Pointer(ctokens))
tokens := make([]string, 0, 20)
for p[i] != nil {
tokens = append(tokens, C.GoString(p[i]))
i++
}
return tokens
}
// Range specifies the position of an Entity within a token slice.
type Range struct {
Start int
End int
}
// Entity is a detected entity.
type Entity struct {
Score float64
Tag int
TagString string
Name string
Range Range
}
// Extractor detects entities based on a language model file.
type Extractor struct {
ner *C.mitie_named_entity_extractor
}
// NewExtractor returns an Extractor given the path to a language model.
func NewExtractor(path string) (*Extractor, error) {
model := C.CString(path)
defer C.free(unsafe.Pointer(model))
ner := C.mitie_load_named_entity_extractor(model)
if ner == nil {
return nil, ErrCantOpen
}
return &Extractor{
ner: ner,
}, nil
}
// Free frees the underlying used C memory.
func (ext *Extractor) Free() {
C.mitie_free(unsafe.Pointer(ext.ner))
}
// Tags returns a slice of Tags that are part of this language model.
// E.g. PERSON or LOCATION, etc…
func (ext *Extractor) Tags() []string {
num := int(C.mitie_get_num_possible_ner_tags(ext.ner))
tags := make([]string, num, num)
for i := 0; i < num; i++ {
tags[i] = ext.tagString(i)
}
return tags
}
func (ext *Extractor) tagString(index int) string {
return C.GoString(C.mitie_get_named_entity_tagstr(ext.ner, C.ulong(index)))
}
// Extract runs the extractor and returns a slice of Entities found in the
// given tokens.
func (ext *Extractor) Extract(tokens []string) ([]Entity, error) {
ctokens := C.ner_arr_make(C.int(len(tokens)) + 1) // NULL termination
defer C.ner_arr_free(ctokens, C.int(len(tokens))+1)
for i, t := range tokens {
cs := C.CString(t) // released by ner_arr_free
C.ner_arr_set(ctokens, cs, C.int(i))
}
dets := C.mitie_extract_entities(ext.ner, ctokens)
defer C.mitie_free(unsafe.Pointer(dets))
if dets == nil {
return nil, ErrMemory
}
n := int(C.mitie_ner_get_num_detections(dets))
entities := make([]Entity, n, n)
for i := 0; i < n; i++ {
pos := int(C.mitie_ner_get_detection_position(dets, C.ulong(i)))
len := int(C.mitie_ner_get_detection_length(dets, C.ulong(i)))
entities[i] = Entity{
Tag: int(C.mitie_ner_get_detection_tag(dets, C.ulong(i))),
Score: float64(C.mitie_ner_get_detection_score(dets, C.ulong(i))),
Name: strings.Join(tokens[pos:pos+len], " "),
Range: Range{pos, pos + len},
}
}
return entities, nil
}