/
rebucket.go
160 lines (127 loc) · 3.25 KB
/
rebucket.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Package rebucket implements the ReBucket duplicate crash report clustering algorithm
/*
http://research.microsoft.com/en-us/groups/sa/rebucket-icse2012.pdf
*/
package rebucket
import (
"math"
"github.com/bugsnag/bugsnag-go/errors"
)
func distance(e1, e2 *errors.Error, c, o float64) float64 {
c1 := e1.StackFrames()
c2 := e2.StackFrames()
if len(c1) == 0 {
return 1
}
if len(c2) == 0 {
return 1
}
M := make([][]float64, len(c1)+1)
for i := range M {
M[i] = make([]float64, len(c2)+1)
}
for i := 1; i <= len(c1); i++ {
for j := 1; j <= len(c2); j++ {
var x float64
// TODO(dgryski): better 'equality' comparison here
if c1[i-1].Name == c2[j-1].Name {
x = math.Exp(-c*fmin(i-1, j-1)) * math.Exp(-o*fabs(i-j))
}
M[i][j] = fmax3(M[i-1][j-1]+x, M[i-1][j], M[i][j-1])
}
}
var sig float64
for i := 0; i < min(len(c1), len(c2)); i++ {
sig += math.Exp(-c * float64(i))
}
res := M[len(c1)][len(c2)] / sig
return 1 - res
}
type Cluster struct {
Idx []int
}
type pair struct {
i, j int
}
type distanceCache map[pair]float64
func (dcache distanceCache) distance(p pair, e1, e2 *errors.Error, c, o float64) float64 {
var d float64
var ok bool
if d, ok = dcache[p]; !ok {
d = distance(e1, e2, c, o)
dcache[p] = d
}
return d
}
func clusterDistance(errs []*errors.Error, c1, c2 Cluster, c, o float64, dCache distanceCache) float64 {
maxd := math.Inf(-1)
for _, i := range c1.Idx {
for _, j := range c2.Idx {
p := pair{i, j}
d := dCache.distance(p, errs[i], errs[j], c, o)
if d > maxd {
maxd = d
}
}
}
return maxd
}
// ClusterErrors returns a set clusters of stacktraces in errs. dthresh is the
// distance threshold to be considered 'similar', c is a coefficient for the
// distance to the top frame, o is a coefficient for the alignment offset.
func ClusterErrors(errs []*errors.Error, dthresh, c, o float64) []Cluster {
// to start, every cluster contains only a single error
clusters := make([]Cluster, len(errs))
for i := range errs {
clusters[i] = Cluster{Idx: []int{i}}
}
// TODO(dgryski): Need a better algorithm for this.
// Until we get that, cache cluster distances
dCache := make(distanceCache)
var done bool
for !done {
var tomerge pair
done = true
minD := math.Inf(1)
// find the closest two clusters, within the distance threshold
for i := 0; i < len(clusters); i++ {
for j := i + 1; j < len(clusters); j++ {
d := clusterDistance(errs, clusters[i], clusters[j], c, o, dCache)
if d < dthresh && d < minD {
minD = d
tomerge = pair{i, j}
done = false
}
}
}
if !done {
// add nodes from clusters[j] to clusters[i]
clusters[tomerge.i].Idx = append(clusters[tomerge.i].Idx, clusters[tomerge.j].Idx...)
// remove cluster[j]
clusters[tomerge.j] = clusters[len(clusters)-1]
clusters = clusters[:len(clusters)-1]
}
}
// create a new slice to avoid the extra cruft left over in clusters
cret := make([]Cluster, len(clusters))
copy(cret, clusters)
return cret
}
func fmax3(x, y, z float64) float64 {
return math.Max(x, math.Max(y, z))
}
func min(i, j int) int {
if i < j {
return i
}
return j
}
func fmin(i, j int) float64 {
return float64(min(i, j))
}
func fabs(i int) float64 {
if i < 0 {
return float64(-i)
}
return float64(i)
}