This repository has been archived by the owner on Sep 25, 2019. It is now read-only.
forked from facebookarchive/dvara
/
healthcheck.go
118 lines (104 loc) · 3.03 KB
/
healthcheck.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package dvara
import (
"errors"
"fmt"
"strings"
"time"
"gopkg.in/mgo.v2"
corelog "github.com/intercom/gocore/log"
)
//HealthChecker -> Run health check to verify is dvara still connected to the replica set
type HealthChecker struct {
consecutiveFailures uint
HealthCheckInterval time.Duration
FailedHealthCheckThreshold uint
Cancel bool
syncTryChan chan<- struct{}
}
func (checker *HealthChecker) HealthCheck(checkable CheckableMongoConnector, syncTryChan chan<- struct{}) {
ticker := time.NewTicker(checker.HealthCheckInterval)
if syncTryChan != nil {
checker.syncTryChan = syncTryChan
}
for {
select {
case <-ticker.C:
checker.tryRunReplicaChecker()
err := checkable.Check(checker.HealthCheckInterval)
if err != nil {
checker.consecutiveFailures++
} else {
checker.consecutiveFailures = 0
}
if checker.consecutiveFailures >= checker.FailedHealthCheckThreshold {
checker.consecutiveFailures = 0
checkable.HandleFailure()
}
}
if checker.Cancel {
return
}
}
}
func (checker *HealthChecker) tryRunReplicaChecker() {
if checker.syncTryChan != nil {
select {
case checker.syncTryChan <- struct{}{}:
default:
}
}
}
type CheckableMongoConnector interface {
Check(timeout time.Duration) error
HandleFailure()
}
// Attemps to connect to Mongo through Dvara, with timeout.
func (r *ReplicaSet) Check(timeout time.Duration) error {
errChan := make(chan error)
go r.runCheck(errChan)
// blocking wait
select {
case err := <-errChan:
if err != nil {
r.Stats.BumpSum("healthcheck.failed", 1)
corelog.LogErrorMessage(fmt.Sprintf("Failed healthcheck due to %s", err))
}
return err
case <-time.After(timeout):
r.Stats.BumpSum("healthcheck.failed", 1)
corelog.LogErrorMessage(fmt.Sprintf("Failed healthcheck due to timeout %s", timeout))
return errors.New("Failed due to timeout")
}
}
func (r *ReplicaSet) HandleFailure() {
corelog.LogErrorMessage("Crashing dvara due to consecutive failed healthchecks")
r.Stats.BumpSum("healthcheck.failed.panic", 1)
panic("failed healthchecks")
}
// Attemps to connect to Mongo through Dvara. Blocking call.
func (r *ReplicaSet) runCheck(errChan chan<- error) {
// dvara opens a port per member of replica set, we don't expect to run more than 5 members in replica set
addrs := strings.Split(fmt.Sprintf("127.0.0.1:%d,127.0.0.1:%d,127.0.0.1:%d,127.0.0.1:%d,127.0.0.1:%d", r.PortStart, r.PortStart+1, r.PortStart+2, r.PortStart+3, r.PortStart+4), ",")
err := checkReplSetStatus(addrs, r.Name)
select {
case errChan <- err:
default:
return
}
}
func checkReplSetStatus(addrs []string, replicaSetName string) error {
info := &mgo.DialInfo{
Addrs: addrs,
FailFast: true,
// Without direct option, healthcheck fails in case there are only secondaries in the replica set
Direct: true,
ReplicaSetName: replicaSetName,
}
session, err := mgo.DialWithInfo(info)
if err != nil {
return err
}
defer session.Close()
_, replStatusErr := replSetGetStatus(session)
return replStatusErr
}