func main() { defer shutdown.Exit() apiPort := os.Getenv("PORT") if apiPort == "" { apiPort = "5000" } logAddr := flag.String("logaddr", ":3000", "syslog input listen address") apiAddr := flag.String("apiaddr", ":"+apiPort, "api listen address") flag.Parse() conf := ServerConfig{ SyslogAddr: *logAddr, ApiAddr: *apiAddr, Discoverd: discoverd.DefaultClient, ServiceName: "logaggregator", } srv := NewServer(conf) shutdown.BeforeExit(srv.Shutdown) // get leader for snapshot (if any) leader, err := conf.Discoverd.Service(conf.ServiceName).Leader() if err == nil { host, _, _ := net.SplitHostPort(leader.Addr) log15.Info("loading snapshot from leader", "leader", host) c, _ := client.New("http://" + host) snapshot, err := c.GetSnapshot() if err == nil { if err := srv.LoadSnapshot(snapshot); err != nil { log15.Error("error receiving snapshot from leader", "error", err) } snapshot.Close() } else { log15.Error("error getting snapshot from leader", "error", err) } } else { log15.Info("error finding leader for snapshot", "error", err) } if err := srv.Start(); err != nil { shutdown.Fatal(err) } <-make(chan struct{}) }
func (m *Mux) addAggregator(addr string) { l := m.logger.New("fn", "addAggregator", "addr", addr) // TODO(titanous): add dial timeout conn, err := net.Dial("tcp", addr) if err != nil { l.Error("failed to connect to aggregator", "error", err) return } l.Info("connected to aggregator") host, _, _ := net.SplitHostPort(addr) c, _ := client.New("http://" + host) cursors, err := c.GetCursors() if err != nil { // TODO(titanous): retry l.Error("failed to get cursors from aggregator", "error", err) conn.Close() return } var aggCursor *utils.HostCursor if c, ok := cursors[m.hostID]; ok { aggCursor = &c } if aggCursor != nil { l.Info("got cursor", "cursor.timestamp", aggCursor.Time, "cursor.seq", aggCursor.Seq) } else { l.Info("no cursor for host") } appLogs, err := m.logFiles("") if err != nil { l.Error("failed to get local log files", "error", err) conn.Close() return } bufferedMessages := make(chan message) firehose := make(chan message) done := make(chan struct{}) // subscribe to all messages unsubscribe := m.subscribe(firehoseApp, firehose) bufferCursors := make(map[string]utils.HostCursor) var bufferCursorsMtx sync.Mutex go func() { l := m.logger.New("fn", "sendToAggregator", "addr", addr) defer unsubscribe() defer conn.Close() defer close(done) bm := bufferedMessages // make a copy so we can nil it later for { var m message var ok bool select { case m, ok = <-bm: if !ok { bm = nil continue } case m, ok = <-firehose: if !ok { return } // if app in list of app logs and cursor from reading files, skip appID := string(m.Message.AppName) if _, ok := appLogs[appID]; ok { bufferCursorsMtx.Lock() c, ok := bufferCursors[appID] bufferCursorsMtx.Unlock() if !ok || c.After(*m.HostCursor) { continue } } } if _, err := conn.Write(rfc6587.Bytes(m.Message)); err != nil { l.Error("failed to write message", "error", err) return } } }() for appID, logs := range appLogs { for i, name := range logs { func() { l := l.New("log", name) f, err := os.Open(name) if err != nil { l.Error("failed to open log file", "error", err) return } defer f.Close() sc := bufio.NewScanner(f) sc.Split(rfc6587.SplitWithNewlines) var cursor *utils.HostCursor cursorSaved := false scan: for sc.Scan() { msgBytes := sc.Bytes() // slice in msgBytes could get modified on next Scan(), need to copy it msgCopy := make([]byte, len(msgBytes)-1) copy(msgCopy, msgBytes) var msg *rfc5424.Message msg, cursor, err = utils.ParseMessage(msgCopy) if err != nil { l.Error("failed to parse message", "msg", string(msgCopy), "error", err) continue } if aggCursor != nil && !cursor.After(*aggCursor) { continue } select { case bufferedMessages <- message{cursor, msg}: case <-done: return } } if err := sc.Err(); err != nil { l.Error("failed to scan message", "error", err) return } if !cursorSaved && i == len(appLogs[appID])-1 { // last file, send cursor to processing goroutine bufferCursorsMtx.Lock() bufferCursors[appID] = *cursor bufferCursorsMtx.Unlock() cursorSaved = true // read to end of file again goto scan } }() } } close(bufferedMessages) }
func (s *LogAggregatorSuite) TestReplication(t *c.C) { app := s.newCliTestApp(t) app.flynn("scale", "ish=1") defer app.flynn("scale", "ish=0") defer app.cleanup() aggHost := "logaggregator.discoverd" waitForAggregator := func(wantUp bool) func() { ch := make(chan *discoverd.Event) stream, err := app.disc.Service("logaggregator").Watch(ch) t.Assert(err, c.IsNil) up := make(chan struct{}) go func() { timeout := time.After(60 * time.Second) defer close(up) defer stream.Close() var current bool for { select { case <-timeout: t.Error("logaggregator did not come back within a minute") return case event := <-ch: switch { case event.Kind == discoverd.EventKindCurrent: current = true case !wantUp && current && event.Kind == discoverd.EventKindDown: return case wantUp && current && event.Kind == discoverd.EventKindUp: aggHost, _, _ = net.SplitHostPort(event.Instance.Addr) return } } } }() return func() { <-up } } longLine := strings.Repeat("a", 10050) longLine0 := longLine[:10000] longLine1 := longLine[10000:] aggregators, err := app.disc.Instances("logaggregator", time.Second) t.Assert(err, c.IsNil) if len(aggregators) == 0 || len(aggregators) > 2 { t.Errorf("unexpected number of aggregators: %d", len(aggregators)) } else if len(aggregators) == 2 { wait := waitForAggregator(false) flynn(t, "/", "-a", "logaggregator", "scale", "app=1") wait() } instances, err := app.disc.Instances(app.name, time.Second*100) t.Assert(err, c.IsNil) ish := instances[0] cc := s.controllerClient(t) readLines := func(expectedLines ...string) { lineCount := 10 lc, _ := client.New("http://" + aggHost) out, err := lc.GetLog(app.id, &logagg.LogOpts{Follow: true, Lines: &lineCount}) t.Assert(err, c.IsNil) done := make(chan struct{}) var lines []string go func() { defer close(done) dec := json.NewDecoder(out) for { var msg client.Message if err := dec.Decode(&msg); err != nil { return } lines = append(lines, msg.Msg) if reflect.DeepEqual(lines, expectedLines) { return } } }() select { case <-time.After(60 * time.Second): case <-done: } out.Close() t.Assert(lines, c.DeepEquals, expectedLines) } runIshCommand(ish, "echo line1") runIshCommand(ish, "echo line2") runIshCommand(ish, "echo "+longLine) readLines("line1", "line2", longLine0, longLine1) // kill logaggregator wait := waitForAggregator(true) jobs, err := cc.JobList("logaggregator") t.Assert(err, c.IsNil) for _, j := range jobs { if j.State == ct.JobStateUp { t.Assert(cc.DeleteJob(app.name, j.ID), c.IsNil) } } wait() // confirm that logs are replayed when it comes back runIshCommand(ish, "echo line3") readLines("line1", "line2", longLine0, longLine1, "line3") // start new logaggregator wait = waitForAggregator(true) flynn(t, "/", "-a", "logaggregator", "scale", "app=2") wait() // confirm that logs show up in the new aggregator runIshCommand(ish, "echo line4") readLines("line1", "line2", longLine0, longLine1, "line3", "line4") }