Ejemplo n.º 1
0
func main() {
	defer shutdown.Exit()

	apiPort := os.Getenv("PORT")
	if apiPort == "" {
		apiPort = "5000"
	}

	logAddr := flag.String("logaddr", ":3000", "syslog input listen address")
	apiAddr := flag.String("apiaddr", ":"+apiPort, "api listen address")
	flag.Parse()

	conf := ServerConfig{
		SyslogAddr:  *logAddr,
		ApiAddr:     *apiAddr,
		Discoverd:   discoverd.DefaultClient,
		ServiceName: "logaggregator",
	}

	srv := NewServer(conf)
	shutdown.BeforeExit(srv.Shutdown)

	// get leader for snapshot (if any)
	leader, err := conf.Discoverd.Service(conf.ServiceName).Leader()
	if err == nil {
		host, _, _ := net.SplitHostPort(leader.Addr)
		log15.Info("loading snapshot from leader", "leader", host)

		c, _ := client.New("http://" + host)
		snapshot, err := c.GetSnapshot()
		if err == nil {
			if err := srv.LoadSnapshot(snapshot); err != nil {
				log15.Error("error receiving snapshot from leader", "error", err)
			}
			snapshot.Close()
		} else {
			log15.Error("error getting snapshot from leader", "error", err)
		}
	} else {
		log15.Info("error finding leader for snapshot", "error", err)
	}

	if err := srv.Start(); err != nil {
		shutdown.Fatal(err)
	}
	<-make(chan struct{})
}
Ejemplo n.º 2
0
func (m *Mux) addAggregator(addr string) {
	l := m.logger.New("fn", "addAggregator", "addr", addr)
	// TODO(titanous): add dial timeout
	conn, err := net.Dial("tcp", addr)
	if err != nil {
		l.Error("failed to connect to aggregator", "error", err)
		return
	}
	l.Info("connected to aggregator")

	host, _, _ := net.SplitHostPort(addr)
	c, _ := client.New("http://" + host)
	cursors, err := c.GetCursors()
	if err != nil {
		// TODO(titanous): retry
		l.Error("failed to get cursors from aggregator", "error", err)
		conn.Close()
		return
	}

	var aggCursor *utils.HostCursor
	if c, ok := cursors[m.hostID]; ok {
		aggCursor = &c
	}
	if aggCursor != nil {
		l.Info("got cursor", "cursor.timestamp", aggCursor.Time, "cursor.seq", aggCursor.Seq)
	} else {
		l.Info("no cursor for host")
	}

	appLogs, err := m.logFiles("")
	if err != nil {
		l.Error("failed to get local log files", "error", err)
		conn.Close()
		return
	}

	bufferedMessages := make(chan message)
	firehose := make(chan message)
	done := make(chan struct{})

	// subscribe to all messages
	unsubscribe := m.subscribe(firehoseApp, firehose)

	bufferCursors := make(map[string]utils.HostCursor)
	var bufferCursorsMtx sync.Mutex
	go func() {
		l := m.logger.New("fn", "sendToAggregator", "addr", addr)
		defer unsubscribe()
		defer conn.Close()
		defer close(done)
		bm := bufferedMessages // make a copy so we can nil it later
		for {
			var m message
			var ok bool
			select {
			case m, ok = <-bm:
				if !ok {
					bm = nil
					continue
				}
			case m, ok = <-firehose:
				if !ok {
					return
				}

				// if app in list of app logs and cursor from reading files, skip
				appID := string(m.Message.AppName)
				if _, ok := appLogs[appID]; ok {
					bufferCursorsMtx.Lock()
					c, ok := bufferCursors[appID]
					bufferCursorsMtx.Unlock()
					if !ok || c.After(*m.HostCursor) {
						continue
					}
				}
			}
			if _, err := conn.Write(rfc6587.Bytes(m.Message)); err != nil {
				l.Error("failed to write message", "error", err)
				return
			}
		}
	}()

	for appID, logs := range appLogs {
		for i, name := range logs {
			func() {
				l := l.New("log", name)
				f, err := os.Open(name)
				if err != nil {
					l.Error("failed to open log file", "error", err)
					return
				}
				defer f.Close()
				sc := bufio.NewScanner(f)
				sc.Split(rfc6587.SplitWithNewlines)
				var cursor *utils.HostCursor
				cursorSaved := false
			scan:
				for sc.Scan() {
					msgBytes := sc.Bytes()
					// slice in msgBytes could get modified on next Scan(), need to copy it
					msgCopy := make([]byte, len(msgBytes)-1)
					copy(msgCopy, msgBytes)
					var msg *rfc5424.Message
					msg, cursor, err = utils.ParseMessage(msgCopy)
					if err != nil {
						l.Error("failed to parse message", "msg", string(msgCopy), "error", err)
						continue
					}
					if aggCursor != nil && !cursor.After(*aggCursor) {
						continue
					}
					select {
					case bufferedMessages <- message{cursor, msg}:
					case <-done:
						return
					}
				}
				if err := sc.Err(); err != nil {
					l.Error("failed to scan message", "error", err)
					return
				}
				if !cursorSaved && i == len(appLogs[appID])-1 {
					// last file, send cursor to processing goroutine
					bufferCursorsMtx.Lock()
					bufferCursors[appID] = *cursor
					bufferCursorsMtx.Unlock()
					cursorSaved = true
					// read to end of file again
					goto scan
				}
			}()
		}
	}
	close(bufferedMessages)
}
Ejemplo n.º 3
0
func (s *LogAggregatorSuite) TestReplication(t *c.C) {
	app := s.newCliTestApp(t)
	app.flynn("scale", "ish=1")
	defer app.flynn("scale", "ish=0")
	defer app.cleanup()

	aggHost := "logaggregator.discoverd"
	waitForAggregator := func(wantUp bool) func() {
		ch := make(chan *discoverd.Event)
		stream, err := app.disc.Service("logaggregator").Watch(ch)
		t.Assert(err, c.IsNil)
		up := make(chan struct{})
		go func() {
			timeout := time.After(60 * time.Second)
			defer close(up)
			defer stream.Close()
			var current bool
			for {
				select {
				case <-timeout:
					t.Error("logaggregator did not come back within a minute")
					return
				case event := <-ch:
					switch {
					case event.Kind == discoverd.EventKindCurrent:
						current = true
					case !wantUp && current && event.Kind == discoverd.EventKindDown:
						return
					case wantUp && current && event.Kind == discoverd.EventKindUp:
						aggHost, _, _ = net.SplitHostPort(event.Instance.Addr)
						return
					}
				}
			}
		}()
		return func() {
			<-up
		}
	}

	longLine := strings.Repeat("a", 10050)
	longLine0 := longLine[:10000]
	longLine1 := longLine[10000:]

	aggregators, err := app.disc.Instances("logaggregator", time.Second)
	t.Assert(err, c.IsNil)
	if len(aggregators) == 0 || len(aggregators) > 2 {
		t.Errorf("unexpected number of aggregators: %d", len(aggregators))
	} else if len(aggregators) == 2 {
		wait := waitForAggregator(false)
		flynn(t, "/", "-a", "logaggregator", "scale", "app=1")
		wait()
	}

	instances, err := app.disc.Instances(app.name, time.Second*100)
	t.Assert(err, c.IsNil)
	ish := instances[0]
	cc := s.controllerClient(t)

	readLines := func(expectedLines ...string) {
		lineCount := 10
		lc, _ := client.New("http://" + aggHost)
		out, err := lc.GetLog(app.id, &logagg.LogOpts{Follow: true, Lines: &lineCount})
		t.Assert(err, c.IsNil)

		done := make(chan struct{})
		var lines []string
		go func() {
			defer close(done)
			dec := json.NewDecoder(out)
			for {
				var msg client.Message
				if err := dec.Decode(&msg); err != nil {
					return
				}
				lines = append(lines, msg.Msg)
				if reflect.DeepEqual(lines, expectedLines) {
					return
				}
			}
		}()

		select {
		case <-time.After(60 * time.Second):
		case <-done:
		}
		out.Close()

		t.Assert(lines, c.DeepEquals, expectedLines)
	}

	runIshCommand(ish, "echo line1")
	runIshCommand(ish, "echo line2")
	runIshCommand(ish, "echo "+longLine)
	readLines("line1", "line2", longLine0, longLine1)

	// kill logaggregator
	wait := waitForAggregator(true)
	jobs, err := cc.JobList("logaggregator")
	t.Assert(err, c.IsNil)
	for _, j := range jobs {
		if j.State == ct.JobStateUp {
			t.Assert(cc.DeleteJob(app.name, j.ID), c.IsNil)
		}
	}
	wait()

	// confirm that logs are replayed when it comes back
	runIshCommand(ish, "echo line3")
	readLines("line1", "line2", longLine0, longLine1, "line3")

	// start new logaggregator
	wait = waitForAggregator(true)
	flynn(t, "/", "-a", "logaggregator", "scale", "app=2")
	wait()

	// confirm that logs show up in the new aggregator
	runIshCommand(ish, "echo line4")
	readLines("line1", "line2", longLine0, longLine1, "line3", "line4")
}