Ejemplo n.º 1
0
// TODO: handle "no such file"
func (input *S3SplitFileInput) readS3File(runner pipeline.InputRunner, d *pipeline.Deliverer, sr *pipeline.SplitterRunner, s3Key string) (err error) {
	runner.LogMessage(fmt.Sprintf("Preparing to read: %s", s3Key))
	if input.bucket == nil {
		runner.LogMessage(fmt.Sprintf("Dude, where's my bucket: %s", s3Key))
		return
	}

	var lastGoodOffset uint64
	var attempt uint32

RetryS3:
	for attempt = 1; attempt <= input.S3Retries; attempt++ {
		for r := range S3FileIterator(input.bucket, s3Key, lastGoodOffset) {
			record := r.Record
			err := r.Err

			if err != nil && err != io.EOF {
				runner.LogError(fmt.Errorf("Error in attempt %d reading %s at offset %d: %s", attempt, s3Key, lastGoodOffset, err))
				atomic.AddInt64(&input.processMessageFailures, 1)
				continue RetryS3
			}
			if len(record) > 0 {
				lastGoodOffset += uint64(r.BytesRead)
				atomic.AddInt64(&input.processMessageCount, 1)
				atomic.AddInt64(&input.processMessageBytes, int64(len(record)))
				(*sr).DeliverRecord(record, *d)
			}
		}
		break
	}

	return
}
Ejemplo n.º 2
0
func (rpsi *RedisPubSubInput) Run(ir pipeline.InputRunner, h pipeline.PluginHelper) error {
	var (
		dRunner pipeline.DecoderRunner
		decoder pipeline.Decoder
		pack    *pipeline.PipelinePack
		e       error
		ok      bool
	)
	// Get the InputRunner's chan to receive empty PipelinePacks
	packSupply := ir.InChan()

	if rpsi.conf.DecoderName != "" {
		if dRunner, ok = h.DecoderRunner(rpsi.conf.DecoderName, fmt.Sprintf("%s-%s", ir.Name(), rpsi.conf.DecoderName)); !ok {
			return fmt.Errorf("Decoder not found: %s", rpsi.conf.DecoderName)
		}
		decoder = dRunner.Decoder()
	}

	//Connect to the channel
	psc := redis.PubSubConn{Conn: rpsi.conn}
	psc.PSubscribe(rpsi.conf.Channel)

	for {
		switch n := psc.Receive().(type) {
		case redis.PMessage:
			// Grab an empty PipelinePack from the InputRunner
			pack = <-packSupply
			pack.Message.SetType("redis_pub_sub")
			pack.Message.SetLogger(n.Channel)
			pack.Message.SetPayload(string(n.Data))
			pack.Message.SetTimestamp(time.Now().UnixNano())
			var packs []*pipeline.PipelinePack
			if decoder == nil {
				packs = []*pipeline.PipelinePack{pack}
			} else {
				packs, e = decoder.Decode(pack)
			}
			if packs != nil {
				for _, p := range packs {
					ir.Inject(p)
				}
			} else {
				if e != nil {
					ir.LogError(fmt.Errorf("Couldn't parse Redis message: %s", n.Data))
				}
				pack.Recycle(nil)
			}
		case redis.Subscription:
			ir.LogMessage(fmt.Sprintf("Subscription: %s %s %d\n", n.Kind, n.Channel, n.Count))
			if n.Count == 0 {
				return errors.New("No channel to subscribe")
			}
		case error:
			fmt.Printf("error: %v\n", n)
			return n
		}
	}

	return nil
}
Ejemplo n.º 3
0
func (s *SandboxInput) Run(ir pipeline.InputRunner, h pipeline.PluginHelper) (err error) {
	s.sb.InjectMessage(func(payload, payload_type, payload_name string) int {
		pack := <-ir.InChan()
		if err := proto.Unmarshal([]byte(payload), pack.Message); err != nil {
			pack.Recycle()
			return 1
		}
		if s.tz != time.UTC {
			const layout = "2006-01-02T15:04:05.999999999" // remove the incorrect UTC tz info
			t := time.Unix(0, pack.Message.GetTimestamp())
			t = t.In(time.UTC)
			ct, _ := time.ParseInLocation(layout, t.Format(layout), s.tz)
			pack.Message.SetTimestamp(ct.UnixNano())
		}
		ir.Inject(pack)
		atomic.AddInt64(&s.processMessageCount, 1)
		atomic.AddInt64(&s.processMessageBytes, int64(len(payload)))
		return 0
	})

	ticker := ir.Ticker()

	for true {
		retval := s.sb.ProcessMessage(nil)
		if retval <= 0 { // Sandbox is in polling mode
			if retval < 0 {
				atomic.AddInt64(&s.processMessageFailures, 1)
				em := s.sb.LastError()
				if len(em) > 0 {
					ir.LogError(errors.New(em))
				}
			}
			if ticker == nil {
				ir.LogMessage("single run completed")
				break
			}
			select { // block until stop or poll interval
			case <-s.stopChan:
			case <-ticker:
			}
		} else { // Sandbox is shutting down
			em := s.sb.LastError()
			if !strings.HasSuffix(em, "shutting down") {
				ir.LogError(errors.New(em))
			}
			break
		}
	}

	s.reportLock.Lock()
	if s.sbc.PreserveData {
		err = s.sb.Destroy(s.preservationFile)
	} else {
		err = s.sb.Destroy("")
	}
	s.sb = nil
	s.reportLock.Unlock()
	return
}
Ejemplo n.º 4
0
func (input *S3SplitFileInput) Run(runner pipeline.InputRunner, helper pipeline.PluginHelper) error {
	// Begin listing the files (either straight from S3 or from a cache)
	// Write matching filenames on a "lister" channel
	// Read from the lister channel:
	//   - fetch the filename
	//   - read records from it
	//   - write them to a "reader" channel

	var (
		wg sync.WaitGroup
		i  uint32
	)

	wg.Add(1)
	go func() {
		runner.LogMessage("Starting S3 list")
	iteratorLoop:
		for r := range S3Iterator(input.bucket, input.S3BucketPrefix, input.schema) {
			select {
			case <-input.stop:
				runner.LogMessage("Stopping S3 list")
				break iteratorLoop
			default:
			}
			if r.Err != nil {
				runner.LogError(fmt.Errorf("Error getting S3 list: %s", r.Err))
			} else {
				basename := r.Key.Key[strings.LastIndex(r.Key.Key, "/")+1:]
				if input.objectMatch == nil || input.objectMatch.MatchString(basename) {
					runner.LogMessage(fmt.Sprintf("Found: %s", r.Key.Key))
					input.listChan <- r.Key.Key
				} else {
					runner.LogMessage(fmt.Sprintf("Skipping: %s", r.Key.Key))
				}
			}
		}
		// All done listing, close the channel
		runner.LogMessage("All done listing. Closing channel")
		close(input.listChan)
		wg.Done()
	}()

	// Run a pool of concurrent readers.
	for i = 0; i < input.S3WorkerCount; i++ {
		wg.Add(1)
		go input.fetcher(runner, &wg, i)
	}
	wg.Wait()

	return nil
}
Ejemplo n.º 5
0
func (input *S3SplitFileInput) fetcher(runner pipeline.InputRunner, wg *sync.WaitGroup, workerId uint32) {
	var (
		s3Key     string
		startTime time.Time
		duration  float64
	)

	fetcherName := fmt.Sprintf("S3Reader%d", workerId)
	deliverer := runner.NewDeliverer(fetcherName)
	defer deliverer.Done()
	splitterRunner := runner.NewSplitterRunner(fetcherName)

	ok := true
	for ok {
		select {
		case s3Key, ok = <-input.listChan:
			if !ok {
				// Channel is closed => we're shutting down, exit cleanly.
				// runner.LogMessage("Fetcher all done! shutting down.")
				break
			}

			startTime = time.Now().UTC()
			err := input.readS3File(runner, &deliverer, &splitterRunner, s3Key)
			atomic.AddInt64(&input.processFileCount, 1)
			leftovers := splitterRunner.GetRemainingData()
			lenLeftovers := len(leftovers)
			if lenLeftovers > 0 {
				atomic.AddInt64(&input.processFileDiscardedBytes, int64(lenLeftovers))
				runner.LogError(fmt.Errorf("Trailing data, possible corruption: %d bytes left in stream at EOF: %s", lenLeftovers, s3Key))
			}
			if err != nil && err != io.EOF {
				runner.LogError(fmt.Errorf("Error reading %s: %s", s3Key, err))
				atomic.AddInt64(&input.processFileFailures, 1)
				continue
			}
			duration = time.Now().UTC().Sub(startTime).Seconds()
			runner.LogMessage(fmt.Sprintf("Successfully fetched %s in %.2fs ", s3Key, duration))
		case <-input.stop:
			for _ = range input.listChan {
				// Drain the channel without processing the files.
				// Technically the S3Iterator can still add one back on to the
				// channel but this ensures there is room so it won't block.
			}
			ok = false
		}
	}

	wg.Done()
}
Ejemplo n.º 6
0
func (input *S3OffsetInput) fetcher(runner pipeline.InputRunner, wg *sync.WaitGroup, workerId uint32) {
	var (
		loc       MessageLocation
		startTime time.Time
		duration  float64
		headers   map[string][]string
		record    []byte
		err       error
	)

	headers = map[string][]string{
		"Range": []string{""},
	}

	fetcherName := fmt.Sprintf("S3Reader%d", workerId)
	deliverer := runner.NewDeliverer(fetcherName)
	defer deliverer.Done()
	splitterRunner := runner.NewSplitterRunner(fetcherName)

	ok := true
	for ok {
		select {
		case loc, ok = <-input.offsetChan:
			if !ok {
				// Channel is closed => we're shutting down, exit cleanly.
				runner.LogMessage("Fetcher all done! shutting down.")
				break
			}

			startTime = time.Now().UTC()
			// Read one message from the given location
			headers["Range"][0] = fmt.Sprintf("bytes=%d-%d", loc.Offset, loc.Offset+loc.Length-1)
			atomic.AddInt64(&input.processMessageCount, 1)
			atomic.AddInt64(&input.processMessageBytes, int64(loc.Length))
			for attempt := uint32(1); attempt <= input.S3Retries; attempt++ {
				record, err = getClientRecord(input.bucket, &loc, headers)
				if err != nil {
					runner.LogMessage(fmt.Sprintf("Error #%d fetching %s @ %d+%d: %s\n", attempt, loc.Key, loc.Offset, loc.Length, err))
				} else {
					break
				}
			}
			if err != nil {
				atomic.AddInt64(&input.processMessageFailures, 1)
				continue
			}
			splitterRunner.DeliverRecord(record, deliverer)
			duration = time.Now().UTC().Sub(startTime).Seconds()
			runner.LogMessage(fmt.Sprintf("Successfully fetched %s in %.2fs ", loc.Key, duration))

		case <-input.stop:
			runner.LogMessage("Stopping fetcher...")
			for _ = range input.offsetChan {
				// Drain the channel without processing anything.
			}
			ok = false
		}
	}

	wg.Done()
}
Ejemplo n.º 7
0
func (input *S3OffsetInput) Run(runner pipeline.InputRunner, helper pipeline.PluginHelper) error {
	// List offset metadata index files
	// For each index D >= start and <= end
	//   Read index D
	//   Write offsets for any desired clients to offsetChan
	// Meanwhile, for each item in offsetChan
	//   Go fetch that record, inject resulting message into pipeline.

	var (
		wg          sync.WaitGroup
		i           uint32
		emptySchema Schema
	)

	if input.metaFileName != "" {
		wg.Add(1)
		go func() {
			reader, err := os.Open(input.metaFileName)
			if err != nil {
				runner.LogMessage(fmt.Sprintf("Error opening metadata file '%s': %s", input.metaFileName, err))
			}
			defer reader.Close()
			err = input.parseMessageLocations(reader, input.metaFileName)
			if err != nil {
				runner.LogMessage(fmt.Sprintf("Error reading metadata: %s", err))
			}
			// All done with metadata, close the channel
			runner.LogMessage("All done with metadata. Closing channel")
			close(input.offsetChan)
			wg.Done()
		}()
	} else if input.metaBucket != nil {
		wg.Add(1)
		go func() {
			runner.LogMessage("Starting S3 list")
		iteratorLoop:
			for r := range S3Iterator(input.metaBucket, input.S3MetaBucketPrefix, emptySchema) {
				select {
				case <-input.stop:
					runner.LogMessage("Stopping S3 list")
					break iteratorLoop
				default:
				}
				if r.Err != nil {
					runner.LogError(fmt.Errorf("Error getting S3 list: %s", r.Err))
				} else {
					base := path.Base(r.Key.Key)[0:8]
					// Check if r is in the desired date range.
					if base >= input.StartDate && base <= input.EndDate {
						err := input.grep(r)
						if err != nil {
							runner.LogMessage(fmt.Sprintf("Error reading index: %s", err))
						}
					}
				}
			}
			// All done listing, close the channel
			runner.LogMessage("All done listing. Closing channel")
			close(input.offsetChan)
			wg.Done()
		}()
	} else {
		runner.LogMessage("Nothing to do, no metadata available. Closing channel")
		close(input.offsetChan)
		wg.Done()
	}

	// Run a pool of concurrent readers.
	for i = 0; i < input.S3WorkerCount; i++ {
		wg.Add(1)
		go input.fetcher(runner, &wg, i)
	}
	wg.Wait()

	return nil
}