Esempio n. 1
0
// randomize order of each listing batch
func List(ss3 s3.Interface, output *StepLocation, ch chan s3.ListedObject) {
	var marker string
	for {
		r, err := ss3.List(s3.ListRequest{MaxKeys: 1000, Bucket: output.Bucket, Prefix: output.Prefix, Marker: marker})
		if err != nil {
			panic(err)
		}

		p := rand.Perm(len(r.Contents))

		for i := 0; i < len(r.Contents); i++ {
			v := r.Contents[p[i]]
			ch <- s3.ListedObject{
				ListBucketResultContents: v,
				Bucket: output.Bucket,
			}
		}

		if r.IsTruncated {
			marker = r.Contents[len(r.Contents)-1].Key
		} else {
			break
		}
	}
	close(ch)
}
Esempio n. 2
0
// enables transactional processing of files
func LoadLines3(ss3 s3.Interface, output *StepLocation, threads int, proc FileProcessor) {
	var wg sync.WaitGroup
	ch := make(chan s3.ListedObject)
	for i := 0; i < threads; i++ {
		wg.Add(1)
		go func() {
			for o := range ch {
				fn := o.Object().Url()
				p := proc.ForFile(fn, o.Size)
				for p != nil {
					r, err := ss3.Get(s3.GetRequest{Object: o.Object()})
					if err != nil {
						if p = proc.Failure(fn, o.Size, err); p != nil {
							continue
						} else {
							break
						}
					}
					if strings.HasSuffix(o.Key, ".gz") {
						r, err = gzip.NewReader(r)
						check(err)
					}
					scanner := bufio.NewScanner(r)
					for scanner.Scan() {
						kv := ParseLine(scanner.Text())
						p(&kv)
					}
					if err := scanner.Err(); err != nil {
						r.Close()
						if p = proc.Failure(fn, o.Size, err); p != nil {
							continue
						} else {
							break
						}
					} else {
						proc.Success(fn)
						p = nil
					}
					r.Close()
				}
			}
			wg.Done()
		}()
	}
	List(ss3, output, ch)
	wg.Wait()
}
Esempio n. 3
0
func LoadLines2(ss3 s3.Interface, output *StepLocation, threads int, decider UrlDeciderFunc, f func(string, *KeyValue)) {
	var wg, wg2 sync.WaitGroup
	ch2 := make(chan *FileKeyValue)
	ch := make(chan s3.ListedObject)
	wg2.Add(1)
	go func() {
		for fkv := range ch2 {
			f(fkv.Filename, fkv.Item)
		}
		wg2.Done()
	}()
	for i := 0; i < threads; i++ {
		wg.Add(1)
		go func() {
			for o := range ch {
				fn := o.Object().Url()
				if decider(fn) {
					r, err := ss3.Get(s3.GetRequest{Object: o.Object()})
					check(err)
					defer r.Close()
					if strings.HasSuffix(o.Key, ".gz") {
						r, err = gzip.NewReader(r)
						check(err)
					}

					scanner := bufio.NewScanner(r)
					for scanner.Scan() {
						kv := ParseLine(scanner.Text())
						ch2 <- &FileKeyValue{
							Filename: fn,
							Item:     &kv,
						}
					}
					if err := scanner.Err(); err != nil {
						panic(err)
					}
				}
			}
			wg.Done()
		}()
	}
	List(ss3, output, ch)
	wg.Wait()
	close(ch2)
	wg2.Wait()
}