// randomize order of each listing batch func List(ss3 s3.Interface, output *StepLocation, ch chan s3.ListedObject) { var marker string for { r, err := ss3.List(s3.ListRequest{MaxKeys: 1000, Bucket: output.Bucket, Prefix: output.Prefix, Marker: marker}) if err != nil { panic(err) } p := rand.Perm(len(r.Contents)) for i := 0; i < len(r.Contents); i++ { v := r.Contents[p[i]] ch <- s3.ListedObject{ ListBucketResultContents: v, Bucket: output.Bucket, } } if r.IsTruncated { marker = r.Contents[len(r.Contents)-1].Key } else { break } } close(ch) }
// enables transactional processing of files func LoadLines3(ss3 s3.Interface, output *StepLocation, threads int, proc FileProcessor) { var wg sync.WaitGroup ch := make(chan s3.ListedObject) for i := 0; i < threads; i++ { wg.Add(1) go func() { for o := range ch { fn := o.Object().Url() p := proc.ForFile(fn, o.Size) for p != nil { r, err := ss3.Get(s3.GetRequest{Object: o.Object()}) if err != nil { if p = proc.Failure(fn, o.Size, err); p != nil { continue } else { break } } if strings.HasSuffix(o.Key, ".gz") { r, err = gzip.NewReader(r) check(err) } scanner := bufio.NewScanner(r) for scanner.Scan() { kv := ParseLine(scanner.Text()) p(&kv) } if err := scanner.Err(); err != nil { r.Close() if p = proc.Failure(fn, o.Size, err); p != nil { continue } else { break } } else { proc.Success(fn) p = nil } r.Close() } } wg.Done() }() } List(ss3, output, ch) wg.Wait() }
func LoadLines2(ss3 s3.Interface, output *StepLocation, threads int, decider UrlDeciderFunc, f func(string, *KeyValue)) { var wg, wg2 sync.WaitGroup ch2 := make(chan *FileKeyValue) ch := make(chan s3.ListedObject) wg2.Add(1) go func() { for fkv := range ch2 { f(fkv.Filename, fkv.Item) } wg2.Done() }() for i := 0; i < threads; i++ { wg.Add(1) go func() { for o := range ch { fn := o.Object().Url() if decider(fn) { r, err := ss3.Get(s3.GetRequest{Object: o.Object()}) check(err) defer r.Close() if strings.HasSuffix(o.Key, ".gz") { r, err = gzip.NewReader(r) check(err) } scanner := bufio.NewScanner(r) for scanner.Scan() { kv := ParseLine(scanner.Text()) ch2 <- &FileKeyValue{ Filename: fn, Item: &kv, } } if err := scanner.Err(); err != nil { panic(err) } } } wg.Done() }() } List(ss3, output, ch) wg.Wait() close(ch2) wg2.Wait() }