func executeItemQuery(con *Context, q *datastore.Query, limit int, cursorStr string) ([]Item, string, error) { if cursor, err := datastore.DecodeCursor(cursorStr); err == nil { q = q.Start(cursor) } var is = make([]Item, 0, limit) var err error t := q.Run(con.C) for { var i Item _, err = t.Next(&i) if err == datastore.Done { break } is = append(is, i) if err != nil { con.Log.Errorf("Error fetching next item: %v", err) return nil, "", err } } var cursor datastore.Cursor if cursor, err = t.Cursor(); err == nil { return is, cursor.String(), nil } return nil, "", err }
// createQuery builds a range query using start and end. It works // for [start,end[, [start,nil] and [start,start] intervals. The // returned query is sorted by __key__ and limited to BatchSize. func createQuery(start, end *datastore.Key, cur datastore.Cursor) *datastore.Query { q := datastore.NewQuery(start.Kind()) if start.Equal(end) { q = q.Filter("__key__ =", start) } else { q = q.Filter("__key__ >=", start) if end != nil { q = q.Filter("__key__ <", end) } } if cur.String() != "" { q = q.Start(cur) } q = q.Order("__key__") return q }
func process(c context.Context, processor Processor, start string) error { // use the full 10 minutes allowed (assuming front-end instance type) c, _ = context.WithTimeout(c, time.Duration(10)*time.Minute) // get the query to iterate and the entity slot to load (could be nill for keys_only) q, e := processor.Start(c) var cursor *datastore.Cursor if start != "" { newCursor, err := datastore.DecodeCursor(start) if err != nil { log.Errorf(c, "get start cursor error %s", err.Error()) return err } cursor = &newCursor } // signal a timeout after 5 minutes timeout := make(chan bool, 1) timer := time.AfterFunc(time.Duration(5)*time.Minute, func() { timeout <- true }) defer timer.Stop() // TODO: error handling to retry Loop: for { // check if we've timed out or whether to keep going select { case <-timeout: break Loop default: } processed := 0 if cursor != nil { q = q.Start(*cursor) } it := q.Run(c) for { key, err := it.Next(e) if err == datastore.Done { break } if err != nil { log.Errorf(c, "get key error %s", err.Error()) return err } processor.Process(c, key) processed++ } // did we process any? if processed > 0 { newCursor, err := it.Cursor() if err != nil { log.Errorf(c, "get next cursor error %s", err.Error()) return err } cursor = &newCursor } else { // otherwise we're finished cursor = nil break } } // let the processor write any aggregation entries / tasks etc... processor.Complete(c) // if we didn't complete everything then continue from the cursor if cursor != nil { processFunc.Call(c, processor, cursor.String()) } return nil }
func (it *iterator) iterate(c context.Context, mapper *mapper) (bool, error) { taskTimeout := time.After(mapper.config.TaskTimeout) taskRunning := true // if the query defines the specific namespaces to process // then we can just process that list directly if it.Query.selection == selected { for _, namespace := range it.Query.namespaces { it.process(c, mapper, namespace) } return true, nil } q := it.createQuery(c) var cursor *datastore.Cursor if it.Cursor != "" { newCursor, err := datastore.DecodeCursor(it.Cursor) if err != nil { log.Errorf(c, "get start cursor error %s", err.Error()) return false, err } cursor = &newCursor } // main task loop to repeat datastore query with cursor for taskRunning { // if cursor is set, start the query at that point if cursor != nil { q = q.Start(*cursor) } // limit how long the cursor can run before we requery cursorTimeout := time.After(mapper.config.CursorTimeout) // datastore cursor context needs to run for the max allowed cc, _ := context.WithTimeout(c, time.Duration(60)*time.Second) t := q.Run(cc) // item loop to iterate cursor cursorLoop: for { key, err := t.Next(nil) if err == datastore.Done { // we reached the end return true, nil } if err != nil { log.Errorf(c, "error %s", err.Error()) return false, err } namespace := key.StringID() if err := it.process(c, mapper, namespace); err != nil { return false, err } select { case <-taskTimeout: // clearing the flag breaks us out of the task loop but also lets us update the // cursor first when we break from the inner cursorLoop taskRunning = false break cursorLoop default: select { case <-cursorTimeout: // this forces a new cursor and query so we don't suffer from datastore timeouts break cursorLoop default: // no timeout so carry on with the current cursor continue cursorLoop } } } // we need to get the cursor for where we are upto whether we are requerying // within this task or scheduling a new continuation slice newCursor, err := t.Cursor() if err != nil { log.Errorf(c, "get next cursor error %s", err.Error()) return false, err } cursor = &newCursor it.Cursor = cursor.String() } return false, nil }
func processPhotos(c context.Context, processor PhotoProcessor) error { // use the full 10 minutes allowed (assuming front-end instance type) c, _ = context.WithTimeout(c, time.Duration(10)*time.Minute) r := processor.Start(c) log.Debugf(c, "processPhotos from %s to %s cursor %s", r.From.Format(dateFormat), r.To.Format(dateFormat), r.Start) // TODO: describe pros & cons of different querying + continuation strategies q := datastore.NewQuery("photo") q = q.Filter("taken >=", r.From) q = q.Filter("taken <", r.To) q = q.Order("taken") // I use keys only because it saves on cost - entities come from memcache if possible q = q.KeysOnly() var cursor *datastore.Cursor if r.Start != "" { newCursor, err := datastore.DecodeCursor(r.Start) if err != nil { log.Errorf(c, "get start cursor error %s", err.Error()) return err } cursor = &newCursor } // only one entity is loaded at a time p := new(Photo) timeout := make(chan bool, 1) timer := time.AfterFunc(r.Timeout, func() { timeout <- true }) defer timer.Stop() Loop: for { // check if we've timed out or whether to keep going select { case <-timeout: break Loop default: } processed := 0 q = q.Limit(r.Size) if cursor != nil { q = q.Start(*cursor) } it := q.Run(c) for { // if not using keys only then we would load the actual entity here using // key, err := it.Next(p) key, err := it.Next(nil) if err == datastore.Done { break } if err != nil { log.Errorf(c, "get key error %s", err.Error()) return err } // loads the actual entity from memcache / datastore err = nds.Get(c, key, p) if err != nil { log.Errorf(c, "get photo error %s", err.Error()) return err } // call the processor with the entity p.ID = key.IntID() processor.Process(c, p) processed++ } // did we process a full batch? if so, there may be more if processed == r.Size { newCursor, err := it.Cursor() if err != nil { log.Errorf(c, "get next cursor error %s", err.Error()) return err } cursor = &newCursor } else { // otherwise we're finished cursor = nil break } } // let the processor write any aggregation entries / tasks etc... processor.Complete(c) // if we didn't complete everything then continue from the cursor if cursor != nil { r.Start = cursor.String() processPhotosFunc.Call(c, processor) } return nil }
func (s *shard) iterate(c context.Context, mapper *mapper) (bool, error) { // switch namespace c, _ = appengine.Namespace(c, s.Namespace) taskTimeout := time.After(mapper.config.TaskTimeout) taskRunning := true jobOutput, useJobOutput := s.job.JobSpec.(JobOutput) if useJobOutput && s.job.Bucket != "" { w, err := s.createOutputFile(c) if err != nil { return false, err } defer w.Close() jobOutput.Output(w) } q := datastore.NewQuery(s.Query.kind) for _, f := range s.Query.filter { q = q.Filter(f.FieldName+" "+f.Op.String(), f.Value) } var cursor *datastore.Cursor if s.Cursor != "" { newCursor, err := datastore.DecodeCursor(s.Cursor) if err != nil { log.Errorf(c, "get start cursor error %s", err.Error()) return false, err } cursor = &newCursor } // what we'll load into if doing full entity loads (i.e. not doing KeysOnly) var entity interface{} // is full loading implemented? jobEntity, useJobEntity := s.job.JobSpec.(JobEntity) if useJobEntity { entity = jobEntity.Make() } else { q = q.KeysOnly() } // main task loop to repeat datastore query with cursor for taskRunning { // if cursor is set, start the query at that point if cursor != nil { q = q.Start(*cursor) } // limit how long the cursor can run before we requery cursorTimeout := time.After(mapper.config.CursorTimeout) // datastore cursor context needs to run for the max allowed cc, _ := context.WithTimeout(c, time.Duration(60)*time.Second) it := q.Run(cc) // item loop to iterate cursor cursorLoop: for { key, err := it.Next(entity) if err == datastore.Done { // we reached the end return true, nil } // TODO: option to fail or continue on individual errors // or add error handling logic to job to give it a chance (?) if err != nil { log.Errorf(c, "key %v error %v", key, err) // return false, err continue cursorLoop } if err := s.job.JobSpec.Next(c, s.Counters, key); err != nil { // TODO: instead of failing the entire slice, try to figure // out if it's possible to continue from this point or maybe // the last cursor position to avoid re-processing entities. // NOTE: this would need to truncate any output file being // written so entries weren't doubled up but maybe possible. return false, err } s.Count++ select { case <-taskTimeout: // clearing the flag breaks us out of the task loop but also lets us update the // cursor first when we break from the inner cursorLoop taskRunning = false break cursorLoop default: select { case <-cursorTimeout: // this forces a new cursor and query so we don't suffer from datastore timeouts break cursorLoop default: // no timeout so carry on with the current cursor continue cursorLoop } } } // we need to get the cursor for where we are upto whether we are requerying // within this task or scheduling a new continuation slice newCursor, err := it.Cursor() if err != nil { log.Errorf(c, "get next cursor error %s", err.Error()) return false, err } cursor = &newCursor s.Cursor = cursor.String() } return false, nil }
func (p *MemcacheQueryPlugin) Start(cur datastore.Cursor) { p.buf.WriteString(fmt.Sprintf(":!s=%s", cur.String())) }