func (s *migrationSourceWs) Do(op *operation) error { <-s.allConnected criuType := CRIUType_CRIU_RSYNC.Enum() if !s.live { criuType = nil defer s.container.StorageStop() } idmaps := make([]*IDMapType, 0) idmapset := s.container.IdmapSet() if idmapset != nil { for _, ctnIdmap := range idmapset.Idmap { idmap := IDMapType{ Isuid: proto.Bool(ctnIdmap.Isuid), Isgid: proto.Bool(ctnIdmap.Isgid), Hostid: proto.Int(ctnIdmap.Hostid), Nsid: proto.Int(ctnIdmap.Nsid), Maprange: proto.Int(ctnIdmap.Maprange), } idmaps = append(idmaps, &idmap) } } sources, fsErr := s.container.Storage().MigrationSource(s.container) /* the protocol says we have to send a header no matter what, so let's * do that, but then immediately send an error. */ snapshots := []string{} if fsErr == nil { /* A bit of a special case here: doing lxc launch * host2:c1/snap1 host1:container we're sending a snapshot, but * it ends up as the container on the other end. So, we want to * send it as the main container (i.e. ignore its IsSnapshot()). */ if len(sources) > 1 { for _, snap := range sources { if !snap.IsSnapshot() { continue } name := shared.ExtractSnapshotName(snap.Name()) snapshots = append(snapshots, name) } } } myType := s.container.Storage().MigrationType() header := MigrationHeader{ Fs: &myType, Criu: criuType, Idmap: idmaps, Snapshots: snapshots, } if err := s.send(&header); err != nil { s.sendControl(err) return err } if fsErr != nil { s.sendControl(fsErr) return fsErr } if err := s.recv(&header); err != nil { s.sendControl(err) return err } // TODO: actually fall back on rsync. if *header.Fs != myType { err := fmt.Errorf("mismatched storage types not supported yet") s.sendControl(err) return err } if s.live { if header.Criu == nil { err := fmt.Errorf("Got no CRIU socket type for live migration") s.sendControl(err) return err } else if *header.Criu != CRIUType_CRIU_RSYNC { err := fmt.Errorf("Formats other than criu rsync not understood") s.sendControl(err) return err } checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_") if err != nil { s.sendControl(err) return err } defer os.RemoveAll(checkpointDir) opts := lxc.CheckpointOptions{Stop: true, Directory: checkpointDir, Verbose: true} err = s.container.Checkpoint(opts) if err2 := CollectCRIULogFile(s.container, checkpointDir, "migration", "dump"); err2 != nil { shared.Debugf("Error collecting checkpoint log file %s", err) } if err != nil { log := GetCRIULogErrors(checkpointDir, "dump") err = fmt.Errorf("checkpoint failed:\n%s", log) s.sendControl(err) return err } /* * We do the serially right now, but there's really no reason for us * to; since we have separate websockets, we can do it in parallel if * we wanted to. However, assuming we're network bound, there's really * no reason to do these in parallel. In the future when we're using * p.haul's protocol, it will make sense to do these in parallel. */ if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn); err != nil { s.sendControl(err) return err } } for _, source := range sources { shared.Debugf("sending fs object %s", source.Name()) if err := source.Send(s.fsConn); err != nil { s.sendControl(err) return err } } msg := MigrationControl{} if err := s.recv(&msg); err != nil { s.disconnect() return err } // TODO: should we add some config here about automatically restarting // the container migrate failure? What about the failures above? if !*msg.Success { return fmt.Errorf(*msg.Message) } return nil }
func (s *migrationSourceWs) Do(op *operation) error { <-s.allConnected criuType := CRIUType_CRIU_RSYNC.Enum() if !s.live { criuType = nil err := s.container.StorageStart() if err != nil { return err } defer s.container.StorageStop() } idmaps := make([]*IDMapType, 0) idmapset := s.container.IdmapSet() if idmapset != nil { for _, ctnIdmap := range idmapset.Idmap { idmap := IDMapType{ Isuid: proto.Bool(ctnIdmap.Isuid), Isgid: proto.Bool(ctnIdmap.Isgid), Hostid: proto.Int(ctnIdmap.Hostid), Nsid: proto.Int(ctnIdmap.Nsid), Maprange: proto.Int(ctnIdmap.Maprange), } idmaps = append(idmaps, &idmap) } } driver, fsErr := s.container.Storage().MigrationSource(s.container) /* the protocol says we have to send a header no matter what, so let's * do that, but then immediately send an error. */ snapshots := []string{} if fsErr == nil { fullSnaps := driver.Snapshots() for _, snap := range fullSnaps { snapshots = append(snapshots, shared.ExtractSnapshotName(snap.Name())) } } myType := s.container.Storage().MigrationType() header := MigrationHeader{ Fs: &myType, Criu: criuType, Idmap: idmaps, Snapshots: snapshots, } if err := s.send(&header); err != nil { s.sendControl(err) return err } if fsErr != nil { s.sendControl(fsErr) return fsErr } if err := s.recv(&header); err != nil { s.sendControl(err) return err } if *header.Fs != myType { myType = MigrationFSType_RSYNC header.Fs = &myType driver, _ = rsyncMigrationSource(s.container) } defer driver.Cleanup() if err := driver.SendWhileRunning(s.fsConn); err != nil { s.sendControl(err) return err } if s.live { if header.Criu == nil { err := fmt.Errorf("Got no CRIU socket type for live migration") s.sendControl(err) return err } else if *header.Criu != CRIUType_CRIU_RSYNC { err := fmt.Errorf("Formats other than criu rsync not understood") s.sendControl(err) return err } checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_") if err != nil { s.sendControl(err) return err } defer os.RemoveAll(checkpointDir) opts := lxc.CheckpointOptions{Stop: true, Directory: checkpointDir, Verbose: true} err = s.container.Checkpoint(opts) if err2 := CollectCRIULogFile(s.container, checkpointDir, "migration", "dump"); err2 != nil { shared.Debugf("Error collecting checkpoint log file %s", err) } if err != nil { log, err2 := GetCRIULogErrors(checkpointDir, "dump") /* couldn't find the CRIU log file which means we * didn't even get that far; give back the liblxc * error. */ if err2 != nil { log = err.Error() } err = fmt.Errorf("checkpoint failed:\n%s", log) s.sendControl(err) return err } /* * We do the serially right now, but there's really no reason for us * to; since we have separate websockets, we can do it in parallel if * we wanted to. However, assuming we're network bound, there's really * no reason to do these in parallel. In the future when we're using * p.haul's protocol, it will make sense to do these in parallel. */ if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn); err != nil { s.sendControl(err) return err } if err := driver.SendAfterCheckpoint(s.fsConn); err != nil { s.sendControl(err) return err } } msg := MigrationControl{} if err := s.recv(&msg); err != nil { s.disconnect() return err } // TODO: should we add some config here about automatically restarting // the container migrate failure? What about the failures above? if !*msg.Success { return fmt.Errorf(*msg.Message) } return nil }
func (s *migrationSourceWs) Do(migrateOp *operation) error { <-s.allConnected criuType := CRIUType_CRIU_RSYNC.Enum() if !s.live { criuType = nil err := s.container.StorageStart() if err != nil { return err } defer s.container.StorageStop() } idmaps := make([]*IDMapType, 0) idmapset := s.container.IdmapSet() if idmapset != nil { for _, ctnIdmap := range idmapset.Idmap { idmap := IDMapType{ Isuid: proto.Bool(ctnIdmap.Isuid), Isgid: proto.Bool(ctnIdmap.Isgid), Hostid: proto.Int(ctnIdmap.Hostid), Nsid: proto.Int(ctnIdmap.Nsid), Maprange: proto.Int(ctnIdmap.Maprange), } idmaps = append(idmaps, &idmap) } } driver, fsErr := s.container.Storage().MigrationSource(s.container) /* the protocol says we have to send a header no matter what, so let's * do that, but then immediately send an error. */ snapshots := []*Snapshot{} snapshotNames := []string{} if fsErr == nil { fullSnaps := driver.Snapshots() for _, snap := range fullSnaps { snapshots = append(snapshots, snapshotToProtobuf(snap)) snapshotNames = append(snapshotNames, shared.ExtractSnapshotName(snap.Name())) } } myType := s.container.Storage().MigrationType() header := MigrationHeader{ Fs: &myType, Criu: criuType, Idmap: idmaps, SnapshotNames: snapshotNames, Snapshots: snapshots, } if err := s.send(&header); err != nil { s.sendControl(err) return err } if fsErr != nil { s.sendControl(fsErr) return fsErr } if err := s.recv(&header); err != nil { s.sendControl(err) return err } if *header.Fs != myType { myType = MigrationFSType_RSYNC header.Fs = &myType driver, _ = rsyncMigrationSource(s.container) } // All failure paths need to do a few things to correctly handle errors before returning. // Unfortunately, handling errors is not well-suited to defer as the code depends on the // status of driver and the error value. The error value is especially tricky due to the // common case of creating a new err variable (intentional or not) due to scoping and use // of ":=". Capturing err in a closure for use in defer would be fragile, which defeats // the purpose of using defer. An abort function reduces the odds of mishandling errors // without introducing the fragility of closing on err. abort := func(err error) error { driver.Cleanup() s.sendControl(err) return err } if err := driver.SendWhileRunning(s.fsConn, migrateOp); err != nil { return abort(err) } restoreSuccess := make(chan bool, 1) dumpSuccess := make(chan error, 1) if s.live { if header.Criu == nil { return abort(fmt.Errorf("Got no CRIU socket type for live migration")) } else if *header.Criu != CRIUType_CRIU_RSYNC { return abort(fmt.Errorf("Formats other than criu rsync not understood")) } checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_") if err != nil { return abort(err) } if lxc.VersionAtLeast(2, 0, 4) { /* What happens below is slightly convoluted. Due to various * complications with networking, there's no easy way for criu * to exit and leave the container in a frozen state for us to * somehow resume later. * * Instead, we use what criu calls an "action-script", which is * basically a callback that lets us know when the dump is * done. (Unfortunately, we can't pass arguments, just an * executable path, so we write a custom action script with the * real command we want to run.) * * This script then hangs until the migration operation either * finishes successfully or fails, and exits 1 or 0, which * causes criu to either leave the container running or kill it * as we asked. */ dumpDone := make(chan bool, 1) actionScriptOpSecret, err := shared.RandomCryptoString() if err != nil { os.RemoveAll(checkpointDir) return abort(err) } actionScriptOp, err := operationCreate( operationClassWebsocket, nil, nil, func(op *operation) error { result := <-restoreSuccess if !result { return fmt.Errorf("restore failed, failing CRIU") } return nil }, nil, func(op *operation, r *http.Request, w http.ResponseWriter) error { secret := r.FormValue("secret") if secret == "" { return fmt.Errorf("missing secret") } if secret != actionScriptOpSecret { return os.ErrPermission } c, err := shared.WebsocketUpgrader.Upgrade(w, r, nil) if err != nil { return err } dumpDone <- true closeMsg := websocket.FormatCloseMessage(websocket.CloseNormalClosure, "") return c.WriteMessage(websocket.CloseMessage, closeMsg) }, ) if err != nil { os.RemoveAll(checkpointDir) return abort(err) } if err := writeActionScript(checkpointDir, actionScriptOp.url, actionScriptOpSecret); err != nil { os.RemoveAll(checkpointDir) return abort(err) } _, err = actionScriptOp.Run() if err != nil { os.RemoveAll(checkpointDir) return abort(err) } go func() { dumpSuccess <- s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, true) os.RemoveAll(checkpointDir) }() select { /* the checkpoint failed, let's just abort */ case err = <-dumpSuccess: return abort(err) /* the dump finished, let's continue on to the restore */ case <-dumpDone: shared.LogDebugf("Dump finished, continuing with restore...") } } else { defer os.RemoveAll(checkpointDir) if err := s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, false); err != nil { return abort(err) } } /* * We do the serially right now, but there's really no reason for us * to; since we have separate websockets, we can do it in parallel if * we wanted to. However, assuming we're network bound, there's really * no reason to do these in parallel. In the future when we're using * p.haul's protocol, it will make sense to do these in parallel. */ if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn, nil); err != nil { return abort(err) } if err := driver.SendAfterCheckpoint(s.fsConn); err != nil { return abort(err) } } driver.Cleanup() msg := MigrationControl{} if err := s.recv(&msg); err != nil { s.disconnect() return err } if s.live { restoreSuccess <- *msg.Success err := <-dumpSuccess if err != nil { shared.LogErrorf("dump failed after successful restore?: %q", err) } } if !*msg.Success { return fmt.Errorf(*msg.Message) } return nil }