Beispiel #1
0
func (s *migrationSourceWs) Do(op *operation) error {
	<-s.allConnected

	criuType := CRIUType_CRIU_RSYNC.Enum()
	if !s.live {
		criuType = nil
		defer s.container.StorageStop()
	}

	idmaps := make([]*IDMapType, 0)

	idmapset := s.container.IdmapSet()
	if idmapset != nil {
		for _, ctnIdmap := range idmapset.Idmap {
			idmap := IDMapType{
				Isuid:    proto.Bool(ctnIdmap.Isuid),
				Isgid:    proto.Bool(ctnIdmap.Isgid),
				Hostid:   proto.Int(ctnIdmap.Hostid),
				Nsid:     proto.Int(ctnIdmap.Nsid),
				Maprange: proto.Int(ctnIdmap.Maprange),
			}

			idmaps = append(idmaps, &idmap)
		}
	}

	sources, fsErr := s.container.Storage().MigrationSource(s.container)
	/* the protocol says we have to send a header no matter what, so let's
	 * do that, but then immediately send an error.
	 */
	snapshots := []string{}
	if fsErr == nil {
		/* A bit of a special case here: doing lxc launch
		 * host2:c1/snap1 host1:container we're sending a snapshot, but
		 * it ends up as the container on the other end. So, we want to
		 * send it as the main container (i.e. ignore its IsSnapshot()).
		 */
		if len(sources) > 1 {
			for _, snap := range sources {
				if !snap.IsSnapshot() {
					continue
				}
				name := shared.ExtractSnapshotName(snap.Name())
				snapshots = append(snapshots, name)
			}
		}
	}

	myType := s.container.Storage().MigrationType()
	header := MigrationHeader{
		Fs:        &myType,
		Criu:      criuType,
		Idmap:     idmaps,
		Snapshots: snapshots,
	}

	if err := s.send(&header); err != nil {
		s.sendControl(err)
		return err
	}

	if fsErr != nil {
		s.sendControl(fsErr)
		return fsErr
	}

	if err := s.recv(&header); err != nil {
		s.sendControl(err)
		return err
	}

	// TODO: actually fall back on rsync.
	if *header.Fs != myType {
		err := fmt.Errorf("mismatched storage types not supported yet")
		s.sendControl(err)
		return err
	}

	if s.live {
		if header.Criu == nil {
			err := fmt.Errorf("Got no CRIU socket type for live migration")
			s.sendControl(err)
			return err
		} else if *header.Criu != CRIUType_CRIU_RSYNC {
			err := fmt.Errorf("Formats other than criu rsync not understood")
			s.sendControl(err)
			return err
		}

		checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_")
		if err != nil {
			s.sendControl(err)
			return err
		}
		defer os.RemoveAll(checkpointDir)

		opts := lxc.CheckpointOptions{Stop: true, Directory: checkpointDir, Verbose: true}
		err = s.container.Checkpoint(opts)

		if err2 := CollectCRIULogFile(s.container, checkpointDir, "migration", "dump"); err2 != nil {
			shared.Debugf("Error collecting checkpoint log file %s", err)
		}

		if err != nil {
			log := GetCRIULogErrors(checkpointDir, "dump")

			err = fmt.Errorf("checkpoint failed:\n%s", log)
			s.sendControl(err)
			return err
		}

		/*
		 * We do the serially right now, but there's really no reason for us
		 * to; since we have separate websockets, we can do it in parallel if
		 * we wanted to. However, assuming we're network bound, there's really
		 * no reason to do these in parallel. In the future when we're using
		 * p.haul's protocol, it will make sense to do these in parallel.
		 */
		if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn); err != nil {
			s.sendControl(err)
			return err
		}
	}

	for _, source := range sources {
		shared.Debugf("sending fs object %s", source.Name())
		if err := source.Send(s.fsConn); err != nil {
			s.sendControl(err)
			return err
		}
	}

	msg := MigrationControl{}
	if err := s.recv(&msg); err != nil {
		s.disconnect()
		return err
	}

	// TODO: should we add some config here about automatically restarting
	// the container migrate failure? What about the failures above?
	if !*msg.Success {
		return fmt.Errorf(*msg.Message)
	}

	return nil
}
Beispiel #2
0
func (s *migrationSourceWs) Do(op *operation) error {
	<-s.allConnected

	criuType := CRIUType_CRIU_RSYNC.Enum()
	if !s.live {
		criuType = nil

		err := s.container.StorageStart()
		if err != nil {
			return err
		}

		defer s.container.StorageStop()
	}

	idmaps := make([]*IDMapType, 0)

	idmapset := s.container.IdmapSet()
	if idmapset != nil {
		for _, ctnIdmap := range idmapset.Idmap {
			idmap := IDMapType{
				Isuid:    proto.Bool(ctnIdmap.Isuid),
				Isgid:    proto.Bool(ctnIdmap.Isgid),
				Hostid:   proto.Int(ctnIdmap.Hostid),
				Nsid:     proto.Int(ctnIdmap.Nsid),
				Maprange: proto.Int(ctnIdmap.Maprange),
			}

			idmaps = append(idmaps, &idmap)
		}
	}

	driver, fsErr := s.container.Storage().MigrationSource(s.container)
	/* the protocol says we have to send a header no matter what, so let's
	 * do that, but then immediately send an error.
	 */
	snapshots := []string{}
	if fsErr == nil {
		fullSnaps := driver.Snapshots()
		for _, snap := range fullSnaps {
			snapshots = append(snapshots, shared.ExtractSnapshotName(snap.Name()))
		}
	}

	myType := s.container.Storage().MigrationType()
	header := MigrationHeader{
		Fs:        &myType,
		Criu:      criuType,
		Idmap:     idmaps,
		Snapshots: snapshots,
	}

	if err := s.send(&header); err != nil {
		s.sendControl(err)
		return err
	}

	if fsErr != nil {
		s.sendControl(fsErr)
		return fsErr
	}

	if err := s.recv(&header); err != nil {
		s.sendControl(err)
		return err
	}

	if *header.Fs != myType {
		myType = MigrationFSType_RSYNC
		header.Fs = &myType

		driver, _ = rsyncMigrationSource(s.container)
	}

	defer driver.Cleanup()

	if err := driver.SendWhileRunning(s.fsConn); err != nil {
		s.sendControl(err)
		return err
	}

	if s.live {
		if header.Criu == nil {
			err := fmt.Errorf("Got no CRIU socket type for live migration")
			s.sendControl(err)
			return err
		} else if *header.Criu != CRIUType_CRIU_RSYNC {
			err := fmt.Errorf("Formats other than criu rsync not understood")
			s.sendControl(err)
			return err
		}

		checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_")
		if err != nil {
			s.sendControl(err)
			return err
		}
		defer os.RemoveAll(checkpointDir)

		opts := lxc.CheckpointOptions{Stop: true, Directory: checkpointDir, Verbose: true}
		err = s.container.Checkpoint(opts)

		if err2 := CollectCRIULogFile(s.container, checkpointDir, "migration", "dump"); err2 != nil {
			shared.Debugf("Error collecting checkpoint log file %s", err)
		}

		if err != nil {
			log, err2 := GetCRIULogErrors(checkpointDir, "dump")

			/* couldn't find the CRIU log file which means we
			 * didn't even get that far; give back the liblxc
			 * error. */
			if err2 != nil {
				log = err.Error()
			}

			err = fmt.Errorf("checkpoint failed:\n%s", log)
			s.sendControl(err)
			return err
		}

		/*
		 * We do the serially right now, but there's really no reason for us
		 * to; since we have separate websockets, we can do it in parallel if
		 * we wanted to. However, assuming we're network bound, there's really
		 * no reason to do these in parallel. In the future when we're using
		 * p.haul's protocol, it will make sense to do these in parallel.
		 */
		if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn); err != nil {
			s.sendControl(err)
			return err
		}

		if err := driver.SendAfterCheckpoint(s.fsConn); err != nil {
			s.sendControl(err)
			return err
		}
	}

	msg := MigrationControl{}
	if err := s.recv(&msg); err != nil {
		s.disconnect()
		return err
	}

	// TODO: should we add some config here about automatically restarting
	// the container migrate failure? What about the failures above?
	if !*msg.Success {
		return fmt.Errorf(*msg.Message)
	}

	return nil
}
Beispiel #3
0
func (s *migrationSourceWs) Do(migrateOp *operation) error {
	<-s.allConnected

	criuType := CRIUType_CRIU_RSYNC.Enum()
	if !s.live {
		criuType = nil

		err := s.container.StorageStart()
		if err != nil {
			return err
		}

		defer s.container.StorageStop()
	}

	idmaps := make([]*IDMapType, 0)

	idmapset := s.container.IdmapSet()
	if idmapset != nil {
		for _, ctnIdmap := range idmapset.Idmap {
			idmap := IDMapType{
				Isuid:    proto.Bool(ctnIdmap.Isuid),
				Isgid:    proto.Bool(ctnIdmap.Isgid),
				Hostid:   proto.Int(ctnIdmap.Hostid),
				Nsid:     proto.Int(ctnIdmap.Nsid),
				Maprange: proto.Int(ctnIdmap.Maprange),
			}

			idmaps = append(idmaps, &idmap)
		}
	}

	driver, fsErr := s.container.Storage().MigrationSource(s.container)
	/* the protocol says we have to send a header no matter what, so let's
	 * do that, but then immediately send an error.
	 */
	snapshots := []*Snapshot{}
	snapshotNames := []string{}
	if fsErr == nil {
		fullSnaps := driver.Snapshots()
		for _, snap := range fullSnaps {
			snapshots = append(snapshots, snapshotToProtobuf(snap))
			snapshotNames = append(snapshotNames, shared.ExtractSnapshotName(snap.Name()))
		}
	}

	myType := s.container.Storage().MigrationType()
	header := MigrationHeader{
		Fs:            &myType,
		Criu:          criuType,
		Idmap:         idmaps,
		SnapshotNames: snapshotNames,
		Snapshots:     snapshots,
	}

	if err := s.send(&header); err != nil {
		s.sendControl(err)
		return err
	}

	if fsErr != nil {
		s.sendControl(fsErr)
		return fsErr
	}

	if err := s.recv(&header); err != nil {
		s.sendControl(err)
		return err
	}

	if *header.Fs != myType {
		myType = MigrationFSType_RSYNC
		header.Fs = &myType

		driver, _ = rsyncMigrationSource(s.container)
	}

	// All failure paths need to do a few things to correctly handle errors before returning.
	// Unfortunately, handling errors is not well-suited to defer as the code depends on the
	// status of driver and the error value.  The error value is especially tricky due to the
	// common case of creating a new err variable (intentional or not) due to scoping and use
	// of ":=".  Capturing err in a closure for use in defer would be fragile, which defeats
	// the purpose of using defer.  An abort function reduces the odds of mishandling errors
	// without introducing the fragility of closing on err.
	abort := func(err error) error {
		driver.Cleanup()
		s.sendControl(err)
		return err
	}

	if err := driver.SendWhileRunning(s.fsConn, migrateOp); err != nil {
		return abort(err)
	}

	restoreSuccess := make(chan bool, 1)
	dumpSuccess := make(chan error, 1)
	if s.live {
		if header.Criu == nil {
			return abort(fmt.Errorf("Got no CRIU socket type for live migration"))
		} else if *header.Criu != CRIUType_CRIU_RSYNC {
			return abort(fmt.Errorf("Formats other than criu rsync not understood"))
		}

		checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_")
		if err != nil {
			return abort(err)
		}

		if lxc.VersionAtLeast(2, 0, 4) {
			/* What happens below is slightly convoluted. Due to various
			 * complications with networking, there's no easy way for criu
			 * to exit and leave the container in a frozen state for us to
			 * somehow resume later.
			 *
			 * Instead, we use what criu calls an "action-script", which is
			 * basically a callback that lets us know when the dump is
			 * done. (Unfortunately, we can't pass arguments, just an
			 * executable path, so we write a custom action script with the
			 * real command we want to run.)
			 *
			 * This script then hangs until the migration operation either
			 * finishes successfully or fails, and exits 1 or 0, which
			 * causes criu to either leave the container running or kill it
			 * as we asked.
			 */
			dumpDone := make(chan bool, 1)
			actionScriptOpSecret, err := shared.RandomCryptoString()
			if err != nil {
				os.RemoveAll(checkpointDir)
				return abort(err)
			}

			actionScriptOp, err := operationCreate(
				operationClassWebsocket,
				nil,
				nil,
				func(op *operation) error {
					result := <-restoreSuccess
					if !result {
						return fmt.Errorf("restore failed, failing CRIU")
					}
					return nil
				},
				nil,
				func(op *operation, r *http.Request, w http.ResponseWriter) error {
					secret := r.FormValue("secret")
					if secret == "" {
						return fmt.Errorf("missing secret")
					}

					if secret != actionScriptOpSecret {
						return os.ErrPermission
					}

					c, err := shared.WebsocketUpgrader.Upgrade(w, r, nil)
					if err != nil {
						return err
					}

					dumpDone <- true

					closeMsg := websocket.FormatCloseMessage(websocket.CloseNormalClosure, "")
					return c.WriteMessage(websocket.CloseMessage, closeMsg)
				},
			)
			if err != nil {
				os.RemoveAll(checkpointDir)
				return abort(err)
			}

			if err := writeActionScript(checkpointDir, actionScriptOp.url, actionScriptOpSecret); err != nil {
				os.RemoveAll(checkpointDir)
				return abort(err)
			}

			_, err = actionScriptOp.Run()
			if err != nil {
				os.RemoveAll(checkpointDir)
				return abort(err)
			}

			go func() {
				dumpSuccess <- s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, true)
				os.RemoveAll(checkpointDir)
			}()

			select {
			/* the checkpoint failed, let's just abort */
			case err = <-dumpSuccess:
				return abort(err)
			/* the dump finished, let's continue on to the restore */
			case <-dumpDone:
				shared.LogDebugf("Dump finished, continuing with restore...")
			}
		} else {
			defer os.RemoveAll(checkpointDir)
			if err := s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, false); err != nil {
				return abort(err)
			}
		}

		/*
		 * We do the serially right now, but there's really no reason for us
		 * to; since we have separate websockets, we can do it in parallel if
		 * we wanted to. However, assuming we're network bound, there's really
		 * no reason to do these in parallel. In the future when we're using
		 * p.haul's protocol, it will make sense to do these in parallel.
		 */
		if err := RsyncSend(shared.AddSlash(checkpointDir), s.criuConn, nil); err != nil {
			return abort(err)
		}

		if err := driver.SendAfterCheckpoint(s.fsConn); err != nil {
			return abort(err)
		}
	}

	driver.Cleanup()

	msg := MigrationControl{}
	if err := s.recv(&msg); err != nil {
		s.disconnect()
		return err
	}

	if s.live {
		restoreSuccess <- *msg.Success
		err := <-dumpSuccess
		if err != nil {
			shared.LogErrorf("dump failed after successful restore?: %q", err)
		}
	}

	if !*msg.Success {
		return fmt.Errorf(*msg.Message)
	}

	return nil
}