Esempio n. 1
0
// operateTunnel periodically sends status requests (traffic stats updates updates)
// to the Psiphon API; and monitors the tunnel for failures:
//
// 1. Overall tunnel failure: the tunnel sends a signal to the ClosedSignal
// channel on keep-alive failure and other transport I/O errors. In case
// of such a failure, the tunnel is marked as failed.
//
// 2. Tunnel port forward failures: the tunnel connection may stay up but
// the client may still fail to establish port forwards due to server load
// and other conditions. After a threshold number of such failures, the
// overall tunnel is marked as failed.
//
// TODO: currently, any connect (dial), read, or write error associated with
// a port forward is counted as a failure. It may be important to differentiate
// between failures due to Psiphon server conditions and failures due to the
// origin/target server (in the latter case, the tunnel is healthy). Here are
// some typical error messages to consider matching against (or ignoring):
//
// - "ssh: rejected: administratively prohibited (open failed)"
//   (this error message is reported in both actual and false cases: when a server
//    is overloaded and has no free ephemeral ports; and when the user mistypes
//    a domain in a browser address bar and name resolution fails)
// - "ssh: rejected: connect failed (Connection timed out)"
// - "write tcp ... broken pipe"
// - "read tcp ... connection reset by peer"
// - "ssh: unexpected packet in response to channel open: <nil>"
//
// Update: the above is superceded by SSH keep alives with timeouts. When a keep
// alive times out, the tunnel is marked as failed. Keep alives are triggered
// periodically, and also immediately in the case of a port forward failure (so
// as to immediately detect a situation such as a device waking up and trying
// to use a dead tunnel). By default, port forward theshold counting does not
// cause a tunnel to be marked as failed, with the conservative assumption that
// a server which responds to an SSH keep alive is fully functional.
//
func (tunnel *Tunnel) operateTunnel(config *Config, tunnelOwner TunnelOwner) {
	defer tunnel.operateWaitGroup.Done()

	// The next status request and ssh keep alive times are picked at random,
	// from a range, to make the resulting traffic less fingerprintable,
	// especially when then tunnel is otherwise idle.
	// Note: not using Tickers since these are not fixed time periods.

	nextStatusRequestPeriod := func() time.Duration {
		return MakeRandomPeriod(
			PSIPHON_API_STATUS_REQUEST_PERIOD_MIN,
			PSIPHON_API_STATUS_REQUEST_PERIOD_MAX)
	}
	nextSshKeepAlivePeriod := func() time.Duration {
		return MakeRandomPeriod(
			TUNNEL_SSH_KEEP_ALIVE_PERIOD_MIN,
			TUNNEL_SSH_KEEP_ALIVE_PERIOD_MAX)
	}

	// TODO: don't initialize if !config.EmitBytesTransferred
	noticeBytesTransferredTicker := time.NewTicker(1 * time.Second)
	if !config.EmitBytesTransferred {
		noticeBytesTransferredTicker.Stop()
	} else {
		defer noticeBytesTransferredTicker.Stop()
	}

	statsTimer := time.NewTimer(nextStatusRequestPeriod())
	defer statsTimer.Stop()

	sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod())
	defer sshKeepAliveTimer.Stop()

	var err error
	for err == nil {
		select {
		case <-noticeBytesTransferredTicker.C:
			sent, received := transferstats.GetBytesTransferredForServer(
				tunnel.serverEntry.IpAddress)
			// Only emit notice when tunnel is not idle.
			if sent > 0 || received > 0 {
				NoticeBytesTransferred(sent, received)
			}

		case <-statsTimer.C:
			sendStats(tunnel)
			statsTimer.Reset(nextStatusRequestPeriod())

		case <-sshKeepAliveTimer.C:
			err = sendSshKeepAlive(tunnel.sshClient, tunnel.conn)
			sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())

		case failures := <-tunnel.portForwardFailures:
			// Note: no mutex on portForwardFailureTotal; only referenced here
			tunnel.portForwardFailureTotal += failures
			NoticeInfo("port forward failures for %s: %d",
				tunnel.serverEntry.IpAddress, tunnel.portForwardFailureTotal)
			if config.PortForwardFailureThreshold > 0 &&
				tunnel.portForwardFailureTotal > config.PortForwardFailureThreshold {
				err = errors.New("tunnel exceeded port forward failure threshold")
			} else {
				// Try an SSH keep alive to check the state of the SSH connection
				// Some port forward failures are due to intermittent conditions
				// on the server, so we don't abort the connection until the threshold
				// is hit. But if we can't make a simple round trip request to the
				// server, we'll immediately abort.
				err = sendSshKeepAlive(tunnel.sshClient, tunnel.conn)
				sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())
			}

		case <-tunnel.shutdownOperateBroadcast:
			// Attempt to send any remaining stats
			sendStats(tunnel)
			NoticeInfo("shutdown operate tunnel")
			return
		}
	}

	if err != nil {
		NoticeAlert("operate tunnel error for %s: %s", tunnel.serverEntry.IpAddress, err)
		tunnelOwner.SignalTunnelFailure(tunnel)
	}
}
// operateTunnel monitors the health of the tunnel and performs
// periodic work.
//
// BytesTransferred and TotalBytesTransferred notices are emitted
// for live reporting and diagnostics reporting, respectively.
//
// Status requests are sent to the Psiphon API to report bytes
// transferred.
//
// Periodic SSH keep alive packets are sent to ensure the underlying
// TCP connection isn't terminated by NAT, or other network
// interference -- or test if it has been terminated while the device
// has been asleep. When a keep alive times out, the tunnel is
// considered failed.
//
// An immediate SSH keep alive "probe" is sent to test the tunnel and
// server responsiveness when a port forward failure is detected: a
// failed dial or failed read/write. This keep alive has a shorter
// timeout.
//
// Note that port foward failures may be due to non-failure conditions.
// For example, when the user inputs an invalid domain name and
// resolution is done by the ssh server; or trying to connect to a
// non-white-listed port; and the error message in these cases is not
// distinguishable from a a true server error (a common error message,
// "ssh: rejected: administratively prohibited (open failed)", may be
// returned for these cases but also if the server has run out of
// ephemeral ports, for example).
//
// SSH keep alives are not sent when the tunnel has been recently
// active (not only does tunnel activity obviate the necessity of a keep
// alive, testing has shown that keep alives may time out for "busy"
// tunnels, especially over meek protocol and other high latency
// conditions).
//
// "Recently active" is defined has having received payload bytes. Sent
// bytes are not considered as testing has shown bytes may appear to
// send when certain NAT devices have interfered with the tunnel, while
// no bytes are received. In a pathological case, with DNS implemented
// as tunneled UDP, a browser may wait excessively for a domain name to
// resolve, while no new port forward is attempted which would otherwise
// result in a tunnel failure detection.
//
// TODO: change "recently active" to include having received any
// SSH protocol messages from the server, not just user payload?
//
func (tunnel *Tunnel) operateTunnel(config *Config, tunnelOwner TunnelOwner) {
	defer tunnel.operateWaitGroup.Done()

	lastBytesReceivedTime := time.Now()

	lastTotalBytesTransferedTime := time.Now()
	totalSent := int64(0)
	totalReceived := int64(0)

	// Always emit a final NoticeTotalBytesTransferred
	defer func() {
		NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived)
	}()

	noticeBytesTransferredTicker := time.NewTicker(1 * time.Second)
	defer noticeBytesTransferredTicker.Stop()

	// The next status request and ssh keep alive times are picked at random,
	// from a range, to make the resulting traffic less fingerprintable,
	// Note: not using Tickers since these are not fixed time periods.
	nextStatusRequestPeriod := func() time.Duration {
		return MakeRandomPeriod(
			PSIPHON_API_STATUS_REQUEST_PERIOD_MIN,
			PSIPHON_API_STATUS_REQUEST_PERIOD_MAX)
	}

	statsTimer := time.NewTimer(nextStatusRequestPeriod())
	defer statsTimer.Stop()

	nextSshKeepAlivePeriod := func() time.Duration {
		return MakeRandomPeriod(
			TUNNEL_SSH_KEEP_ALIVE_PERIOD_MIN,
			TUNNEL_SSH_KEEP_ALIVE_PERIOD_MAX)
	}

	// TODO: don't initialize timer when config.DisablePeriodicSshKeepAlive is set
	sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod())
	if config.DisablePeriodicSshKeepAlive {
		sshKeepAliveTimer.Stop()
	} else {
		defer sshKeepAliveTimer.Stop()
	}

	// Perform network requests in separate goroutines so as not to block
	// other operations.
	// Note: defer LIFO dependency: channels to be closed before Wait()
	requestsWaitGroup := new(sync.WaitGroup)
	defer requestsWaitGroup.Wait()

	requestsWaitGroup.Add(1)
	signalStatusRequest := make(chan struct{})
	defer close(signalStatusRequest)
	go func() {
		defer requestsWaitGroup.Done()
		for _ = range signalStatusRequest {
			sendStats(tunnel)
		}
	}()

	requestsWaitGroup.Add(1)
	signalSshKeepAlive := make(chan time.Duration)
	sshKeepAliveError := make(chan error, 1)
	defer close(signalSshKeepAlive)
	go func() {
		defer requestsWaitGroup.Done()
		for timeout := range signalSshKeepAlive {
			err := sendSshKeepAlive(tunnel.sshClient, tunnel.conn, timeout)
			if err != nil {
				select {
				case sshKeepAliveError <- err:
				default:
				}
			}
		}
	}()

	var err error
	for err == nil {
		select {
		case <-noticeBytesTransferredTicker.C:
			sent, received := transferstats.GetBytesTransferredForServer(
				tunnel.serverEntry.IpAddress)

			if received > 0 {
				lastBytesReceivedTime = time.Now()
			}

			totalSent += sent
			totalReceived += received

			if lastTotalBytesTransferedTime.Add(TOTAL_BYTES_TRANSFERRED_NOTICE_PERIOD).Before(time.Now()) {
				NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived)
				lastTotalBytesTransferedTime = time.Now()
			}

			// Only emit the frequent BytesTransferred notice when tunnel is not idle.
			if config.EmitBytesTransferred && (sent > 0 || received > 0) {
				NoticeBytesTransferred(tunnel.serverEntry.IpAddress, sent, received)
			}

		case <-statsTimer.C:
			select {
			case signalStatusRequest <- *new(struct{}):
			default:
			}
			statsTimer.Reset(nextStatusRequestPeriod())

		case <-sshKeepAliveTimer.C:
			if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PERIODIC_INACTIVE_PERIOD).Before(time.Now()) {
				select {
				case signalSshKeepAlive <- TUNNEL_SSH_KEEP_ALIVE_PERIODIC_TIMEOUT:
				default:
				}
			}
			sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())

		case <-tunnel.signalPortForwardFailure:
			// Note: no mutex on portForwardFailureTotal; only referenced here
			tunnel.totalPortForwardFailures++
			NoticeInfo("port forward failures for %s: %d",
				tunnel.serverEntry.IpAddress, tunnel.totalPortForwardFailures)

			if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PROBE_INACTIVE_PERIOD).Before(time.Now()) {
				select {
				case signalSshKeepAlive <- TUNNEL_SSH_KEEP_ALIVE_PROBE_TIMEOUT:
				default:
				}
			}
			if !config.DisablePeriodicSshKeepAlive {
				sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod())
			}

		case err = <-sshKeepAliveError:

		case <-tunnel.shutdownOperateBroadcast:
			// Attempt to send any remaining stats
			sendStats(tunnel)
			NoticeInfo("shutdown operate tunnel")
			return
		}
	}

	if err != nil {
		NoticeAlert("operate tunnel error for %s: %s", tunnel.serverEntry.IpAddress, err)
		tunnelOwner.SignalTunnelFailure(tunnel)
	}
}