// operateTunnel monitors the health of the tunnel and performs // periodic work. // // BytesTransferred and TotalBytesTransferred notices are emitted // for live reporting and diagnostics reporting, respectively. // // Status requests are sent to the Psiphon API to report bytes // transferred. // // Periodic SSH keep alive packets are sent to ensure the underlying // TCP connection isn't terminated by NAT, or other network // interference -- or test if it has been terminated while the device // has been asleep. When a keep alive times out, the tunnel is // considered failed. // // An immediate SSH keep alive "probe" is sent to test the tunnel and // server responsiveness when a port forward failure is detected: a // failed dial or failed read/write. This keep alive has a shorter // timeout. // // Note that port foward failures may be due to non-failure conditions. // For example, when the user inputs an invalid domain name and // resolution is done by the ssh server; or trying to connect to a // non-white-listed port; and the error message in these cases is not // distinguishable from a a true server error (a common error message, // "ssh: rejected: administratively prohibited (open failed)", may be // returned for these cases but also if the server has run out of // ephemeral ports, for example). // // SSH keep alives are not sent when the tunnel has been recently // active (not only does tunnel activity obviate the necessity of a keep // alive, testing has shown that keep alives may time out for "busy" // tunnels, especially over meek protocol and other high latency // conditions). // // "Recently active" is defined has having received payload bytes. Sent // bytes are not considered as testing has shown bytes may appear to // send when certain NAT devices have interfered with the tunnel, while // no bytes are received. In a pathological case, with DNS implemented // as tunneled UDP, a browser may wait excessively for a domain name to // resolve, while no new port forward is attempted which would otherwise // result in a tunnel failure detection. // // TODO: change "recently active" to include having received any // SSH protocol messages from the server, not just user payload? // func (tunnel *Tunnel) operateTunnel(tunnelOwner TunnelOwner) { defer tunnel.operateWaitGroup.Done() lastBytesReceivedTime := time.Now() lastTotalBytesTransferedTime := time.Now() totalSent := int64(0) totalReceived := int64(0) noticeBytesTransferredTicker := time.NewTicker(1 * time.Second) defer noticeBytesTransferredTicker.Stop() // The next status request and ssh keep alive times are picked at random, // from a range, to make the resulting traffic less fingerprintable, // Note: not using Tickers since these are not fixed time periods. nextStatusRequestPeriod := func() time.Duration { return MakeRandomPeriod( PSIPHON_API_STATUS_REQUEST_PERIOD_MIN, PSIPHON_API_STATUS_REQUEST_PERIOD_MAX) } statsTimer := time.NewTimer(nextStatusRequestPeriod()) defer statsTimer.Stop() // Schedule an immediate status request to deliver any unreported // tunnel stats. // Note: this may not be effective when there's an outstanding // asynchronous untunneled final status request is holding the // tunnel stats records. It may also conflict with other // tunnel candidates which attempt to send an immediate request // before being discarded. For now, we mitigate this with a short, // random delay. unreported := CountUnreportedTunnelStats() if unreported > 0 { NoticeInfo("Unreported tunnel stats: %d", unreported) statsTimer.Reset(MakeRandomPeriod( PSIPHON_API_STATUS_REQUEST_SHORT_PERIOD_MIN, PSIPHON_API_STATUS_REQUEST_SHORT_PERIOD_MAX)) } nextSshKeepAlivePeriod := func() time.Duration { return MakeRandomPeriod( TUNNEL_SSH_KEEP_ALIVE_PERIOD_MIN, TUNNEL_SSH_KEEP_ALIVE_PERIOD_MAX) } // TODO: don't initialize timer when config.DisablePeriodicSshKeepAlive is set sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod()) if tunnel.config.DisablePeriodicSshKeepAlive { sshKeepAliveTimer.Stop() } else { defer sshKeepAliveTimer.Stop() } // Perform network requests in separate goroutines so as not to block // other operations. requestsWaitGroup := new(sync.WaitGroup) requestsWaitGroup.Add(1) signalStatusRequest := make(chan struct{}) go func() { defer requestsWaitGroup.Done() for _ = range signalStatusRequest { sendStats(tunnel) } }() requestsWaitGroup.Add(1) signalSshKeepAlive := make(chan time.Duration) sshKeepAliveError := make(chan error, 1) go func() { defer requestsWaitGroup.Done() for timeout := range signalSshKeepAlive { err := sendSshKeepAlive(tunnel.sshClient, tunnel.conn, timeout) if err != nil { select { case sshKeepAliveError <- err: default: } } } }() shutdown := false var err error for !shutdown && err == nil { select { case <-noticeBytesTransferredTicker.C: sent, received := transferstats.ReportRecentBytesTransferredForServer( tunnel.serverEntry.IpAddress) if received > 0 { lastBytesReceivedTime = time.Now() } totalSent += sent totalReceived += received if lastTotalBytesTransferedTime.Add(TOTAL_BYTES_TRANSFERRED_NOTICE_PERIOD).Before(time.Now()) { NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived) lastTotalBytesTransferedTime = time.Now() } // Only emit the frequent BytesTransferred notice when tunnel is not idle. if tunnel.config.EmitBytesTransferred && (sent > 0 || received > 0) { NoticeBytesTransferred(tunnel.serverEntry.IpAddress, sent, received) } case <-statsTimer.C: select { case signalStatusRequest <- *new(struct{}): default: } statsTimer.Reset(nextStatusRequestPeriod()) case <-sshKeepAliveTimer.C: if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PERIODIC_INACTIVE_PERIOD).Before(time.Now()) { select { case signalSshKeepAlive <- time.Duration(*tunnel.config.TunnelSshKeepAlivePeriodicTimeoutSeconds) * time.Second: default: } } sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) case <-tunnel.signalPortForwardFailure: // Note: no mutex on portForwardFailureTotal; only referenced here tunnel.totalPortForwardFailures++ NoticeInfo("port forward failures for %s: %d", tunnel.serverEntry.IpAddress, tunnel.totalPortForwardFailures) if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PROBE_INACTIVE_PERIOD).Before(time.Now()) { select { case signalSshKeepAlive <- time.Duration(*tunnel.config.TunnelSshKeepAliveProbeTimeoutSeconds) * time.Second: default: } } if !tunnel.config.DisablePeriodicSshKeepAlive { sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) } case err = <-sshKeepAliveError: case <-tunnel.shutdownOperateBroadcast: shutdown = true } } close(signalSshKeepAlive) close(signalStatusRequest) requestsWaitGroup.Wait() // Capture bytes transferred since the last noticeBytesTransferredTicker tick sent, received := transferstats.ReportRecentBytesTransferredForServer(tunnel.serverEntry.IpAddress) totalSent += sent totalReceived += received // Always emit a final NoticeTotalBytesTransferred NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived) // The stats for this tunnel will be reported via the next successful // status request. // Note: Since client clocks are unreliable, we use the server's reported // timestamp in the handshake response as the tunnel start time. This time // will be slightly earlier than the actual tunnel activation time, as the // client has to receive and parse the response and activate the tunnel. if !tunnel.IsDiscarded() { err := RecordTunnelStats( tunnel.serverContext.sessionId, tunnel.serverContext.tunnelNumber, tunnel.serverEntry.IpAddress, tunnel.serverContext.serverHandshakeTimestamp, fmt.Sprintf("%d", time.Now().Sub(tunnel.startTime)), totalSent, totalReceived) if err != nil { NoticeAlert("RecordTunnelStats failed: %s", ContextError(err)) } } // Final status request notes: // // It's highly desirable to send a final status request in order to report // domain bytes transferred stats as well as to report tunnel stats as // soon as possible. For this reason, we attempt untunneled requests when // the tunneled request isn't possible or has failed. // // In an orderly shutdown (err == nil), the Controller is stopping and // everything must be wrapped up quickly. Also, we still have a working // tunnel. So we first attempt a tunneled status request (with a short // timeout) and then attempt, synchronously -- otherwise the Contoller's // runWaitGroup.Wait() will return while a request is still in progress // -- untunneled requests (also with short timeouts). Note that in this // case the untunneled request will opt out of untunneledPendingConns so // that it's not inadvertently canceled by the Controller shutdown // sequence (see doUntunneledStatusRequest). // // If the tunnel has failed, the Controller may continue working. We want // to re-establish as soon as possible (so don't want to block on status // requests, even for a second). We may have a long time to attempt // untunneled requests in the background. And there is no tunnel through // which to attempt tunneled requests. So we spawn a goroutine to run the // untunneled requests, which are allowed a longer timeout. These requests // will be interrupted by the Controller's untunneledPendingConns.CloseAll() // in the case of a shutdown. if err == nil { NoticeInfo("shutdown operate tunnel") if !sendStats(tunnel) { sendUntunneledStats(tunnel, true) } } else { NoticeAlert("operate tunnel error for %s: %s", tunnel.serverEntry.IpAddress, err) go sendUntunneledStats(tunnel, false) tunnelOwner.SignalTunnelFailure(tunnel) } }
// operateTunnel monitors the health of the tunnel and performs // periodic work. // // BytesTransferred and TotalBytesTransferred notices are emitted // for live reporting and diagnostics reporting, respectively. // // Status requests are sent to the Psiphon API to report bytes // transferred. // // Periodic SSH keep alive packets are sent to ensure the underlying // TCP connection isn't terminated by NAT, or other network // interference -- or test if it has been terminated while the device // has been asleep. When a keep alive times out, the tunnel is // considered failed. // // An immediate SSH keep alive "probe" is sent to test the tunnel and // server responsiveness when a port forward failure is detected: a // failed dial or failed read/write. This keep alive has a shorter // timeout. // // Note that port foward failures may be due to non-failure conditions. // For example, when the user inputs an invalid domain name and // resolution is done by the ssh server; or trying to connect to a // non-white-listed port; and the error message in these cases is not // distinguishable from a a true server error (a common error message, // "ssh: rejected: administratively prohibited (open failed)", may be // returned for these cases but also if the server has run out of // ephemeral ports, for example). // // SSH keep alives are not sent when the tunnel has been recently // active (not only does tunnel activity obviate the necessity of a keep // alive, testing has shown that keep alives may time out for "busy" // tunnels, especially over meek protocol and other high latency // conditions). // // "Recently active" is defined has having received payload bytes. Sent // bytes are not considered as testing has shown bytes may appear to // send when certain NAT devices have interfered with the tunnel, while // no bytes are received. In a pathological case, with DNS implemented // as tunneled UDP, a browser may wait excessively for a domain name to // resolve, while no new port forward is attempted which would otherwise // result in a tunnel failure detection. // // TODO: change "recently active" to include having received any // SSH protocol messages from the server, not just user payload? // func (tunnel *Tunnel) operateTunnel(tunnelOwner TunnelOwner) { defer tunnel.operateWaitGroup.Done() lastBytesReceivedTime := monotime.Now() lastTotalBytesTransferedTime := monotime.Now() totalSent := int64(0) totalReceived := int64(0) noticeBytesTransferredTicker := time.NewTicker(1 * time.Second) defer noticeBytesTransferredTicker.Stop() // The next status request and ssh keep alive times are picked at random, // from a range, to make the resulting traffic less fingerprintable, // Note: not using Tickers since these are not fixed time periods. nextStatusRequestPeriod := func() time.Duration { return makeRandomPeriod( PSIPHON_API_STATUS_REQUEST_PERIOD_MIN, PSIPHON_API_STATUS_REQUEST_PERIOD_MAX) } statsTimer := time.NewTimer(nextStatusRequestPeriod()) defer statsTimer.Stop() // Schedule an immediate status request to deliver any unreported // persistent stats. // Note: this may not be effective when there's an outstanding // asynchronous untunneled final status request is holding the // persistent stats records. It may also conflict with other // tunnel candidates which attempt to send an immediate request // before being discarded. For now, we mitigate this with a short, // random delay. unreported := CountUnreportedPersistentStats() if unreported > 0 { NoticeInfo("Unreported persistent stats: %d", unreported) statsTimer.Reset(makeRandomPeriod( PSIPHON_API_STATUS_REQUEST_SHORT_PERIOD_MIN, PSIPHON_API_STATUS_REQUEST_SHORT_PERIOD_MAX)) } nextSshKeepAlivePeriod := func() time.Duration { return makeRandomPeriod( TUNNEL_SSH_KEEP_ALIVE_PERIOD_MIN, TUNNEL_SSH_KEEP_ALIVE_PERIOD_MAX) } // TODO: don't initialize timer when config.DisablePeriodicSshKeepAlive is set sshKeepAliveTimer := time.NewTimer(nextSshKeepAlivePeriod()) if tunnel.config.DisablePeriodicSshKeepAlive { sshKeepAliveTimer.Stop() } else { defer sshKeepAliveTimer.Stop() } // Perform network requests in separate goroutines so as not to block // other operations. requestsWaitGroup := new(sync.WaitGroup) requestsWaitGroup.Add(1) signalStatusRequest := make(chan struct{}) go func() { defer requestsWaitGroup.Done() for _ = range signalStatusRequest { sendStats(tunnel) } }() requestsWaitGroup.Add(1) signalSshKeepAlive := make(chan time.Duration) sshKeepAliveError := make(chan error, 1) go func() { defer requestsWaitGroup.Done() for timeout := range signalSshKeepAlive { err := sendSshKeepAlive(tunnel.sshClient, tunnel.conn, timeout) if err != nil { select { case sshKeepAliveError <- err: default: } } } }() requestsWaitGroup.Add(1) signalStopClientVerificationRequests := make(chan struct{}) go func() { defer requestsWaitGroup.Done() clientVerificationRequestSuccess := true clientVerificationPayload := "" failCount := 0 for { // TODO: use reflect.SelectCase? if clientVerificationRequestSuccess == true { failCount = 0 select { case clientVerificationPayload = <-tunnel.newClientVerificationPayload: case <-signalStopClientVerificationRequests: return } } else { // If sendClientVerification failed to send the payload we // will retry after a delay. Will use a new payload instead // if that arrives in the meantime. // If failures count is more than PSIPHON_API_CLIENT_VERIFICATION_REQUEST_MAX_RETRIES // stop retrying for this tunnel. failCount += 1 if failCount > PSIPHON_API_CLIENT_VERIFICATION_REQUEST_MAX_RETRIES { return } timeout := time.After(PSIPHON_API_CLIENT_VERIFICATION_REQUEST_RETRY_PERIOD) select { case <-timeout: case clientVerificationPayload = <-tunnel.newClientVerificationPayload: case <-signalStopClientVerificationRequests: return } } clientVerificationRequestSuccess = sendClientVerification(tunnel, clientVerificationPayload) } }() shutdown := false var err error for !shutdown && err == nil { select { case <-noticeBytesTransferredTicker.C: sent, received := transferstats.ReportRecentBytesTransferredForServer( tunnel.serverEntry.IpAddress) if received > 0 { lastBytesReceivedTime = monotime.Now() } totalSent += sent totalReceived += received if lastTotalBytesTransferedTime.Add(TOTAL_BYTES_TRANSFERRED_NOTICE_PERIOD).Before(monotime.Now()) { NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived) lastTotalBytesTransferedTime = monotime.Now() } // Only emit the frequent BytesTransferred notice when tunnel is not idle. if tunnel.config.EmitBytesTransferred && (sent > 0 || received > 0) { NoticeBytesTransferred(tunnel.serverEntry.IpAddress, sent, received) } case <-statsTimer.C: select { case signalStatusRequest <- *new(struct{}): default: } statsTimer.Reset(nextStatusRequestPeriod()) case <-sshKeepAliveTimer.C: if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PERIODIC_INACTIVE_PERIOD).Before(monotime.Now()) { select { case signalSshKeepAlive <- time.Duration(*tunnel.config.TunnelSshKeepAlivePeriodicTimeoutSeconds) * time.Second: default: } } sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) case <-tunnel.signalPortForwardFailure: // Note: no mutex on portForwardFailureTotal; only referenced here tunnel.totalPortForwardFailures++ NoticeInfo("port forward failures for %s: %d", tunnel.serverEntry.IpAddress, tunnel.totalPortForwardFailures) if lastBytesReceivedTime.Add(TUNNEL_SSH_KEEP_ALIVE_PROBE_INACTIVE_PERIOD).Before(monotime.Now()) { select { case signalSshKeepAlive <- time.Duration(*tunnel.config.TunnelSshKeepAliveProbeTimeoutSeconds) * time.Second: default: } } if !tunnel.config.DisablePeriodicSshKeepAlive { sshKeepAliveTimer.Reset(nextSshKeepAlivePeriod()) } case err = <-sshKeepAliveError: case serverRequest := <-tunnel.sshServerRequests: if serverRequest != nil { err := HandleServerRequest(tunnelOwner, tunnel, serverRequest.Type, serverRequest.Payload) if err == nil { serverRequest.Reply(true, nil) } else { NoticeAlert("HandleServerRequest for %s failed: %s", serverRequest.Type, err) serverRequest.Reply(false, nil) } } case <-tunnel.shutdownOperateBroadcast: shutdown = true } } close(signalSshKeepAlive) close(signalStatusRequest) close(signalStopClientVerificationRequests) requestsWaitGroup.Wait() // Capture bytes transferred since the last noticeBytesTransferredTicker tick sent, received := transferstats.ReportRecentBytesTransferredForServer(tunnel.serverEntry.IpAddress) totalSent += sent totalReceived += received // Always emit a final NoticeTotalBytesTransferred NoticeTotalBytesTransferred(tunnel.serverEntry.IpAddress, totalSent, totalReceived) // Tunnel does not have a serverContext when DisableApi is set. if tunnel.serverContext != nil && !tunnel.IsDiscarded() { // The stats for this tunnel will be reported via the next successful // status request. // Since client clocks are unreliable, we report the server's timestamp from // the handshake response as the absolute tunnel start time. This time // will be slightly earlier than the actual tunnel activation time, as the // client has to receive and parse the response and activate the tunnel. tunnelStartTime := tunnel.serverContext.serverHandshakeTimestamp // For the tunnel duration calculation, we use the local clock. The start time // is tunnel.establishedTime as recorded when the tunnel was established. For the // end time, we do not use the current time as we may now be long past the // actual termination time of the tunnel. For example, the host or device may // have resumed after a long sleep (it's not clear that the monotonic clock service // used to measure elapsed time will or will not stop during device sleep). Instead, // we use the last data received time as the estimated tunnel end time. // // One potential issue with using the last received time is receiving data // after an extended sleep because the device sleep occured with data still in // the OS socket read buffer. This is not expected to happen on Android, as the // OS will wake a process when it has TCP data available to read. (For this reason, // the actual long sleep issue is only with an idle tunnel; in this case the client // is responsible for sending SSH keep alives but a device sleep will delay the // golang SSH keep alive timer.) // // Idle tunnels will only read data when a SSH keep alive is sent. As a result, // the last-received-time scheme can undercount tunnel durations by up to // TUNNEL_SSH_KEEP_ALIVE_PERIOD_MAX for idle tunnels. tunnelDuration := tunnel.conn.GetLastActivityMonotime().Sub(tunnel.establishedTime) err := RecordTunnelStat( tunnel.serverContext.sessionId, tunnel.serverContext.tunnelNumber, tunnel.serverEntry.IpAddress, fmt.Sprintf("%d", tunnel.establishDuration), tunnelStartTime, fmt.Sprintf("%d", tunnelDuration), totalSent, totalReceived) if err != nil { NoticeAlert("RecordTunnelStats failed: %s", common.ContextError(err)) } } // Final status request notes: // // It's highly desirable to send a final status request in order to report // domain bytes transferred stats as well as to report tunnel stats as // soon as possible. For this reason, we attempt untunneled requests when // the tunneled request isn't possible or has failed. // // In an orderly shutdown (err == nil), the Controller is stopping and // everything must be wrapped up quickly. Also, we still have a working // tunnel. So we first attempt a tunneled status request (with a short // timeout) and then attempt, synchronously -- otherwise the Contoller's // runWaitGroup.Wait() will return while a request is still in progress // -- untunneled requests (also with short timeouts). Note that in this // case the untunneled request will opt out of untunneledPendingConns so // that it's not inadvertently canceled by the Controller shutdown // sequence (see doUntunneledStatusRequest). // // If the tunnel has failed, the Controller may continue working. We want // to re-establish as soon as possible (so don't want to block on status // requests, even for a second). We may have a long time to attempt // untunneled requests in the background. And there is no tunnel through // which to attempt tunneled requests. So we spawn a goroutine to run the // untunneled requests, which are allowed a longer timeout. These requests // will be interrupted by the Controller's untunneledPendingConns.CloseAll() // in the case of a shutdown. if err == nil { NoticeInfo("shutdown operate tunnel") if !sendStats(tunnel) { sendUntunneledStats(tunnel, true) } } else { NoticeAlert("operate tunnel error for %s: %s", tunnel.serverEntry.IpAddress, err) go sendUntunneledStats(tunnel, false) tunnelOwner.SignalTunnelFailure(tunnel) } }