From 7bac4b15b09c2f8f17ea9e9f29eb77b909fdc181 Mon Sep 17 00:00:00 2001 From: Nuno Diegues Date: Thu, 27 Jan 2022 22:12:25 +0000 Subject: [PATCH] TUN-5719: Re-attempt connection to edge with QUIC despite network error when there is no fallback We have made 2 changes in the past that caused an unexpected edge case: 1. when faced with QUIC "no network activity", give up re-attempts and fall-back 2. when a protocol is chosen explicitly, rather than using auto (the default), do not fallback The reasoning for 1. was to fallback quickly in situations where the user may not have chosen QUIC, and simply got it because we auto-chose it (with the TXT DNS record), but the users' environment does not allow egress via UDP. The reasoning for 2. was to avoid falling back if the user explicitly chooses a protocol. E.g., if the user chooses QUIC, she may want to do UDP proxying, so if we fallback to HTTP2 protocol that will be unexpected since it does not support UDP (and same applies for HTTP2 falling back to h2mux and TCP proxying). This PR fixes the edge case that happens when both those changes 1. and 2. are put together: when faced with a QUIC "no network activity", we should only try to fallback if there is a possible fallback. Otherwise, we should exhaust the retries as normal. --- origin/tunnel.go | 11 +++++--- origin/tunnel_test.go | 58 ++++++++++++++++++++++++++++++------------- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/origin/tunnel.go b/origin/tunnel.go index 7f8ff02c..bcfea720 100644 --- a/origin/tunnel.go +++ b/origin/tunnel.go @@ -185,12 +185,11 @@ func ServeTunnelLoop( case <-gracefulShutdownC: return nil case <-protocolFallback.BackoffTimer(): - var idleTimeoutError *quic.IdleTimeoutError if !selectNextProtocol( connLog.Logger(), protocolFallback, config.ProtocolSelector, - errors.As(err, &idleTimeoutError), + err, ) { return err } @@ -223,9 +222,13 @@ func selectNextProtocol( connLog *zerolog.Logger, protocolBackoff *protocolFallback, selector connection.ProtocolSelector, - isNetworkActivityTimeout bool, + cause error, ) bool { - if protocolBackoff.ReachedMaxRetries() || isNetworkActivityTimeout { + var idleTimeoutError *quic.IdleTimeoutError + isNetworkActivityTimeout := errors.As(cause, &idleTimeoutError) + _, hasFallback := selector.Fallback() + + if protocolBackoff.ReachedMaxRetries() || (hasFallback && isNetworkActivityTimeout) { fallback, hasFallback := selector.Fallback() if !hasFallback { return false diff --git a/origin/tunnel_test.go b/origin/tunnel_test.go index 58108240..870a5049 100644 --- a/origin/tunnel_test.go +++ b/origin/tunnel_test.go @@ -4,6 +4,7 @@ import ( "testing" "time" + "github.com/lucas-clemente/quic-go" "github.com/rs/zerolog" "github.com/stretchr/testify/assert" @@ -53,7 +54,7 @@ func TestWaitForBackoffFallback(t *testing.T) { initProtocol := protocolSelector.Current() assert.Equal(t, connection.HTTP2, initProtocol) - protocolFallback := &protocolFallback{ + protoFallback := &protocolFallback{ backoff, initProtocol, false, @@ -61,40 +62,63 @@ func TestWaitForBackoffFallback(t *testing.T) { // Retry #0 and #1. At retry #2, we switch protocol, so the fallback loop has one more retry than this for i := 0; i < int(maxRetries-1); i++ { - protocolFallback.BackoffTimer() // simulate retry - ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false) + protoFallback.BackoffTimer() // simulate retry + ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil) assert.True(t, ok) - assert.Equal(t, initProtocol, protocolFallback.protocol) + assert.Equal(t, initProtocol, protoFallback.protocol) } // Retry fallback protocol for i := 0; i < int(maxRetries); i++ { - protocolFallback.BackoffTimer() // simulate retry - ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false) + protoFallback.BackoffTimer() // simulate retry + ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil) assert.True(t, ok) fallback, ok := protocolSelector.Fallback() assert.True(t, ok) - assert.Equal(t, fallback, protocolFallback.protocol) + assert.Equal(t, fallback, protoFallback.protocol) } currentGlobalProtocol := protocolSelector.Current() assert.Equal(t, initProtocol, currentGlobalProtocol) // No protocol to fallback, return error - protocolFallback.BackoffTimer() // simulate retry - ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false) + protoFallback.BackoffTimer() // simulate retry + ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil) assert.False(t, ok) - protocolFallback.reset() - protocolFallback.BackoffTimer() // simulate retry - ok = selectNextProtocol(&log, protocolFallback, protocolSelector, false) + protoFallback.reset() + protoFallback.BackoffTimer() // simulate retry + ok = selectNextProtocol(&log, protoFallback, protocolSelector, nil) assert.True(t, ok) - assert.Equal(t, initProtocol, protocolFallback.protocol) + assert.Equal(t, initProtocol, protoFallback.protocol) - protocolFallback.reset() - protocolFallback.BackoffTimer() // simulate retry - ok = selectNextProtocol(&log, protocolFallback, protocolSelector, true) + protoFallback.reset() + protoFallback.BackoffTimer() // simulate retry + ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{}) // Check that we get a true after the first try itself when this flag is true. This allows us to immediately - // switch protocols. + // switch protocols when there is a fallback. assert.True(t, ok) + + // But if there is no fallback available, then we exhaust the retries despite the type of error. + // The reason why there's no fallback available is because we pick a specific protocol instead of letting it be auto. + protocolSelector, err = connection.NewProtocolSelector( + "quic", + warpRoutingEnabled, + namedTunnel, + mockFetcher.fetch(), + resolveTTL, + &log, + ) + assert.NoError(t, err) + protoFallback = &protocolFallback{backoff, protocolSelector.Current(), false} + for i := 0; i < int(maxRetries-1); i++ { + protoFallback.BackoffTimer() // simulate retry + ok := selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{}) + assert.True(t, ok) + assert.Equal(t, connection.QUIC, protoFallback.protocol) + } + // And finally it fails as it should, with no fallback. + protoFallback.BackoffTimer() + ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{}) + assert.False(t, ok) }