TUN-5719: Re-attempt connection to edge with QUIC despite network error when there is no fallback

We have made 2 changes in the past that caused an unexpected edge case: 1. when faced with QUIC "no network activity", give up re-attempts and fall-back 2. when a protocol is chosen explicitly, rather than using auto (the default), do not fallback The reasoning for 1. was to fallback quickly in situations where the user may not have chosen QUIC, and simply got it because we auto-chose it (with the TXT DNS record), but the users' environment does not allow egress via UDP. The reasoning for 2. was to avoid falling back if the user explicitly chooses a protocol. E.g., if the user chooses QUIC, she may want to do UDP proxying, so if we fallback to HTTP2 protocol that will be unexpected since it does not support UDP (and same applies for HTTP2 falling back to h2mux and TCP proxying). This PR fixes the edge case that happens when both those changes 1. and 2. are put together: when faced with a QUIC "no network activity", we should only try to fallback if there is a possible fallback. Otherwise, we should exhaust the retries as normal.
2022-01-27 22:12:25 +00:00 · 2022-01-27 22:12:25 +00:00 · 7bac4b15b0
parent 8a5343d0a5
commit 7bac4b15b0
2 changed files with 48 additions and 21 deletions
--- a/origin/tunnel.go
+++ b/origin/tunnel.go
@ -185,12 +185,11 @@ func ServeTunnelLoop(
 		case <-gracefulShutdownC:
 			return nil
 		case <-protocolFallback.BackoffTimer():
-			var idleTimeoutError *quic.IdleTimeoutError
 			if !selectNextProtocol(
 				connLog.Logger(),
 				protocolFallback,
 				config.ProtocolSelector,
-				errors.As(err, &idleTimeoutError),
+				err,
 			) {
 				return err
 			}
@ -223,9 +222,13 @@ func selectNextProtocol(
 	connLog *zerolog.Logger,
 	protocolBackoff *protocolFallback,
 	selector connection.ProtocolSelector,
-	isNetworkActivityTimeout bool,
+	cause error,
 ) bool {
-	if protocolBackoff.ReachedMaxRetries() || isNetworkActivityTimeout {
+	var idleTimeoutError *quic.IdleTimeoutError
+	isNetworkActivityTimeout := errors.As(cause, &idleTimeoutError)
+	_, hasFallback := selector.Fallback()
+
+	if protocolBackoff.ReachedMaxRetries() || (hasFallback && isNetworkActivityTimeout) {
 		fallback, hasFallback := selector.Fallback()
 		if !hasFallback {
 			return false
--- a/origin/tunnel_test.go
+++ b/origin/tunnel_test.go
@ -4,6 +4,7 @@ import (
 	"testing"
 	"time"

+	"github.com/lucas-clemente/quic-go"
 	"github.com/rs/zerolog"
 	"github.com/stretchr/testify/assert"

@ -53,7 +54,7 @@ func TestWaitForBackoffFallback(t *testing.T) {
 	initProtocol := protocolSelector.Current()
 	assert.Equal(t, connection.HTTP2, initProtocol)

-	protocolFallback := &protocolFallback{
+	protoFallback := &protocolFallback{
 		backoff,
 		initProtocol,
 		false,
@ -61,40 +62,63 @@ func TestWaitForBackoffFallback(t *testing.T) {

 	// Retry #0 and #1. At retry #2, we switch protocol, so the fallback loop has one more retry than this
 	for i := 0; i < int(maxRetries-1); i++ {
-		protocolFallback.BackoffTimer() // simulate retry
-		ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
+		protoFallback.BackoffTimer() // simulate retry
+		ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
 		assert.True(t, ok)
-		assert.Equal(t, initProtocol, protocolFallback.protocol)
+		assert.Equal(t, initProtocol, protoFallback.protocol)
 	}

 	// Retry fallback protocol
 	for i := 0; i < int(maxRetries); i++ {
-		protocolFallback.BackoffTimer() // simulate retry
-		ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
+		protoFallback.BackoffTimer() // simulate retry
+		ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
 		assert.True(t, ok)
 		fallback, ok := protocolSelector.Fallback()
 		assert.True(t, ok)
-		assert.Equal(t, fallback, protocolFallback.protocol)
+		assert.Equal(t, fallback, protoFallback.protocol)
 	}

 	currentGlobalProtocol := protocolSelector.Current()
 	assert.Equal(t, initProtocol, currentGlobalProtocol)

 	// No protocol to fallback, return error
-	protocolFallback.BackoffTimer() // simulate retry
-	ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
+	protoFallback.BackoffTimer() // simulate retry
+	ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
 	assert.False(t, ok)

-	protocolFallback.reset()
-	protocolFallback.BackoffTimer() // simulate retry
-	ok = selectNextProtocol(&log, protocolFallback, protocolSelector, false)
+	protoFallback.reset()
+	protoFallback.BackoffTimer() // simulate retry
+	ok = selectNextProtocol(&log, protoFallback, protocolSelector, nil)
 	assert.True(t, ok)
-	assert.Equal(t, initProtocol, protocolFallback.protocol)
+	assert.Equal(t, initProtocol, protoFallback.protocol)

-	protocolFallback.reset()
-	protocolFallback.BackoffTimer() // simulate retry
-	ok = selectNextProtocol(&log, protocolFallback, protocolSelector, true)
+	protoFallback.reset()
+	protoFallback.BackoffTimer() // simulate retry
+	ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
 	// Check that we get a true after the first try itself when this flag is true. This allows us to immediately
-	// switch protocols.
+	// switch protocols when there is a fallback.
 	assert.True(t, ok)
+
+	// But if there is no fallback available, then we exhaust the retries despite the type of error.
+	// The reason why there's no fallback available is because we pick a specific protocol instead of letting it be auto.
+	protocolSelector, err = connection.NewProtocolSelector(
+		"quic",
+		warpRoutingEnabled,
+		namedTunnel,
+		mockFetcher.fetch(),
+		resolveTTL,
+		&log,
+	)
+	assert.NoError(t, err)
+	protoFallback = &protocolFallback{backoff, protocolSelector.Current(), false}
+	for i := 0; i < int(maxRetries-1); i++ {
+		protoFallback.BackoffTimer() // simulate retry
+		ok := selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
+		assert.True(t, ok)
+		assert.Equal(t, connection.QUIC, protoFallback.protocol)
+	}
+	// And finally it fails as it should, with no fallback.
+	protoFallback.BackoffTimer()
+	ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
+	assert.False(t, ok)
 }