TUN-5719: Re-attempt connection to edge with QUIC despite network error when there is no fallback

We have made 2 changes in the past that caused an unexpected edge case:
 1. when faced with QUIC "no network activity", give up re-attempts and fall-back
 2. when a protocol is chosen explicitly, rather than using auto (the default), do not fallback

The reasoning for 1. was to fallback quickly in situations where the user may not
have chosen QUIC, and simply got it because we auto-chose it (with the TXT DNS record),
but the users' environment does not allow egress via UDP.

The reasoning for 2. was to avoid falling back if the user explicitly chooses a
protocol. E.g., if the user chooses QUIC, she may want to do UDP proxying, so if
we fallback to HTTP2 protocol that will be unexpected since it does not support
UDP (and same applies for HTTP2 falling back to h2mux and TCP proxying).

This PR fixes the edge case that happens when both those changes 1. and 2. are
put together: when faced with a QUIC "no network activity", we should only try
to fallback if there is a possible fallback. Otherwise, we should exhaust the
retries as normal.
This commit is contained in:
Nuno Diegues 2022-01-27 22:12:25 +00:00
parent 8a5343d0a5
commit 7bac4b15b0
2 changed files with 48 additions and 21 deletions

View File

@ -185,12 +185,11 @@ func ServeTunnelLoop(
case <-gracefulShutdownC:
return nil
case <-protocolFallback.BackoffTimer():
var idleTimeoutError *quic.IdleTimeoutError
if !selectNextProtocol(
connLog.Logger(),
protocolFallback,
config.ProtocolSelector,
errors.As(err, &idleTimeoutError),
err,
) {
return err
}
@ -223,9 +222,13 @@ func selectNextProtocol(
connLog *zerolog.Logger,
protocolBackoff *protocolFallback,
selector connection.ProtocolSelector,
isNetworkActivityTimeout bool,
cause error,
) bool {
if protocolBackoff.ReachedMaxRetries() || isNetworkActivityTimeout {
var idleTimeoutError *quic.IdleTimeoutError
isNetworkActivityTimeout := errors.As(cause, &idleTimeoutError)
_, hasFallback := selector.Fallback()
if protocolBackoff.ReachedMaxRetries() || (hasFallback && isNetworkActivityTimeout) {
fallback, hasFallback := selector.Fallback()
if !hasFallback {
return false

View File

@ -4,6 +4,7 @@ import (
"testing"
"time"
"github.com/lucas-clemente/quic-go"
"github.com/rs/zerolog"
"github.com/stretchr/testify/assert"
@ -53,7 +54,7 @@ func TestWaitForBackoffFallback(t *testing.T) {
initProtocol := protocolSelector.Current()
assert.Equal(t, connection.HTTP2, initProtocol)
protocolFallback := &protocolFallback{
protoFallback := &protocolFallback{
backoff,
initProtocol,
false,
@ -61,40 +62,63 @@ func TestWaitForBackoffFallback(t *testing.T) {
// Retry #0 and #1. At retry #2, we switch protocol, so the fallback loop has one more retry than this
for i := 0; i < int(maxRetries-1); i++ {
protocolFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
protoFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
assert.True(t, ok)
assert.Equal(t, initProtocol, protocolFallback.protocol)
assert.Equal(t, initProtocol, protoFallback.protocol)
}
// Retry fallback protocol
for i := 0; i < int(maxRetries); i++ {
protocolFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
protoFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
assert.True(t, ok)
fallback, ok := protocolSelector.Fallback()
assert.True(t, ok)
assert.Equal(t, fallback, protocolFallback.protocol)
assert.Equal(t, fallback, protoFallback.protocol)
}
currentGlobalProtocol := protocolSelector.Current()
assert.Equal(t, initProtocol, currentGlobalProtocol)
// No protocol to fallback, return error
protocolFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protocolFallback, protocolSelector, false)
protoFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protoFallback, protocolSelector, nil)
assert.False(t, ok)
protocolFallback.reset()
protocolFallback.BackoffTimer() // simulate retry
ok = selectNextProtocol(&log, protocolFallback, protocolSelector, false)
protoFallback.reset()
protoFallback.BackoffTimer() // simulate retry
ok = selectNextProtocol(&log, protoFallback, protocolSelector, nil)
assert.True(t, ok)
assert.Equal(t, initProtocol, protocolFallback.protocol)
assert.Equal(t, initProtocol, protoFallback.protocol)
protocolFallback.reset()
protocolFallback.BackoffTimer() // simulate retry
ok = selectNextProtocol(&log, protocolFallback, protocolSelector, true)
protoFallback.reset()
protoFallback.BackoffTimer() // simulate retry
ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
// Check that we get a true after the first try itself when this flag is true. This allows us to immediately
// switch protocols.
// switch protocols when there is a fallback.
assert.True(t, ok)
// But if there is no fallback available, then we exhaust the retries despite the type of error.
// The reason why there's no fallback available is because we pick a specific protocol instead of letting it be auto.
protocolSelector, err = connection.NewProtocolSelector(
"quic",
warpRoutingEnabled,
namedTunnel,
mockFetcher.fetch(),
resolveTTL,
&log,
)
assert.NoError(t, err)
protoFallback = &protocolFallback{backoff, protocolSelector.Current(), false}
for i := 0; i < int(maxRetries-1); i++ {
protoFallback.BackoffTimer() // simulate retry
ok := selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
assert.True(t, ok)
assert.Equal(t, connection.QUIC, protoFallback.protocol)
}
// And finally it fails as it should, with no fallback.
protoFallback.BackoffTimer()
ok = selectNextProtocol(&log, protoFallback, protocolSelector, &quic.IdleTimeoutError{})
assert.False(t, ok)
}