TUN-6999: cloudflared should attempt other edge addresses before falling back on protocol

This PR does two things:
It changes how we fallback to a lower protocol: The current state
is to try connecting with a protocol. If it fails, fall back to a
lower protocol. And try connecting with that and so on. With this PR,
if we fail to connect with a protocol, we will try to connect to other
edge addresses first. Only if we fail to connect to those will we
fall back to a lower protocol.
It fixes a behaviour where if we fail to connect to an edge addr,
we keep re-trying the same address over and over again.
This PR now switches between edge addresses on subsequent connecton attempts.
Note that through these switches, it still respects the backoff time.
(We are connecting to a different edge, but this helps to not bombard an edge
address with connect requests if a particular edge addresses stops working).
This commit is contained in:
Sudarsan Reddy 2022-12-14 11:43:52 +00:00
parent e517242194
commit 99b3736cc7
4 changed files with 101 additions and 86 deletions

View File

@ -612,6 +612,12 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
Value: 5, Value: 5,
Hidden: true, Hidden: true,
}), }),
altsrc.NewIntFlag(&cli.IntFlag{
Name: "max-edge-addr-retries",
Usage: "Maximum number of times to retry on edge addrs before falling back to a lower protocol",
Value: 8,
Hidden: true,
}),
// Note TUN-3758 , we use Int because UInt is not supported with altsrc // Note TUN-3758 , we use Int because UInt is not supported with altsrc
altsrc.NewIntFlag(&cli.IntFlag{ altsrc.NewIntFlag(&cli.IntFlag{
Name: "retries", Name: "retries",

View File

@ -385,6 +385,7 @@ func prepareTunnelConfig(
EdgeTLSConfigs: edgeTLSConfigs, EdgeTLSConfigs: edgeTLSConfigs,
NeedPQ: needPQ, NeedPQ: needPQ,
PQKexIdx: pqKexIdx, PQKexIdx: pqKexIdx,
MaxEdgeAddrRetries: uint8(c.Int("max-edge-addr-retries")),
} }
packetConfig, err := newPacketConfig(c, log) packetConfig, err := newPacketConfig(c, log)
if err != nil { if err != nil {

View File

@ -14,7 +14,6 @@ import (
"github.com/cloudflare/cloudflared/connection" "github.com/cloudflare/cloudflared/connection"
"github.com/cloudflare/cloudflared/edgediscovery" "github.com/cloudflare/cloudflared/edgediscovery"
"github.com/cloudflare/cloudflared/edgediscovery/allregions"
"github.com/cloudflare/cloudflared/h2mux" "github.com/cloudflare/cloudflared/h2mux"
"github.com/cloudflare/cloudflared/orchestration" "github.com/cloudflare/cloudflared/orchestration"
"github.com/cloudflare/cloudflared/retry" "github.com/cloudflare/cloudflared/retry"
@ -94,14 +93,7 @@ func NewSupervisor(config *TunnelConfig, orchestrator *orchestration.Orchestrato
tracker := tunnelstate.NewConnTracker(config.Log) tracker := tunnelstate.NewConnTracker(config.Log)
log := NewConnAwareLogger(config.Log, tracker, config.Observer) log := NewConnAwareLogger(config.Log, tracker, config.Observer)
var edgeAddrHandler EdgeAddrHandler edgeAddrHandler := NewIPAddrFallback(config.MaxEdgeAddrRetries)
if isStaticEdge { // static edge addresses
edgeAddrHandler = &IPAddrFallback{}
} else if config.EdgeIPVersion == allregions.IPv6Only || config.EdgeIPVersion == allregions.Auto {
edgeAddrHandler = &IPAddrFallback{}
} else { // IPv4Only
edgeAddrHandler = &DefaultAddrFallback{}
}
edgeTunnelServer := EdgeTunnelServer{ edgeTunnelServer := EdgeTunnelServer{
config: config, config: config,

View File

@ -59,6 +59,7 @@ type TunnelConfig struct {
Observer *connection.Observer Observer *connection.Observer
ReportedVersion string ReportedVersion string
Retries uint Retries uint
MaxEdgeAddrRetries uint8
RunFromTerminal bool RunFromTerminal bool
NeedPQ bool NeedPQ bool
@ -133,6 +134,24 @@ func StartTunnelDaemon(
return s.Run(ctx, connectedSignal) return s.Run(ctx, connectedSignal)
} }
type ConnectivityError struct {
reachedMaxRetries bool
}
func NewConnectivityError(hasReachedMaxRetries bool) *ConnectivityError {
return &ConnectivityError{
reachedMaxRetries: hasReachedMaxRetries,
}
}
func (e *ConnectivityError) Error() string {
return fmt.Sprintf("connectivity error - reached max retries: %t", e.HasReachedMaxRetries())
}
func (e *ConnectivityError) HasReachedMaxRetries() bool {
return e.reachedMaxRetries
}
// EdgeAddrHandler provides a mechanism switch between behaviors in ServeTunnel // EdgeAddrHandler provides a mechanism switch between behaviors in ServeTunnel
// for handling the errors when attempting to make edge connections. // for handling the errors when attempting to make edge connections.
type EdgeAddrHandler interface { type EdgeAddrHandler interface {
@ -140,56 +159,47 @@ type EdgeAddrHandler interface {
// the edge address should be replaced with a new one. Also, will return if the // the edge address should be replaced with a new one. Also, will return if the
// error should be recognized as a connectivity error, or otherwise, a general // error should be recognized as a connectivity error, or otherwise, a general
// application error. // application error.
ShouldGetNewAddress(err error) (needsNewAddress bool, isConnectivityError bool) ShouldGetNewAddress(connIndex uint8, err error) (needsNewAddress bool, connectivityError error)
} }
// DefaultAddrFallback will always return false for isConnectivityError since this func NewIPAddrFallback(maxRetries uint8) *ipAddrFallback {
// handler is a way to provide the legacy behavior in the new edge discovery algorithm. return &ipAddrFallback{
type DefaultAddrFallback struct { retriesByConnIndex: make(map[uint8]uint8),
edgeErrors int maxRetries: maxRetries,
}
func (f DefaultAddrFallback) ShouldGetNewAddress(err error) (needsNewAddress bool, isConnectivityError bool) {
switch err.(type) {
case nil: // maintain current IP address
// DupConnRegisterTunnelError should indicate to get a new address immediately
case connection.DupConnRegisterTunnelError:
return true, false
// Try the next address if it was a quic.IdleTimeoutError
case *quic.IdleTimeoutError,
edgediscovery.DialError,
*connection.EdgeQuicDialError:
// Wait for two failures before falling back to a new address
f.edgeErrors++
if f.edgeErrors >= 2 {
f.edgeErrors = 0
return true, false
} }
default: // maintain current IP address
}
return false, false
} }
// IPAddrFallback will have more conditions to fall back to a new address for certain // ipAddrFallback will have more conditions to fall back to a new address for certain
// edge connection errors. This means that this handler will return true for isConnectivityError // edge connection errors. This means that this handler will return true for isConnectivityError
// for more cases like duplicate connection register and edge quic dial errors. // for more cases like duplicate connection register and edge quic dial errors.
type IPAddrFallback struct{} type ipAddrFallback struct {
m sync.Mutex
retriesByConnIndex map[uint8]uint8
maxRetries uint8
}
func (f IPAddrFallback) ShouldGetNewAddress(err error) (needsNewAddress bool, isConnectivityError bool) { func (f *ipAddrFallback) ShouldGetNewAddress(connIndex uint8, err error) (needsNewAddress bool, connectivityError error) {
f.m.Lock()
defer f.m.Unlock()
switch err.(type) { switch err.(type) {
case nil: // maintain current IP address case nil: // maintain current IP address
// Try the next address if it was a quic.IdleTimeoutError // Try the next address if it was a quic.IdleTimeoutError
// DupConnRegisterTunnelError needs to also receive a new ip address // DupConnRegisterTunnelError needs to also receive a new ip address
case connection.DupConnRegisterTunnelError, case connection.DupConnRegisterTunnelError,
*quic.IdleTimeoutError: *quic.IdleTimeoutError:
return true, false return true, nil
// Network problems should be retried with new address immediately and report // Network problems should be retried with new address immediately and report
// as connectivity error // as connectivity error
case edgediscovery.DialError, *connection.EdgeQuicDialError: case edgediscovery.DialError, *connection.EdgeQuicDialError:
return true, true if f.retriesByConnIndex[connIndex] >= f.maxRetries {
f.retriesByConnIndex[connIndex] = 0
return true, NewConnectivityError(true)
}
f.retriesByConnIndex[connIndex]++
return true, NewConnectivityError(false)
default: // maintain current IP address default: // maintain current IP address
} }
return false, false return false, nil
} }
type EdgeTunnelServer struct { type EdgeTunnelServer struct {
@ -238,11 +248,12 @@ func (e *EdgeTunnelServer) Serve(ctx context.Context, connIndex uint8, protocolF
Uint8(connection.LogFieldConnIndex, connIndex). Uint8(connection.LogFieldConnIndex, connIndex).
Logger() Logger()
connLog := e.connAwareLogger.ReplaceLogger(&logger) connLog := e.connAwareLogger.ReplaceLogger(&logger)
// Each connection to keep its own copy of protocol, because individual connections might fallback // Each connection to keep its own copy of protocol, because individual connections might fallback
// to another protocol when a particular metal doesn't support new protocol // to another protocol when a particular metal doesn't support new protocol
// Each connection can also have it's own IP version because individual connections might fallback // Each connection can also have it's own IP version because individual connections might fallback
// to another IP version. // to another IP version.
err, recoverable := e.serveTunnel( err, shouldFallbackProtocol := e.serveTunnel(
ctx, ctx,
connLog, connLog,
addr, addr,
@ -252,26 +263,30 @@ func (e *EdgeTunnelServer) Serve(ctx context.Context, connIndex uint8, protocolF
protocolFallback.protocol, protocolFallback.protocol,
) )
// If the connection is recoverable, we want to maintain the same IP // Check if the connection error was from an IP issue with the host or
// but backoff a reconnect with some duration. // establishing a connection to the edge and if so, rotate the IP address.
if recoverable { shouldRotateEdgeIP, cErr := e.edgeAddrHandler.ShouldGetNewAddress(connIndex, err)
if shouldRotateEdgeIP {
// rotate IP, but forcing internal state to assign a new IP to connection index.
if _, err := e.edgeAddrs.GetDifferentAddr(int(connIndex), true); err != nil {
return err
}
// In addition, if it is a connectivity error, and we have exhausted the configurable maximum edge IPs to rotate,
// then just fallback protocol on next iteration run.
connectivityErr, ok := cErr.(*ConnectivityError)
if ok {
shouldFallbackProtocol = connectivityErr.HasReachedMaxRetries()
}
}
// set connection has re-connecting and log the next retrying backoff
duration, ok := protocolFallback.GetMaxBackoffDuration(ctx) duration, ok := protocolFallback.GetMaxBackoffDuration(ctx)
if !ok { if !ok {
return err return err
} }
e.config.Observer.SendReconnect(connIndex) e.config.Observer.SendReconnect(connIndex)
connLog.Logger().Info().Msgf("Retrying connection in up to %s", duration) connLog.Logger().Info().Msgf("Retrying connection in up to %s", duration)
}
// Check if the connection error was from an IP issue with the host or
// establishing a connection to the edge and if so, rotate the IP address.
yes, hasConnectivityError := e.edgeAddrHandler.ShouldGetNewAddress(err)
if yes {
if _, err := e.edgeAddrs.GetDifferentAddr(int(connIndex), hasConnectivityError); err != nil {
return err
}
}
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -279,7 +294,8 @@ func (e *EdgeTunnelServer) Serve(ctx context.Context, connIndex uint8, protocolF
case <-e.gracefulShutdownC: case <-e.gracefulShutdownC:
return nil return nil
case <-protocolFallback.BackoffTimer(): case <-protocolFallback.BackoffTimer():
if !recoverable { // should we fallback protocol? If not, just return. Otherwise, set new protocol for next method call.
if !shouldFallbackProtocol {
return err return err
} }
@ -426,7 +442,7 @@ func (e *EdgeTunnelServer) serveTunnel(
} }
return err.Cause, !err.Permanent return err.Cause, !err.Permanent
case *connection.EdgeQuicDialError: case *connection.EdgeQuicDialError:
return err, true return err, false
case ReconnectSignal: case ReconnectSignal:
connLog.Logger().Info(). connLog.Logger().Info().
IPAddr(connection.LogFieldIPAddress, addr.UDP.IP). IPAddr(connection.LogFieldIPAddress, addr.UDP.IP).