TUN-5368: Log connection issues with LogLevel that depends on tunnel state

Connections from cloudflared to Cloudflare edge are long lived and may
break over time. That is expected for many reasons (ranging from network
conditions to operations within Cloudflare edge). Hence, logging that as
Error feels too strong and leads to users being concerned that something
is failing when it is actually expected.

With this change, we wrap logging about connection issues to be aware
of the tunnel state:
 - if the tunnel has no connections active, we log as error
 - otherwise we log as warning
This commit is contained in:
Nuno Diegues 2021-11-08 15:43:36 +00:00
parent 6bcc9a76e9
commit 1ee540a166
6 changed files with 148 additions and 60 deletions

View File

@ -4,46 +4,32 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"net/http" "net/http"
"sync"
conn "github.com/cloudflare/cloudflared/connection" conn "github.com/cloudflare/cloudflared/connection"
"github.com/cloudflare/cloudflared/tunnelstate"
"github.com/rs/zerolog" "github.com/rs/zerolog"
) )
// ReadyServer serves HTTP 200 if the tunnel can serve traffic. Intended for k8s readiness checks. // ReadyServer serves HTTP 200 if the tunnel can serve traffic. Intended for k8s readiness checks.
type ReadyServer struct { type ReadyServer struct {
sync.RWMutex tracker *tunnelstate.ConnTracker
isConnected map[int]bool
log *zerolog.Logger
} }
// NewReadyServer initializes a ReadyServer and starts listening for dis/connection events. // NewReadyServer initializes a ReadyServer and starts listening for dis/connection events.
func NewReadyServer(log *zerolog.Logger) *ReadyServer { func NewReadyServer(log *zerolog.Logger) *ReadyServer {
return &ReadyServer{ return &ReadyServer{
isConnected: make(map[int]bool, 0), tracker: tunnelstate.NewConnTracker(log),
log: log,
} }
} }
func (rs *ReadyServer) OnTunnelEvent(c conn.Event) { func (rs *ReadyServer) OnTunnelEvent(c conn.Event) {
switch c.EventType { rs.tracker.OnTunnelEvent(c)
case conn.Connected:
rs.Lock()
rs.isConnected[int(c.Index)] = true
rs.Unlock()
case conn.Disconnected, conn.Reconnecting, conn.RegisteringTunnel, conn.Unregistering:
rs.Lock()
rs.isConnected[int(c.Index)] = false
rs.Unlock()
default:
rs.log.Error().Msgf("Unknown connection event case %v", c)
}
} }
type body struct { type body struct {
Status int `json:"status"` Status int `json:"status"`
ReadyConnections int `json:"readyConnections"` ReadyConnections uint `json:"readyConnections"`
} }
// ServeHTTP responds with HTTP 200 if the tunnel is connected to the edge. // ServeHTTP responds with HTTP 200 if the tunnel is connected to the edge.
@ -63,15 +49,11 @@ func (rs *ReadyServer) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// This is the bulk of the logic for ServeHTTP, broken into its own pure function // This is the bulk of the logic for ServeHTTP, broken into its own pure function
// to make unit testing easy. // to make unit testing easy.
func (rs *ReadyServer) makeResponse() (statusCode, readyConnections int) { func (rs *ReadyServer) makeResponse() (statusCode int, readyConnections uint) {
statusCode = http.StatusServiceUnavailable readyConnections = rs.tracker.CountActiveConns()
rs.RLock() if readyConnections > 0 {
defer rs.RUnlock() return http.StatusOK, readyConnections
for _, connected := range rs.isConnected { } else {
if connected { return http.StatusServiceUnavailable, readyConnections
statusCode = http.StatusOK
readyConnections++
}
} }
return statusCode, readyConnections
} }

View File

@ -7,6 +7,8 @@ import (
"github.com/rs/zerolog" "github.com/rs/zerolog"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/cloudflare/cloudflared/tunnelstate"
"github.com/cloudflare/cloudflared/connection" "github.com/cloudflare/cloudflared/connection"
) )
@ -18,7 +20,7 @@ func TestReadyServer_makeResponse(t *testing.T) {
name string name string
fields fields fields fields
wantOK bool wantOK bool
wantReadyConnections int wantReadyConnections uint
}{ }{
{ {
name: "One connection online => HTTP 200", name: "One connection online => HTTP 200",
@ -49,7 +51,7 @@ func TestReadyServer_makeResponse(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
rs := &ReadyServer{ rs := &ReadyServer{
isConnected: tt.fields.isConnected, tracker: tunnelstate.MockedConnTracker(tt.fields.isConnected),
} }
gotStatusCode, gotReadyConnections := rs.makeResponse() gotStatusCode, gotReadyConnections := rs.makeResponse()
if tt.wantOK && gotStatusCode != http.StatusOK { if tt.wantOK && gotStatusCode != http.StatusOK {

View File

@ -0,0 +1,42 @@
package origin
import (
"github.com/rs/zerolog"
"github.com/cloudflare/cloudflared/connection"
"github.com/cloudflare/cloudflared/tunnelstate"
)
type ConnAwareLogger struct {
tracker *tunnelstate.ConnTracker
logger *zerolog.Logger
}
func NewConnAwareLogger(logger *zerolog.Logger, observer *connection.Observer) *ConnAwareLogger {
connAwareLogger := &ConnAwareLogger{
tracker: tunnelstate.NewConnTracker(logger),
logger: logger,
}
observer.RegisterSink(connAwareLogger.tracker)
return connAwareLogger
}
func (c *ConnAwareLogger) ReplaceLogger(logger *zerolog.Logger) *ConnAwareLogger {
return &ConnAwareLogger{
tracker: c.tracker,
logger: logger,
}
}
func (c *ConnAwareLogger) ConnAwareLogger() *zerolog.Event {
if c.tracker.CountActiveConns() == 0 {
return c.logger.Error()
}
return c.logger.Warn()
}
func (c *ConnAwareLogger) Logger() *zerolog.Logger {
return c.logger
}

View File

@ -46,7 +46,7 @@ type Supervisor struct {
nextConnectedIndex int nextConnectedIndex int
nextConnectedSignal chan struct{} nextConnectedSignal chan struct{}
log *zerolog.Logger log *ConnAwareLogger
logTransport *zerolog.Logger logTransport *zerolog.Logger
reconnectCredentialManager *reconnectCredentialManager reconnectCredentialManager *reconnectCredentialManager
@ -91,7 +91,7 @@ func NewSupervisor(config *TunnelConfig, reconnectCh chan ReconnectSignal, grace
edgeIPs: edgeIPs, edgeIPs: edgeIPs,
tunnelErrors: make(chan tunnelError), tunnelErrors: make(chan tunnelError),
tunnelsConnecting: map[int]chan struct{}{}, tunnelsConnecting: map[int]chan struct{}{},
log: config.Log, log: NewConnAwareLogger(config.Log, config.Observer),
logTransport: config.LogTransport, logTransport: config.LogTransport,
reconnectCredentialManager: newReconnectCredentialManager(connection.MetricsNamespace, connection.TunnelSubsystem, config.HAConnections), reconnectCredentialManager: newReconnectCredentialManager(connection.MetricsNamespace, connection.TunnelSubsystem, config.HAConnections),
useReconnectToken: useReconnectToken, useReconnectToken: useReconnectToken,
@ -123,7 +123,7 @@ func (s *Supervisor) Run(
if timer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate); err == nil { if timer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate); err == nil {
refreshAuthBackoffTimer = timer refreshAuthBackoffTimer = timer
} else { } else {
s.log.Err(err). s.log.Logger().Err(err).
Dur("refreshAuthRetryDuration", refreshAuthRetryDuration). Dur("refreshAuthRetryDuration", refreshAuthRetryDuration).
Msgf("supervisor: initial refreshAuth failed, retrying in %v", refreshAuthRetryDuration) Msgf("supervisor: initial refreshAuth failed, retrying in %v", refreshAuthRetryDuration)
refreshAuthBackoffTimer = time.After(refreshAuthRetryDuration) refreshAuthBackoffTimer = time.After(refreshAuthRetryDuration)
@ -145,7 +145,7 @@ func (s *Supervisor) Run(
case tunnelError := <-s.tunnelErrors: case tunnelError := <-s.tunnelErrors:
tunnelsActive-- tunnelsActive--
if tunnelError.err != nil && !shuttingDown { if tunnelError.err != nil && !shuttingDown {
s.log.Err(tunnelError.err).Int(connection.LogFieldConnIndex, tunnelError.index).Msg("Connection terminated") s.log.ConnAwareLogger().Err(tunnelError.err).Int(connection.LogFieldConnIndex, tunnelError.index).Msg("Connection terminated")
tunnelsWaiting = append(tunnelsWaiting, tunnelError.index) tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
s.waitForNextTunnel(tunnelError.index) s.waitForNextTunnel(tunnelError.index)
@ -170,7 +170,7 @@ func (s *Supervisor) Run(
case <-refreshAuthBackoffTimer: case <-refreshAuthBackoffTimer:
newTimer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate) newTimer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate)
if err != nil { if err != nil {
s.log.Err(err).Msg("supervisor: Authentication failed") s.log.Logger().Err(err).Msg("supervisor: Authentication failed")
// Permanent failure. Leave the `select` without setting the // Permanent failure. Leave the `select` without setting the
// channel to be non-null, so we'll never hit this case of the `select` again. // channel to be non-null, so we'll never hit this case of the `select` again.
continue continue
@ -195,7 +195,7 @@ func (s *Supervisor) initialize(
) error { ) error {
availableAddrs := s.edgeIPs.AvailableAddrs() availableAddrs := s.edgeIPs.AvailableAddrs()
if s.config.HAConnections > availableAddrs { if s.config.HAConnections > availableAddrs {
s.log.Info().Msgf("You requested %d HA connections but I can give you at most %d.", s.config.HAConnections, availableAddrs) s.log.Logger().Info().Msgf("You requested %d HA connections but I can give you at most %d.", s.config.HAConnections, availableAddrs)
s.config.HAConnections = availableAddrs s.config.HAConnections = availableAddrs
} }
@ -244,6 +244,7 @@ func (s *Supervisor) startFirstTunnel(
s.reconnectCredentialManager, s.reconnectCredentialManager,
s.config, s.config,
addr, addr,
s.log,
firstConnIndex, firstConnIndex,
connectedSignal, connectedSignal,
s.cloudflaredUUID, s.cloudflaredUUID,
@ -277,6 +278,7 @@ func (s *Supervisor) startFirstTunnel(
s.reconnectCredentialManager, s.reconnectCredentialManager,
s.config, s.config,
addr, addr,
s.log,
firstConnIndex, firstConnIndex,
connectedSignal, connectedSignal,
s.cloudflaredUUID, s.cloudflaredUUID,
@ -310,6 +312,7 @@ func (s *Supervisor) startTunnel(
s.reconnectCredentialManager, s.reconnectCredentialManager,
s.config, s.config,
addr, addr,
s.log,
uint8(index), uint8(index),
connectedSignal, connectedSignal,
s.cloudflaredUUID, s.cloudflaredUUID,
@ -373,7 +376,7 @@ func (s *Supervisor) authenticate(ctx context.Context, numPreviousAttempts int)
if err != nil { if err != nil {
return nil, err return nil, err
} }
rpcClient := connection.NewTunnelServerClient(ctx, stream, s.log) rpcClient := connection.NewTunnelServerClient(ctx, stream, s.log.Logger())
defer rpcClient.Close() defer rpcClient.Close()
const arbitraryConnectionID = uint8(0) const arbitraryConnectionID = uint8(0)

View File

@ -131,6 +131,7 @@ func ServeTunnelLoop(
credentialManager *reconnectCredentialManager, credentialManager *reconnectCredentialManager,
config *TunnelConfig, config *TunnelConfig,
addr *allregions.EdgeAddr, addr *allregions.EdgeAddr,
connAwareLogger *ConnAwareLogger,
connIndex uint8, connIndex uint8,
connectedSignal *signal.Signal, connectedSignal *signal.Signal,
cloudflaredUUID uuid.UUID, cloudflaredUUID uuid.UUID,
@ -140,7 +141,8 @@ func ServeTunnelLoop(
haConnections.Inc() haConnections.Inc()
defer haConnections.Dec() defer haConnections.Dec()
connLog := config.Log.With().Uint8(connection.LogFieldConnIndex, connIndex).Logger() logger := config.Log.With().Uint8(connection.LogFieldConnIndex, connIndex).Logger()
connLog := connAwareLogger.ReplaceLogger(&logger)
protocolFallback := &protocolFallback{ protocolFallback := &protocolFallback{
retry.BackoffHandler{MaxRetries: config.Retries}, retry.BackoffHandler{MaxRetries: config.Retries},
@ -160,7 +162,7 @@ func ServeTunnelLoop(
for { for {
err, recoverable := ServeTunnel( err, recoverable := ServeTunnel(
ctx, ctx,
&connLog, connLog,
credentialManager, credentialManager,
config, config,
addr, addr,
@ -182,7 +184,7 @@ func ServeTunnelLoop(
if !ok { if !ok {
return err return err
} }
connLog.Info().Msgf("Retrying connection in up to %s seconds", duration) connLog.Logger().Info().Msgf("Retrying connection in up to %s seconds", duration)
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -192,7 +194,7 @@ func ServeTunnelLoop(
case <-protocolFallback.BackoffTimer(): case <-protocolFallback.BackoffTimer():
var idleTimeoutError *quic.IdleTimeoutError var idleTimeoutError *quic.IdleTimeoutError
if !selectNextProtocol( if !selectNextProtocol(
&connLog, connLog.Logger(),
protocolFallback, protocolFallback,
config.ProtocolSelector, config.ProtocolSelector,
errors.As(err, &idleTimeoutError), errors.As(err, &idleTimeoutError),
@ -255,7 +257,7 @@ func selectNextProtocol(
// on error returns a flag indicating if error can be retried // on error returns a flag indicating if error can be retried
func ServeTunnel( func ServeTunnel(
ctx context.Context, ctx context.Context,
connLog *zerolog.Logger, connLog *ConnAwareLogger,
credentialManager *reconnectCredentialManager, credentialManager *reconnectCredentialManager,
config *TunnelConfig, config *TunnelConfig,
addr *allregions.EdgeAddr, addr *allregions.EdgeAddr,
@ -299,29 +301,29 @@ func ServeTunnel(
if err != nil { if err != nil {
switch err := err.(type) { switch err := err.(type) {
case connection.DupConnRegisterTunnelError: case connection.DupConnRegisterTunnelError:
connLog.Err(err).Msg("Unable to establish connection.") connLog.ConnAwareLogger().Err(err).Msg("Unable to establish connection.")
// don't retry this connection anymore, let supervisor pick a new address // don't retry this connection anymore, let supervisor pick a new address
return err, false return err, false
case connection.ServerRegisterTunnelError: case connection.ServerRegisterTunnelError:
connLog.Err(err).Msg("Register tunnel error from server side") connLog.ConnAwareLogger().Err(err).Msg("Register tunnel error from server side")
// Don't send registration error return from server to Sentry. They are // Don't send registration error return from server to Sentry. They are
// logged on server side // logged on server side
if incidents := config.IncidentLookup.ActiveIncidents(); len(incidents) > 0 { if incidents := config.IncidentLookup.ActiveIncidents(); len(incidents) > 0 {
connLog.Error().Msg(activeIncidentsMsg(incidents)) connLog.ConnAwareLogger().Msg(activeIncidentsMsg(incidents))
} }
return err.Cause, !err.Permanent return err.Cause, !err.Permanent
case ReconnectSignal: case ReconnectSignal:
connLog.Info(). connLog.Logger().Info().
Uint8(connection.LogFieldConnIndex, connIndex). Uint8(connection.LogFieldConnIndex, connIndex).
Msgf("Restarting connection due to reconnect signal in %s", err.Delay) Msgf("Restarting connection due to reconnect signal in %s", err.Delay)
err.DelayBeforeReconnect() err.DelayBeforeReconnect()
return err, true return err, true
default: default:
if err == context.Canceled { if err == context.Canceled {
connLog.Debug().Err(err).Msgf("Serve tunnel error") connLog.Logger().Debug().Err(err).Msgf("Serve tunnel error")
return err, false return err, false
} }
connLog.Err(err).Msgf("Serve tunnel error") connLog.ConnAwareLogger().Err(err).Msgf("Serve tunnel error")
_, permanent := err.(unrecoverableError) _, permanent := err.(unrecoverableError)
return err, !permanent return err, !permanent
} }
@ -331,7 +333,7 @@ func ServeTunnel(
func serveTunnel( func serveTunnel(
ctx context.Context, ctx context.Context,
connLog *zerolog.Logger, connLog *ConnAwareLogger,
credentialManager *reconnectCredentialManager, credentialManager *reconnectCredentialManager,
config *TunnelConfig, config *TunnelConfig,
addr *allregions.EdgeAddr, addr *allregions.EdgeAddr,
@ -364,6 +366,7 @@ func serveTunnel(
return ServeQUIC(ctx, return ServeQUIC(ctx,
addr.UDP, addr.UDP,
config, config,
connLog,
connOptions, connOptions,
controlStream, controlStream,
connIndex, connIndex,
@ -373,7 +376,7 @@ func serveTunnel(
case connection.HTTP2, connection.HTTP2Warp: case connection.HTTP2, connection.HTTP2Warp:
edgeConn, err := edgediscovery.DialEdge(ctx, dialTimeout, config.EdgeTLSConfigs[protocol], addr.TCP) edgeConn, err := edgediscovery.DialEdge(ctx, dialTimeout, config.EdgeTLSConfigs[protocol], addr.TCP)
if err != nil { if err != nil {
connLog.Err(err).Msg("Unable to establish connection with Cloudflare edge") connLog.ConnAwareLogger().Err(err).Msg("Unable to establish connection with Cloudflare edge")
return err, true return err, true
} }
@ -395,7 +398,7 @@ func serveTunnel(
default: default:
edgeConn, err := edgediscovery.DialEdge(ctx, dialTimeout, config.EdgeTLSConfigs[protocol], addr.TCP) edgeConn, err := edgediscovery.DialEdge(ctx, dialTimeout, config.EdgeTLSConfigs[protocol], addr.TCP)
if err != nil { if err != nil {
connLog.Err(err).Msg("Unable to establish connection with Cloudflare edge") connLog.ConnAwareLogger().Err(err).Msg("Unable to establish connection with Cloudflare edge")
return err, true return err, true
} }
@ -427,7 +430,7 @@ func (r unrecoverableError) Error() string {
func ServeH2mux( func ServeH2mux(
ctx context.Context, ctx context.Context,
connLog *zerolog.Logger, connLog *ConnAwareLogger,
credentialManager *reconnectCredentialManager, credentialManager *reconnectCredentialManager,
config *TunnelConfig, config *TunnelConfig,
edgeConn net.Conn, edgeConn net.Conn,
@ -437,7 +440,7 @@ func ServeH2mux(
reconnectCh chan ReconnectSignal, reconnectCh chan ReconnectSignal,
gracefulShutdownC <-chan struct{}, gracefulShutdownC <-chan struct{},
) error { ) error {
connLog.Debug().Msgf("Connecting via h2mux") connLog.Logger().Debug().Msgf("Connecting via h2mux")
// Returns error from parsing the origin URL or handshake errors // Returns error from parsing the origin URL or handshake errors
handler, err, recoverable := connection.NewH2muxConnection( handler, err, recoverable := connection.NewH2muxConnection(
config.ConnectionConfig, config.ConnectionConfig,
@ -474,7 +477,7 @@ func ServeH2mux(
func ServeHTTP2( func ServeHTTP2(
ctx context.Context, ctx context.Context,
connLog *zerolog.Logger, connLog *ConnAwareLogger,
config *TunnelConfig, config *TunnelConfig,
tlsServerConn net.Conn, tlsServerConn net.Conn,
connOptions *tunnelpogs.ConnectionOptions, connOptions *tunnelpogs.ConnectionOptions,
@ -483,7 +486,7 @@ func ServeHTTP2(
gracefulShutdownC <-chan struct{}, gracefulShutdownC <-chan struct{},
reconnectCh chan ReconnectSignal, reconnectCh chan ReconnectSignal,
) error { ) error {
connLog.Debug().Msgf("Connecting via http2") connLog.Logger().Debug().Msgf("Connecting via http2")
h2conn := connection.NewHTTP2Connection( h2conn := connection.NewHTTP2Connection(
tlsServerConn, tlsServerConn,
config.ConnectionConfig, config.ConnectionConfig,
@ -515,6 +518,7 @@ func ServeQUIC(
ctx context.Context, ctx context.Context,
edgeAddr *net.UDPAddr, edgeAddr *net.UDPAddr,
config *TunnelConfig, config *TunnelConfig,
connLogger *ConnAwareLogger,
connOptions *tunnelpogs.ConnectionOptions, connOptions *tunnelpogs.ConnectionOptions,
controlStreamHandler connection.ControlStreamHandler, controlStreamHandler connection.ControlStreamHandler,
connIndex uint8, connIndex uint8,
@ -528,7 +532,7 @@ func ServeQUIC(
MaxIncomingStreams: connection.MaxConcurrentStreams, MaxIncomingStreams: connection.MaxConcurrentStreams,
MaxIncomingUniStreams: connection.MaxConcurrentStreams, MaxIncomingUniStreams: connection.MaxConcurrentStreams,
KeepAlive: true, KeepAlive: true,
Tracer: quicpogs.NewClientTracer(config.Log, connIndex), Tracer: quicpogs.NewClientTracer(connLogger.Logger(), connIndex),
} }
for { for {
select { select {
@ -545,7 +549,7 @@ func ServeQUIC(
controlStreamHandler, controlStreamHandler,
config.Observer) config.Observer)
if err != nil { if err != nil {
config.Log.Error().Msgf("Failed to create new quic connection, err: %v", err) connLogger.ConnAwareLogger().Err(err).Msgf("Failed to create new quic connection")
return err, true return err, true
} }
@ -553,7 +557,7 @@ func ServeQUIC(
errGroup.Go(func() error { errGroup.Go(func() error {
err := quicConn.Serve(ctx) err := quicConn.Serve(ctx)
if err != nil { if err != nil {
config.Log.Error().Msgf("Failed to serve quic connection, err: %v", err) connLogger.ConnAwareLogger().Err(err).Msg("Failed to serve quic connection")
} }
return fmt.Errorf("Connection with edge closed") return fmt.Errorf("Connection with edge closed")
}) })

View File

@ -0,0 +1,55 @@
package tunnelstate
import (
"sync"
"github.com/rs/zerolog"
"github.com/cloudflare/cloudflared/connection"
)
type ConnTracker struct {
sync.RWMutex
isConnected map[int]bool
log *zerolog.Logger
}
func NewConnTracker(log *zerolog.Logger) *ConnTracker {
return &ConnTracker{
isConnected: make(map[int]bool, 0),
log: log,
}
}
func MockedConnTracker(mocked map[int]bool) *ConnTracker {
return &ConnTracker{
isConnected: mocked,
}
}
func (ct *ConnTracker) OnTunnelEvent(c connection.Event) {
switch c.EventType {
case connection.Connected:
ct.Lock()
ct.isConnected[int(c.Index)] = true
ct.Unlock()
case connection.Disconnected, connection.Reconnecting, connection.RegisteringTunnel, connection.Unregistering:
ct.Lock()
ct.isConnected[int(c.Index)] = false
ct.Unlock()
default:
ct.log.Error().Msgf("Unknown connection event case %v", c)
}
}
func (ct *ConnTracker) CountActiveConns() uint {
ct.RLock()
defer ct.RUnlock()
active := uint(0)
for _, connected := range ct.isConnected {
if connected {
active++
}
}
return active
}