TUN-3902: Add jitter to backoffhandler

Jitter is important to avoid every cloudflared in the world trying to
reconnect at t=1, 2, 4, etc. That could overwhelm the backend. But
if each cloudflared randomly waits for up to 2, then up to 4, then up
to 8 etc, then the retries get spread out evenly across time.

On average, wait times should be the same (e.g. instead of waiting for
exactly 1 second, cloudflared will wait betweeen 0 and 2 seconds).

This is the "Full Jitter" algorithm from https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
This commit is contained in:
Adam Chalmers 2021-02-10 10:42:09 -06:00
parent 3004703074
commit a278753bbf
7 changed files with 49 additions and 38 deletions

View File

@ -194,6 +194,7 @@ stretch: &stretch
builddeps: builddeps:
- *pinned_go_fips - *pinned_go_fips
- build-essential - build-essential
- gotest-to-teamcity
post-cache: post-cache:
- export GOOS=linux - export GOOS=linux
- export GOARCH=amd64 - export GOARCH=amd64
@ -201,7 +202,7 @@ stretch: &stretch
# cd to a non-module directory: https://github.com/golang/go/issues/24250 # cd to a non-module directory: https://github.com/golang/go/issues/24250
- (cd / && go get github.com/BurntSushi/go-sumtype) - (cd / && go get github.com/BurntSushi/go-sumtype)
- export PATH="$HOME/go/bin:$PATH" - export PATH="$HOME/go/bin:$PATH"
- make test - make test | gotest-to-teamcity
update-homebrew: update-homebrew:
builddeps: builddeps:
- openssh-client - openssh-client

View File

@ -2,6 +2,7 @@ package main
import ( import (
"fmt" "fmt"
"math/rand"
"strings" "strings"
"time" "time"
@ -43,6 +44,7 @@ var (
) )
func main() { func main() {
rand.Seed(time.Now().UnixNano())
metrics.RegisterBuildInfo(BuildTime, Version) metrics.RegisterBuildInfo(BuildTime, Version)
raven.SetRelease(Version) raven.SetRelease(Version)

View File

@ -2,6 +2,7 @@ package origin
import ( import (
"context" "context"
"math/rand"
"time" "time"
) )
@ -30,7 +31,7 @@ type BackoffHandler struct {
resetDeadline time.Time resetDeadline time.Time
} }
func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration, bool) { func (b BackoffHandler) GetMaxBackoffDuration(ctx context.Context) (time.Duration, bool) {
// Follows the same logic as Backoff, but without mutating the receiver. // Follows the same logic as Backoff, but without mutating the receiver.
// This select has to happen first to reflect the actual behaviour of the Backoff function. // This select has to happen first to reflect the actual behaviour of the Backoff function.
select { select {
@ -45,7 +46,8 @@ func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration,
if b.retries >= b.MaxRetries && !b.RetryForever { if b.retries >= b.MaxRetries && !b.RetryForever {
return time.Duration(0), false return time.Duration(0), false
} }
return time.Duration(b.GetBaseTime() * 1 << b.retries), true maxTimeToWait := b.GetBaseTime() * 1 << (b.retries + 1)
return maxTimeToWait, true
} }
// BackoffTimer returns a channel that sends the current time when the exponential backoff timeout expires. // BackoffTimer returns a channel that sends the current time when the exponential backoff timeout expires.
@ -62,7 +64,9 @@ func (b *BackoffHandler) BackoffTimer() <-chan time.Time {
} else { } else {
b.retries++ b.retries++
} }
return timeAfter(time.Duration(b.GetBaseTime() * 1 << (b.retries - 1))) maxTimeToWait := time.Duration(b.GetBaseTime() * 1 << (b.retries))
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
return timeAfter(timeToWait)
} }
// Backoff is used to wait according to exponential backoff. Returns false if the // Backoff is used to wait according to exponential backoff. Returns false if the
@ -83,7 +87,9 @@ func (b *BackoffHandler) Backoff(ctx context.Context) bool {
// Sets a grace period within which the the backoff timer is maintained. After the grace // Sets a grace period within which the the backoff timer is maintained. After the grace
// period expires, the number of retries & backoff duration is reset. // period expires, the number of retries & backoff duration is reset.
func (b *BackoffHandler) SetGracePeriod() { func (b *BackoffHandler) SetGracePeriod() {
b.resetDeadline = timeNow().Add(time.Duration(b.GetBaseTime() * 2 << b.retries)) maxTimeToWait := b.GetBaseTime() * 2 << (b.retries + 1)
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
b.resetDeadline = timeNow().Add(timeToWait)
} }
func (b BackoffHandler) GetBaseTime() time.Duration { func (b BackoffHandler) GetBaseTime() time.Duration {

View File

@ -40,7 +40,7 @@ func TestBackoffCancel(t *testing.T) {
if backoff.Backoff(ctx) { if backoff.Backoff(ctx) {
t.Fatalf("backoff allowed after cancel") t.Fatalf("backoff allowed after cancel")
} }
if _, ok := backoff.GetBackoffDuration(ctx); ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
t.Fatalf("backoff allowed after cancel") t.Fatalf("backoff allowed after cancel")
} }
} }
@ -69,24 +69,24 @@ func TestBackoffGracePeriod(t *testing.T) {
} }
} }
func TestGetBackoffDurationRetries(t *testing.T) { func TestGetMaxBackoffDurationRetries(t *testing.T) {
// make backoff return immediately // make backoff return immediately
timeAfter = immediateTimeAfter timeAfter = immediateTimeAfter
ctx := context.Background() ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3} backoff := BackoffHandler{MaxRetries: 3}
if _, ok := backoff.GetBackoffDuration(ctx); !ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed immediately") t.Fatalf("backoff failed immediately")
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); !ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed after 1 retry") t.Fatalf("backoff failed after 1 retry")
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); !ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed after 2 retry") t.Fatalf("backoff failed after 2 retry")
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
t.Fatalf("backoff allowed after 3 (max) retries") t.Fatalf("backoff allowed after 3 (max) retries")
} }
if backoff.Backoff(ctx) { if backoff.Backoff(ctx) {
@ -94,25 +94,25 @@ func TestGetBackoffDurationRetries(t *testing.T) {
} }
} }
func TestGetBackoffDuration(t *testing.T) { func TestGetMaxBackoffDuration(t *testing.T) {
// make backoff return immediately // make backoff return immediately
timeAfter = immediateTimeAfter timeAfter = immediateTimeAfter
ctx := context.Background() ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3} backoff := BackoffHandler{MaxRetries: 3}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
t.Fatalf("backoff didn't return 1 second on first retry") t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
t.Fatalf("backoff didn't return 2 seconds on second retry") t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
t.Fatalf("backoff didn't return 4 seconds on third retry") t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); ok || duration != 0 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); ok || duration != 0 {
t.Fatalf("backoff didn't return 0 seconds on fourth retry (exceeding limit)") t.Fatalf("backoff (%s) didn't return 0 seconds on fourth retry (exceeding limit)", duration)
} }
} }
@ -121,27 +121,27 @@ func TestBackoffRetryForever(t *testing.T) {
timeAfter = immediateTimeAfter timeAfter = immediateTimeAfter
ctx := context.Background() ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3, RetryForever: true} backoff := BackoffHandler{MaxRetries: 3, RetryForever: true}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
t.Fatalf("backoff didn't return 1 second on first retry") t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
t.Fatalf("backoff didn't return 2 seconds on second retry") t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
} }
backoff.Backoff(ctx) // noop backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
t.Fatalf("backoff didn't return 4 seconds on third retry") t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
} }
if !backoff.Backoff(ctx) { if !backoff.Backoff(ctx) {
t.Fatalf("backoff refused on fourth retry despire RetryForever") t.Fatalf("backoff refused on fourth retry despire RetryForever")
} }
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
t.Fatalf("backoff returned %v instead of 8 seconds on fourth retry", duration) t.Fatalf("backoff returned %v instead of 8 seconds on fourth retry", duration)
} }
if !backoff.Backoff(ctx) { if !backoff.Backoff(ctx) {
t.Fatalf("backoff refused on fifth retry despire RetryForever") t.Fatalf("backoff refused on fifth retry despire RetryForever")
} }
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 { if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
t.Fatalf("backoff returned %v instead of 8 seconds on fifth retry", duration) t.Fatalf("backoff returned %v instead of 8 seconds on fifth retry", duration)
} }
} }

View File

@ -108,7 +108,7 @@ func (cm *reconnectCredentialManager) RefreshAuth(
authOutcome, err := authenticate(ctx, backoff.Retries()) authOutcome, err := authenticate(ctx, backoff.Retries())
if err != nil { if err != nil {
cm.authFail.WithLabelValues(err.Error()).Inc() cm.authFail.WithLabelValues(err.Error()).Inc()
if _, ok := backoff.GetBackoffDuration(ctx); ok { if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
return backoff.BackoffTimer(), nil return backoff.BackoffTimer(), nil
} }
return nil, err return nil, err

View File

@ -8,6 +8,7 @@ import (
"time" "time"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs" tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs"
) )
@ -28,13 +29,14 @@ func TestRefreshAuthBackoff(t *testing.T) {
// authentication failures should consume the backoff // authentication failures should consume the backoff
for i := uint(0); i < backoff.MaxRetries; i++ { for i := uint(0); i < backoff.MaxRetries; i++ {
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth) retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
assert.NoError(t, err) require.NoError(t, err)
assert.NotNil(t, retryChan) require.NotNil(t, retryChan)
assert.Equal(t, (1<<i)*time.Second, wait) require.Greater(t, wait.Seconds(), 0.0)
require.Less(t, wait.Seconds(), float64((1<<(i+1))*time.Second))
} }
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth) retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
assert.Error(t, err) require.Error(t, err)
assert.Nil(t, retryChan) require.Nil(t, retryChan)
// now we actually make contact with the remote server // now we actually make contact with the remote server
_, _ = rcm.RefreshAuth(context.Background(), backoff, func(ctx context.Context, n int) (tunnelpogs.AuthOutcome, error) { _, _ = rcm.RefreshAuth(context.Background(), backoff, func(ctx context.Context, n int) (tunnelpogs.AuthOutcome, error) {
@ -47,8 +49,8 @@ func TestRefreshAuthBackoff(t *testing.T) {
expectedGracePeriod := time.Duration(time.Second * 2 << backoff.MaxRetries) expectedGracePeriod := time.Duration(time.Second * 2 << backoff.MaxRetries)
return time.Now().Add(expectedGracePeriod * 2) return time.Now().Add(expectedGracePeriod * 2)
} }
_, ok := backoff.GetBackoffDuration(context.Background()) _, ok := backoff.GetMaxBackoffDuration(context.Background())
assert.True(t, ok) require.True(t, ok)
} }
func TestRefreshAuthSuccess(t *testing.T) { func TestRefreshAuthSuccess(t *testing.T) {

View File

@ -174,11 +174,11 @@ func ServeTunnelLoop(
config.Observer.SendReconnect(connIndex) config.Observer.SendReconnect(connIndex)
duration, ok := protocolFallback.GetBackoffDuration(ctx) duration, ok := protocolFallback.GetMaxBackoffDuration(ctx)
if !ok { if !ok {
return err return err
} }
connLog.Info().Msgf("Retrying connection in %s seconds", duration) connLog.Info().Msgf("Retrying connection in up to %s seconds", duration)
select { select {
case <-ctx.Done(): case <-ctx.Done():