TUN-3902: Add jitter to backoffhandler

Jitter is important to avoid every cloudflared in the world trying to
reconnect at t=1, 2, 4, etc. That could overwhelm the backend. But
if each cloudflared randomly waits for up to 2, then up to 4, then up
to 8 etc, then the retries get spread out evenly across time.

On average, wait times should be the same (e.g. instead of waiting for
exactly 1 second, cloudflared will wait betweeen 0 and 2 seconds).

This is the "Full Jitter" algorithm from https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
This commit is contained in:
Adam Chalmers 2021-02-10 10:42:09 -06:00
parent 3004703074
commit a278753bbf
7 changed files with 49 additions and 38 deletions

View File

@ -194,6 +194,7 @@ stretch: &stretch
builddeps:
- *pinned_go_fips
- build-essential
- gotest-to-teamcity
post-cache:
- export GOOS=linux
- export GOARCH=amd64
@ -201,7 +202,7 @@ stretch: &stretch
# cd to a non-module directory: https://github.com/golang/go/issues/24250
- (cd / && go get github.com/BurntSushi/go-sumtype)
- export PATH="$HOME/go/bin:$PATH"
- make test
- make test | gotest-to-teamcity
update-homebrew:
builddeps:
- openssh-client

View File

@ -2,6 +2,7 @@ package main
import (
"fmt"
"math/rand"
"strings"
"time"
@ -43,6 +44,7 @@ var (
)
func main() {
rand.Seed(time.Now().UnixNano())
metrics.RegisterBuildInfo(BuildTime, Version)
raven.SetRelease(Version)

View File

@ -2,6 +2,7 @@ package origin
import (
"context"
"math/rand"
"time"
)
@ -30,7 +31,7 @@ type BackoffHandler struct {
resetDeadline time.Time
}
func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration, bool) {
func (b BackoffHandler) GetMaxBackoffDuration(ctx context.Context) (time.Duration, bool) {
// Follows the same logic as Backoff, but without mutating the receiver.
// This select has to happen first to reflect the actual behaviour of the Backoff function.
select {
@ -45,7 +46,8 @@ func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration,
if b.retries >= b.MaxRetries && !b.RetryForever {
return time.Duration(0), false
}
return time.Duration(b.GetBaseTime() * 1 << b.retries), true
maxTimeToWait := b.GetBaseTime() * 1 << (b.retries + 1)
return maxTimeToWait, true
}
// BackoffTimer returns a channel that sends the current time when the exponential backoff timeout expires.
@ -62,7 +64,9 @@ func (b *BackoffHandler) BackoffTimer() <-chan time.Time {
} else {
b.retries++
}
return timeAfter(time.Duration(b.GetBaseTime() * 1 << (b.retries - 1)))
maxTimeToWait := time.Duration(b.GetBaseTime() * 1 << (b.retries))
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
return timeAfter(timeToWait)
}
// Backoff is used to wait according to exponential backoff. Returns false if the
@ -83,7 +87,9 @@ func (b *BackoffHandler) Backoff(ctx context.Context) bool {
// Sets a grace period within which the the backoff timer is maintained. After the grace
// period expires, the number of retries & backoff duration is reset.
func (b *BackoffHandler) SetGracePeriod() {
b.resetDeadline = timeNow().Add(time.Duration(b.GetBaseTime() * 2 << b.retries))
maxTimeToWait := b.GetBaseTime() * 2 << (b.retries + 1)
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
b.resetDeadline = timeNow().Add(timeToWait)
}
func (b BackoffHandler) GetBaseTime() time.Duration {

View File

@ -40,7 +40,7 @@ func TestBackoffCancel(t *testing.T) {
if backoff.Backoff(ctx) {
t.Fatalf("backoff allowed after cancel")
}
if _, ok := backoff.GetBackoffDuration(ctx); ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
t.Fatalf("backoff allowed after cancel")
}
}
@ -69,24 +69,24 @@ func TestBackoffGracePeriod(t *testing.T) {
}
}
func TestGetBackoffDurationRetries(t *testing.T) {
func TestGetMaxBackoffDurationRetries(t *testing.T) {
// make backoff return immediately
timeAfter = immediateTimeAfter
ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3}
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed immediately")
}
backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed after 1 retry")
}
backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
t.Fatalf("backoff failed after 2 retry")
}
backoff.Backoff(ctx) // noop
if _, ok := backoff.GetBackoffDuration(ctx); ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
t.Fatalf("backoff allowed after 3 (max) retries")
}
if backoff.Backoff(ctx) {
@ -94,25 +94,25 @@ func TestGetBackoffDurationRetries(t *testing.T) {
}
}
func TestGetBackoffDuration(t *testing.T) {
func TestGetMaxBackoffDuration(t *testing.T) {
// make backoff return immediately
timeAfter = immediateTimeAfter
ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second {
t.Fatalf("backoff didn't return 1 second on first retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
}
backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 {
t.Fatalf("backoff didn't return 2 seconds on second retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
}
backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 {
t.Fatalf("backoff didn't return 4 seconds on third retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
}
backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); ok || duration != 0 {
t.Fatalf("backoff didn't return 0 seconds on fourth retry (exceeding limit)")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); ok || duration != 0 {
t.Fatalf("backoff (%s) didn't return 0 seconds on fourth retry (exceeding limit)", duration)
}
}
@ -121,27 +121,27 @@ func TestBackoffRetryForever(t *testing.T) {
timeAfter = immediateTimeAfter
ctx := context.Background()
backoff := BackoffHandler{MaxRetries: 3, RetryForever: true}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second {
t.Fatalf("backoff didn't return 1 second on first retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
}
backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 {
t.Fatalf("backoff didn't return 2 seconds on second retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
}
backoff.Backoff(ctx) // noop
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 {
t.Fatalf("backoff didn't return 4 seconds on third retry")
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
}
if !backoff.Backoff(ctx) {
t.Fatalf("backoff refused on fourth retry despire RetryForever")
}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 {
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
t.Fatalf("backoff returned %v instead of 8 seconds on fourth retry", duration)
}
if !backoff.Backoff(ctx) {
t.Fatalf("backoff refused on fifth retry despire RetryForever")
}
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 {
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
t.Fatalf("backoff returned %v instead of 8 seconds on fifth retry", duration)
}
}

View File

@ -108,7 +108,7 @@ func (cm *reconnectCredentialManager) RefreshAuth(
authOutcome, err := authenticate(ctx, backoff.Retries())
if err != nil {
cm.authFail.WithLabelValues(err.Error()).Inc()
if _, ok := backoff.GetBackoffDuration(ctx); ok {
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
return backoff.BackoffTimer(), nil
}
return nil, err

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs"
)
@ -28,13 +29,14 @@ func TestRefreshAuthBackoff(t *testing.T) {
// authentication failures should consume the backoff
for i := uint(0); i < backoff.MaxRetries; i++ {
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
assert.NoError(t, err)
assert.NotNil(t, retryChan)
assert.Equal(t, (1<<i)*time.Second, wait)
require.NoError(t, err)
require.NotNil(t, retryChan)
require.Greater(t, wait.Seconds(), 0.0)
require.Less(t, wait.Seconds(), float64((1<<(i+1))*time.Second))
}
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
assert.Error(t, err)
assert.Nil(t, retryChan)
require.Error(t, err)
require.Nil(t, retryChan)
// now we actually make contact with the remote server
_, _ = rcm.RefreshAuth(context.Background(), backoff, func(ctx context.Context, n int) (tunnelpogs.AuthOutcome, error) {
@ -47,8 +49,8 @@ func TestRefreshAuthBackoff(t *testing.T) {
expectedGracePeriod := time.Duration(time.Second * 2 << backoff.MaxRetries)
return time.Now().Add(expectedGracePeriod * 2)
}
_, ok := backoff.GetBackoffDuration(context.Background())
assert.True(t, ok)
_, ok := backoff.GetMaxBackoffDuration(context.Background())
require.True(t, ok)
}
func TestRefreshAuthSuccess(t *testing.T) {

View File

@ -174,11 +174,11 @@ func ServeTunnelLoop(
config.Observer.SendReconnect(connIndex)
duration, ok := protocolFallback.GetBackoffDuration(ctx)
duration, ok := protocolFallback.GetMaxBackoffDuration(ctx)
if !ok {
return err
}
connLog.Info().Msgf("Retrying connection in %s seconds", duration)
connLog.Info().Msgf("Retrying connection in up to %s seconds", duration)
select {
case <-ctx.Done():