TUN-3902: Add jitter to backoffhandler
Jitter is important to avoid every cloudflared in the world trying to reconnect at t=1, 2, 4, etc. That could overwhelm the backend. But if each cloudflared randomly waits for up to 2, then up to 4, then up to 8 etc, then the retries get spread out evenly across time. On average, wait times should be the same (e.g. instead of waiting for exactly 1 second, cloudflared will wait betweeen 0 and 2 seconds). This is the "Full Jitter" algorithm from https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
This commit is contained in:
parent
3004703074
commit
a278753bbf
|
@ -194,6 +194,7 @@ stretch: &stretch
|
|||
builddeps:
|
||||
- *pinned_go_fips
|
||||
- build-essential
|
||||
- gotest-to-teamcity
|
||||
post-cache:
|
||||
- export GOOS=linux
|
||||
- export GOARCH=amd64
|
||||
|
@ -201,7 +202,7 @@ stretch: &stretch
|
|||
# cd to a non-module directory: https://github.com/golang/go/issues/24250
|
||||
- (cd / && go get github.com/BurntSushi/go-sumtype)
|
||||
- export PATH="$HOME/go/bin:$PATH"
|
||||
- make test
|
||||
- make test | gotest-to-teamcity
|
||||
update-homebrew:
|
||||
builddeps:
|
||||
- openssh-client
|
||||
|
|
|
@ -2,6 +2,7 @@ package main
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -43,6 +44,7 @@ var (
|
|||
)
|
||||
|
||||
func main() {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
metrics.RegisterBuildInfo(BuildTime, Version)
|
||||
raven.SetRelease(Version)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package origin
|
|||
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
|
@ -30,7 +31,7 @@ type BackoffHandler struct {
|
|||
resetDeadline time.Time
|
||||
}
|
||||
|
||||
func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration, bool) {
|
||||
func (b BackoffHandler) GetMaxBackoffDuration(ctx context.Context) (time.Duration, bool) {
|
||||
// Follows the same logic as Backoff, but without mutating the receiver.
|
||||
// This select has to happen first to reflect the actual behaviour of the Backoff function.
|
||||
select {
|
||||
|
@ -45,7 +46,8 @@ func (b BackoffHandler) GetBackoffDuration(ctx context.Context) (time.Duration,
|
|||
if b.retries >= b.MaxRetries && !b.RetryForever {
|
||||
return time.Duration(0), false
|
||||
}
|
||||
return time.Duration(b.GetBaseTime() * 1 << b.retries), true
|
||||
maxTimeToWait := b.GetBaseTime() * 1 << (b.retries + 1)
|
||||
return maxTimeToWait, true
|
||||
}
|
||||
|
||||
// BackoffTimer returns a channel that sends the current time when the exponential backoff timeout expires.
|
||||
|
@ -62,7 +64,9 @@ func (b *BackoffHandler) BackoffTimer() <-chan time.Time {
|
|||
} else {
|
||||
b.retries++
|
||||
}
|
||||
return timeAfter(time.Duration(b.GetBaseTime() * 1 << (b.retries - 1)))
|
||||
maxTimeToWait := time.Duration(b.GetBaseTime() * 1 << (b.retries))
|
||||
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
|
||||
return timeAfter(timeToWait)
|
||||
}
|
||||
|
||||
// Backoff is used to wait according to exponential backoff. Returns false if the
|
||||
|
@ -83,7 +87,9 @@ func (b *BackoffHandler) Backoff(ctx context.Context) bool {
|
|||
// Sets a grace period within which the the backoff timer is maintained. After the grace
|
||||
// period expires, the number of retries & backoff duration is reset.
|
||||
func (b *BackoffHandler) SetGracePeriod() {
|
||||
b.resetDeadline = timeNow().Add(time.Duration(b.GetBaseTime() * 2 << b.retries))
|
||||
maxTimeToWait := b.GetBaseTime() * 2 << (b.retries + 1)
|
||||
timeToWait := time.Duration(rand.Int63n(maxTimeToWait.Nanoseconds()))
|
||||
b.resetDeadline = timeNow().Add(timeToWait)
|
||||
}
|
||||
|
||||
func (b BackoffHandler) GetBaseTime() time.Duration {
|
||||
|
|
|
@ -40,7 +40,7 @@ func TestBackoffCancel(t *testing.T) {
|
|||
if backoff.Backoff(ctx) {
|
||||
t.Fatalf("backoff allowed after cancel")
|
||||
}
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
|
||||
t.Fatalf("backoff allowed after cancel")
|
||||
}
|
||||
}
|
||||
|
@ -69,24 +69,24 @@ func TestBackoffGracePeriod(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGetBackoffDurationRetries(t *testing.T) {
|
||||
func TestGetMaxBackoffDurationRetries(t *testing.T) {
|
||||
// make backoff return immediately
|
||||
timeAfter = immediateTimeAfter
|
||||
ctx := context.Background()
|
||||
backoff := BackoffHandler{MaxRetries: 3}
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
|
||||
t.Fatalf("backoff failed immediately")
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
|
||||
t.Fatalf("backoff failed after 1 retry")
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); !ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); !ok {
|
||||
t.Fatalf("backoff failed after 2 retry")
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
|
||||
t.Fatalf("backoff allowed after 3 (max) retries")
|
||||
}
|
||||
if backoff.Backoff(ctx) {
|
||||
|
@ -94,25 +94,25 @@ func TestGetBackoffDurationRetries(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGetBackoffDuration(t *testing.T) {
|
||||
func TestGetMaxBackoffDuration(t *testing.T) {
|
||||
// make backoff return immediately
|
||||
timeAfter = immediateTimeAfter
|
||||
ctx := context.Background()
|
||||
backoff := BackoffHandler{MaxRetries: 3}
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second {
|
||||
t.Fatalf("backoff didn't return 1 second on first retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
|
||||
t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 {
|
||||
t.Fatalf("backoff didn't return 2 seconds on second retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
|
||||
t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 {
|
||||
t.Fatalf("backoff didn't return 4 seconds on third retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
|
||||
t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); ok || duration != 0 {
|
||||
t.Fatalf("backoff didn't return 0 seconds on fourth retry (exceeding limit)")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); ok || duration != 0 {
|
||||
t.Fatalf("backoff (%s) didn't return 0 seconds on fourth retry (exceeding limit)", duration)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -121,27 +121,27 @@ func TestBackoffRetryForever(t *testing.T) {
|
|||
timeAfter = immediateTimeAfter
|
||||
ctx := context.Background()
|
||||
backoff := BackoffHandler{MaxRetries: 3, RetryForever: true}
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second {
|
||||
t.Fatalf("backoff didn't return 1 second on first retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*2 {
|
||||
t.Fatalf("backoff (%s) didn't return < 2 seconds on first retry", duration)
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*2 {
|
||||
t.Fatalf("backoff didn't return 2 seconds on second retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*4 {
|
||||
t.Fatalf("backoff (%s) didn't return < 4 seconds on second retry", duration)
|
||||
}
|
||||
backoff.Backoff(ctx) // noop
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*4 {
|
||||
t.Fatalf("backoff didn't return 4 seconds on third retry")
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*8 {
|
||||
t.Fatalf("backoff (%s) didn't return < 8 seconds on third retry", duration)
|
||||
}
|
||||
if !backoff.Backoff(ctx) {
|
||||
t.Fatalf("backoff refused on fourth retry despire RetryForever")
|
||||
}
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 {
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
|
||||
t.Fatalf("backoff returned %v instead of 8 seconds on fourth retry", duration)
|
||||
}
|
||||
if !backoff.Backoff(ctx) {
|
||||
t.Fatalf("backoff refused on fifth retry despire RetryForever")
|
||||
}
|
||||
if duration, ok := backoff.GetBackoffDuration(ctx); !ok || duration != time.Second*8 {
|
||||
if duration, ok := backoff.GetMaxBackoffDuration(ctx); !ok || duration > time.Second*16 {
|
||||
t.Fatalf("backoff returned %v instead of 8 seconds on fifth retry", duration)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,7 +108,7 @@ func (cm *reconnectCredentialManager) RefreshAuth(
|
|||
authOutcome, err := authenticate(ctx, backoff.Retries())
|
||||
if err != nil {
|
||||
cm.authFail.WithLabelValues(err.Error()).Inc()
|
||||
if _, ok := backoff.GetBackoffDuration(ctx); ok {
|
||||
if _, ok := backoff.GetMaxBackoffDuration(ctx); ok {
|
||||
return backoff.BackoffTimer(), nil
|
||||
}
|
||||
return nil, err
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs"
|
||||
)
|
||||
|
@ -28,13 +29,14 @@ func TestRefreshAuthBackoff(t *testing.T) {
|
|||
// authentication failures should consume the backoff
|
||||
for i := uint(0); i < backoff.MaxRetries; i++ {
|
||||
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
|
||||
assert.NoError(t, err)
|
||||
assert.NotNil(t, retryChan)
|
||||
assert.Equal(t, (1<<i)*time.Second, wait)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, retryChan)
|
||||
require.Greater(t, wait.Seconds(), 0.0)
|
||||
require.Less(t, wait.Seconds(), float64((1<<(i+1))*time.Second))
|
||||
}
|
||||
retryChan, err := rcm.RefreshAuth(context.Background(), backoff, auth)
|
||||
assert.Error(t, err)
|
||||
assert.Nil(t, retryChan)
|
||||
require.Error(t, err)
|
||||
require.Nil(t, retryChan)
|
||||
|
||||
// now we actually make contact with the remote server
|
||||
_, _ = rcm.RefreshAuth(context.Background(), backoff, func(ctx context.Context, n int) (tunnelpogs.AuthOutcome, error) {
|
||||
|
@ -47,8 +49,8 @@ func TestRefreshAuthBackoff(t *testing.T) {
|
|||
expectedGracePeriod := time.Duration(time.Second * 2 << backoff.MaxRetries)
|
||||
return time.Now().Add(expectedGracePeriod * 2)
|
||||
}
|
||||
_, ok := backoff.GetBackoffDuration(context.Background())
|
||||
assert.True(t, ok)
|
||||
_, ok := backoff.GetMaxBackoffDuration(context.Background())
|
||||
require.True(t, ok)
|
||||
}
|
||||
|
||||
func TestRefreshAuthSuccess(t *testing.T) {
|
||||
|
|
|
@ -174,11 +174,11 @@ func ServeTunnelLoop(
|
|||
|
||||
config.Observer.SendReconnect(connIndex)
|
||||
|
||||
duration, ok := protocolFallback.GetBackoffDuration(ctx)
|
||||
duration, ok := protocolFallback.GetMaxBackoffDuration(ctx)
|
||||
if !ok {
|
||||
return err
|
||||
}
|
||||
connLog.Info().Msgf("Retrying connection in %s seconds", duration)
|
||||
connLog.Info().Msgf("Retrying connection in up to %s seconds", duration)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
|
|
Loading…
Reference in New Issue