2018-05-01 23:45:06 +00:00
|
|
|
package origin
|
|
|
|
|
|
|
|
import (
|
2019-01-10 20:55:44 +00:00
|
|
|
"context"
|
2019-12-04 17:22:08 +00:00
|
|
|
"errors"
|
2018-05-01 23:45:06 +00:00
|
|
|
"net"
|
|
|
|
"time"
|
2019-02-01 20:11:12 +00:00
|
|
|
|
2019-11-21 17:03:13 +00:00
|
|
|
"github.com/google/uuid"
|
2019-06-17 21:18:47 +00:00
|
|
|
|
2019-03-18 23:14:47 +00:00
|
|
|
"github.com/cloudflare/cloudflared/connection"
|
2020-02-06 00:55:26 +00:00
|
|
|
"github.com/cloudflare/cloudflared/edgediscovery"
|
2019-12-04 17:22:08 +00:00
|
|
|
"github.com/cloudflare/cloudflared/h2mux"
|
2020-04-29 20:51:32 +00:00
|
|
|
"github.com/cloudflare/cloudflared/logger"
|
2019-03-04 19:48:56 +00:00
|
|
|
"github.com/cloudflare/cloudflared/signal"
|
2019-12-04 17:22:08 +00:00
|
|
|
tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs"
|
2018-05-01 23:45:06 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2020-10-14 13:42:00 +00:00
|
|
|
// SRV and TXT record resolution TTL
|
|
|
|
ResolveTTL = time.Hour
|
2018-05-01 23:45:06 +00:00
|
|
|
// Waiting time before retrying a failed tunnel connection
|
|
|
|
tunnelRetryDuration = time.Second * 10
|
|
|
|
// Interval between registering new tunnels
|
|
|
|
registrationInterval = time.Second
|
2019-12-04 17:22:08 +00:00
|
|
|
|
|
|
|
subsystemRefreshAuth = "refresh_auth"
|
|
|
|
// Maximum exponent for 'Authenticate' exponential backoff
|
|
|
|
refreshAuthMaxBackoff = 10
|
|
|
|
// Waiting time before retrying a failed 'Authenticate' connection
|
|
|
|
refreshAuthRetryDuration = time.Second * 10
|
2019-12-06 21:32:15 +00:00
|
|
|
// Maximum time to make an Authenticate RPC
|
|
|
|
authTokenTimeout = time.Second * 30
|
2019-12-04 17:22:08 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
errEventDigestUnset = errors.New("event digest unset")
|
2018-05-01 23:45:06 +00:00
|
|
|
)
|
|
|
|
|
2020-02-06 00:55:26 +00:00
|
|
|
// Supervisor manages non-declarative tunnels. Establishes TCP connections with the edge, and
|
|
|
|
// reconnects them if they disconnect.
|
2018-05-01 23:45:06 +00:00
|
|
|
type Supervisor struct {
|
2019-12-13 23:05:21 +00:00
|
|
|
cloudflaredUUID uuid.UUID
|
|
|
|
config *TunnelConfig
|
2020-02-06 00:55:26 +00:00
|
|
|
edgeIPs *edgediscovery.Edge
|
2018-05-01 23:45:06 +00:00
|
|
|
tunnelErrors chan tunnelError
|
|
|
|
tunnelsConnecting map[int]chan struct{}
|
|
|
|
// nextConnectedIndex and nextConnectedSignal are used to wait for all
|
|
|
|
// currently-connecting tunnels to finish connecting so we can reset backoff timer
|
|
|
|
nextConnectedIndex int
|
|
|
|
nextConnectedSignal chan struct{}
|
2019-06-17 21:18:47 +00:00
|
|
|
|
2020-04-29 20:51:32 +00:00
|
|
|
logger logger.Service
|
2019-12-04 17:22:08 +00:00
|
|
|
|
2020-08-18 10:14:14 +00:00
|
|
|
reconnectCredentialManager *reconnectCredentialManager
|
2020-10-08 10:12:26 +00:00
|
|
|
useReconnectToken bool
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type tunnelError struct {
|
|
|
|
index int
|
2019-12-13 23:05:21 +00:00
|
|
|
addr *net.TCPAddr
|
2018-05-01 23:45:06 +00:00
|
|
|
err error
|
|
|
|
}
|
|
|
|
|
2020-06-25 18:25:39 +00:00
|
|
|
func NewSupervisor(config *TunnelConfig, cloudflaredUUID uuid.UUID) (*Supervisor, error) {
|
2019-12-24 05:11:00 +00:00
|
|
|
var (
|
2020-02-06 00:55:26 +00:00
|
|
|
edgeIPs *edgediscovery.Edge
|
2019-12-24 05:11:00 +00:00
|
|
|
err error
|
|
|
|
)
|
|
|
|
if len(config.EdgeAddrs) > 0 {
|
2020-10-14 13:42:00 +00:00
|
|
|
edgeIPs, err = edgediscovery.StaticEdge(config.Logger, config.EdgeAddrs)
|
2019-12-24 05:11:00 +00:00
|
|
|
} else {
|
2020-10-14 13:42:00 +00:00
|
|
|
edgeIPs, err = edgediscovery.ResolveEdge(config.Logger)
|
2019-12-24 05:11:00 +00:00
|
|
|
}
|
2019-12-13 23:05:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-06-25 18:25:39 +00:00
|
|
|
|
2020-10-08 10:12:26 +00:00
|
|
|
useReconnectToken := false
|
|
|
|
if config.ClassicTunnel != nil {
|
|
|
|
useReconnectToken = config.ClassicTunnel.UseReconnectToken
|
|
|
|
}
|
|
|
|
|
2018-05-01 23:45:06 +00:00
|
|
|
return &Supervisor{
|
2020-08-18 10:14:14 +00:00
|
|
|
cloudflaredUUID: cloudflaredUUID,
|
|
|
|
config: config,
|
|
|
|
edgeIPs: edgeIPs,
|
|
|
|
tunnelErrors: make(chan tunnelError),
|
|
|
|
tunnelsConnecting: map[int]chan struct{}{},
|
2020-10-14 13:42:00 +00:00
|
|
|
logger: config.Logger,
|
2020-10-08 10:12:26 +00:00
|
|
|
reconnectCredentialManager: newReconnectCredentialManager(connection.MetricsNamespace, connection.TunnelSubsystem, config.HAConnections),
|
|
|
|
useReconnectToken: useReconnectToken,
|
2019-12-13 23:05:21 +00:00
|
|
|
}, nil
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
|
2020-04-30 05:02:08 +00:00
|
|
|
func (s *Supervisor) Run(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) error {
|
2020-03-19 15:38:28 +00:00
|
|
|
if err := s.initialize(ctx, connectedSignal, reconnectCh); err != nil {
|
2018-05-01 23:45:06 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
var tunnelsWaiting []int
|
2019-12-06 21:32:15 +00:00
|
|
|
tunnelsActive := s.config.HAConnections
|
|
|
|
|
2018-05-01 23:45:06 +00:00
|
|
|
backoff := BackoffHandler{MaxRetries: s.config.Retries, BaseTime: tunnelRetryDuration, RetryForever: true}
|
|
|
|
var backoffTimer <-chan time.Time
|
|
|
|
|
2019-12-04 17:22:08 +00:00
|
|
|
refreshAuthBackoff := &BackoffHandler{MaxRetries: refreshAuthMaxBackoff, BaseTime: refreshAuthRetryDuration, RetryForever: true}
|
|
|
|
var refreshAuthBackoffTimer <-chan time.Time
|
2019-12-06 21:32:15 +00:00
|
|
|
|
2020-10-08 10:12:26 +00:00
|
|
|
if s.useReconnectToken {
|
2020-08-18 10:14:14 +00:00
|
|
|
if timer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate); err == nil {
|
2019-12-06 21:32:15 +00:00
|
|
|
refreshAuthBackoffTimer = timer
|
|
|
|
} else {
|
2020-10-14 13:42:00 +00:00
|
|
|
s.logger.Errorf("supervisor: initial refreshAuth failed, retrying in %v: %s", refreshAuthRetryDuration, err)
|
2019-12-06 21:32:15 +00:00
|
|
|
refreshAuthBackoffTimer = time.After(refreshAuthRetryDuration)
|
|
|
|
}
|
2019-12-04 17:22:08 +00:00
|
|
|
}
|
|
|
|
|
2018-05-01 23:45:06 +00:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
// Context cancelled
|
|
|
|
case <-ctx.Done():
|
|
|
|
for tunnelsActive > 0 {
|
|
|
|
<-s.tunnelErrors
|
|
|
|
tunnelsActive--
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
// startTunnel returned with error
|
|
|
|
// (note that this may also be caused by context cancellation)
|
|
|
|
case tunnelError := <-s.tunnelErrors:
|
|
|
|
tunnelsActive--
|
|
|
|
if tunnelError.err != nil {
|
2020-10-14 13:42:00 +00:00
|
|
|
s.logger.Infof("supervisor: Tunnel disconnected due to error: %s", tunnelError.err)
|
2018-05-01 23:45:06 +00:00
|
|
|
tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
|
|
|
|
s.waitForNextTunnel(tunnelError.index)
|
|
|
|
|
|
|
|
if backoffTimer == nil {
|
|
|
|
backoffTimer = backoff.BackoffTimer()
|
|
|
|
}
|
|
|
|
|
2020-02-06 00:55:26 +00:00
|
|
|
// Previously we'd mark the edge address as bad here, but now we'll just silently use
|
|
|
|
// another.
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
// Backoff was set and its timer expired
|
|
|
|
case <-backoffTimer:
|
|
|
|
backoffTimer = nil
|
|
|
|
for _, index := range tunnelsWaiting {
|
2020-03-19 15:38:28 +00:00
|
|
|
go s.startTunnel(ctx, index, s.newConnectedTunnelSignal(index), reconnectCh)
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
tunnelsActive += len(tunnelsWaiting)
|
|
|
|
tunnelsWaiting = nil
|
2019-12-04 17:22:08 +00:00
|
|
|
// Time to call Authenticate
|
|
|
|
case <-refreshAuthBackoffTimer:
|
2020-08-18 10:14:14 +00:00
|
|
|
newTimer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate)
|
2019-12-04 17:22:08 +00:00
|
|
|
if err != nil {
|
2020-10-14 13:42:00 +00:00
|
|
|
s.logger.Errorf("supervisor: Authentication failed: %s", err)
|
2019-12-04 17:22:08 +00:00
|
|
|
// Permanent failure. Leave the `select` without setting the
|
|
|
|
// channel to be non-null, so we'll never hit this case of the `select` again.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
refreshAuthBackoffTimer = newTimer
|
2018-05-01 23:45:06 +00:00
|
|
|
// Tunnel successfully connected
|
|
|
|
case <-s.nextConnectedSignal:
|
|
|
|
if !s.waitForNextTunnel(s.nextConnectedIndex) && len(tunnelsWaiting) == 0 {
|
|
|
|
// No more tunnels outstanding, clear backoff timer
|
|
|
|
backoff.SetGracePeriod()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-06 21:32:15 +00:00
|
|
|
// Returns nil if initialization succeeded, else the initialization error.
|
2020-04-30 05:02:08 +00:00
|
|
|
func (s *Supervisor) initialize(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) error {
|
2019-12-13 23:05:21 +00:00
|
|
|
availableAddrs := int(s.edgeIPs.AvailableAddrs())
|
|
|
|
if s.config.HAConnections > availableAddrs {
|
2020-10-14 13:42:00 +00:00
|
|
|
s.logger.Infof("You requested %d HA connections but I can give you at most %d.", s.config.HAConnections, availableAddrs)
|
2019-12-13 23:05:21 +00:00
|
|
|
s.config.HAConnections = availableAddrs
|
|
|
|
}
|
|
|
|
|
2020-03-19 15:38:28 +00:00
|
|
|
go s.startFirstTunnel(ctx, connectedSignal, reconnectCh)
|
2018-05-01 23:45:06 +00:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
<-s.tunnelErrors
|
2019-12-04 17:22:08 +00:00
|
|
|
return ctx.Err()
|
2018-05-01 23:45:06 +00:00
|
|
|
case tunnelError := <-s.tunnelErrors:
|
|
|
|
return tunnelError.err
|
2019-03-04 19:48:56 +00:00
|
|
|
case <-connectedSignal.Wait():
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
// At least one successful connection, so start the rest
|
|
|
|
for i := 1; i < s.config.HAConnections; i++ {
|
2019-03-04 19:48:56 +00:00
|
|
|
ch := signal.New(make(chan struct{}))
|
2020-03-19 15:38:28 +00:00
|
|
|
go s.startTunnel(ctx, i, ch, reconnectCh)
|
2018-05-01 23:45:06 +00:00
|
|
|
time.Sleep(registrationInterval)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// startTunnel starts the first tunnel connection. The resulting error will be sent on
|
|
|
|
// s.tunnelErrors. It will send a signal via connectedSignal if registration succeed
|
2020-04-30 05:02:08 +00:00
|
|
|
func (s *Supervisor) startFirstTunnel(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) {
|
2019-12-13 23:05:21 +00:00
|
|
|
var (
|
|
|
|
addr *net.TCPAddr
|
|
|
|
err error
|
|
|
|
)
|
2020-06-25 18:25:39 +00:00
|
|
|
const firstConnIndex = 0
|
2018-05-01 23:45:06 +00:00
|
|
|
defer func() {
|
2020-06-25 18:25:39 +00:00
|
|
|
s.tunnelErrors <- tunnelError{index: firstConnIndex, addr: addr, err: err}
|
2018-05-01 23:45:06 +00:00
|
|
|
}()
|
|
|
|
|
2020-06-25 18:25:39 +00:00
|
|
|
addr, err = s.edgeIPs.GetAddr(firstConnIndex)
|
2019-12-13 23:05:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2020-10-08 10:12:26 +00:00
|
|
|
err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, firstConnIndex, connectedSignal, s.cloudflaredUUID, reconnectCh)
|
2020-02-06 00:55:26 +00:00
|
|
|
// If the first tunnel disconnects, keep restarting it.
|
|
|
|
edgeErrors := 0
|
2018-05-01 23:45:06 +00:00
|
|
|
for s.unusedIPs() {
|
2019-12-13 23:05:21 +00:00
|
|
|
if ctx.Err() != nil {
|
2018-05-01 23:45:06 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
switch err.(type) {
|
|
|
|
case nil:
|
|
|
|
return
|
|
|
|
// try the next address if it was a dialError(network problem) or
|
|
|
|
// dupConnRegisterTunnelError
|
2020-10-08 10:12:26 +00:00
|
|
|
case edgediscovery.DialError, connection.DupConnRegisterTunnelError:
|
2020-02-06 00:55:26 +00:00
|
|
|
edgeErrors++
|
2018-05-01 23:45:06 +00:00
|
|
|
default:
|
|
|
|
return
|
|
|
|
}
|
2020-02-06 00:55:26 +00:00
|
|
|
if edgeErrors >= 2 {
|
2020-06-25 18:25:39 +00:00
|
|
|
addr, err = s.edgeIPs.GetDifferentAddr(firstConnIndex)
|
2020-02-06 00:55:26 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
2019-12-13 23:05:21 +00:00
|
|
|
}
|
2020-10-08 10:12:26 +00:00
|
|
|
err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, firstConnIndex, connectedSignal, s.cloudflaredUUID, reconnectCh)
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// startTunnel starts a new tunnel connection. The resulting error will be sent on
|
|
|
|
// s.tunnelErrors.
|
2020-04-30 05:02:08 +00:00
|
|
|
func (s *Supervisor) startTunnel(ctx context.Context, index int, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) {
|
2019-12-13 23:05:21 +00:00
|
|
|
var (
|
|
|
|
addr *net.TCPAddr
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
defer func() {
|
|
|
|
s.tunnelErrors <- tunnelError{index: index, addr: addr, err: err}
|
|
|
|
}()
|
|
|
|
|
2020-04-27 19:25:37 +00:00
|
|
|
addr, err = s.edgeIPs.GetDifferentAddr(index)
|
2019-12-13 23:05:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
2020-10-08 10:12:26 +00:00
|
|
|
err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, uint8(index), connectedSignal, s.cloudflaredUUID, reconnectCh)
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
|
2019-03-04 19:48:56 +00:00
|
|
|
func (s *Supervisor) newConnectedTunnelSignal(index int) *signal.Signal {
|
|
|
|
sig := make(chan struct{})
|
|
|
|
s.tunnelsConnecting[index] = sig
|
|
|
|
s.nextConnectedSignal = sig
|
2018-05-01 23:45:06 +00:00
|
|
|
s.nextConnectedIndex = index
|
2019-03-04 19:48:56 +00:00
|
|
|
return signal.New(sig)
|
2018-05-01 23:45:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) waitForNextTunnel(index int) bool {
|
|
|
|
delete(s.tunnelsConnecting, index)
|
|
|
|
s.nextConnectedSignal = nil
|
|
|
|
for k, v := range s.tunnelsConnecting {
|
|
|
|
s.nextConnectedIndex = k
|
|
|
|
s.nextConnectedSignal = v
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2019-12-13 23:05:21 +00:00
|
|
|
func (s *Supervisor) unusedIPs() bool {
|
|
|
|
return s.edgeIPs.AvailableAddrs() > s.config.HAConnections
|
2019-06-17 21:18:47 +00:00
|
|
|
}
|
|
|
|
|
2019-12-04 17:22:08 +00:00
|
|
|
func (s *Supervisor) authenticate(ctx context.Context, numPreviousAttempts int) (tunnelpogs.AuthOutcome, error) {
|
2020-02-06 00:55:26 +00:00
|
|
|
arbitraryEdgeIP, err := s.edgeIPs.GetAddrForRPC()
|
2019-12-13 23:05:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2020-10-14 13:42:00 +00:00
|
|
|
edgeConn, err := edgediscovery.DialEdge(ctx, dialTimeout, s.config.EdgeTLSConfigs[connection.H2mux], arbitraryEdgeIP)
|
2019-12-04 17:22:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer edgeConn.Close()
|
|
|
|
|
|
|
|
handler := h2mux.MuxedStreamFunc(func(*h2mux.MuxedStream) error {
|
|
|
|
// This callback is invoked by h2mux when the edge initiates a stream.
|
|
|
|
return nil // noop
|
|
|
|
})
|
2020-10-08 10:12:26 +00:00
|
|
|
muxerConfig := s.config.MuxerConfig.H2MuxerConfig(handler, s.logger)
|
|
|
|
muxer, err := h2mux.Handshake(edgeConn, edgeConn, *muxerConfig, h2mux.ActiveStreams)
|
2019-12-04 17:22:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
go muxer.Serve(ctx)
|
|
|
|
defer func() {
|
|
|
|
// If we don't wait for the muxer shutdown here, edgeConn.Close() runs before the muxer connections are done,
|
|
|
|
// and the user sees log noise: "error writing data", "connection closed unexpectedly"
|
|
|
|
<-muxer.Shutdown()
|
|
|
|
}()
|
|
|
|
|
2020-10-08 10:12:26 +00:00
|
|
|
stream, err := muxer.OpenRPCStream(ctx)
|
2019-12-04 17:22:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2020-10-08 10:12:26 +00:00
|
|
|
rpcClient := connection.NewTunnelServerClient(ctx, stream, s.logger)
|
2020-09-28 09:10:30 +00:00
|
|
|
defer rpcClient.Close()
|
2019-12-04 17:22:08 +00:00
|
|
|
|
|
|
|
const arbitraryConnectionID = uint8(0)
|
|
|
|
registrationOptions := s.config.RegistrationOptions(arbitraryConnectionID, edgeConn.LocalAddr().String(), s.cloudflaredUUID)
|
|
|
|
registrationOptions.NumPreviousAttempts = uint8(numPreviousAttempts)
|
2020-10-08 10:12:26 +00:00
|
|
|
return rpcClient.Authenticate(ctx, s.config.ClassicTunnel, registrationOptions)
|
2019-12-04 17:22:08 +00:00
|
|
|
}
|