Release 2017.12.1

2017-12-21 12:21:57 +00:00 · 2017-12-21 12:21:57 +00:00 · e0ae598112
parent d40eb85da6
commit e0ae598112
5 changed files with 184 additions and 31 deletions
--- a/cmd/cloudflare-warp/main.go
+++ b/cmd/cloudflare-warp/main.go
@ -7,6 +7,7 @@ import (
 	"io/ioutil"
 	"math/rand"
 	"net"
+	"net/http"
 	"os"
 	"os/signal"
 	"path/filepath"
@ -203,6 +204,35 @@ WARNING:
 			Value:  4,
 			Hidden: true,
 		}),
+		altsrc.NewDurationFlag(&cli.DurationFlag{
+			Name:  "proxy-connect-timeout",
+			Usage: "HTTP proxy timeout for establishing a new connection",
+			Value: time.Second * 30,
+		}),
+		altsrc.NewDurationFlag(&cli.DurationFlag{
+			Name:  "proxy-tls-timeout",
+			Usage: "HTTP proxy timeout for completing a TLS handshake",
+			Value: time.Second * 10,
+		}),
+		altsrc.NewDurationFlag(&cli.DurationFlag{
+			Name:  "proxy-tcp-keepalive",
+			Usage: "HTTP proxy TCP keepalive duration",
+			Value: time.Second * 30,
+		}),
+		altsrc.NewBoolFlag(&cli.BoolFlag{
+			Name:  "proxy-no-happy-eyeballs",
+			Usage: "HTTP proxy should disable \"happy eyeballs\" for IPv4/v6 fallback",
+		}),
+		altsrc.NewIntFlag(&cli.IntFlag{
+			Name:  "proxy-keepalive-connections",
+			Usage: "HTTP proxy maximum keepalive connection pool size",
+			Value: 100,
+		}),
+		altsrc.NewDurationFlag(&cli.DurationFlag{
+			Name:  "proxy-keepalive-timeout",
+			Usage: "HTTP proxy timeout for closing an idle connection",
+			Value: time.Second * 90,
+		}),
 	}
 	app.Action = func(c *cli.Context) error {
 		raven.CapturePanic(func() { startServer(c) }, nil)
@ -348,6 +378,18 @@ If you don't have a certificate signed by Cloudflare, run the command:
 		log.WithError(err).Fatalf("Cannot read %s to load origin certificate", originCertPath)
 	}
 	tunnelMetrics := origin.NewTunnelMetrics()
+	httpTransport := &http.Transport{
+		Proxy: http.ProxyFromEnvironment,
+		DialContext: (&net.Dialer{
+			Timeout:   c.Duration("proxy-connect-timeout"),
+			KeepAlive: c.Duration("proxy-tcp-keepalive"),
+			DualStack: !c.Bool("proxy-no-happy-eyeballs"),
+		}).DialContext,
+		MaxIdleConns:          c.Int("proxy-keepalive-connections"),
+		IdleConnTimeout:       c.Duration("proxy-keepalive-timeout"),
+		TLSHandshakeTimeout:   c.Duration("proxy-tls-timeout"),
+		ExpectContinueTimeout: 1 * time.Second,
+	}
 	tunnelConfig := &origin.TunnelConfig{
 		EdgeAddrs:         c.StringSlice("edge"),
 		OriginUrl:         url,
@ -362,6 +404,7 @@ If you don't have a certificate signed by Cloudflare, run the command:
 		LBPool:            c.String("lb-pool"),
 		Tags:              tags,
 		HAConnections:     c.Int("ha-connections"),
+		HTTPTransport:     httpTransport,
 		Metrics:           tunnelMetrics,
 		MetricsUpdateFreq: c.Duration("metrics-update-freq"),
 		ProtocolLogger:    protoLogger,
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@ -5,27 +5,51 @@ import (
 	"net/http"
 	_ "net/http/pprof"
 	"runtime"
+	"sync"
 	"time"

 	"golang.org/x/net/context"
+	"golang.org/x/net/trace"

 	log "github.com/Sirupsen/logrus"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 )

-func ServeMetrics(l net.Listener, shutdownC <-chan struct{}) error {
+const (
+	shutdownTimeout = time.Second * 15
+	startupTime     = time.Millisecond * 500
+)
+
+func ServeMetrics(l net.Listener, shutdownC <-chan struct{}) (err error) {
+	var wg sync.WaitGroup
+	// Metrics port is privileged, so no need for further access control
+	trace.AuthRequest = func(*http.Request) (bool, bool) { return true, true }
+	// TODO: parameterize ReadTimeout and WriteTimeout. The maximum time we can
+	// profile CPU usage depends on WriteTimeout
 	server := &http.Server{
-		ReadTimeout:  5 * time.Second,
-		WriteTimeout: 5 * time.Second,
+		ReadTimeout:  10 * time.Second,
+		WriteTimeout: 10 * time.Second,
 	}
-	go func() {
-		<-shutdownC
-		server.Shutdown(context.Background())
-	}()
+
 	http.Handle("/metrics", promhttp.Handler())
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		err = server.Serve(l)
+	}()
 	log.WithField("addr", l.Addr()).Info("Starting metrics server")
-	err := server.Serve(l)
+	// server.Serve will hang if server.Shutdown is called before the server is
+	// fully started up. So add artificial delay.
+	time.Sleep(startupTime)
+
+	<-shutdownC
+	ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
+	server.Shutdown(ctx)
+	cancel()
+
+	wg.Wait()
 	if err == http.ErrServerClosed {
 		log.Info("Metrics server stopped")
 		return nil
--- a/origin/metrics.go
+++ b/origin/metrics.go
@ -10,6 +10,7 @@ import (
 )

 type TunnelMetrics struct {
+	haConnections     prometheus.Gauge
 	totalRequests     prometheus.Counter
 	requestsPerTunnel *prometheus.CounterVec
 	// concurrentRequestsLock is a mutex for concurrentRequests and maxConcurrentRequests
@ -41,6 +42,13 @@ type TunnelMetrics struct {

 // Metrics that can be collected without asking the edge
 func NewTunnelMetrics() *TunnelMetrics {
+	haConnections := prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "ha_connections",
+			Help: "Number of active ha connections",
+		})
+	prometheus.MustRegister(haConnections)
+
 	totalRequests := prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Name: "total_requests",
@ -173,6 +181,7 @@ func NewTunnelMetrics() *TunnelMetrics {
 	prometheus.MustRegister(serverLocations)

 	return &TunnelMetrics{
+		haConnections:                  haConnections,
 		totalRequests:                  totalRequests,
 		requestsPerTunnel:              requestsPerTunnel,
 		concurrentRequestsPerTunnel:    concurrentRequestsPerTunnel,
@ -196,6 +205,14 @@ func NewTunnelMetrics() *TunnelMetrics {
 	}
 }

+func (t *TunnelMetrics) incrementHaConnections() {
+	t.haConnections.Inc()
+}
+
+func (t *TunnelMetrics) decrementHaConnections() {
+	t.haConnections.Dec()
+}
+
 func (t *TunnelMetrics) updateTunnelFlowControlMetrics(metrics *h2mux.FlowControlMetrics) {
 	t.receiveWindowSizeAve.Set(float64(metrics.AverageReceiveWindowSize))
 	t.sendWindowSizeAve.Set(float64(metrics.AverageSendWindowSize))
--- a/origin/supervisor.go
+++ b/origin/supervisor.go
@ -11,7 +11,7 @@ import (

 const (
 	// Waiting time before retrying a failed tunnel connection
-	tunnelRetryDuration = time.Minute
+	tunnelRetryDuration = time.Second * 10
 	// Limit on the exponential backoff time period. (2^5 = 32 minutes)
 	tunnelRetryLimit = 5
 	// SRV record resolution TTL
@ -21,10 +21,14 @@ const (
 type Supervisor struct {
 	config  *TunnelConfig
 	edgeIPs []*net.TCPAddr
+	// nextUnusedEdgeIP is the index of the next addr k edgeIPs to try
+	nextUnusedEdgeIP  int
 	lastResolve       time.Time
 	resolverC         chan resolveResult
 	tunnelErrors      chan tunnelError
 	tunnelsConnecting map[int]chan struct{}
+	// nextConnectedIndex and nextConnectedSignal are used to wait for all
+	// currently-connecting tunnels to finish connecting so we can reset backoff timer
 	nextConnectedIndex  int
 	nextConnectedSignal chan struct{}
 }
@ -53,7 +57,7 @@ func (s *Supervisor) Run(ctx context.Context, connectedSignal chan struct{}) err
 	}
 	tunnelsActive := s.config.HAConnections
 	tunnelsWaiting := []int{}
-	backoff := BackoffHandler{MaxRetries: tunnelRetryLimit, BaseTime: tunnelRetryDuration, RetryForever: true}
+	backoff := BackoffHandler{MaxRetries: s.config.Retries, BaseTime: tunnelRetryDuration, RetryForever: true}
 	var backoffTimer <-chan time.Time
 	for tunnelsActive > 0 {
 		select {
@ -72,11 +76,18 @@ func (s *Supervisor) Run(ctx context.Context, connectedSignal chan struct{}) err
 				log.WithError(tunnelError.err).Warn("Tunnel disconnected due to error")
 				tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
 				s.waitForNextTunnel(tunnelError.index)
-				if backoffTimer != nil {
+				if backoffTimer == nil {
 					backoffTimer = backoff.BackoffTimer()
 				}
+				// If the error is a dial error, the problem is likely to be network related
+				// try another addr before refreshing since we are likely to get back the
+				// same IPs in the same order. Same problem with duplicate connection error.
+				if s.unusedIPs() {
+					s.replaceEdgeIP(tunnelError.index)
+				} else {
 					s.refreshEdgeIPs()
 				}
+			}
 		// Backoff was set and its timer expired
 		case <-backoffTimer:
 			backoffTimer = nil
@ -109,15 +120,23 @@ func (s *Supervisor) Run(ctx context.Context, connectedSignal chan struct{}) err
 func (s *Supervisor) initialize(ctx context.Context, connectedSignal chan struct{}) error {
 	edgeIPs, err := ResolveEdgeIPs(s.config.EdgeAddrs)
 	if err != nil {
+		log.Infof("ResolveEdgeIPs err")
 		return err
 	}
 	s.edgeIPs = edgeIPs
+	if s.config.HAConnections > len(edgeIPs) {
+		log.Warnf("You requested %d HA connections but I can give you at most %d.", s.config.HAConnections, len(edgeIPs))
+		s.config.HAConnections = len(edgeIPs)
+	}
 	s.lastResolve = time.Now()
-	go s.startTunnel(ctx, 0, connectedSignal)
+	// check entitlement and version too old error before attempting to register more tunnels
+	s.nextUnusedEdgeIP = s.config.HAConnections
+	go s.startFirstTunnel(ctx, connectedSignal)
 	select {
 	case <-ctx.Done():
 		<-s.tunnelErrors
-		return nil
+		// Error can't be nil. A nil error signals that initialization succeed
+		return fmt.Errorf("Context was canceled")
 	case tunnelError := <-s.tunnelErrors:
 		return tunnelError.err
 	case <-connectedSignal:
@ -125,10 +144,41 @@ func (s *Supervisor) initialize(ctx context.Context, connectedSignal chan struct
 	// At least one successful connection, so start the rest
 	for i := 1; i < s.config.HAConnections; i++ {
 		go s.startTunnel(ctx, i, make(chan struct{}))
+		// TODO: Add artificial delay between HA connections to make sure all origins
+		// are registered in LB pool. Temporary fix until we fix LB
+		time.Sleep(time.Millisecond * 500)
 	}
 	return nil
 }

+// startTunnel starts the first tunnel connection. The resulting error will be sent on
+// s.tunnelErrors. It will send a signal via connectedSignal if registration succeed
+func (s *Supervisor) startFirstTunnel(ctx context.Context, connectedSignal chan struct{}) {
+	err := ServeTunnelLoop(ctx, s.config, s.getEdgeIP(0), 0, connectedSignal)
+	defer func() {
+		s.tunnelErrors <- tunnelError{index: 0, err: err}
+	}()
+
+	for s.unusedIPs() {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+		}
+		switch err.(type) {
+		case nil:
+			return
+		// try the next address if it was a dialError(network problem) or
+		// dupConnRegisterTunnelError
+		case dialError, dupConnRegisterTunnelError:
+			s.replaceEdgeIP(0)
+		default:
+			return
+		}
+		err = ServeTunnelLoop(ctx, s.config, s.getEdgeIP(0), 0, connectedSignal)
+	}
+}
+
 // startTunnel starts a new tunnel connection. The resulting error will be sent on
 // s.tunnelErrors.
 func (s *Supervisor) startTunnel(ctx context.Context, index int, connectedSignal chan struct{}) {
@ -172,3 +222,12 @@ func (s *Supervisor) refreshEdgeIPs() {
 		s.resolverC <- resolveResult{edgeIPs: edgeIPs, err: err}
 	}()
 }
+
+func (s *Supervisor) unusedIPs() bool {
+	return s.nextUnusedEdgeIP < len(s.edgeIPs)
+}
+
+func (s *Supervisor) replaceEdgeIP(badIPIndex int) {
+	s.edgeIPs[badIPIndex] = s.edgeIPs[s.nextUnusedEdgeIP]
+	s.nextUnusedEdgeIP++
+}
--- a/origin/tunnel.go
+++ b/origin/tunnel.go
@ -10,6 +10,7 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"time"

 	"golang.org/x/net/context"
@ -47,6 +48,7 @@ type TunnelConfig struct {
 	LBPool            string
 	Tags              []tunnelpogs.Tag
 	HAConnections     int
+	HTTPTransport     http.RoundTripper
 	Metrics           *TunnelMetrics
 	MetricsUpdateFreq time.Duration
 	ProtocolLogger    *log.Logger
@ -98,6 +100,7 @@ func StartTunnelDaemon(config *TunnelConfig, shutdownC <-chan struct{}, connecte
 		<-shutdownC
 		cancel()
 	}()
+	// If a user specified negative HAConnections, we will treat it as requesting 1 connection
 	if config.HAConnections > 1 {
 		return NewSupervisor(config).Run(ctx, connectedSignal)
 	} else {
@ -110,6 +113,8 @@ func StartTunnelDaemon(config *TunnelConfig, shutdownC <-chan struct{}, connecte
 }

 func ServeTunnelLoop(ctx context.Context, config *TunnelConfig, addr *net.TCPAddr, connectionID uint8, connectedSignal chan struct{}) error {
+	config.Metrics.incrementHaConnections()
+	defer config.Metrics.decrementHaConnections()
 	backoff := BackoffHandler{MaxRetries: config.Retries}
 	// Used to close connectedSignal no more than once
 	connectedFuse := h2mux.NewBooleanFuse()
@ -141,6 +146,8 @@ func ServeTunnel(
 	connectedFuse *h2mux.BooleanFuse,
 	backoff *BackoffHandler,
 ) (err error, recoverable bool) {
+	var wg sync.WaitGroup
+	wg.Add(2)
 	// Treat panics as recoverable errors
 	defer func() {
 		if r := recover(); r != nil {
@ -159,6 +166,7 @@ func ServeTunnel(
 		switch err.(type) {
 		case dialError:
 			errLog.Error("Unable to dial edge")
+			return err, false
 		case h2mux.MuxerHandshakeError:
 			errLog.Error("Handshake failed with edge server")
 		default:
@ -178,14 +186,16 @@ func ServeTunnel(
 			serveCancel()
 		}
 		registerErrC <- err
+		wg.Done()
 	}()
 	updateMetricsTickC := time.Tick(config.MetricsUpdateFreq)
 	go func() {
+		defer wg.Done()
 		for {
 			select {
 			case <-serveCtx.Done():
 				handler.muxer.Shutdown()
-				break
+				return
 			case <-updateMetricsTickC:
 				handler.UpdateMetrics()
 			}
@ -195,6 +205,7 @@ func ServeTunnel(
 	err = handler.muxer.Serve()
 	serveCancel()
 	registerErr := <-registerErrC
+	wg.Wait()
 	if err != nil {
 		log.WithError(err).Error("Tunnel error")
 		return err, true
@ -204,7 +215,7 @@ func ServeTunnel(
 		if e, ok := registerErr.(printableRegisterTunnelError); ok {
 			log.Error(e)
 			if e.permanent {
-				return nil, false
+				return e, false
 			}
 			return e.cause, true
 		} else if e, ok := registerErr.(dupConnRegisterTunnelError); ok {
@ -282,10 +293,6 @@ func RegisterTunnel(ctx context.Context, muxer *h2mux.Muxer, config *TunnelConfi
 	}

 	log.Infof("Registered at %s", registration.Url)
-
-	for _, logLine := range registration.LogLines {
-		log.Infof(logLine)
-	}
 	return nil
 }

@ -348,7 +355,7 @@ func H1ResponseToH2Response(h1 *http.Response) (h2 []h2mux.Header) {
 type TunnelHandler struct {
 	originUrl  string
 	muxer      *h2mux.Muxer
-	httpClient *http.Client
+	httpClient http.RoundTripper
 	tags       []tunnelpogs.Tag
 	metrics    *TunnelMetrics
 	// connectionID is only used by metrics, and prometheus requires labels to be string
@ -365,24 +372,27 @@ func NewTunnelHandler(ctx context.Context, config *TunnelConfig, addr string, co
 	}
 	h := &TunnelHandler{
 		originUrl:    url,
-		httpClient:   &http.Client{Timeout: time.Minute},
+		httpClient:   config.HTTPTransport,
 		tags:         config.Tags,
 		metrics:      config.Metrics,
 		connectionID: uint8ToString(connectionID),
 	}
+	if h.httpClient == nil {
+		h.httpClient = http.DefaultTransport
+	}
 	// Inherit from parent context so we can cancel (Ctrl-C) while dialing
 	dialCtx, dialCancel := context.WithTimeout(ctx, dialTimeout)
 	// TUN-92: enforce a timeout on dial and handshake (as tls.Dial does not support one)
 	plaintextEdgeConn, err := dialer.DialContext(dialCtx, "tcp", addr)
 	dialCancel()
 	if err != nil {
-		return nil, "", dialError{cause: err}
+		return nil, "", dialError{cause: errors.Wrap(err, "DialContext error")}
 	}
 	edgeConn := tls.Client(plaintextEdgeConn, config.TlsConfig)
 	edgeConn.SetDeadline(time.Now().Add(dialTimeout))
 	err = edgeConn.Handshake()
 	if err != nil {
-		return nil, "", dialError{cause: err}
+		return nil, "", dialError{cause: errors.Wrap(err, "Handshake with edge error")}
 	}
 	// clear the deadline on the conn; h2mux has its own timeouts
 	edgeConn.SetDeadline(time.Time{})
@ -419,7 +429,7 @@ func (h *TunnelHandler) ServeStream(stream *h2mux.MuxedStream) error {
 		log.WithError(err).Error("invalid request received")
 	}
 	h.AppendTagHeaders(req)
-	response, err := h.httpClient.Do(req)
+	response, err := h.httpClient.RoundTrip(req)
 	if err != nil {
 		log.WithError(err).Error("HTTP request error")
 		stream.WriteHeaders([]h2mux.Header{{Name: ":status", Value: "502"}})