TUN-8724: Add CLI command for diagnostic procedure

## Summary Adds a new CLI subcommand, under the tunnel command, the `diag`. This command has as function the automatic collection of different data points, such as, logs, metrics, network information, system information, tunnel state, and runtime information which will be written to a single zip file. Closes TUN-8724
2024-12-13 10:07:56 -08:00 · 2024-12-13 10:07:56 -08:00 · 1859d742a8
parent 8ed19222b9
commit 1859d742a8
3 changed files with 186 additions and 49 deletions
--- a/cmd/cloudflared/tunnel/cmd.go
+++ b/cmd/cloudflared/tunnel/cmd.go
@ -236,6 +236,7 @@ func Commands() []*cli.Command {
 		buildDeleteCommand(),
 		buildCleanupCommand(),
 		buildTokenCommand(),
+		buildDiagCommand(),
 		// for compatibility, allow following as tunnel subcommands
 		proxydns.Command(true),
 		cliutil.RemovedCommand("db-connect"),
--- a/cmd/cloudflared/tunnel/subcommands.go
+++ b/cmd/cloudflared/tunnel/subcommands.go
@ -28,6 +28,8 @@ import (
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/updater"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/connection"
+	"github.com/cloudflare/cloudflared/diagnostic"
+	"github.com/cloudflare/cloudflared/metrics"
 )

 const (
@ -38,6 +40,14 @@ const (
 	CredContentsFlag        = "credentials-contents"
 	TunnelTokenFlag         = "token"
 	overwriteDNSFlagName    = "overwrite-dns"
+	noDiagLogsFlagName      = "no-diag-logs"
+	noDiagMetricsFlagName   = "no-diag-metrics"
+	noDiagSystemFlagName    = "no-diag-system"
+	noDiagRuntimeFlagName   = "no-diag-runtime"
+	noDiagNetworkFlagName   = "no-diag-network"
+	diagContainerIDFlagName = "diag-container-id"
+	diagPodFlagName         = "diag-pod-id"
+	metricsFlagName         = "metrics"

 	LogFieldTunnelID = "tunnelID"
 )
@ -179,6 +189,46 @@ var (
 		Usage:   "Source address and the interface name to send/receive ICMPv6 messages. If not provided cloudflared will dial a local address to determine the source IP or fallback to ::.",
 		EnvVars: []string{"TUNNEL_ICMPV6_SRC"},
 	}
+	metricsFlag = &cli.StringFlag{
+		Name:  metricsFlagName,
+		Usage: "The metrics server address i.e.: 127.0.0.1:12345. If your instance is running in a Docker/Kubernetes environment you need to setup port forwarding for your application.",
+		Value: "",
+	}
+	diagContainerFlag = &cli.StringFlag{
+		Name:  diagContainerIDFlagName,
+		Usage: "Container ID or Name to collect logs from",
+		Value: "",
+	}
+	diagPodFlag = &cli.StringFlag{
+		Name:  diagPodFlagName,
+		Usage: "Kubernetes POD to collect logs from",
+		Value: "",
+	}
+	noDiagLogsFlag = &cli.BoolFlag{
+		Name:  noDiagLogsFlagName,
+		Usage: "Log collection will not be performed",
+		Value: false,
+	}
+	noDiagMetricsFlag = &cli.BoolFlag{
+		Name:  noDiagMetricsFlagName,
+		Usage: "Metric collection will not be performed",
+		Value: false,
+	}
+	noDiagSystemFlag = &cli.BoolFlag{
+		Name:  noDiagSystemFlagName,
+		Usage: "System information collection will not be performed",
+		Value: false,
+	}
+	noDiagRuntimeFlag = &cli.BoolFlag{
+		Name:  noDiagRuntimeFlagName,
+		Usage: "Runtime information collection will not be performed",
+		Value: false,
+	}
+	noDiagNetworkFlag = &cli.BoolFlag{
+		Name:  noDiagNetworkFlagName,
+		Usage: "Network diagnostics won't be performed",
+		Value: false,
+	}
 )

 func buildCreateCommand() *cli.Command {
@ -375,7 +425,6 @@ func formatAndPrintTunnelList(tunnels []*cfapi.Tunnel, showRecentlyDisconnected
 }

 func fmtConnections(connections []cfapi.Connection, showRecentlyDisconnected bool) string {
-
 	// Count connections per colo
 	numConnsPerColo := make(map[string]uint, len(connections))
 	for _, connection := range connections {
@ -897,8 +946,10 @@ func lbRouteFromArg(c *cli.Context) (cfapi.HostnameRoute, error) {
 	return cfapi.NewLBRoute(lbName, lbPool), nil
 }

-var nameRegex = regexp.MustCompile("^[_a-zA-Z0-9][-_.a-zA-Z0-9]*$")
-var hostNameRegex = regexp.MustCompile("^[*_a-zA-Z0-9][-_.a-zA-Z0-9]*$")
+var (
+	nameRegex     = regexp.MustCompile("^[_a-zA-Z0-9][-_.a-zA-Z0-9]*$")
+	hostNameRegex = regexp.MustCompile("^[*_a-zA-Z0-9][-_.a-zA-Z0-9]*$")
+)

 func validateName(s string, allowWildcardSubdomain bool) bool {
 	if allowWildcardSubdomain {
@ -986,3 +1037,78 @@ SUBCOMMAND OPTIONS:
 `
 	return fmt.Sprintf(template, parentFlagsHelp)
 }
+
+func buildDiagCommand() *cli.Command {
+	return &cli.Command{
+		Name:        "diag",
+		Action:      cliutil.ConfiguredAction(diagCommand),
+		Usage:       "Creates a diagnostic report from a local cloudflared instance",
+		UsageText:   "cloudflared tunnel [tunnel command options] diag [subcommand options]",
+		Description: "cloudflared tunnel diag will create a diagnostic report of a local cloudflared instance. The diagnostic procedure collects: logs, metrics, system information, traceroute to Cloudflare Edge, and runtime information. Since there may be multiple instances of cloudflared running the --metrics option may be provided to target a specific instance.",
+		Flags: []cli.Flag{
+			metricsFlag,
+			diagContainerFlag,
+			diagPodFlag,
+			noDiagLogsFlag,
+			noDiagMetricsFlag,
+			noDiagSystemFlag,
+			noDiagRuntimeFlag,
+			noDiagNetworkFlag,
+		},
+		CustomHelpTemplate: commandHelpTemplate(),
+	}
+}
+
+func diagCommand(ctx *cli.Context) error {
+	sctx, err := newSubcommandContext(ctx)
+	if err != nil {
+		return err
+	}
+	log := sctx.log
+	options := diagnostic.Options{
+		KnownAddresses: metrics.GetMetricsKnownAddresses(metrics.Runtime),
+		Address:        sctx.c.String(metricsFlagName),
+		ContainerID:    sctx.c.String(diagContainerIDFlagName),
+		PodID:          sctx.c.String(diagPodFlagName),
+		Toggles: diagnostic.Toggles{
+			NoDiagLogs:    sctx.c.Bool(noDiagLogsFlagName),
+			NoDiagMetrics: sctx.c.Bool(noDiagMetricsFlagName),
+			NoDiagSystem:  sctx.c.Bool(noDiagSystemFlagName),
+			NoDiagRuntime: sctx.c.Bool(noDiagRuntimeFlagName),
+			NoDiagNetwork: sctx.c.Bool(noDiagNetworkFlagName),
+		},
+	}
+
+	if options.Address == "" {
+		log.Info().Msg("If your instance is running in a Docker/Kubernetes environment you need to setup port forwarding for your application.")
+	}
+
+	states, err := diagnostic.RunDiagnostic(log, options)
+
+	if errors.Is(err, diagnostic.ErrMetricsServerNotFound) {
+		log.Warn().Msg("No instances found")
+		return nil
+	}
+	if errors.Is(err, diagnostic.ErrMultipleMetricsServerFound) {
+		if states != nil {
+			log.Info().Msgf("Found multiple instances running:")
+			for _, state := range states {
+				log.Info().Msgf("Instance: tunnel-id=%s connector-id=%s metrics-address=%s", state.TunnelID, state.ConnectorID, state.URL.String())
+			}
+			log.Info().Msgf("To select one instance use the option --metrics")
+		}
+		return nil
+	}
+
+	if errors.Is(err, diagnostic.ErrLogConfigurationIsInvalid) {
+		log.Info().Msg("Couldn't extract logs from the instance. If the instance is running in a containerized environment use the option --diag-container-id or --diag-pod-id. If there is no logging configuration use --no-diag-logs.")
+	}
+
+	if err != nil {
+		log.Warn().Msg("Diagnostic completed with one or more errors")
+	} else {
+		log.Info().Msg("Diagnostic completed")
+	}
+
+	return nil
+}
--- a/diagnostic/diagnostic.go
+++ b/diagnostic/diagnostic.go
@ -9,6 +9,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
+	"strings"
 	"sync"
 	"time"

@ -162,17 +163,7 @@ func collectNetworkResultRoutine(
 	}

 	hops, raw, err := collector.Collect(ctx, network.NewTraceOptions(hopsNo, timeout, hostname, useIPv4))
-	if err != nil {
-		if raw == "" {
-			// An error happened and there is no raw output
-			results <- networkCollectionResult{name, nil, "", err}
-		} else {
-			// An error happened and there is raw output then write to file
-			results <- networkCollectionResult{name, nil, raw, nil}
-		}
-	} else {
-		results <- networkCollectionResult{name, hops, raw, nil}
-	}
+	results <- networkCollectionResult{name, hops, raw, err}
 }

 func gatherNetworkInformation(ctx context.Context) map[string]networkCollectionResult {
@ -209,10 +200,6 @@ func gatherNetworkInformation(ctx context.Context) map[string]networkCollectionR

 	for range len(hostAndIPversionPairs) {
 		result := <-results
-		if result.err != nil {
-			continue
-		}
-
 		resultMap[result.name] = result
 	}

@ -249,22 +236,30 @@ func rawNetworkInformationWriter(resultMap map[string]networkCollectionResult) (

 	defer networkDumpHandle.Close()

+	var exitErr error
+
 	for k, v := range resultMap {
+		if v.err != nil {
+			if exitErr == nil {
+				exitErr = v.err
+			}
+
+			_, err := networkDumpHandle.WriteString(k + "\nno content\n")
+			if err != nil {
+				return networkDumpHandle.Name(), fmt.Errorf("error writing 'no content' to raw network file: %w", err)
+			}
+		} else {
 			_, err := networkDumpHandle.WriteString(k + "\n" + v.raw + "\n")
 			if err != nil {
-			return "", fmt.Errorf("error writing raw network information: %w", err)
+				return networkDumpHandle.Name(), fmt.Errorf("error writing raw network information: %w", err)
+			}
 		}
 	}

-	return networkDumpHandle.Name(), nil
+	return networkDumpHandle.Name(), exitErr
 }

 func jsonNetworkInformationWriter(resultMap map[string]networkCollectionResult) (string, error) {
-	jsonMap := make(map[string][]*network.Hop, len(resultMap))
-	for k, v := range resultMap {
-		jsonMap[k] = v.info
-	}
-
 	networkDumpHandle, err := os.Create(filepath.Join(os.TempDir(), networkBaseName))
 	if err != nil {
 		return "", ErrCreatingTemporaryFile
@ -274,12 +269,23 @@ func jsonNetworkInformationWriter(resultMap map[string]networkCollectionResult)

 	encoder := newFormattedEncoder(networkDumpHandle)

-	err = encoder.Encode(jsonMap)
-	if err != nil {
-		return "", fmt.Errorf("error encoding network information results: %w", err)
+	var exitErr error
+
+	jsonMap := make(map[string][]*network.Hop, len(resultMap))
+	for k, v := range resultMap {
+		jsonMap[k] = v.info
+
+		if exitErr == nil && v.err != nil {
+			exitErr = v.err
+		}
 	}

-	return networkDumpHandle.Name(), nil
+	err = encoder.Encode(jsonMap)
+	if err != nil {
+		return networkDumpHandle.Name(), fmt.Errorf("error encoding network information results: %w", err)
+	}
+
+	return networkDumpHandle.Name(), exitErr
 }

 func collectFromEndpointAdapter(collect collectToWriterFunc, fileName string) collectFunc {
@ -292,7 +298,7 @@ func collectFromEndpointAdapter(collect collectToWriterFunc, fileName string) co

 		err = collect(ctx, dumpHandle)
 		if err != nil {
-			return "", fmt.Errorf("error running collector: %w", err)
+			return dumpHandle.Name(), fmt.Errorf("error running collector: %w", err)
 		}

 		return dumpHandle.Name(), nil
@ -316,10 +322,13 @@ func tunnelStateCollectEndpointAdapter(client HTTPClient, tunnel *TunnelState, f
 		encoder := newFormattedEncoder(writer)

 		err := encoder.Encode(tunnel)
-
+		if err != nil {
 			return fmt.Errorf("error encoding tunnel state: %w", err)
 		}

+		return nil
+	}
+
 	return collectFromEndpointAdapter(endpointFunc, fileName)
 }

@ -337,15 +346,14 @@ func resolveInstanceBaseURL(
 	addresses []string,
 ) (*url.URL, *TunnelState, []*AddressableTunnelState, error) {
 	if metricsServerAddress != "" {
+		if !strings.HasPrefix(metricsServerAddress, "http://") {
+			metricsServerAddress = "http://" + metricsServerAddress
+		}
 		url, err := url.Parse(metricsServerAddress)
 		if err != nil {
 			return nil, nil, nil, fmt.Errorf("provided address is not valid: %w", err)
 		}

-		if url.Scheme == "" {
-			url.Scheme = "http://"
-		}
-
 		return url, nil, nil, nil
 	}

@ -526,9 +534,15 @@ func RunDiagnostic(
 	jobsReport := runJobs(ctx, jobs, log)
 	paths := make([]string, 0)

+	var gerr error
+
 	for _, v := range jobsReport {
 		paths = append(paths, v.path)

+		if gerr == nil && v.Err != nil {
+			gerr = v.Err
+		}
+
 		defer func() {
 			if !errors.Is(v.Err, ErrCreatingTemporaryFile) {
 				os.Remove(v.path)
@ -538,14 +552,10 @@ func RunDiagnostic(

 	zipfile, err := CreateDiagnosticZipFile(zipName, paths)
 	if err != nil {
-		if zipfile != "" {
-			os.Remove(zipfile)
-		}
-
 		return nil, err
 	}

 	log.Info().Msgf("Diagnostic file written: %v", zipfile)

-	return nil, nil
+	return nil, gerr
 }