diff --git a/.teamcity/install-cloudflare-go.sh b/.teamcity/install-cloudflare-go.sh
index 9677ed7e..96d7056b 100755
--- a/.teamcity/install-cloudflare-go.sh
+++ b/.teamcity/install-cloudflare-go.sh
@@ -3,6 +3,6 @@
 cd /tmp
 git clone -q https://github.com/cloudflare/go
 cd go/src
-# https://github.com/cloudflare/go/tree/f4334cdc0c3f22a3bfdd7e66f387e3ffc65a5c38 is version go1.22.5-devel-cf
-git checkout -q f4334cdc0c3f22a3bfdd7e66f387e3ffc65a5c38
+# https://github.com/cloudflare/go/tree/af19da5605ca11f85776ef7af3384a02a315a52b is version go1.22.5-devel-cf
+git checkout -q af19da5605ca11f85776ef7af3384a02a315a52b
 ./make.bash
diff --git a/.teamcity/windows/install-cloudflare-go.ps1 b/.teamcity/windows/install-cloudflare-go.ps1
index 6ff957b9..a7ed1e11 100644
--- a/.teamcity/windows/install-cloudflare-go.ps1
+++ b/.teamcity/windows/install-cloudflare-go.ps1
@@ -9,8 +9,8 @@ Set-Location "$Env:Temp"
 git clone -q https://github.com/cloudflare/go
 Write-Output "Building go..."
 cd go/src
-# https://github.com/cloudflare/go/tree/f4334cdc0c3f22a3bfdd7e66f387e3ffc65a5c38 is version go1.22.5-devel-cf
-git checkout -q f4334cdc0c3f22a3bfdd7e66f387e3ffc65a5c38
+# https://github.com/cloudflare/go/tree/af19da5605ca11f85776ef7af3384a02a315a52b is version go1.22.5-devel-cf
+git checkout -q af19da5605ca11f85776ef7af3384a02a315a52b
 & ./make.bat
 
 Write-Output "Installed"
diff --git a/CHANGES.md b/CHANGES.md
index 2389511c..b3574850 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+## 2025.1.1
+### New Features
+- This release introduces the use of new Post Quantum curves and the ability to use Post Quantum curves when running tunnels with the QUIC protocol this applies to non-FIPS and FIPS builds.
+
 ## 2024.12.2
 ### New Features
 - This release introduces the ability to collect troubleshooting information from one instance of cloudflared running on the local machine. The command can be executed as `cloudflared tunnel diag`.
diff --git a/Dockerfile b/Dockerfile
index 66674c2a..a90910a0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 # use a builder image for building cloudflare
 ARG TARGET_GOOS
 ARG TARGET_GOARCH
-FROM golang:1.22.5 as builder
+FROM golang:1.22.10 as builder
 ENV GO111MODULE=on \
   CGO_ENABLED=0 \
   TARGET_GOOS=${TARGET_GOOS} \
@@ -22,7 +22,7 @@ RUN .teamcity/install-cloudflare-go.sh
 RUN PATH="/tmp/go/bin:$PATH" make cloudflared
 
 # use a distroless base image with glibc
-FROM gcr.io/distroless/base-debian11:nonroot
+FROM gcr.io/distroless/base-debian12:nonroot
 
 LABEL org.opencontainers.image.source="https://github.com/cloudflare/cloudflared"
 
diff --git a/Dockerfile.amd64 b/Dockerfile.amd64
index c375d801..20c48bd4 100644
--- a/Dockerfile.amd64
+++ b/Dockerfile.amd64
@@ -1,5 +1,5 @@
 # use a builder image for building cloudflare
-FROM golang:1.22.5 as builder
+FROM golang:1.22.10 as builder
 ENV GO111MODULE=on \
   CGO_ENABLED=0 \
   # the CONTAINER_BUILD envvar is used set github.com/cloudflare/cloudflared/metrics.Runtime=virtual
@@ -17,7 +17,7 @@ RUN .teamcity/install-cloudflare-go.sh
 RUN GOOS=linux GOARCH=amd64 PATH="/tmp/go/bin:$PATH" make cloudflared
 
 # use a distroless base image with glibc
-FROM gcr.io/distroless/base-debian11:nonroot
+FROM gcr.io/distroless/base-debian12:nonroot
 
 LABEL org.opencontainers.image.source="https://github.com/cloudflare/cloudflared"
 
diff --git a/Dockerfile.arm64 b/Dockerfile.arm64
index b617d7ae..f4b25a6e 100644
--- a/Dockerfile.arm64
+++ b/Dockerfile.arm64
@@ -1,5 +1,5 @@
 # use a builder image for building cloudflare
-FROM golang:1.22.5 as builder
+FROM golang:1.22.10 as builder
 ENV GO111MODULE=on \
   CGO_ENABLED=0 \
   # the CONTAINER_BUILD envvar is used set github.com/cloudflare/cloudflared/metrics.Runtime=virtual
@@ -17,7 +17,7 @@ RUN .teamcity/install-cloudflare-go.sh
 RUN GOOS=linux GOARCH=arm64 PATH="/tmp/go/bin:$PATH" make cloudflared
 
 # use a distroless base image with glibc
-FROM gcr.io/distroless/base-debian11:nonroot-arm64
+FROM gcr.io/distroless/base-debian12:nonroot-arm64
 
 LABEL org.opencontainers.image.source="https://github.com/cloudflare/cloudflared"
 
diff --git a/Makefile b/Makefile
index 70063f3c..bfdf2ff9 100644
--- a/Makefile
+++ b/Makefile
@@ -133,11 +133,9 @@ clean:
 cloudflared:
 ifeq ($(FIPS), true)
 	$(info Building cloudflared with go-fips)
-	cp -f fips/fips.go.linux-amd64 cmd/cloudflared/fips.go
 endif
 	GOOS=$(TARGET_OS) GOARCH=$(TARGET_ARCH) $(ARM_COMMAND) go build -mod=vendor $(GO_BUILD_TAGS) $(LDFLAGS) $(IMPORT_PATH)/cmd/cloudflared
 ifeq ($(FIPS), true)
-	rm -f cmd/cloudflared/fips.go
 	./check-fips.sh cloudflared
 endif
 
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index dcf4bc89..7fc164ab 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,31 @@
+2025.2.1
+- 2025-02-26 TUN-9016: update base-debian to v12
+- 2025-02-25 TUN-8960: Connect to FED API GW based on the OriginCert's endpoint
+- 2025-02-25 TUN-9007: modify logic to resolve region when the tunnel token has an endpoint field
+- 2025-02-13 SDLC-3762: Remove backstage.io/source-location from catalog-info.yaml
+- 2025-02-06 TUN-8914: Create a flags module to group all cloudflared cli flags
+
+2025.2.0
+- 2025-02-03 TUN-8914: Add a new configuration to locally override the max-active-flows
+- 2025-02-03 Bump x/crypto to 0.31.0
+
+2025.1.1
+- 2025-01-30 TUN-8858: update go to 1.22.10 and include quic-go FIPS changes
+- 2025-01-30 TUN-8855: fix lint issues
+- 2025-01-30 TUN-8855: Update PQ curve preferences
+- 2025-01-30 TUN-8857: remove restriction for using FIPS and PQ
+- 2025-01-30 TUN-8894: report FIPS+PQ error to Sentry when dialling to the edge
+- 2025-01-22 TUN-8904: Rename Connect Response Flow Rate Limited metadata
+- 2025-01-21 AUTH-6633 Fix cloudflared access login + warp as auth
+- 2025-01-20 TUN-8861: Add session limiter to UDP session manager
+- 2025-01-20 TUN-8861: Rename Session Limiter to Flow Limiter
+- 2025-01-17 TUN-8900: Add import of Apple Developer Certificate Authority to macOS Pipeline
+- 2025-01-17 TUN-8871: Accept login flag to authenticate with Fedramp environment
+- 2025-01-16 TUN-8866: Add linter to cloudflared repository
+- 2025-01-14 TUN-8861: Add session limiter to TCP session manager
+- 2025-01-13 TUN-8861: Add configuration for active sessions limiter
+- 2025-01-09 TUN-8848: Don't treat connection shutdown as an error condition when RPC server is done
+
 2025.1.0
 - 2025-01-06 TUN-8842: Add Ubuntu Noble and 'any' debian distributions to release script
 - 2025-01-06 TUN-8807: Add support_datagram_v3 to remote feature rollout
diff --git a/build-packages-fips.sh b/build-packages-fips.sh
index 0ec3b3c9..e1b6e791 100755
--- a/build-packages-fips.sh
+++ b/build-packages-fips.sh
@@ -17,7 +17,7 @@ make cloudflared-deb
 mv cloudflared-fips\_$VERSION\_$arch.deb $ARTIFACT_DIR/cloudflared-fips-linux-$arch.deb
 
 # rpm packages invert the - and _ and use x86_64 instead of amd64.
-RPMVERSION=$(echo $VERSION|sed -r 's/-/_/g')
+RPMVERSION=$(echo $VERSION | sed -r 's/-/_/g')
 RPMARCH="x86_64"
 make cloudflared-rpm
 mv cloudflared-fips-$RPMVERSION-1.$RPMARCH.rpm $ARTIFACT_DIR/cloudflared-fips-linux-$RPMARCH.rpm
diff --git a/catalog-info.yaml b/catalog-info.yaml
index e47d2d38..26f3381f 100644
--- a/catalog-info.yaml
+++ b/catalog-info.yaml
@@ -4,7 +4,6 @@ metadata:
   name: cloudflared
   description: Client for Cloudflare Tunnels
   annotations:
-    backstage.io/source-location: url:https://bitbucket.cfdata.org/projects/TUN/repos/cloudflared/browse
     cloudflare.com/software-excellence-opt-in: "true"
     cloudflare.com/jira-project-key: "TUN"
     cloudflare.com/jira-project-component: "Cloudflare Tunnel"
diff --git a/cfsetup.yaml b/cfsetup.yaml
index 7c7ac750..3c972679 100644
--- a/cfsetup.yaml
+++ b/cfsetup.yaml
@@ -1,4 +1,4 @@
-pinned_go: &pinned_go go-boring=1.22.5-1
+pinned_go: &pinned_go go-boring=1.22.10-1
 
 build_dir: &build_dir /cfsetup_build
 default-flavor: bookworm
@@ -16,7 +16,7 @@ bullseye: &bullseye
       - golangci-lint
     pre-cache: &build_pre_cache
       - export GOCACHE=/cfsetup_build/.cache/go-build
-      - go install golang.org/x/tools/cmd/goimports@latest
+      - go install golang.org/x/tools/cmd/goimports@v0.30.0
     post-cache:
       # Linting
       - make lint
diff --git a/cmd/cloudflared/access/carrier.go b/cmd/cloudflared/access/carrier.go
index 12cc8ac7..11190de7 100644
--- a/cmd/cloudflared/access/carrier.go
+++ b/cmd/cloudflared/access/carrier.go
@@ -104,7 +104,7 @@ func ssh(c *cli.Context) error {
 		case 3:
 			options.OriginURL = fmt.Sprintf("https://%s:%s", parts[2], parts[1])
 			options.TLSClientConfig = &tls.Config{
-				InsecureSkipVerify: true,
+				InsecureSkipVerify: true, // #nosec G402
 				ServerName:         parts[0],
 			}
 			log.Warn().Msgf("Using insecure SSL connection because SNI overridden to %s", parts[0])
@@ -141,6 +141,5 @@ func ssh(c *cli.Context) error {
 		logger := log.With().Str("host", url.Host).Logger()
 		s = stream.NewDebugStream(s, &logger, maxMessages)
 	}
-	carrier.StartClient(wsConn, s, options)
-	return nil
+	return carrier.StartClient(wsConn, s, options)
 }
diff --git a/cmd/cloudflared/access/cmd.go b/cmd/cloudflared/access/cmd.go
index d1490ef7..cb11aede 100644
--- a/cmd/cloudflared/access/cmd.go
+++ b/cmd/cloudflared/access/cmd.go
@@ -19,6 +19,7 @@ import (
 
 	"github.com/cloudflare/cloudflared/carrier"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/logger"
 	"github.com/cloudflare/cloudflared/sshgen"
 	"github.com/cloudflare/cloudflared/token"
@@ -172,15 +173,15 @@ func Commands() []*cli.Command {
 							EnvVars: []string{"TUNNEL_SERVICE_TOKEN_SECRET"},
 						},
 						&cli.StringFlag{
-							Name:  logger.LogFileFlag,
+							Name:  cfdflags.LogFile,
 							Usage: "Save application log to this file for reporting issues.",
 						},
 						&cli.StringFlag{
-							Name:  logger.LogSSHDirectoryFlag,
+							Name:  cfdflags.LogDirectory,
 							Usage: "Save application log to this directory for reporting issues.",
 						},
 						&cli.StringFlag{
-							Name:    logger.LogSSHLevelFlag,
+							Name:    cfdflags.LogLevelSSH,
 							Aliases: []string{"loglevel"}, //added to match the tunnel side
 							Usage:   "Application logging level {debug, info, warn, error, fatal}. ",
 						},
@@ -342,7 +343,7 @@ func run(cmd string, args ...string) error {
 		return err
 	}
 	go func() {
-		io.Copy(os.Stderr, stderr)
+		_, _ = io.Copy(os.Stderr, stderr)
 	}()
 
 	stdout, err := c.StdoutPipe()
@@ -350,7 +351,7 @@ func run(cmd string, args ...string) error {
 		return err
 	}
 	go func() {
-		io.Copy(os.Stdout, stdout)
+		_, _ = io.Copy(os.Stdout, stdout)
 	}()
 	return c.Run()
 }
@@ -531,7 +532,7 @@ func isFileThere(candidate string) bool {
 }
 
 // verifyTokenAtEdge checks for a token on disk, or generates a new one.
-// Then makes a request to to the origin with the token to ensure it is valid.
+// Then makes a request to the origin with the token to ensure it is valid.
 // Returns nil if token is valid.
 func verifyTokenAtEdge(appUrl *url.URL, appInfo *token.AppInfo, c *cli.Context, log *zerolog.Logger) error {
 	headers := parseRequestHeaders(c.StringSlice(sshHeaderFlag))
diff --git a/cmd/cloudflared/cliutil/logger.go b/cmd/cloudflared/cliutil/logger.go
index 01435213..6fbcf352 100644
--- a/cmd/cloudflared/cliutil/logger.go
+++ b/cmd/cloudflared/cliutil/logger.go
@@ -4,7 +4,7 @@ import (
 	"github.com/urfave/cli/v2"
 	"github.com/urfave/cli/v2/altsrc"
 
-	"github.com/cloudflare/cloudflared/logger"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 )
 
 var (
@@ -15,14 +15,14 @@ var (
 func ConfigureLoggingFlags(shouldHide bool) []cli.Flag {
 	return []cli.Flag{
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    logger.LogLevelFlag,
+			Name:    cfdflags.LogLevel,
 			Value:   "info",
 			Usage:   "Application logging level {debug, info, warn, error, fatal}. " + debugLevelWarning,
 			EnvVars: []string{"TUNNEL_LOGLEVEL"},
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    logger.LogTransportLevelFlag,
+			Name:    cfdflags.TransportLogLevel,
 			Aliases: []string{"proto-loglevel"}, // This flag used to be called proto-loglevel
 			Value:   "info",
 			Usage:   "Transport logging level(previously called protocol logging level) {debug, info, warn, error, fatal}",
@@ -30,19 +30,19 @@ func ConfigureLoggingFlags(shouldHide bool) []cli.Flag {
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    logger.LogFileFlag,
+			Name:    cfdflags.LogFile,
 			Usage:   "Save application log to this file for reporting issues.",
 			EnvVars: []string{"TUNNEL_LOGFILE"},
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    logger.LogDirectoryFlag,
+			Name:    cfdflags.LogDirectory,
 			Usage:   "Save application log to this directory for reporting issues.",
 			EnvVars: []string{"TUNNEL_LOGDIRECTORY"},
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "trace-output",
+			Name:    cfdflags.TraceOutput,
 			Usage:   "Name of trace output file, generated when cloudflared stops.",
 			EnvVars: []string{"TUNNEL_TRACE_OUTPUT"},
 			Hidden:  shouldHide,
diff --git a/cmd/cloudflared/flags/flags.go b/cmd/cloudflared/flags/flags.go
new file mode 100644
index 00000000..7c919f05
--- /dev/null
+++ b/cmd/cloudflared/flags/flags.go
@@ -0,0 +1,155 @@
+package flags
+
+const (
+	// HaConnections specifies how many connections to make to the edge
+	HaConnections = "ha-connections"
+
+	// SshPort is the port on localhost the cloudflared ssh server will run on
+	SshPort = "local-ssh-port"
+
+	// SshIdleTimeout defines the duration a SSH session can remain idle before being closed
+	SshIdleTimeout = "ssh-idle-timeout"
+
+	// SshMaxTimeout defines the max duration a SSH session can remain open for
+	SshMaxTimeout = "ssh-max-timeout"
+
+	// SshLogUploaderBucketName is the bucket name to use for the SSH log uploader
+	SshLogUploaderBucketName = "bucket-name"
+
+	// SshLogUploaderRegionName is the AWS region name to use for the SSH log uploader
+	SshLogUploaderRegionName = "region-name"
+
+	// SshLogUploaderSecretID is the Secret id of SSH log uploader
+	SshLogUploaderSecretID = "secret-id"
+
+	// SshLogUploaderAccessKeyID is the Access key id of SSH log uploader
+	SshLogUploaderAccessKeyID = "access-key-id"
+
+	// SshLogUploaderSessionTokenID is the Session token of SSH log uploader
+	SshLogUploaderSessionTokenID = "session-token"
+
+	// SshLogUploaderS3URL is the S3 URL of SSH log uploader (e.g. don't use AWS s3 and use google storage bucket instead)
+	SshLogUploaderS3URL = "s3-url-host"
+
+	// HostKeyPath is the path of the dir to save SSH host keys too
+	HostKeyPath = "host-key-path"
+
+	// RpcTimeout is how long to wait for a Capnp RPC request to the edge
+	RpcTimeout = "rpc-timeout"
+
+	// WriteStreamTimeout sets if we should have a timeout when writing data to a stream towards the destination (edge/origin).
+	WriteStreamTimeout = "write-stream-timeout"
+
+	// QuicDisablePathMTUDiscovery sets if QUIC should not perform PTMU discovery and use a smaller (safe) packet size.
+	// Packets will then be at most 1252 (IPv4) / 1232 (IPv6) bytes in size.
+	// Note that this may result in packet drops for UDP proxying, since we expect being able to send at least 1280 bytes of inner packets.
+	QuicDisablePathMTUDiscovery = "quic-disable-pmtu-discovery"
+
+	// QuicConnLevelFlowControlLimit controls the max flow control limit allocated for a QUIC connection. This controls how much data is the
+	// receiver willing to buffer. Once the limit is reached, the sender will send a DATA_BLOCKED frame to indicate it has more data to write,
+	// but it's blocked by flow control
+	QuicConnLevelFlowControlLimit = "quic-connection-level-flow-control-limit"
+
+	// QuicStreamLevelFlowControlLimit is similar to quicConnLevelFlowControlLimit but for each QUIC stream. When the sender is blocked,
+	// it will send a STREAM_DATA_BLOCKED frame
+	QuicStreamLevelFlowControlLimit = "quic-stream-level-flow-control-limit"
+
+	// Ui is to enable launching cloudflared in interactive UI mode
+	Ui = "ui"
+
+	// ConnectorLabel is the command line flag to give a meaningful label to a specific connector
+	ConnectorLabel = "label"
+
+	// MaxActiveFlows is the command line flag to set the maximum number of flows that cloudflared can be processing at the same time
+	MaxActiveFlows = "max-active-flows"
+
+	// Tag is the command line flag to set custom tags used to identify this tunnel via added HTTP request headers to the origin
+	Tag = "tag"
+
+	// Protocol is the command line flag to set the protocol to use to connect to the Cloudflare Edge
+	Protocol = "protocol"
+
+	// PostQuantum is the command line flag to force the connection to Cloudflare Edge to use Post Quantum cryptography
+	PostQuantum = "post-quantum"
+
+	// Features is the command line flag to opt into various features that are still being developed or tested
+	Features = "features"
+
+	// EdgeIpVersion is the command line flag to set the Cloudflare Edge IP address version to connect with
+	EdgeIpVersion = "edge-ip-version"
+
+	// EdgeBindAddress is the command line flag to bind to IP address for outgoing connections to Cloudflare Edge
+	EdgeBindAddress = "edge-bind-address"
+
+	// Force is the command line flag to specify if you wish to force an action
+	Force = "force"
+
+	// Edge is the command line flag to set the address of the Cloudflare tunnel server. Only works in Cloudflare's internal testing environment
+	Edge = "edge"
+
+	// Region is the command line flag to set the Cloudflare Edge region to connect to
+	Region = "region"
+
+	// IsAutoUpdated is the command line flag to signal the new process that cloudflared has been autoupdated
+	IsAutoUpdated = "is-autoupdated"
+
+	// LBPool is the command line flag to set the name of the load balancing pool to add this origin to
+	LBPool = "lb-pool"
+
+	// Retries is the command line flag to set the maximum number of retries for connection/protocol errors
+	Retries = "retries"
+
+	// MaxEdgeAddrRetries is the command line flag to set the maximum number of times to retry on edge addrs before falling back to a lower protocol
+	MaxEdgeAddrRetries = "max-edge-addr-retries"
+
+	// GracePeriod is the command line flag to set the maximum amount of time that cloudflared waits to shut down if it is still serving requests
+	GracePeriod = "grace-period"
+
+	// ICMPV4Src is the command line flag to set the source address and the interface name to send/receive ICMPv4 messages
+	ICMPV4Src = "icmpv4-src"
+
+	// ICMPV6Src is the command line flag to set the source address and the interface name to send/receive ICMPv6 messages
+	ICMPV6Src = "icmpv6-src"
+
+	// ProxyDns is the command line flag to run DNS server over HTTPS
+	ProxyDns = "proxy-dns"
+
+	// Name is the command line to set the name of the tunnel
+	Name = "name"
+
+	// AutoUpdateFreq is the command line for setting the frequency that cloudflared checks for updates
+	AutoUpdateFreq = "autoupdate-freq"
+
+	// NoAutoUpdate is the command line flag to disable cloudflared from checking for updates
+	NoAutoUpdate = "no-autoupdate"
+
+	// LogLevel is the command line flag for the cloudflared logging level
+	LogLevel = "loglevel"
+
+	// LogLevelSSH is the command line flag for the cloudflared ssh logging level
+	LogLevelSSH = "log-level"
+
+	// TransportLogLevel is the command line flag for the transport logging level
+	TransportLogLevel = "transport-loglevel"
+
+	// LogFile is the command line flag to define the file where application logs will be stored
+	LogFile = "logfile"
+
+	// LogDirectory is the command line flag to define the directory where application logs will be stored.
+	LogDirectory = "log-directory"
+
+	// TraceOutput is the command line flag to set the name of trace output file
+	TraceOutput = "trace-output"
+
+	// OriginCert is the command line flag to define the path for the origin certificate used by cloudflared
+	OriginCert = "origincert"
+
+	// Metrics is the command line flag to define the address of the metrics server
+	Metrics = "metrics"
+
+	// MetricsUpdateFreq is the command line flag to define how frequently tunnel metrics are updated
+	MetricsUpdateFreq = "metrics-update-freq"
+
+	// ApiURL is the command line flag used to define the base URL of the API
+	ApiURL = "api-url"
+)
diff --git a/cmd/cloudflared/macos_service.go b/cmd/cloudflared/macos_service.go
index 48d066e0..43d36568 100644
--- a/cmd/cloudflared/macos_service.go
+++ b/cmd/cloudflared/macos_service.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"os"
 
+	homedir "github.com/mitchellh/go-homedir"
 	"github.com/pkg/errors"
 	"github.com/urfave/cli/v2"
 
@@ -17,7 +18,7 @@ const (
 	launchdIdentifier = "com.cloudflare.cloudflared"
 )
 
-func runApp(app *cli.App, graceShutdownC chan struct{}) {
+func runApp(app *cli.App, _ chan struct{}) {
 	app.Commands = append(app.Commands, &cli.Command{
 		Name:  "service",
 		Usage: "Manages the cloudflared launch agent",
@@ -207,3 +208,15 @@ func uninstallLaunchd(c *cli.Context) error {
 	}
 	return err
 }
+
+func userHomeDir() (string, error) {
+	// This returns the home dir of the executing user using OS-specific method
+	// for discovering the home dir. It's not recommended to call this function
+	// when the user has root permission as $HOME depends on what options the user
+	// use with sudo.
+	homeDir, err := homedir.Dir()
+	if err != nil {
+		return "", errors.Wrap(err, "Cannot determine home directory for the user")
+	}
+	return homeDir, nil
+}
diff --git a/cmd/cloudflared/main.go b/cmd/cloudflared/main.go
index b0b93cf8..d54a768a 100644
--- a/cmd/cloudflared/main.go
+++ b/cmd/cloudflared/main.go
@@ -2,19 +2,17 @@ package main
 
 import (
 	"fmt"
-	"math/rand"
 	"os"
 	"strings"
 	"time"
 
 	"github.com/getsentry/sentry-go"
-	homedir "github.com/mitchellh/go-homedir"
-	"github.com/pkg/errors"
 	"github.com/urfave/cli/v2"
 	"go.uber.org/automaxprocs/maxprocs"
 
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/access"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/proxydns"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/tail"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/tunnel"
@@ -52,10 +50,8 @@ var (
 func main() {
 	// FIXME: TUN-8148: Disable QUIC_GO ECN due to bugs in proper detection if supported
 	os.Setenv("QUIC_GO_DISABLE_ECN", "1")
-
-	rand.Seed(time.Now().UnixNano())
 	metrics.RegisterBuildInfo(BuildType, BuildTime, Version)
-	maxprocs.Set()
+	_, _ = maxprocs.Set()
 	bInfo := cliutil.GetBuildInfo(BuildType, Version)
 
 	// Graceful shutdown channel used by the app. When closed, app must terminate gracefully.
@@ -110,7 +106,7 @@ func commands(version func(c *cli.Context)) []*cli.Command {
 					Usage: "specify if you wish to update to the latest beta version",
 				},
 				&cli.BoolFlag{
-					Name:   "force",
+					Name:   cfdflags.Force,
 					Usage:  "specify if you wish to force an upgrade to the latest version regardless of the current version",
 					Hidden: true,
 				},
@@ -184,18 +180,6 @@ func action(graceShutdownC chan struct{}) cli.ActionFunc {
 	})
 }
 
-func userHomeDir() (string, error) {
-	// This returns the home dir of the executing user using OS-specific method
-	// for discovering the home dir. It's not recommended to call this function
-	// when the user has root permission as $HOME depends on what options the user
-	// use with sudo.
-	homeDir, err := homedir.Dir()
-	if err != nil {
-		return "", errors.Wrap(err, "Cannot determine home directory for the user")
-	}
-	return homeDir, nil
-}
-
 // In order to keep the amount of noise sent to Sentry low, typical network errors can be filtered out here by a substring match.
 func captureError(err error) {
 	errorMessage := err.Error()
diff --git a/cmd/cloudflared/tail/cmd.go b/cmd/cloudflared/tail/cmd.go
index 7d444024..d7f5a429 100644
--- a/cmd/cloudflared/tail/cmd.go
+++ b/cmd/cloudflared/tail/cmd.go
@@ -18,14 +18,12 @@ import (
 	"nhooyr.io/websocket"
 
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/credentials"
-	"github.com/cloudflare/cloudflared/logger"
 	"github.com/cloudflare/cloudflared/management"
 )
 
-var (
-	buildInfo *cliutil.BuildInfo
-)
+var buildInfo *cliutil.BuildInfo
 
 func Init(bi *cliutil.BuildInfo) {
 	buildInfo = bi
@@ -56,7 +54,7 @@ func managementTokenCommand(c *cli.Context) error {
 	if err != nil {
 		return err
 	}
-	var tokenResponse = struct {
+	tokenResponse := struct {
 		Token string `json:"token"`
 	}{Token: token}
 
@@ -119,13 +117,13 @@ func buildTailCommand(subcommands []*cli.Command) *cli.Command {
 				Value:  "",
 			},
 			&cli.StringFlag{
-				Name:    logger.LogLevelFlag,
+				Name:    cfdflags.LogLevel,
 				Value:   "info",
 				Usage:   "Application logging level {debug, info, warn, error, fatal}",
 				EnvVars: []string{"TUNNEL_LOGLEVEL"},
 			},
 			&cli.StringFlag{
-				Name:    credentials.OriginCertFlag,
+				Name:    cfdflags.OriginCert,
 				Usage:   "Path to the certificate generated for your origin when you run cloudflared login.",
 				EnvVars: []string{"TUNNEL_ORIGIN_CERT"},
 				Value:   credentials.FindDefaultOriginCertPath(),
@@ -169,7 +167,7 @@ func handleValidationError(resp *http.Response, log *zerolog.Logger) {
 // logger will be created to emit only against the os.Stderr as to not obstruct with normal output from
 // management requests
 func createLogger(c *cli.Context) *zerolog.Logger {
-	level, levelErr := zerolog.ParseLevel(c.String(logger.LogLevelFlag))
+	level, levelErr := zerolog.ParseLevel(c.String(cfdflags.LogLevel))
 	if levelErr != nil {
 		level = zerolog.InfoLevel
 	}
@@ -183,9 +181,10 @@ func createLogger(c *cli.Context) *zerolog.Logger {
 // parseFilters will attempt to parse provided filters to send to with the EventStartStreaming
 func parseFilters(c *cli.Context) (*management.StreamingFilters, error) {
 	var level *management.LogLevel
-	var events []management.LogEventType
 	var sample float64
 
+	events := make([]management.LogEventType, 0)
+
 	argLevel := c.String("level")
 	argEvents := c.StringSlice("event")
 	argSample := c.Float64("sample")
@@ -225,12 +224,12 @@ func parseFilters(c *cli.Context) (*management.StreamingFilters, error) {
 
 // getManagementToken will make a call to the Cloudflare API to acquire a management token for the requested tunnel.
 func getManagementToken(c *cli.Context, log *zerolog.Logger) (string, error) {
-	userCreds, err := credentials.Read(c.String(credentials.OriginCertFlag), log)
+	userCreds, err := credentials.Read(c.String(cfdflags.OriginCert), log)
 	if err != nil {
 		return "", err
 	}
 
-	client, err := userCreds.Client(c.String("api-url"), buildInfo.UserAgent(), log)
+	client, err := userCreds.Client(c.String(cfdflags.ApiURL), buildInfo.UserAgent(), log)
 	if err != nil {
 		return "", err
 	}
@@ -331,6 +330,7 @@ func Run(c *cli.Context) error {
 		header["cf-trace-id"] = []string{trace}
 	}
 	ctx := c.Context
+	// nolint: bodyclose
 	conn, resp, err := websocket.Dial(ctx, u.String(), &websocket.DialOptions{
 		HTTPHeader: header,
 	})
diff --git a/cmd/cloudflared/tunnel/cmd.go b/cmd/cloudflared/tunnel/cmd.go
index c8d565f2..535c8bea 100644
--- a/cmd/cloudflared/tunnel/cmd.go
+++ b/cmd/cloudflared/tunnel/cmd.go
@@ -16,7 +16,7 @@ import (
 	"github.com/facebookgo/grace/gracenet"
 	"github.com/getsentry/sentry-go"
 	"github.com/google/uuid"
-	homedir "github.com/mitchellh/go-homedir"
+	"github.com/mitchellh/go-homedir"
 	"github.com/pkg/errors"
 	"github.com/rs/zerolog"
 	"github.com/urfave/cli/v2"
@@ -24,6 +24,7 @@ import (
 
 	"github.com/cloudflare/cloudflared/cfapi"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/proxydns"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/updater"
 	"github.com/cloudflare/cloudflared/config"
@@ -47,61 +48,6 @@ import (
 const (
 	sentryDSN = "https://56a9c9fa5c364ab28f34b14f35ea0f1b:3e8827f6f9f740738eb11138f7bebb68@sentry.io/189878"
 
-	// ha-Connections specifies how many connections to make to the edge
-	haConnectionsFlag = "ha-connections"
-
-	// sshPortFlag is the port on localhost the cloudflared ssh server will run on
-	sshPortFlag = "local-ssh-port"
-
-	// sshIdleTimeoutFlag defines the duration a SSH session can remain idle before being closed
-	sshIdleTimeoutFlag = "ssh-idle-timeout"
-
-	// sshMaxTimeoutFlag defines the max duration a SSH session can remain open for
-	sshMaxTimeoutFlag = "ssh-max-timeout"
-
-	// bucketNameFlag is the bucket name to use for the SSH log uploader
-	bucketNameFlag = "bucket-name"
-
-	// regionNameFlag is the AWS region name to use for the SSH log uploader
-	regionNameFlag = "region-name"
-
-	// secretIDFlag is the Secret id of SSH log uploader
-	secretIDFlag = "secret-id"
-
-	// accessKeyIDFlag is the Access key id of SSH log uploader
-	accessKeyIDFlag = "access-key-id"
-
-	// sessionTokenIDFlag is the Session token of SSH log uploader
-	sessionTokenIDFlag = "session-token"
-
-	// s3URLFlag is the S3 URL of SSH log uploader (e.g. don't use AWS s3 and use google storage bucket instead)
-	s3URLFlag = "s3-url-host"
-
-	// hostKeyPath is the path of the dir to save SSH host keys too
-	hostKeyPath = "host-key-path"
-
-	// rpcTimeout is how long to wait for a Capnp RPC request to the edge
-	rpcTimeout = "rpc-timeout"
-
-	// writeStreamTimeout sets if we should have a timeout when writing data to a stream towards the destination (edge/origin).
-	writeStreamTimeout = "write-stream-timeout"
-
-	// quicDisablePathMTUDiscovery sets if QUIC should not perform PTMU discovery and use a smaller (safe) packet size.
-	// Packets will then be at most 1252 (IPv4) / 1232 (IPv6) bytes in size.
-	// Note that this may result in packet drops for UDP proxying, since we expect being able to send at least 1280 bytes of inner packets.
-	quicDisablePathMTUDiscovery = "quic-disable-pmtu-discovery"
-
-	// quicConnLevelFlowControlLimit controls the max flow control limit allocated for a QUIC connection. This controls how much data is the
-	// receiver willing to buffer. Once the limit is reached, the sender will send a DATA_BLOCKED frame to indicate it has more data to write,
-	// but it's blocked by flow control
-	quicConnLevelFlowControlLimit = "quic-connection-level-flow-control-limit"
-	// quicStreamLevelFlowControlLimit is similar to quicConnLevelFlowControlLimit but for each QUIC stream. When the sender is blocked,
-	// it will send a STREAM_DATA_BLOCKED frame
-	quicStreamLevelFlowControlLimit = "quic-stream-level-flow-control-limit"
-
-	// uiFlag is to enable launching cloudflared in interactive UI mode
-	uiFlag = "ui"
-
 	LogFieldCommand             = "command"
 	LogFieldExpandedPath        = "expandedPath"
 	LogFieldPIDPathname         = "pidPathname"
@@ -116,7 +62,6 @@ Eg. cloudflared tunnel --url localhost:8080/.
 Please note that Quick Tunnels are meant to be ephemeral and should only be used for testing purposes.
 For production usage, we recommend creating Named Tunnels. (https://developers.cloudflare.com/cloudflare-one/connections/connect-apps/install-and-setup/tunnel-guide/)
 `
-	connectorLabelFlag = "label"
 )
 
 var (
@@ -126,14 +71,14 @@ var (
 	routeFailMsg = fmt.Sprintf("failed to provision routing, please create it manually via Cloudflare dashboard or UI; "+
 		"most likely you already have a conflicting record there. You can also rerun this command with --%s to overwrite "+
 		"any existing DNS records for this hostname.", overwriteDNSFlag)
-	deprecatedClassicTunnelErr = fmt.Errorf("Classic tunnels have been deprecated, please use Named Tunnels. (https://developers.cloudflare.com/cloudflare-one/connections/connect-apps/install-and-setup/tunnel-guide/)")
+	errDeprecatedClassicTunnel = errors.New("Classic tunnels have been deprecated, please use Named Tunnels. (https://developers.cloudflare.com/cloudflare-one/connections/connect-apps/install-and-setup/tunnel-guide/)")
 	// TODO: TUN-8756 the list below denotes the flags that do not possess any kind of sensitive information
 	// however this approach is not maintainble in the long-term.
 	nonSecretFlagsList = []string{
 		"config",
-		"autoupdate-freq",
-		"no-autoupdate",
-		"metrics",
+		cfdflags.AutoUpdateFreq,
+		cfdflags.NoAutoUpdate,
+		cfdflags.Metrics,
 		"pidfile",
 		"url",
 		"hello-world",
@@ -166,54 +111,55 @@ var (
 		"bastion",
 		"proxy-address",
 		"proxy-port",
-		"loglevel",
-		"transport-loglevel",
-		"logfile",
-		"log-directory",
-		"trace-output",
-		"proxy-dns",
+		cfdflags.LogLevel,
+		cfdflags.TransportLogLevel,
+		cfdflags.LogFile,
+		cfdflags.LogDirectory,
+		cfdflags.TraceOutput,
+		cfdflags.ProxyDns,
 		"proxy-dns-port",
 		"proxy-dns-address",
 		"proxy-dns-upstream",
 		"proxy-dns-max-upstream-conns",
 		"proxy-dns-bootstrap",
-		"is-autoupdated",
-		"edge",
-		"region",
-		"edge-ip-version",
-		"edge-bind-address",
+		cfdflags.IsAutoUpdated,
+		cfdflags.Edge,
+		cfdflags.Region,
+		cfdflags.EdgeIpVersion,
+		cfdflags.EdgeBindAddress,
 		"cacert",
 		"hostname",
 		"id",
-		"lb-pool",
-		"api-url",
-		"metrics-update-freq",
-		"tag",
+		cfdflags.LBPool,
+		cfdflags.ApiURL,
+		cfdflags.MetricsUpdateFreq,
+		cfdflags.Tag,
 		"heartbeat-interval",
 		"heartbeat-count",
-		"max-edge-addr-retries",
-		"retries",
+		cfdflags.MaxEdgeAddrRetries,
+		cfdflags.Retries,
 		"ha-connections",
 		"rpc-timeout",
 		"write-stream-timeout",
 		"quic-disable-pmtu-discovery",
 		"quic-connection-level-flow-control-limit",
 		"quic-stream-level-flow-control-limit",
-		"label",
-		"grace-period",
+		cfdflags.ConnectorLabel,
+		cfdflags.GracePeriod,
 		"compression-quality",
 		"use-reconnect-token",
 		"dial-edge-timeout",
 		"stdin-control",
-		"name",
-		"ui",
+		cfdflags.Name,
+		cfdflags.Ui,
 		"quick-service",
 		"max-fetch-size",
-		"post-quantum",
+		cfdflags.PostQuantum,
 		"management-diagnostics",
-		"protocol",
+		cfdflags.Protocol,
 		"overwrite-dns",
 		"help",
+		cfdflags.MaxActiveFlows,
 	}
 )
 
@@ -298,7 +244,7 @@ func TunnelCommand(c *cli.Context) error {
 	// --name required
 	// --url or --hello-world required
 	// --hostname optional
-	if name := c.String("name"); name != "" {
+	if name := c.String(cfdflags.Name); name != "" {
 		hostname, err := validation.ValidateHostname(c.String("hostname"))
 		if err != nil {
 			return errors.Wrap(err, "Invalid hostname provided")
@@ -315,7 +261,7 @@ func TunnelCommand(c *cli.Context) error {
 	// A unauthenticated named tunnel hosted on <random>.<quick-tunnels-service>.com
 	// We don't support running proxy-dns and a quick tunnel at the same time as the same process
 	shouldRunQuickTunnel := c.IsSet("url") || c.IsSet(ingress.HelloWorldFlag)
-	if !c.IsSet("proxy-dns") && c.String("quick-service") != "" && shouldRunQuickTunnel {
+	if !c.IsSet(cfdflags.ProxyDns) && c.String("quick-service") != "" && shouldRunQuickTunnel {
 		return RunQuickTunnel(sc)
 	}
 
@@ -326,10 +272,10 @@ func TunnelCommand(c *cli.Context) error {
 
 	// Classic tunnel usage is no longer supported
 	if c.String("hostname") != "" {
-		return deprecatedClassicTunnelErr
+		return errDeprecatedClassicTunnel
 	}
 
-	if c.IsSet("proxy-dns") {
+	if c.IsSet(cfdflags.ProxyDns) {
 		if shouldRunQuickTunnel {
 			return fmt.Errorf("running a quick tunnel with `proxy-dns` is not supported")
 		}
@@ -376,7 +322,7 @@ func runAdhocNamedTunnel(sc *subcommandContext, name, credentialsOutputPath stri
 
 func routeFromFlag(c *cli.Context) (route cfapi.HostnameRoute, ok bool) {
 	if hostname := c.String("hostname"); hostname != "" {
-		if lbPool := c.String("lb-pool"); lbPool != "" {
+		if lbPool := c.String(cfdflags.LBPool); lbPool != "" {
 			return cfapi.NewLBRoute(hostname, lbPool), true
 		}
 		return cfapi.NewDNSRoute(hostname, c.Bool(overwriteDNSFlagName)), true
@@ -406,7 +352,7 @@ func StartServer(
 		log.Info().Msg(config.ErrNoConfigFile.Error())
 	}
 
-	if c.IsSet("trace-output") {
+	if c.IsSet(cfdflags.TraceOutput) {
 		tmpTraceFile, err := os.CreateTemp("", "trace")
 		if err != nil {
 			log.Err(err).Msg("Failed to create new temporary file to save trace output")
@@ -418,7 +364,7 @@ func StartServer(
 			if err := tmpTraceFile.Close(); err != nil {
 				traceLog.Err(err).Msg("Failed to close temporary trace output file")
 			}
-			traceOutputFilepath := c.String("trace-output")
+			traceOutputFilepath := c.String(cfdflags.TraceOutput)
 			if err := os.Rename(tmpTraceFile.Name(), traceOutputFilepath); err != nil {
 				traceLog.
 					Err(err).
@@ -448,7 +394,7 @@ func StartServer(
 
 	go waitForSignal(graceShutdownC, log)
 
-	if c.IsSet("proxy-dns") {
+	if c.IsSet(cfdflags.ProxyDns) {
 		dnsReadySignal := make(chan struct{})
 		wg.Add(1)
 		go func() {
@@ -470,7 +416,7 @@ func StartServer(
 	go func() {
 		defer wg.Done()
 		autoupdater := updater.NewAutoUpdater(
-			c.Bool("no-autoupdate"), c.Duration("autoupdate-freq"), &listeners, log,
+			c.Bool(cfdflags.NoAutoUpdate), c.Duration(cfdflags.AutoUpdateFreq), &listeners, log,
 		)
 		errC <- autoupdater.Run(ctx)
 	}()
@@ -526,7 +472,7 @@ func StartServer(
 		c.Bool("management-diagnostics"),
 		serviceIP,
 		clientID,
-		c.String(connectorLabelFlag),
+		c.String(cfdflags.ConnectorLabel),
 		logger.ManagementLogger.Log,
 		logger.ManagementLogger,
 	)
@@ -578,7 +524,7 @@ func StartServer(
 		errC <- metrics.ServeMetrics(metricsListener, ctx, metricsConfig, log)
 	}()
 
-	reconnectCh := make(chan supervisor.ReconnectSignal, c.Int(haConnectionsFlag))
+	reconnectCh := make(chan supervisor.ReconnectSignal, c.Int(cfdflags.HaConnections))
 	if c.IsSet("stdin-control") {
 		log.Info().Msg("Enabling control through stdin")
 		go stdinControl(reconnectCh, log)
@@ -615,8 +561,10 @@ func waitToShutdown(wg *sync.WaitGroup,
 		log.Debug().Msg("Graceful shutdown signalled")
 		if gracePeriod > 0 {
 			// wait for either grace period or service termination
+			ticker := time.NewTicker(gracePeriod)
+			defer ticker.Stop()
 			select {
-			case <-time.Tick(gracePeriod):
+			case <-ticker.C:
 			case <-errC:
 			}
 		}
@@ -644,7 +592,7 @@ func waitToShutdown(wg *sync.WaitGroup,
 
 func notifySystemd(waitForSignal *signal.Signal) {
 	<-waitForSignal.Wait()
-	daemon.SdNotify(false, "READY=1")
+	_, _ = daemon.SdNotify(false, "READY=1")
 }
 
 func writePidFile(waitForSignal *signal.Signal, pidPathname string, log *zerolog.Logger) {
@@ -696,31 +644,31 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 	flags = append(flags, []cli.Flag{
 		credentialsFileFlag,
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:   "is-autoupdated",
+			Name:   cfdflags.IsAutoUpdated,
 			Usage:  "Signal the new process that Cloudflare Tunnel connector has been autoupdated",
 			Value:  false,
 			Hidden: true,
 		}),
 		altsrc.NewStringSliceFlag(&cli.StringSliceFlag{
-			Name:    "edge",
+			Name:    cfdflags.Edge,
 			Usage:   "Address of the Cloudflare tunnel server. Only works in Cloudflare's internal testing environment.",
 			EnvVars: []string{"TUNNEL_EDGE"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "region",
+			Name:    cfdflags.Region,
 			Usage:   "Cloudflare Edge region to connect to. Omit or set to empty to connect to the global region.",
 			EnvVars: []string{"TUNNEL_REGION"},
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "edge-ip-version",
+			Name:    cfdflags.EdgeIpVersion,
 			Usage:   "Cloudflare Edge IP address version to connect with. {4, 6, auto}",
 			EnvVars: []string{"TUNNEL_EDGE_IP_VERSION"},
 			Value:   "4",
 			Hidden:  false,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "edge-bind-address",
+			Name:    cfdflags.EdgeBindAddress,
 			Usage:   "Bind to IP address for outgoing connections to Cloudflare Edge.",
 			EnvVars: []string{"TUNNEL_EDGE_BIND_ADDRESS"},
 			Hidden:  false,
@@ -744,7 +692,7 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "lb-pool",
+			Name:    cfdflags.LBPool,
 			Usage:   "The name of a (new/existing) load balancing pool to add this origin to.",
 			EnvVars: []string{"TUNNEL_LB_POOL"},
 			Hidden:  shouldHide,
@@ -768,21 +716,21 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "api-url",
+			Name:    cfdflags.ApiURL,
 			Usage:   "Base URL for Cloudflare API v4",
 			EnvVars: []string{"TUNNEL_API_URL"},
 			Value:   "https://api.cloudflare.com/client/v4",
 			Hidden:  true,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:    "metrics-update-freq",
+			Name:    cfdflags.MetricsUpdateFreq,
 			Usage:   "Frequency to update tunnel metrics",
 			Value:   time.Second * 5,
 			EnvVars: []string{"TUNNEL_METRICS_UPDATE_FREQ"},
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringSliceFlag(&cli.StringSliceFlag{
-			Name:    "tag",
+			Name:    cfdflags.Tag,
 			Usage:   "Custom tags used to identify this tunnel via added HTTP request headers to the origin, in format `KEY=VALUE`. Multiple tags may be specified.",
 			EnvVars: []string{"TUNNEL_TAG"},
 			Hidden:  true,
@@ -801,64 +749,64 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 			Hidden: true,
 		}),
 		altsrc.NewIntFlag(&cli.IntFlag{
-			Name:   "max-edge-addr-retries",
+			Name:   cfdflags.MaxEdgeAddrRetries,
 			Usage:  "Maximum number of times to retry on edge addrs before falling back to a lower protocol",
 			Value:  8,
 			Hidden: true,
 		}),
 		// Note TUN-3758 , we use Int because UInt is not supported with altsrc
 		altsrc.NewIntFlag(&cli.IntFlag{
-			Name:    "retries",
+			Name:    cfdflags.Retries,
 			Value:   5,
 			Usage:   "Maximum number of retries for connection/protocol errors.",
 			EnvVars: []string{"TUNNEL_RETRIES"},
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewIntFlag(&cli.IntFlag{
-			Name:   haConnectionsFlag,
+			Name:   cfdflags.HaConnections,
 			Value:  4,
 			Hidden: true,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:   rpcTimeout,
+			Name:   cfdflags.RpcTimeout,
 			Value:  5 * time.Second,
 			Hidden: true,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:    writeStreamTimeout,
+			Name:    cfdflags.WriteStreamTimeout,
 			EnvVars: []string{"TUNNEL_STREAM_WRITE_TIMEOUT"},
 			Usage:   "Use this option to add a stream write timeout for connections when writing towards the origin or edge. Default is 0 which disables the write timeout.",
 			Value:   0 * time.Second,
 			Hidden:  true,
 		}),
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:    quicDisablePathMTUDiscovery,
+			Name:    cfdflags.QuicDisablePathMTUDiscovery,
 			EnvVars: []string{"TUNNEL_DISABLE_QUIC_PMTU"},
 			Usage:   "Use this option to disable PTMU discovery for QUIC connections. This will result in lower packet sizes. Not however, that this may cause instability for UDP proxying.",
 			Value:   false,
 			Hidden:  true,
 		}),
 		altsrc.NewIntFlag(&cli.IntFlag{
-			Name:    quicConnLevelFlowControlLimit,
+			Name:    cfdflags.QuicConnLevelFlowControlLimit,
 			EnvVars: []string{"TUNNEL_QUIC_CONN_LEVEL_FLOW_CONTROL_LIMIT"},
 			Usage:   "Use this option to change the connection-level flow control limit for QUIC transport.",
 			Value:   30 * (1 << 20), // 30 MB
 			Hidden:  true,
 		}),
 		altsrc.NewIntFlag(&cli.IntFlag{
-			Name:    quicStreamLevelFlowControlLimit,
+			Name:    cfdflags.QuicStreamLevelFlowControlLimit,
 			EnvVars: []string{"TUNNEL_QUIC_STREAM_LEVEL_FLOW_CONTROL_LIMIT"},
 			Usage:   "Use this option to change the connection-level flow control limit for QUIC transport.",
 			Value:   6 * (1 << 20), // 6 MB
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:  connectorLabelFlag,
+			Name:  cfdflags.ConnectorLabel,
 			Usage: "Use this option to give a meaningful label to a specific connector. When a tunnel starts up, a connector id unique to the tunnel is generated. This is a uuid. To make it easier to identify a connector, we will use the hostname of the machine the tunnel is running on along with the connector ID. This option exists if one wants to have more control over what their individual connectors are called.",
 			Value: "",
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:    "grace-period",
+			Name:    cfdflags.GracePeriod,
 			Usage:   "When cloudflared receives SIGINT/SIGTERM it will stop accepting new requests, wait for in-progress requests to terminate, then shutdown. Waiting for in-progress requests will timeout after this grace period, or when a second SIGTERM/SIGINT is received.",
 			Value:   time.Second * 30,
 			EnvVars: []string{"TUNNEL_GRACE_PERIOD"},
@@ -894,14 +842,14 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 			Value:   false,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    "name",
+			Name:    cfdflags.Name,
 			Aliases: []string{"n"},
 			EnvVars: []string{"TUNNEL_NAME"},
 			Usage:   "Stable name to identify the tunnel. Using this flag will create, route and run a tunnel. For production usage, execute each command separately",
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:   uiFlag,
+			Name:   cfdflags.Ui,
 			Usage:  "(depreciated) Launch tunnel UI. Tunnel logs are scrollable via 'j', 'k', or arrow keys.",
 			Value:  false,
 			Hidden: true,
@@ -919,11 +867,10 @@ func tunnelFlags(shouldHide bool) []cli.Flag {
 			Hidden:  true,
 		}),
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:    "post-quantum",
+			Name:    cfdflags.PostQuantum,
 			Usage:   "When given creates an experimental post-quantum secure tunnel",
 			Aliases: []string{"pq"},
 			EnvVars: []string{"TUNNEL_POST_QUANTUM"},
-			Hidden:  FipsEnabled,
 		}),
 		altsrc.NewBoolFlag(&cli.BoolFlag{
 			Name:    "management-diagnostics",
@@ -948,27 +895,27 @@ func configureCloudflaredFlags(shouldHide bool) []cli.Flag {
 			Hidden: shouldHide,
 		},
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    credentials.OriginCertFlag,
+			Name:    cfdflags.OriginCert,
 			Usage:   "Path to the certificate generated for your origin when you run cloudflared login.",
 			EnvVars: []string{"TUNNEL_ORIGIN_CERT"},
 			Value:   credentials.FindDefaultOriginCertPath(),
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:   "autoupdate-freq",
+			Name:   cfdflags.AutoUpdateFreq,
 			Usage:  fmt.Sprintf("Autoupdate frequency. Default is %v.", updater.DefaultCheckUpdateFreq),
 			Value:  updater.DefaultCheckUpdateFreq,
 			Hidden: shouldHide,
 		}),
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:    "no-autoupdate",
+			Name:    cfdflags.NoAutoUpdate,
 			Usage:   "Disable periodic check for updates, restarting the server with the new version.",
 			EnvVars: []string{"NO_AUTOUPDATE"},
 			Value:   false,
 			Hidden:  shouldHide,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:  "metrics",
+			Name:  cfdflags.Metrics,
 			Value: metrics.GetMetricsDefaultAddress(metrics.Runtime),
 			Usage: fmt.Sprintf(
 				`Listen address for metrics reporting. If no address is passed cloudflared will try to bind to %v.
@@ -1132,62 +1079,62 @@ func legacyTunnelFlag(msg string) string {
 func sshFlags(shouldHide bool) []cli.Flag {
 	return []cli.Flag{
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    sshPortFlag,
+			Name:    cfdflags.SshPort,
 			Usage:   "Localhost port that cloudflared SSH server will run on",
 			Value:   "2222",
 			EnvVars: []string{"LOCAL_SSH_PORT"},
 			Hidden:  true,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:    sshIdleTimeoutFlag,
+			Name:    cfdflags.SshIdleTimeout,
 			Usage:   "Connection timeout after no activity",
 			EnvVars: []string{"SSH_IDLE_TIMEOUT"},
 			Hidden:  true,
 		}),
 		altsrc.NewDurationFlag(&cli.DurationFlag{
-			Name:    sshMaxTimeoutFlag,
+			Name:    cfdflags.SshMaxTimeout,
 			Usage:   "Absolute connection timeout",
 			EnvVars: []string{"SSH_MAX_TIMEOUT"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    bucketNameFlag,
+			Name:    cfdflags.SshLogUploaderBucketName,
 			Usage:   "Bucket name of where to upload SSH logs",
 			EnvVars: []string{"BUCKET_ID"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    regionNameFlag,
+			Name:    cfdflags.SshLogUploaderRegionName,
 			Usage:   "Region name of where to upload SSH logs",
 			EnvVars: []string{"REGION_ID"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    secretIDFlag,
+			Name:    cfdflags.SshLogUploaderSecretID,
 			Usage:   "Secret ID of where to upload SSH logs",
 			EnvVars: []string{"SECRET_ID"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    accessKeyIDFlag,
+			Name:    cfdflags.SshLogUploaderAccessKeyID,
 			Usage:   "Access Key ID of where to upload SSH logs",
 			EnvVars: []string{"ACCESS_CLIENT_ID"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    sessionTokenIDFlag,
+			Name:    cfdflags.SshLogUploaderSessionTokenID,
 			Usage:   "Session Token to use in the configuration of SSH logs uploading",
 			EnvVars: []string{"SESSION_TOKEN_ID"},
 			Hidden:  true,
 		}),
 		altsrc.NewStringFlag(&cli.StringFlag{
-			Name:    s3URLFlag,
+			Name:    cfdflags.SshLogUploaderS3URL,
 			Usage:   "S3 url of where to upload SSH logs",
 			EnvVars: []string{"S3_URL"},
 			Hidden:  true,
 		}),
 		altsrc.NewPathFlag(&cli.PathFlag{
-			Name:    hostKeyPath,
+			Name:    cfdflags.HostKeyPath,
 			Usage:   "Absolute path of directory to save SSH host keys in",
 			EnvVars: []string{"HOST_KEY_PATH"},
 			Hidden:  true,
@@ -1227,7 +1174,7 @@ func sshFlags(shouldHide bool) []cli.Flag {
 func configureProxyDNSFlags(shouldHide bool) []cli.Flag {
 	return []cli.Flag{
 		altsrc.NewBoolFlag(&cli.BoolFlag{
-			Name:    "proxy-dns",
+			Name:    cfdflags.ProxyDns,
 			Usage:   "Run a DNS over HTTPS proxy server.",
 			EnvVars: []string{"TUNNEL_DNS"},
 			Hidden:  shouldHide,
@@ -1325,7 +1272,7 @@ func nonSecretCliFlags(log *zerolog.Logger, cli *cli.Context, flagInclusionList
 		}
 
 		switch flag {
-		case logger.LogDirectoryFlag, logger.LogFileFlag:
+		case cfdflags.LogDirectory, cfdflags.LogFile:
 			{
 				absolute, err := filepath.Abs(value)
 				if err != nil {
diff --git a/cmd/cloudflared/tunnel/configuration.go b/cmd/cloudflared/tunnel/configuration.go
index c5983273..fc21c7ec 100644
--- a/cmd/cloudflared/tunnel/configuration.go
+++ b/cmd/cloudflared/tunnel/configuration.go
@@ -18,6 +18,7 @@ import (
 	"golang.org/x/term"
 
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	"github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/connection"
 	"github.com/cloudflare/cloudflared/edgediscovery"
@@ -33,26 +34,27 @@ import (
 const (
 	secretValue       = "*****"
 	icmpFunnelTimeout = time.Second * 10
+	fedRampRegion     = "fed" // const string denoting the region used to connect to FEDRamp servers
 )
 
 var (
-	developerPortal = "https://developers.cloudflare.com/cloudflare-one/connections/connect-apps/install-and-setup"
-	serviceUrl      = developerPortal + "/tunnel-guide/local/as-a-service/"
-	argumentsUrl    = developerPortal + "/tunnel-guide/local/local-management/arguments/"
-
 	secretFlags = [2]*altsrc.StringFlag{credentialsContentsFlag, tunnelTokenFlag}
 
-	configFlags = []string{"autoupdate-freq", "no-autoupdate", "retries", "protocol", "loglevel", "transport-loglevel", "origincert", "metrics", "metrics-update-freq", "edge-ip-version", "edge-bind-address"}
-)
-
-func generateRandomClientID(log *zerolog.Logger) (string, error) {
-	u, err := uuid.NewRandom()
-	if err != nil {
-		log.Error().Msgf("couldn't create UUID for client ID %s", err)
-		return "", err
+	configFlags = []string{
+		flags.AutoUpdateFreq,
+		flags.NoAutoUpdate,
+		flags.Retries,
+		flags.Protocol,
+		flags.LogLevel,
+		flags.TransportLogLevel,
+		flags.OriginCert,
+		flags.Metrics,
+		flags.MetricsUpdateFreq,
+		flags.EdgeIpVersion,
+		flags.EdgeBindAddress,
+		flags.MaxActiveFlows,
 	}
-	return u.String(), nil
-}
+)
 
 func logClientOptions(c *cli.Context, log *zerolog.Logger) {
 	flags := make(map[string]interface{})
@@ -109,8 +111,8 @@ func isSecretEnvVar(key string) bool {
 }
 
 func dnsProxyStandAlone(c *cli.Context, namedTunnel *connection.TunnelProperties) bool {
-	return c.IsSet("proxy-dns") &&
-		!(c.IsSet("name") || // adhoc-named tunnel
+	return c.IsSet(flags.ProxyDns) &&
+		!(c.IsSet(flags.Name) || // adhoc-named tunnel
 			c.IsSet(ingress.HelloWorldFlag) || // quick or named tunnel
 			namedTunnel != nil) // named tunnel
 }
@@ -128,18 +130,15 @@ func prepareTunnelConfig(
 		return nil, nil, errors.Wrap(err, "can't generate connector UUID")
 	}
 	log.Info().Msgf("Generated Connector ID: %s", clientID)
-	tags, err := NewTagSliceFromCLI(c.StringSlice("tag"))
+	tags, err := NewTagSliceFromCLI(c.StringSlice(flags.Tag))
 	if err != nil {
 		log.Err(err).Msg("Tag parse failure")
 		return nil, nil, errors.Wrap(err, "Tag parse failure")
 	}
 	tags = append(tags, pogs.Tag{Name: "ID", Value: clientID.String()})
 
-	transportProtocol := c.String("protocol")
-
-	if c.Bool("post-quantum") && FipsEnabled {
-		return nil, nil, fmt.Errorf("post-quantum not supported in FIPS mode")
-	}
+	transportProtocol := c.String(flags.Protocol)
+	isPostQuantumEnforced := c.Bool(flags.PostQuantum)
 
 	featureSelector, err := features.NewFeatureSelector(ctx, namedTunnel.Credentials.AccountTag, c.StringSlice("features"), c.Bool("post-quantum"), log)
 	if err != nil {
@@ -153,11 +152,6 @@ func prepareTunnelConfig(
 			return nil, nil, fmt.Errorf("post-quantum is only supported with the quic transport")
 		}
 		transportProtocol = connection.QUIC.String()
-
-		log.Info().Msgf(
-			"Using hybrid post-quantum key agreement %s",
-			supervisor.PQKexName,
-		)
 	}
 
 	namedTunnel.Client = pogs.ClientInfo{
@@ -172,7 +166,7 @@ func prepareTunnelConfig(
 		return nil, nil, err
 	}
 
-	protocolSelector, err := connection.NewProtocolSelector(transportProtocol, namedTunnel.Credentials.AccountTag, c.IsSet(TunnelTokenFlag), c.Bool("post-quantum"), edgediscovery.ProtocolPercentage, connection.ResolveTTL, log)
+	protocolSelector, err := connection.NewProtocolSelector(transportProtocol, namedTunnel.Credentials.AccountTag, c.IsSet(TunnelTokenFlag), isPostQuantumEnforced, edgediscovery.ProtocolPercentage, connection.ResolveTTL, log)
 	if err != nil {
 		return nil, nil, err
 	}
@@ -198,11 +192,11 @@ func prepareTunnelConfig(
 	if err != nil {
 		return nil, nil, err
 	}
-	edgeIPVersion, err := parseConfigIPVersion(c.String("edge-ip-version"))
+	edgeIPVersion, err := parseConfigIPVersion(c.String(flags.EdgeIpVersion))
 	if err != nil {
 		return nil, nil, err
 	}
-	edgeBindAddr, err := parseConfigBindAddress(c.String("edge-bind-address"))
+	edgeBindAddr, err := parseConfigBindAddress(c.String(flags.EdgeBindAddress))
 	if err != nil {
 		return nil, nil, err
 	}
@@ -215,36 +209,50 @@ func prepareTunnelConfig(
 		log.Warn().Str("edgeIPVersion", edgeIPVersion.String()).Err(err).Msg("Overriding edge-ip-version")
 	}
 
+	region := c.String(flags.Region)
+	endpoint := namedTunnel.Credentials.Endpoint
+	var resolvedRegion string
+	// set resolvedRegion to either the region passed as argument
+	// or to the endpoint in the credentials.
+	// Region and endpoint are interchangeable
+	if region != "" && endpoint != "" {
+		return nil, nil, fmt.Errorf("region provided with a token that has an endpoint")
+	} else if region != "" {
+		resolvedRegion = region
+	} else if endpoint != "" {
+		resolvedRegion = endpoint
+	}
+
 	tunnelConfig := &supervisor.TunnelConfig{
 		GracePeriod:     gracePeriod,
-		ReplaceExisting: c.Bool("force"),
+		ReplaceExisting: c.Bool(flags.Force),
 		OSArch:          info.OSArch(),
 		ClientID:        clientID.String(),
-		EdgeAddrs:       c.StringSlice("edge"),
-		Region:          c.String("region"),
+		EdgeAddrs:       c.StringSlice(flags.Edge),
+		Region:          resolvedRegion,
 		EdgeIPVersion:   edgeIPVersion,
 		EdgeBindAddr:    edgeBindAddr,
-		HAConnections:   c.Int(haConnectionsFlag),
-		IsAutoupdated:   c.Bool("is-autoupdated"),
-		LBPool:          c.String("lb-pool"),
+		HAConnections:   c.Int(flags.HaConnections),
+		IsAutoupdated:   c.Bool(flags.IsAutoUpdated),
+		LBPool:          c.String(flags.LBPool),
 		Tags:            tags,
 		Log:             log,
 		LogTransport:    logTransport,
 		Observer:        observer,
 		ReportedVersion: info.Version(),
 		// Note TUN-3758 , we use Int because UInt is not supported with altsrc
-		Retries:                             uint(c.Int("retries")),
+		Retries:                             uint(c.Int(flags.Retries)), // nolint: gosec
 		RunFromTerminal:                     isRunningFromTerminal(),
 		NamedTunnel:                         namedTunnel,
 		ProtocolSelector:                    protocolSelector,
 		EdgeTLSConfigs:                      edgeTLSConfigs,
 		FeatureSelector:                     featureSelector,
-		MaxEdgeAddrRetries:                  uint8(c.Int("max-edge-addr-retries")),
-		RPCTimeout:                          c.Duration(rpcTimeout),
-		WriteStreamTimeout:                  c.Duration(writeStreamTimeout),
-		DisableQUICPathMTUDiscovery:         c.Bool(quicDisablePathMTUDiscovery),
-		QUICConnectionLevelFlowControlLimit: c.Uint64(quicConnLevelFlowControlLimit),
-		QUICStreamLevelFlowControlLimit:     c.Uint64(quicStreamLevelFlowControlLimit),
+		MaxEdgeAddrRetries:                  uint8(c.Int(flags.MaxEdgeAddrRetries)), // nolint: gosec
+		RPCTimeout:                          c.Duration(flags.RpcTimeout),
+		WriteStreamTimeout:                  c.Duration(flags.WriteStreamTimeout),
+		DisableQUICPathMTUDiscovery:         c.Bool(flags.QuicDisablePathMTUDiscovery),
+		QUICConnectionLevelFlowControlLimit: c.Uint64(flags.QuicConnLevelFlowControlLimit),
+		QUICStreamLevelFlowControlLimit:     c.Uint64(flags.QuicStreamLevelFlowControlLimit),
 	}
 	icmpRouter, err := newICMPRouter(c, log)
 	if err != nil {
@@ -256,7 +264,7 @@ func prepareTunnelConfig(
 		Ingress:            &ingressRules,
 		WarpRouting:        ingress.NewWarpRoutingConfig(&cfg.WarpRouting),
 		ConfigurationFlags: parseConfigFlags(c),
-		WriteTimeout:       c.Duration(writeStreamTimeout),
+		WriteTimeout:       tunnelConfig.WriteStreamTimeout,
 	}
 	return tunnelConfig, orchestratorConfig, nil
 }
@@ -274,9 +282,9 @@ func parseConfigFlags(c *cli.Context) map[string]string {
 }
 
 func gracePeriod(c *cli.Context) (time.Duration, error) {
-	period := c.Duration("grace-period")
+	period := c.Duration(flags.GracePeriod)
 	if period > connection.MaxGracePeriod {
-		return time.Duration(0), fmt.Errorf("grace-period must be equal or less than %v", connection.MaxGracePeriod)
+		return time.Duration(0), fmt.Errorf("%s must be equal or less than %v", flags.GracePeriod, connection.MaxGracePeriod)
 	}
 	return period, nil
 }
@@ -359,14 +367,14 @@ func newICMPRouter(c *cli.Context, logger *zerolog.Logger) (ingress.ICMPRouterSe
 }
 
 func determineICMPSources(c *cli.Context, logger *zerolog.Logger) (netip.Addr, netip.Addr, error) {
-	ipv4Src, err := determineICMPv4Src(c.String("icmpv4-src"), logger)
+	ipv4Src, err := determineICMPv4Src(c.String(flags.ICMPV4Src), logger)
 	if err != nil {
 		return netip.Addr{}, netip.Addr{}, errors.Wrap(err, "failed to determine IPv4 source address for ICMP proxy")
 	}
 
 	logger.Info().Msgf("ICMP proxy will use %s as source for IPv4", ipv4Src)
 
-	ipv6Src, zone, err := determineICMPv6Src(c.String("icmpv6-src"), logger, ipv4Src)
+	ipv6Src, zone, err := determineICMPv6Src(c.String(flags.ICMPV6Src), logger, ipv4Src)
 	if err != nil {
 		return netip.Addr{}, netip.Addr{}, errors.Wrap(err, "failed to determine IPv6 source address for ICMP proxy")
 	}
diff --git a/cmd/cloudflared/tunnel/credential_finder.go b/cmd/cloudflared/tunnel/credential_finder.go
index 92e05495..c50ff457 100644
--- a/cmd/cloudflared/tunnel/credential_finder.go
+++ b/cmd/cloudflared/tunnel/credential_finder.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"path/filepath"
 
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/credentials"
 
@@ -57,7 +58,7 @@ func newSearchByID(id uuid.UUID, c *cli.Context, log *zerolog.Logger, fs fileSys
 }
 
 func (s searchByID) Path() (string, error) {
-	originCertPath := s.c.String(credentials.OriginCertFlag)
+	originCertPath := s.c.String(cfdflags.OriginCert)
 	originCertLog := s.log.With().
 		Str("originCertPath", originCertPath).
 		Logger()
diff --git a/cmd/cloudflared/tunnel/fips.go b/cmd/cloudflared/tunnel/fips.go
deleted file mode 100644
index 03ae6d26..00000000
--- a/cmd/cloudflared/tunnel/fips.go
+++ /dev/null
@@ -1,3 +0,0 @@
-package tunnel
-
-var FipsEnabled bool
diff --git a/cmd/cloudflared/tunnel/login.go b/cmd/cloudflared/tunnel/login.go
index 632e622a..a5cf7813 100644
--- a/cmd/cloudflared/tunnel/login.go
+++ b/cmd/cloudflared/tunnel/login.go
@@ -67,7 +67,7 @@ func login(c *cli.Context) error {
 
 	path, ok, err := checkForExistingCert()
 	if ok {
-		fmt.Fprintf(os.Stdout, "You have an existing certificate at %s which login would overwrite.\nIf this is intentional, please move or delete that file then run this command again.\n", path)
+		log.Error().Err(err).Msgf("You have an existing certificate at %s which login would overwrite.\nIf this is intentional, please move or delete that file then run this command again.\n", path)
 		return nil
 	} else if err != nil {
 		return err
@@ -78,7 +78,8 @@ func login(c *cli.Context) error {
 		callbackStoreURL = c.String(callbackURLParamName)
 	)
 
-	if c.Bool(fedRAMPParamName) {
+	isFEDRamp := c.Bool(fedRAMPParamName)
+	if isFEDRamp {
 		baseloginURL = fedBaseLoginURL
 		callbackStoreURL = fedCallbackStoreURL
 	}
@@ -99,7 +100,23 @@ func login(c *cli.Context) error {
 		log,
 	)
 	if err != nil {
-		fmt.Fprintf(os.Stderr, "Failed to write the certificate due to the following error:\n%v\n\nYour browser will download the certificate instead. You will have to manually\ncopy it to the following path:\n\n%s\n", err, path)
+		log.Error().Err(err).Msgf("Failed to write the certificate.\n\nYour browser will download the certificate instead. You will have to manually\ncopy it to the following path:\n\n%s\n", path)
+		return err
+	}
+
+	cert, err := credentials.DecodeOriginCert(resourceData)
+	if err != nil {
+		log.Error().Err(err).Msg("failed to decode origin certificate")
+		return err
+	}
+
+	if isFEDRamp {
+		cert.Endpoint = credentials.FedEndpoint
+	}
+
+	resourceData, err = cert.EncodeOriginCert()
+	if err != nil {
+		log.Error().Err(err).Msg("failed to encode origin certificate")
 		return err
 	}
 
@@ -107,7 +124,7 @@ func login(c *cli.Context) error {
 		return errors.Wrap(err, fmt.Sprintf("error writing cert to %s", path))
 	}
 
-	fmt.Fprintf(os.Stdout, "You have successfully logged in.\nIf you wish to copy your credentials to a server, they have been saved to:\n%s\n", path)
+	log.Info().Msgf("You have successfully logged in.\nIf you wish to copy your credentials to a server, they have been saved to:\n%s\n", path)
 	return nil
 }
 
diff --git a/cmd/cloudflared/tunnel/quick_tunnel.go b/cmd/cloudflared/tunnel/quick_tunnel.go
index ee438450..e5e87da6 100644
--- a/cmd/cloudflared/tunnel/quick_tunnel.go
+++ b/cmd/cloudflared/tunnel/quick_tunnel.go
@@ -11,6 +11,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/pkg/errors"
 
+	"github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/connection"
 )
 
@@ -82,13 +83,13 @@ func RunQuickTunnel(sc *subcommandContext) error {
 		sc.log.Info().Msg(line)
 	}
 
-	if !sc.c.IsSet("protocol") {
-		sc.c.Set("protocol", "quic")
+	if !sc.c.IsSet(flags.Protocol) {
+		_ = sc.c.Set(flags.Protocol, "quic")
 	}
 
 	// Override the number of connections used. Quick tunnels shouldn't be used for production usage,
 	// so, use a single connection instead.
-	sc.c.Set(haConnectionsFlag, "1")
+	_ = sc.c.Set(flags.HaConnections, "1")
 	return StartServer(
 		sc.c,
 		buildInfo,
diff --git a/cmd/cloudflared/tunnel/subcommand_context.go b/cmd/cloudflared/tunnel/subcommand_context.go
index 83332b51..553cb83b 100644
--- a/cmd/cloudflared/tunnel/subcommand_context.go
+++ b/cmd/cloudflared/tunnel/subcommand_context.go
@@ -14,17 +14,20 @@ import (
 	"github.com/urfave/cli/v2"
 
 	"github.com/cloudflare/cloudflared/cfapi"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/connection"
 	"github.com/cloudflare/cloudflared/credentials"
 	"github.com/cloudflare/cloudflared/logger"
 )
 
-type errInvalidJSONCredential struct {
+const fedRampBaseApiURL = "https://api.fed.cloudflare.com/client/v4"
+
+type invalidJSONCredentialError struct {
 	err  error
 	path string
 }
 
-func (e errInvalidJSONCredential) Error() string {
+func (e invalidJSONCredentialError) Error() string {
 	return "Invalid JSON when parsing tunnel credentials file"
 }
 
@@ -64,7 +67,16 @@ func (sc *subcommandContext) client() (cfapi.Client, error) {
 	if err != nil {
 		return nil, err
 	}
-	sc.tunnelstoreClient, err = cred.Client(sc.c.String("api-url"), buildInfo.UserAgent(), sc.log)
+
+	var apiURL string
+	if cred.IsFEDEndpoint() {
+		sc.log.Info().Str("api-url", fedRampBaseApiURL).Msg("using fedramp base api")
+		apiURL = fedRampBaseApiURL
+	} else {
+		apiURL = sc.c.String(cfdflags.ApiURL)
+	}
+
+	sc.tunnelstoreClient, err = cred.Client(apiURL, buildInfo.UserAgent(), sc.log)
 	if err != nil {
 		return nil, err
 	}
@@ -73,7 +85,7 @@ func (sc *subcommandContext) client() (cfapi.Client, error) {
 
 func (sc *subcommandContext) credential() (*credentials.User, error) {
 	if sc.userCredential == nil {
-		uc, err := credentials.Read(sc.c.String(credentials.OriginCertFlag), sc.log)
+		uc, err := credentials.Read(sc.c.String(cfdflags.OriginCert), sc.log)
 		if err != nil {
 			return nil, err
 		}
@@ -100,7 +112,7 @@ func (sc *subcommandContext) readTunnelCredentials(credFinder CredFinder) (conne
 				"You may have accidentally used the filepath to cert.pem, which is generated by `cloudflared tunnel " +
 				"login`.")
 		}
-		return connection.Credentials{}, errInvalidJSONCredential{path: filePath, err: err}
+		return connection.Credentials{}, invalidJSONCredentialError{path: filePath, err: err}
 	}
 	return credentials, nil
 }
@@ -122,7 +134,7 @@ func (sc *subcommandContext) create(name string, credentialsFilePath string, sec
 		if err != nil {
 			return nil, errors.Wrap(err, "Couldn't decode tunnel secret from base64")
 		}
-		tunnelSecret = []byte(decodedSecret)
+		tunnelSecret = decodedSecret
 		if len(tunnelSecret) < 32 {
 			return nil, errors.New("Decoded tunnel secret must be at least 32 bytes long")
 		}
@@ -160,7 +172,7 @@ func (sc *subcommandContext) create(name string, credentialsFilePath string, sec
 			errorLines = append(errorLines, fmt.Sprintf("Cloudflared tried to delete the tunnel for you, but encountered an error. You should use `cloudflared tunnel delete %v` to delete the tunnel yourself, because the tunnel can't be run without the tunnelfile.", tunnel.ID))
 			errorLines = append(errorLines, fmt.Sprintf("The delete tunnel error is: %v", deleteErr))
 		} else {
-			errorLines = append(errorLines, fmt.Sprintf("The tunnel was deleted, because the tunnel can't be run without the credentials file"))
+			errorLines = append(errorLines, "The tunnel was deleted, because the tunnel can't be run without the credentials file")
 		}
 		errorMsg := strings.Join(errorLines, "\n")
 		return nil, errors.New(errorMsg)
@@ -189,7 +201,7 @@ func (sc *subcommandContext) list(filter *cfapi.TunnelFilter) ([]*cfapi.Tunnel,
 }
 
 func (sc *subcommandContext) delete(tunnelIDs []uuid.UUID) error {
-	forceFlagSet := sc.c.Bool("force")
+	forceFlagSet := sc.c.Bool(cfdflags.Force)
 
 	client, err := sc.client()
 	if err != nil {
@@ -229,7 +241,7 @@ func (sc *subcommandContext) findCredentials(tunnelID uuid.UUID) (connection.Cre
 	var err error
 	if credentialsContents := sc.c.String(CredContentsFlag); credentialsContents != "" {
 		if err = json.Unmarshal([]byte(credentialsContents), &credentials); err != nil {
-			err = errInvalidJSONCredential{path: "TUNNEL_CRED_CONTENTS", err: err}
+			err = invalidJSONCredentialError{path: "TUNNEL_CRED_CONTENTS", err: err}
 		}
 	} else {
 		credFinder := sc.credentialFinder(tunnelID)
@@ -245,7 +257,7 @@ func (sc *subcommandContext) findCredentials(tunnelID uuid.UUID) (connection.Cre
 func (sc *subcommandContext) run(tunnelID uuid.UUID) error {
 	credentials, err := sc.findCredentials(tunnelID)
 	if err != nil {
-		if e, ok := err.(errInvalidJSONCredential); ok {
+		if e, ok := err.(invalidJSONCredentialError); ok {
 			sc.log.Error().Msgf("The credentials file at %s contained invalid JSON. This is probably caused by passing the wrong filepath. Reminder: the credentials file is a .json file created via `cloudflared tunnel create`.", e.path)
 			sc.log.Error().Msgf("Invalid JSON when parsing credentials file: %s", e.err.Error())
 		}
diff --git a/cmd/cloudflared/tunnel/subcommands.go b/cmd/cloudflared/tunnel/subcommands.go
index ee26e2ae..a73cec9a 100644
--- a/cmd/cloudflared/tunnel/subcommands.go
+++ b/cmd/cloudflared/tunnel/subcommands.go
@@ -16,19 +16,21 @@ import (
 	"time"
 
 	"github.com/google/uuid"
-	homedir "github.com/mitchellh/go-homedir"
+	"github.com/mitchellh/go-homedir"
 	"github.com/pkg/errors"
 	"github.com/urfave/cli/v2"
 	"github.com/urfave/cli/v2/altsrc"
 	"golang.org/x/net/idna"
-	yaml "gopkg.in/yaml.v3"
+	"gopkg.in/yaml.v3"
 
 	"github.com/cloudflare/cloudflared/cfapi"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	"github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/updater"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/connection"
 	"github.com/cloudflare/cloudflared/diagnostic"
+	"github.com/cloudflare/cloudflared/fips"
 	"github.com/cloudflare/cloudflared/metrics"
 )
 
@@ -47,7 +49,6 @@ const (
 	noDiagNetworkFlagName   = "no-diag-network"
 	diagContainerIDFlagName = "diag-container-id"
 	diagPodFlagName         = "diag-pod-id"
-	metricsFlagName         = "metrics"
 
 	LogFieldTunnelID = "tunnelID"
 )
@@ -59,7 +60,7 @@ var (
 		Usage:   "Include deleted tunnels in the list",
 	}
 	listNameFlag = &cli.StringFlag{
-		Name:    "name",
+		Name:    flags.Name,
 		Aliases: []string{"n"},
 		Usage:   "List tunnels with the given `NAME`",
 	}
@@ -107,7 +108,7 @@ var (
 		EnvVars: []string{"TUNNEL_LIST_INVERT_SORT"},
 	}
 	featuresFlag = altsrc.NewStringSliceFlag(&cli.StringSliceFlag{
-		Name:    "features",
+		Name:    flags.Features,
 		Aliases: []string{"F"},
 		Usage:   "Opt into various features that are still being developed or tested.",
 	})
@@ -129,14 +130,14 @@ var (
 		EnvVars: []string{"TUNNEL_TOKEN"},
 	})
 	forceDeleteFlag = &cli.BoolFlag{
-		Name:    "force",
+		Name:    flags.Force,
 		Aliases: []string{"f"},
 		Usage: "Deletes a tunnel even if tunnel is connected and it has dependencies associated to it. (eg. IP routes)." +
 			" It is not possible to delete tunnels that have connections or non-deleted dependencies, without this flag.",
 		EnvVars: []string{"TUNNEL_RUN_FORCE_OVERWRITE"},
 	}
 	selectProtocolFlag = altsrc.NewStringFlag(&cli.StringFlag{
-		Name:    "protocol",
+		Name:    flags.Protocol,
 		Value:   connection.AutoSelectFlag,
 		Aliases: []string{"p"},
 		Usage:   fmt.Sprintf("Protocol implementation to connect with Cloudflare's edge network. %s", connection.AvailableProtocolFlagMessage),
@@ -144,11 +145,11 @@ var (
 		Hidden:  true,
 	})
 	postQuantumFlag = altsrc.NewBoolFlag(&cli.BoolFlag{
-		Name:    "post-quantum",
+		Name:    flags.PostQuantum,
 		Usage:   "When given creates an experimental post-quantum secure tunnel",
 		Aliases: []string{"pq"},
 		EnvVars: []string{"TUNNEL_POST_QUANTUM"},
-		Hidden:  FipsEnabled,
+		Hidden:  fips.IsFipsEnabled(),
 	})
 	sortInfoByFlag = &cli.StringFlag{
 		Name:    "sort-by",
@@ -180,17 +181,17 @@ var (
 		EnvVars: []string{"TUNNEL_CREATE_SECRET"},
 	}
 	icmpv4SrcFlag = &cli.StringFlag{
-		Name:    "icmpv4-src",
+		Name:    flags.ICMPV4Src,
 		Usage:   "Source address to send/receive ICMPv4 messages. If not provided cloudflared will dial a local address to determine the source IP or fallback to 0.0.0.0.",
 		EnvVars: []string{"TUNNEL_ICMPV4_SRC"},
 	}
 	icmpv6SrcFlag = &cli.StringFlag{
-		Name:    "icmpv6-src",
+		Name:    flags.ICMPV6Src,
 		Usage:   "Source address and the interface name to send/receive ICMPv6 messages. If not provided cloudflared will dial a local address to determine the source IP or fallback to ::.",
 		EnvVars: []string{"TUNNEL_ICMPV6_SRC"},
 	}
 	metricsFlag = &cli.StringFlag{
-		Name:  metricsFlagName,
+		Name:  flags.Metrics,
 		Usage: "The metrics server address i.e.: 127.0.0.1:12345. If your instance is running in a Docker/Kubernetes environment you need to setup port forwarding for your application.",
 		Value: "",
 	}
@@ -229,6 +230,11 @@ var (
 		Usage: "Network diagnostics won't be performed",
 		Value: false,
 	}
+	maxActiveFlowsFlag = &cli.Uint64Flag{
+		Name:    flags.MaxActiveFlows,
+		Usage:   "Overrides the remote configuration for max active private network flows (TCP/UDP) that this cloudflared instance supports",
+		EnvVars: []string{"TUNNEL_MAX_ACTIVE_FLOWS"},
+	}
 )
 
 func buildCreateCommand() *cli.Command {
@@ -331,7 +337,7 @@ func listCommand(c *cli.Context) error {
 	if !c.Bool("show-deleted") {
 		filter.NoDeleted()
 	}
-	if name := c.String("name"); name != "" {
+	if name := c.String(flags.Name); name != "" {
 		filter.ByName(name)
 	}
 	if namePrefix := c.String("name-prefix"); namePrefix != "" {
@@ -441,7 +447,7 @@ func fmtConnections(connections []cfapi.Connection, showRecentlyDisconnected boo
 	sort.Strings(sortedColos)
 
 	// Map each colo to its frequency, combine into output string.
-	var output []string
+	output := make([]string, 0, len(sortedColos))
 	for _, coloName := range sortedColos {
 		output = append(output, fmt.Sprintf("%dx%s", numConnsPerColo[coloName], coloName))
 	}
@@ -461,16 +467,21 @@ func buildReadyCommand() *cli.Command {
 }
 
 func readyCommand(c *cli.Context) error {
-	metricsOpts := c.String("metrics")
-	if !c.IsSet("metrics") {
-		return fmt.Errorf("--metrics has to be provided")
+	metricsOpts := c.String(flags.Metrics)
+	if !c.IsSet(flags.Metrics) {
+		return errors.New("--metrics has to be provided")
 	}
 
 	requestURL := fmt.Sprintf("http://%s/ready", metricsOpts)
-	res, err := http.Get(requestURL)
+	req, err := http.NewRequest(http.MethodGet, requestURL, nil)
 	if err != nil {
 		return err
 	}
+	res, err := http.DefaultClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer res.Body.Close()
 	if res.StatusCode != 200 {
 		body, err := io.ReadAll(res.Body)
 		if err != nil {
@@ -699,6 +710,7 @@ func buildRunCommand() *cli.Command {
 		tunnelTokenFlag,
 		icmpv4SrcFlag,
 		icmpv6SrcFlag,
+		maxActiveFlowsFlag,
 	}
 	flags = append(flags, configureProxyFlags(false)...)
 	return &cli.Command{
@@ -1067,7 +1079,7 @@ func diagCommand(ctx *cli.Context) error {
 	log := sctx.log
 	options := diagnostic.Options{
 		KnownAddresses: metrics.GetMetricsKnownAddresses(metrics.Runtime),
-		Address:        sctx.c.String(metricsFlagName),
+		Address:        sctx.c.String(flags.Metrics),
 		ContainerID:    sctx.c.String(diagContainerIDFlagName),
 		PodID:          sctx.c.String(diagPodFlagName),
 		Toggles: diagnostic.Toggles{
diff --git a/cmd/cloudflared/updater/update.go b/cmd/cloudflared/updater/update.go
index 1d3cbc2e..5c201cdc 100644
--- a/cmd/cloudflared/updater/update.go
+++ b/cmd/cloudflared/updater/update.go
@@ -15,6 +15,7 @@ import (
 	"golang.org/x/term"
 
 	"github.com/cloudflare/cloudflared/cmd/cloudflared/cliutil"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/logger"
 )
@@ -38,6 +39,7 @@ var (
 
 // BinaryUpdated implements ExitCoder interface, the app will exit with status code 11
 // https://pkg.go.dev/github.com/urfave/cli/v2?tab=doc#ExitCoder
+// nolint: errname
 type statusSuccess struct {
 	newVersion string
 }
@@ -50,16 +52,16 @@ func (u *statusSuccess) ExitCode() int {
 	return 11
 }
 
-// UpdateErr implements ExitCoder interface, the app will exit with status code 10
-type statusErr struct {
+// statusError implements ExitCoder interface, the app will exit with status code 10
+type statusError struct {
 	err error
 }
 
-func (e *statusErr) Error() string {
+func (e *statusError) Error() string {
 	return fmt.Sprintf("failed to update cloudflared: %v", e.err)
 }
 
-func (e *statusErr) ExitCode() int {
+func (e *statusError) ExitCode() int {
 	return 10
 }
 
@@ -79,7 +81,7 @@ type UpdateOutcome struct {
 }
 
 func (uo *UpdateOutcome) noUpdate() bool {
-	return uo.Error == nil && uo.Updated == false
+	return uo.Error == nil && !uo.Updated
 }
 
 func Init(info *cliutil.BuildInfo) {
@@ -153,7 +155,7 @@ func Update(c *cli.Context) error {
 		log.Info().Msg("cloudflared is set to update from staging")
 	}
 
-	isForced := c.Bool("force")
+	isForced := c.Bool(cfdflags.Force)
 	if isForced {
 		log.Info().Msg("cloudflared is set to upgrade to the latest publish version regardless of the current version")
 	}
@@ -166,7 +168,7 @@ func Update(c *cli.Context) error {
 		intendedVersion: c.String("version"),
 	})
 	if updateOutcome.Error != nil {
-		return &statusErr{updateOutcome.Error}
+		return &statusError{updateOutcome.Error}
 	}
 
 	if updateOutcome.noUpdate() {
@@ -252,7 +254,7 @@ func (a *AutoUpdater) Run(ctx context.Context) error {
 				pid, err := a.listeners.StartProcess()
 				if err != nil {
 					a.log.Err(err).Msg("Unable to restart server automatically")
-					return &statusErr{err: err}
+					return &statusError{err: err}
 				}
 				// stop old process after autoupdate. Otherwise we create a new process
 				// after each update
diff --git a/component-tests/test_pq.py b/component-tests/test_pq.py
index a7b2ed50..2681e935 100644
--- a/component-tests/test_pq.py
+++ b/component-tests/test_pq.py
@@ -1,7 +1,6 @@
-from util import LOGGER, nofips, start_cloudflared, wait_tunnel_ready
+from util import LOGGER, start_cloudflared, wait_tunnel_ready
 
 
-@nofips
 class TestPostQuantum:
     def _extra_config(self):
         config = {
@@ -12,6 +11,11 @@ class TestPostQuantum:
     def test_post_quantum(self, tmp_path, component_tests_config):
         config = component_tests_config(self._extra_config())
         LOGGER.debug(config)
-        with start_cloudflared(tmp_path, config, cfd_pre_args=["tunnel", "--ha-connections", "1"], cfd_args=["run", "--post-quantum"], new_process=True):
-            wait_tunnel_ready(tunnel_url=config.get_url(),
-                              require_min_connections=1)
+        with start_cloudflared(
+            tmp_path,
+            config,
+            cfd_pre_args=["tunnel", "--ha-connections", "1"],
+            cfd_args=["run", "--post-quantum"],
+            new_process=True,
+        ):
+            wait_tunnel_ready(tunnel_url=config.get_url(), require_min_connections=1)
diff --git a/connection/connection.go b/connection/connection.go
index b7376e38..f141d255 100644
--- a/connection/connection.go
+++ b/connection/connection.go
@@ -60,6 +60,7 @@ type Credentials struct {
 	AccountTag   string
 	TunnelSecret []byte
 	TunnelID     uuid.UUID
+	Endpoint     string
 }
 
 func (c *Credentials) Auth() pogs.TunnelAuth {
@@ -74,13 +75,16 @@ type TunnelToken struct {
 	AccountTag   string    `json:"a"`
 	TunnelSecret []byte    `json:"s"`
 	TunnelID     uuid.UUID `json:"t"`
+	Endpoint     string    `json:"e,omitempty"`
 }
 
 func (t TunnelToken) Credentials() Credentials {
+	// nolint: gosimple
 	return Credentials{
 		AccountTag:   t.AccountTag,
 		TunnelSecret: t.TunnelSecret,
 		TunnelID:     t.TunnelID,
+		Endpoint:     t.Endpoint,
 	}
 }
 
diff --git a/connection/protocol.go b/connection/protocol.go
index 417c8b72..fd53c105 100644
--- a/connection/protocol.go
+++ b/connection/protocol.go
@@ -14,7 +14,7 @@ import (
 const (
 	AvailableProtocolFlagMessage = "Available protocols: 'auto' - automatically chooses the best protocol over time (the default; and also the recommended one); 'quic' - based on QUIC, relying on UDP egress to Cloudflare edge; 'http2' - using Go's HTTP2 library, relying on TCP egress to Cloudflare edge"
 	// edgeH2muxTLSServerName is the server name to establish h2mux connection with edge (unused, but kept for legacy reference).
-	edgeH2muxTLSServerName = "cftunnel.com"
+	_ = "cftunnel.com"
 	// edgeH2TLSServerName is the server name to establish http2 connection with edge
 	edgeH2TLSServerName = "h2.cftunnel.com"
 	// edgeQUICServerName is the server name to establish quic connection with edge.
@@ -24,11 +24,9 @@ const (
 	ResolveTTL = time.Hour
 )
 
-var (
-	// ProtocolList represents a list of supported protocols for communication with the edge
-	// in order of precedence for remote percentage fetcher.
-	ProtocolList = []Protocol{QUIC, HTTP2}
-)
+// ProtocolList represents a list of supported protocols for communication with the edge
+// in order of precedence for remote percentage fetcher.
+var ProtocolList = []Protocol{QUIC, HTTP2}
 
 type Protocol int64
 
@@ -58,7 +56,7 @@ func (p Protocol) String() string {
 	case QUIC:
 		return "quic"
 	default:
-		return fmt.Sprintf("unknown protocol")
+		return "unknown protocol"
 	}
 }
 
@@ -246,11 +244,11 @@ func NewProtocolSelector(
 		return newRemoteProtocolSelector(fetchedProtocol, ProtocolList, threshold, protocolFetcher, resolveTTL, log), nil
 	}
 
-	return nil, fmt.Errorf("Unknown protocol %s, %s", protocolFlag, AvailableProtocolFlagMessage)
+	return nil, fmt.Errorf("unknown protocol %s, %s", protocolFlag, AvailableProtocolFlagMessage)
 }
 
 func switchThreshold(accountTag string) int32 {
 	h := fnv.New32a()
 	_, _ = h.Write([]byte(accountTag))
-	return int32(h.Sum32() % 100)
+	return int32(h.Sum32() % 100) // nolint: gosec
 }
diff --git a/connection/quic_connection.go b/connection/quic_connection.go
index 59308db0..6addfd60 100644
--- a/connection/quic_connection.go
+++ b/connection/quic_connection.go
@@ -103,9 +103,15 @@ func (q *quicConnection) Serve(ctx context.Context) error {
 		// amount of the grace period, allowing requests to finish before we cancel the context, which will
 		// make cloudflared exit.
 		if err := q.serveControlStream(ctx, controlStream); err == nil {
-			select {
-			case <-ctx.Done():
-			case <-time.Tick(q.gracePeriod):
+			if q.gracePeriod > 0 {
+				// In Go1.23 this can be removed and replaced with time.Ticker
+				// see https://pkg.go.dev/time#Tick
+				ticker := time.NewTicker(q.gracePeriod)
+				defer ticker.Stop()
+				select {
+				case <-ctx.Done():
+				case <-ticker.C:
+				}
 			}
 		}
 		cancel()
@@ -186,7 +192,7 @@ func (q *quicConnection) handleDataStream(ctx context.Context, stream *rpcquic.R
 		var metadata []pogs.Metadata
 		// Check the type of error that was throw and add metadata that will help identify it on OTD.
 		if errors.Is(err, cfdflow.ErrTooManyActiveFlows) {
-			metadata = append(metadata, pogs.ErrorFlowConnectRateLimitedKey)
+			metadata = append(metadata, pogs.ErrorFlowConnectRateLimitedMetadata)
 		}
 
 		if writeRespErr := stream.WriteConnectResponseData(err, metadata...); writeRespErr != nil {
diff --git a/connection/quic_connection_test.go b/connection/quic_connection_test.go
index e8ff6d55..49968372 100644
--- a/connection/quic_connection_test.go
+++ b/connection/quic_connection_test.go
@@ -639,7 +639,7 @@ func TestTCPProxy_FlowRateLimited(t *testing.T) {
 
 		// Got Rate Limited
 		assert.NotEmpty(t, response.Error)
-		assert.Contains(t, response.Metadata, pogs.ErrorFlowConnectRateLimitedKey)
+		assert.Contains(t, response.Metadata, pogs.ErrorFlowConnectRateLimitedMetadata)
 	}()
 
 	tunnelConn, _ := testTunnelConnection(t, netip.MustParseAddrPort(udpListener.LocalAddr().String()), uint8(0))
diff --git a/credentials/credentials.go b/credentials/credentials.go
index 8d1d8908..f5679b25 100644
--- a/credentials/credentials.go
+++ b/credentials/credentials.go
@@ -9,6 +9,7 @@ import (
 
 const (
 	logFieldOriginCertPath = "originCertPath"
+	FedEndpoint            = "fed"
 )
 
 type User struct {
@@ -32,6 +33,10 @@ func (c User) CertPath() string {
 	return c.certPath
 }
 
+func (c User) IsFEDEndpoint() bool {
+	return c.cert.Endpoint == FedEndpoint
+}
+
 // Client uses the user credentials to create a Cloudflare API client
 func (c *User) Client(apiURL string, userAgent string, log *zerolog.Logger) (cfapi.Client, error) {
 	if apiURL == "" {
@@ -45,7 +50,6 @@ func (c *User) Client(apiURL string, userAgent string, log *zerolog.Logger) (cfa
 		userAgent,
 		log,
 	)
-
 	if err != nil {
 		return nil, err
 	}
diff --git a/credentials/origin_cert.go b/credentials/origin_cert.go
index 73a59fa3..e8181b36 100644
--- a/credentials/origin_cert.go
+++ b/credentials/origin_cert.go
@@ -1,11 +1,13 @@
 package credentials
 
 import (
+	"bytes"
 	"encoding/json"
 	"encoding/pem"
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 
 	"github.com/mitchellh/go-homedir"
 	"github.com/rs/zerolog"
@@ -15,19 +17,30 @@ import (
 
 const (
 	DefaultCredentialFile = "cert.pem"
-	OriginCertFlag        = "origincert"
 )
 
-type namedTunnelToken struct {
+type OriginCert struct {
 	ZoneID    string `json:"zoneID"`
 	AccountID string `json:"accountID"`
 	APIToken  string `json:"apiToken"`
+	Endpoint  string `json:"endpoint,omitempty"`
 }
 
-type OriginCert struct {
-	ZoneID    string
-	APIToken  string
-	AccountID string
+func (oc *OriginCert) UnmarshalJSON(data []byte) error {
+	var aux struct {
+		ZoneID    string `json:"zoneID"`
+		AccountID string `json:"accountID"`
+		APIToken  string `json:"apiToken"`
+		Endpoint  string `json:"endpoint,omitempty"`
+	}
+	if err := json.Unmarshal(data, &aux); err != nil {
+		return fmt.Errorf("error parsing OriginCert: %v", err)
+	}
+	oc.ZoneID = aux.ZoneID
+	oc.AccountID = aux.AccountID
+	oc.APIToken = aux.APIToken
+	oc.Endpoint = strings.ToLower(aux.Endpoint)
+	return nil
 }
 
 // FindDefaultOriginCertPath returns the first path that contains a cert.pem file. If none of the
@@ -42,40 +55,56 @@ func FindDefaultOriginCertPath() string {
 	return ""
 }
 
+func DecodeOriginCert(blocks []byte) (*OriginCert, error) {
+	return decodeOriginCert(blocks)
+}
+
+func (cert *OriginCert) EncodeOriginCert() ([]byte, error) {
+	if cert == nil {
+		return nil, fmt.Errorf("originCert cannot be nil")
+	}
+	buffer, err := json.Marshal(cert)
+	if err != nil {
+		return nil, fmt.Errorf("originCert marshal failed: %v", err)
+	}
+	block := pem.Block{
+		Type:    "ARGO TUNNEL TOKEN",
+		Headers: map[string]string{},
+		Bytes:   buffer,
+	}
+	var out bytes.Buffer
+	err = pem.Encode(&out, &block)
+	if err != nil {
+		return nil, fmt.Errorf("pem encoding failed: %v", err)
+	}
+	return out.Bytes(), nil
+}
+
 func decodeOriginCert(blocks []byte) (*OriginCert, error) {
 	if len(blocks) == 0 {
-		return nil, fmt.Errorf("Cannot decode empty certificate")
+		return nil, fmt.Errorf("cannot decode empty certificate")
 	}
 	originCert := OriginCert{}
 	block, rest := pem.Decode(blocks)
-	for {
-		if block == nil {
-			break
-		}
+	for block != nil {
 		switch block.Type {
 		case "PRIVATE KEY", "CERTIFICATE":
 			// this is for legacy purposes.
-			break
 		case "ARGO TUNNEL TOKEN":
 			if originCert.ZoneID != "" || originCert.APIToken != "" {
-				return nil, fmt.Errorf("Found multiple tokens in the certificate")
+				return nil, fmt.Errorf("found multiple tokens in the certificate")
 			}
 			// The token is a string,
 			// Try the newer JSON format
-			ntt := namedTunnelToken{}
-			if err := json.Unmarshal(block.Bytes, &ntt); err == nil {
-				originCert.ZoneID = ntt.ZoneID
-				originCert.APIToken = ntt.APIToken
-				originCert.AccountID = ntt.AccountID
-			}
+			_ = json.Unmarshal(block.Bytes, &originCert)
 		default:
-			return nil, fmt.Errorf("Unknown block %s in the certificate", block.Type)
+			return nil, fmt.Errorf("unknown block %s in the certificate", block.Type)
 		}
 		block, rest = pem.Decode(rest)
 	}
 
 	if originCert.ZoneID == "" || originCert.APIToken == "" {
-		return nil, fmt.Errorf("Missing token in the certificate")
+		return nil, fmt.Errorf("missing token in the certificate")
 	}
 
 	return &originCert, nil
diff --git a/credentials/origin_cert_test.go b/credentials/origin_cert_test.go
index 77a473e4..7e2a90a0 100644
--- a/credentials/origin_cert_test.go
+++ b/credentials/origin_cert_test.go
@@ -16,27 +16,25 @@ const (
 	originCertFile = "cert.pem"
 )
 
-var (
-	nopLog = zerolog.Nop().With().Logger()
-)
+var nopLog = zerolog.Nop().With().Logger()
 
 func TestLoadOriginCert(t *testing.T) {
 	cert, err := decodeOriginCert([]byte{})
-	assert.Equal(t, fmt.Errorf("Cannot decode empty certificate"), err)
+	assert.Equal(t, fmt.Errorf("cannot decode empty certificate"), err)
 	assert.Nil(t, cert)
 
 	blocks, err := os.ReadFile("test-cert-unknown-block.pem")
-	assert.NoError(t, err)
+	require.NoError(t, err)
 	cert, err = decodeOriginCert(blocks)
-	assert.Equal(t, fmt.Errorf("Unknown block RSA PRIVATE KEY in the certificate"), err)
+	assert.Equal(t, fmt.Errorf("unknown block RSA PRIVATE KEY in the certificate"), err)
 	assert.Nil(t, cert)
 }
 
 func TestJSONArgoTunnelTokenEmpty(t *testing.T) {
 	blocks, err := os.ReadFile("test-cert-no-token.pem")
-	assert.NoError(t, err)
+	require.NoError(t, err)
 	cert, err := decodeOriginCert(blocks)
-	assert.Equal(t, fmt.Errorf("Missing token in the certificate"), err)
+	assert.Equal(t, fmt.Errorf("missing token in the certificate"), err)
 	assert.Nil(t, cert)
 }
 
@@ -52,51 +50,21 @@ func TestJSONArgoTunnelToken(t *testing.T) {
 
 func CloudflareTunnelTokenTest(t *testing.T, path string) {
 	blocks, err := os.ReadFile(path)
-	assert.NoError(t, err)
+	require.NoError(t, err)
 	cert, err := decodeOriginCert(blocks)
-	assert.NoError(t, err)
+	require.NoError(t, err)
 	assert.NotNil(t, cert)
 	assert.Equal(t, "7b0a4d77dfb881c1a3b7d61ea9443e19", cert.ZoneID)
 	key := "test-service-key"
 	assert.Equal(t, key, cert.APIToken)
 }
 
-type mockFile struct {
-	path string
-	data []byte
-	err  error
-}
-
-type mockFileSystem struct {
-	files map[string]mockFile
-}
-
-func newMockFileSystem(files ...mockFile) *mockFileSystem {
-	fs := mockFileSystem{map[string]mockFile{}}
-	for _, f := range files {
-		fs.files[f.path] = f
-	}
-	return &fs
-}
-
-func (fs *mockFileSystem) ReadFile(path string) ([]byte, error) {
-	if f, ok := fs.files[path]; ok {
-		return f.data, f.err
-	}
-	return nil, os.ErrNotExist
-}
-
-func (fs *mockFileSystem) ValidFilePath(path string) bool {
-	_, exists := fs.files[path]
-	return exists
-}
-
 func TestFindOriginCert_Valid(t *testing.T) {
 	file, err := os.ReadFile("test-cloudflare-tunnel-cert-json.pem")
 	require.NoError(t, err)
 	dir := t.TempDir()
 	certPath := path.Join(dir, originCertFile)
-	os.WriteFile(certPath, file, fs.ModePerm)
+	_ = os.WriteFile(certPath, file, fs.ModePerm)
 	path, err := FindOriginCert(certPath, &nopLog)
 	require.NoError(t, err)
 	require.Equal(t, certPath, path)
@@ -108,3 +76,28 @@ func TestFindOriginCert_Missing(t *testing.T) {
 	_, err := FindOriginCert(certPath, &nopLog)
 	require.Error(t, err)
 }
+
+func TestEncodeDecodeOriginCert(t *testing.T) {
+	cert := OriginCert{
+		ZoneID:    "zone",
+		AccountID: "account",
+		APIToken:  "token",
+		Endpoint:  "FED",
+	}
+	blocks, err := cert.EncodeOriginCert()
+	require.NoError(t, err)
+	decodedCert, err := DecodeOriginCert(blocks)
+	require.NoError(t, err)
+	assert.NotNil(t, cert)
+	assert.Equal(t, "zone", decodedCert.ZoneID)
+	assert.Equal(t, "account", decodedCert.AccountID)
+	assert.Equal(t, "token", decodedCert.APIToken)
+	assert.Equal(t, FedEndpoint, decodedCert.Endpoint)
+}
+
+func TestEncodeDecodeNilOriginCert(t *testing.T) {
+	var cert *OriginCert
+	blocks, err := cert.EncodeOriginCert()
+	assert.Equal(t, fmt.Errorf("originCert cannot be nil"), err)
+	require.Nil(t, blocks)
+}
diff --git a/credentials/test-cert-unknown-block.pem b/credentials/test-cert-unknown-block.pem
index 4a847eb0..86fd4a40 100644
--- a/credentials/test-cert-unknown-block.pem
+++ b/credentials/test-cert-unknown-block.pem
@@ -87,3 +87,4 @@ M2i4QoOFcSKIG+v4SuvgEJHgG8vGvxh2qlSxnMWuPV+7/1P5ATLqDj1PlKms+BNR
 y7sc5AT9PclkL3Y9MNzOu0LXyBkGYcl8M0EQfLv9VPbWT+NXiMg/O2CHiT02pAAz
 uQicoQq3yzeQh20wtrtaXzTNmA==
 -----END RSA PRIVATE KEY-----
+
diff --git a/dev.Dockerfile b/dev.Dockerfile
index 8986040a..8d15784f 100644
--- a/dev.Dockerfile
+++ b/dev.Dockerfile
@@ -1,6 +1,6 @@
-FROM golang:1.22.5 as builder
+FROM golang:1.22.10 as builder
 ENV GO111MODULE=on \
-    CGO_ENABLED=0
+  CGO_ENABLED=0
 WORKDIR /go/src/github.com/cloudflare/cloudflared/
 RUN apt-get update
 COPY . .
diff --git a/diagnostic/client.go b/diagnostic/client.go
index 6e4dc2d3..e05948bf 100644
--- a/diagnostic/client.go
+++ b/diagnostic/client.go
@@ -9,7 +9,7 @@ import (
 	"net/url"
 	"strconv"
 
-	"github.com/cloudflare/cloudflared/logger"
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 )
 
 type httpClient struct {
@@ -86,12 +86,12 @@ func (client *httpClient) GetLogConfiguration(ctx context.Context) (*LogConfigur
 		return nil, fmt.Errorf("error convertin pid to int: %w", err)
 	}
 
-	logFile, exists := data[logger.LogFileFlag]
+	logFile, exists := data[cfdflags.LogFile]
 	if exists {
 		return &LogConfiguration{logFile, "", uid}, nil
 	}
 
-	logDirectory, exists := data[logger.LogDirectoryFlag]
+	logDirectory, exists := data[cfdflags.LogDirectory]
 	if exists {
 		return &LogConfiguration{"", logDirectory, uid}, nil
 	}
diff --git a/features/features.go b/features/features.go
index d1476285..25b5dc8b 100644
--- a/features/features.go
+++ b/features/features.go
@@ -11,15 +11,13 @@ const (
 	FeatureDatagramV3        = "support_datagram_v3"
 )
 
-var (
-	defaultFeatures = []string{
-		FeatureAllowRemoteConfig,
-		FeatureSerializedHeaders,
-		FeatureDatagramV2,
-		FeatureQUICSupportEOF,
-		FeatureManagementLogs,
-	}
-)
+var defaultFeatures = []string{
+	FeatureAllowRemoteConfig,
+	FeatureSerializedHeaders,
+	FeatureDatagramV2,
+	FeatureQUICSupportEOF,
+	FeatureManagementLogs,
+}
 
 // Features set by user provided flags
 type staticFeatures struct {
@@ -47,7 +45,6 @@ const (
 
 // Remove any duplicates from the slice
 func Dedup(slice []string) []string {
-
 	// Convert the slice into a set
 	set := make(map[string]bool, 0)
 	for _, str := range slice {
diff --git a/fips/fips.go b/fips/fips.go
new file mode 100644
index 00000000..cd5d1617
--- /dev/null
+++ b/fips/fips.go
@@ -0,0 +1,11 @@
+//go:build fips
+
+package fips
+
+import (
+	_ "crypto/tls/fipsonly"
+)
+
+func IsFipsEnabled() bool {
+	return true
+}
diff --git a/fips/fips.go.linux-amd64 b/fips/fips.go.linux-amd64
deleted file mode 100644
index 5075f298..00000000
--- a/fips/fips.go.linux-amd64
+++ /dev/null
@@ -1,12 +0,0 @@
-// +build fips
-
-package main
-
-import (
-    _ "crypto/tls/fipsonly"
-    "github.com/cloudflare/cloudflared/cmd/cloudflared/tunnel"
-)
-
-func init () {
-    tunnel.FipsEnabled = true
-}
diff --git a/fips/nofips.go b/fips/nofips.go
new file mode 100644
index 00000000..c8d98ed2
--- /dev/null
+++ b/fips/nofips.go
@@ -0,0 +1,7 @@
+//go:build !fips
+
+package fips
+
+func IsFipsEnabled() bool {
+	return false
+}
diff --git a/go.mod b/go.mod
index 11e8f0f5..a5a24f1a 100644
--- a/go.mod
+++ b/go.mod
@@ -36,11 +36,11 @@ require (
 	go.opentelemetry.io/proto/otlp v1.2.0
 	go.uber.org/automaxprocs v1.4.0
 	go.uber.org/mock v0.5.0
-	golang.org/x/crypto v0.24.0
+	golang.org/x/crypto v0.31.0
 	golang.org/x/net v0.26.0
-	golang.org/x/sync v0.7.0
-	golang.org/x/sys v0.21.0
-	golang.org/x/term v0.21.0
+	golang.org/x/sync v0.10.0
+	golang.org/x/sys v0.28.0
+	golang.org/x/term v0.27.0
 	google.golang.org/protobuf v1.34.1
 	gopkg.in/natefinch/lumberjack.v2 v2.0.0
 	gopkg.in/yaml.v3 v3.0.1
@@ -87,7 +87,7 @@ require (
 	golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
 	golang.org/x/mod v0.18.0 // indirect
 	golang.org/x/oauth2 v0.18.0 // indirect
-	golang.org/x/text v0.16.0 // indirect
+	golang.org/x/text v0.21.0 // indirect
 	golang.org/x/tools v0.22.0 // indirect
 	google.golang.org/appengine v1.6.8 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20240311132316-a219d84964c2 // indirect
@@ -102,3 +102,6 @@ replace github.com/urfave/cli/v2 => github.com/ipostelnik/cli/v2 v2.3.1-0.202103
 replace github.com/prometheus/golang_client => github.com/prometheus/golang_client v1.12.1
 
 replace gopkg.in/yaml.v3 => gopkg.in/yaml.v3 v3.0.1
+
+// This fork is based on quic-go v0.45
+replace github.com/quic-go/quic-go => github.com/chungthuang/quic-go v0.45.1-0.20250128102735-2687bd175910
diff --git a/go.sum b/go.sum
index 1865726c..cb175989 100644
--- a/go.sum
+++ b/go.sum
@@ -7,6 +7,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
 github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
 github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
 github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/chungthuang/quic-go v0.45.1-0.20250128102735-2687bd175910 h1:/hTvBpxBDj/3NIzTodi1oEOyNBpirvgDSPKSV7VqAZU=
+github.com/chungthuang/quic-go v0.45.1-0.20250128102735-2687bd175910/go.mod h1:1dLehS7TIR64+vxGR70GDcatWTOtMX2PUtnKsjbTurI=
 github.com/coredns/caddy v1.1.1 h1:2eYKZT7i6yxIfGP3qLJoJ7HAsDJqYB+X68g4NYjSrE0=
 github.com/coredns/caddy v1.1.1/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4=
 github.com/coredns/coredns v1.11.3 h1:8RjnpZc42db5th84/QJKH2i137ecJdzZK1HJwhetSPk=
@@ -173,8 +175,6 @@ github.com/prometheus/common v0.53.0 h1:U2pL9w9nmJwJDa4qqLQ3ZaePJ6ZTwt7cMD3AG3+a
 github.com/prometheus/common v0.53.0/go.mod h1:BrxBKv3FWBIGXw89Mg1AeBq7FSyRzXWI3l3e7W3RN5U=
 github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
 github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
-github.com/quic-go/quic-go v0.45.0 h1:OHmkQGM37luZITyTSu6ff03HP/2IrwDX1ZFiNEhSFUE=
-github.com/quic-go/quic-go v0.45.0/go.mod h1:1dLehS7TIR64+vxGR70GDcatWTOtMX2PUtnKsjbTurI=
 github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
 github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
 github.com/rs/xid v1.2.1/go.mod h1:+uKXf+4Djp6Md1KODXJxgGQPKngRmWyn10oCKFzNHOQ=
@@ -222,8 +222,8 @@ go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
-golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI=
-golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM=
+golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
 golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
 golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
@@ -242,8 +242,8 @@ golang.org/x/oauth2 v0.18.0/go.mod h1:Wf7knwG0MPoWIMMBgFlEaSUDaKskp0dCfrlJRJXbBi
 golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
-golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -254,19 +254,19 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
-golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA=
-golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0=
+golang.org/x/term v0.27.0 h1:WP60Sv1nlK1T6SupCHbXzSaN0b9wUmsPoRS9b61A23Q=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
-golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
-golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
+golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
 golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
diff --git a/logger/create.go b/logger/create.go
index 4a298ad4..99bcca3c 100644
--- a/logger/create.go
+++ b/logger/create.go
@@ -16,6 +16,7 @@ import (
 	"golang.org/x/term"
 	"gopkg.in/natefinch/lumberjack.v2"
 
+	cfdflags "github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/management"
 )
 
@@ -23,14 +24,6 @@ const (
 	EnableTerminalLog  = false
 	DisableTerminalLog = true
 
-	LogLevelFlag          = "loglevel"
-	LogFileFlag           = "logfile"
-	LogDirectoryFlag      = "log-directory"
-	LogTransportLevelFlag = "transport-loglevel"
-
-	LogSSHDirectoryFlag = "log-directory"
-	LogSSHLevelFlag     = "log-level"
-
 	dirPermMode  = 0744 // rwxr--r--
 	filePermMode = 0644 // rw-r--r--
 
@@ -137,15 +130,15 @@ func newZerolog(loggerConfig *Config) *zerolog.Logger {
 }
 
 func CreateTransportLoggerFromContext(c *cli.Context, disableTerminal bool) *zerolog.Logger {
-	return createFromContext(c, LogTransportLevelFlag, LogDirectoryFlag, disableTerminal)
+	return createFromContext(c, cfdflags.TransportLogLevel, cfdflags.LogDirectory, disableTerminal)
 }
 
 func CreateLoggerFromContext(c *cli.Context, disableTerminal bool) *zerolog.Logger {
-	return createFromContext(c, LogLevelFlag, LogDirectoryFlag, disableTerminal)
+	return createFromContext(c, cfdflags.LogLevel, cfdflags.LogDirectory, disableTerminal)
 }
 
 func CreateSSHLoggerFromContext(c *cli.Context, disableTerminal bool) *zerolog.Logger {
-	return createFromContext(c, LogSSHLevelFlag, LogSSHDirectoryFlag, disableTerminal)
+	return createFromContext(c, cfdflags.LogLevelSSH, cfdflags.LogDirectory, disableTerminal)
 }
 
 func createFromContext(
@@ -155,7 +148,7 @@ func createFromContext(
 	disableTerminal bool,
 ) *zerolog.Logger {
 	logLevel := c.String(logLevelFlagName)
-	logFile := c.String(LogFileFlag)
+	logFile := c.String(cfdflags.LogFile)
 	logDirectory := c.String(logDirectoryFlagName)
 
 	loggerConfig := CreateConfig(
@@ -167,7 +160,7 @@ func createFromContext(
 
 	log := newZerolog(loggerConfig)
 	if incompatibleFlagsSet := logFile != "" && logDirectory != ""; incompatibleFlagsSet {
-		log.Error().Msgf("Your config includes values for both %s (%s) and %s (%s), but they are incompatible. %s takes precedence.", LogFileFlag, logFile, logDirectoryFlagName, logDirectory, LogFileFlag)
+		log.Error().Msgf("Your config includes values for both %s (%s) and %s (%s), but they are incompatible. %s takes precedence.", cfdflags.LogFile, logFile, logDirectoryFlagName, logDirectory, cfdflags.LogFile)
 	}
 	return log
 }
@@ -206,7 +199,6 @@ var (
 
 func createFileWriter(config FileConfig) (io.Writer, error) {
 	singleFileInit.once.Do(func() {
-
 		var logFile io.Writer
 		fullpath := config.Fullpath()
 
diff --git a/orchestration/orchestrator.go b/orchestration/orchestrator.go
index 4c44143e..abfd1f9b 100644
--- a/orchestration/orchestrator.go
+++ b/orchestration/orchestrator.go
@@ -4,16 +4,17 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"strconv"
 	"sync"
 	"sync/atomic"
 
-	"github.com/pkg/errors"
+	pkgerrors "github.com/pkg/errors"
 	"github.com/rs/zerolog"
 
-	cfdflow "github.com/cloudflare/cloudflared/flow"
-
+	"github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/connection"
+	cfdflow "github.com/cloudflare/cloudflared/flow"
 	"github.com/cloudflare/cloudflared/ingress"
 	"github.com/cloudflare/cloudflared/proxy"
 	"github.com/cloudflare/cloudflared/tunnelrpc/pogs"
@@ -117,6 +118,30 @@ func (o *Orchestrator) UpdateConfig(version int32, config []byte) *pogs.UpdateCo
 	}
 }
 
+// overrideRemoteWarpRoutingWithLocalValues overrides the ingress.WarpRoutingConfig that comes from the remote with
+// the local values if there is any.
+func (o *Orchestrator) overrideRemoteWarpRoutingWithLocalValues(remoteWarpRouting *ingress.WarpRoutingConfig) error {
+	return o.overrideMaxActiveFlows(o.config.ConfigurationFlags[flags.MaxActiveFlows], remoteWarpRouting)
+}
+
+// overrideMaxActiveFlows checks the local configuration flags, and if a value is found for the flags.MaxActiveFlows
+// overrides the value that comes on the remote ingress.WarpRoutingConfig with the local value.
+func (o *Orchestrator) overrideMaxActiveFlows(maxActiveFlowsLocalConfig string, remoteWarpRouting *ingress.WarpRoutingConfig) error {
+	// If max active flows isn't defined locally just use the remote value
+	if maxActiveFlowsLocalConfig == "" {
+		return nil
+	}
+
+	maxActiveFlowsLocalOverride, err := strconv.ParseUint(maxActiveFlowsLocalConfig, 10, 64)
+	if err != nil {
+		return pkgerrors.Wrapf(err, "failed to parse %s", flags.MaxActiveFlows)
+	}
+
+	// Override the value that comes from the remote with the local value
+	remoteWarpRouting.MaxActiveFlows = maxActiveFlowsLocalOverride
+	return nil
+}
+
 // The caller is responsible to make sure there is no concurrent access
 func (o *Orchestrator) updateIngress(ingressRules ingress.Ingress, warpRouting ingress.WarpRoutingConfig) error {
 	select {
@@ -125,6 +150,11 @@ func (o *Orchestrator) updateIngress(ingressRules ingress.Ingress, warpRouting i
 	default:
 	}
 
+	// Overrides the local values, onto the remote values of the warp routing configuration
+	if err := o.overrideRemoteWarpRoutingWithLocalValues(&warpRouting); err != nil {
+		return pkgerrors.Wrap(err, "failed to merge local overrides into warp routing configuration")
+	}
+
 	// Assign the internal ingress rules to the parsed ingress
 	ingressRules.InternalRules = o.internalRules
 
@@ -139,7 +169,7 @@ func (o *Orchestrator) updateIngress(ingressRules ingress.Ingress, warpRouting i
 	// The downside is minimized because none of the ingress.OriginService implementation have that requirement
 	proxyShutdownC := make(chan struct{})
 	if err := ingressRules.StartOrigins(o.log, proxyShutdownC); err != nil {
-		return errors.Wrap(err, "failed to start origin")
+		return pkgerrors.Wrap(err, "failed to start origin")
 	}
 
 	// Update the flow limit since the configuration might have changed
diff --git a/orchestration/orchestrator_test.go b/orchestration/orchestrator_test.go
index eb2c6f72..a6e0755b 100644
--- a/orchestration/orchestrator_test.go
+++ b/orchestration/orchestrator_test.go
@@ -16,8 +16,11 @@ import (
 	"github.com/google/uuid"
 	gows "github.com/gorilla/websocket"
 	"github.com/rs/zerolog"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
+	"github.com/cloudflare/cloudflared/cmd/cloudflared/flags"
+
 	"github.com/cloudflare/cloudflared/config"
 	"github.com/cloudflare/cloudflared/connection"
 	"github.com/cloudflare/cloudflared/ingress"
@@ -106,25 +109,25 @@ func TestUpdateConfiguration(t *testing.T) {
 	require.Len(t, configV2.Ingress.Rules, 3)
 	// originRequest of this ingress rule overrides global default
 	require.Equal(t, config.CustomDuration{Duration: time.Second * 10}, configV2.Ingress.Rules[0].Config.ConnectTimeout)
-	require.Equal(t, true, configV2.Ingress.Rules[0].Config.NoTLSVerify)
+	require.True(t, configV2.Ingress.Rules[0].Config.NoTLSVerify)
 	// Inherited from global default
-	require.Equal(t, true, configV2.Ingress.Rules[0].Config.NoHappyEyeballs)
+	require.True(t, configV2.Ingress.Rules[0].Config.NoHappyEyeballs)
 	// Validate ingress rule 1
 	require.Equal(t, "jira.tunnel.org", configV2.Ingress.Rules[1].Hostname)
 	require.True(t, configV2.Ingress.Rules[1].Matches("jira.tunnel.org", "/users"))
 	require.Equal(t, "http://172.32.20.6:80", configV2.Ingress.Rules[1].Service.String())
 	// originRequest of this ingress rule overrides global default
 	require.Equal(t, config.CustomDuration{Duration: time.Second * 30}, configV2.Ingress.Rules[1].Config.ConnectTimeout)
-	require.Equal(t, true, configV2.Ingress.Rules[1].Config.NoTLSVerify)
+	require.True(t, configV2.Ingress.Rules[1].Config.NoTLSVerify)
 	// Inherited from global default
-	require.Equal(t, true, configV2.Ingress.Rules[1].Config.NoHappyEyeballs)
+	require.True(t, configV2.Ingress.Rules[1].Config.NoHappyEyeballs)
 	// Validate ingress rule 2, it's the catch-all rule
 	require.True(t, configV2.Ingress.Rules[2].Matches("blogs.tunnel.io", "/2022/02/10"))
 	// Inherited from global default
 	require.Equal(t, config.CustomDuration{Duration: time.Second * 90}, configV2.Ingress.Rules[2].Config.ConnectTimeout)
-	require.Equal(t, false, configV2.Ingress.Rules[2].Config.NoTLSVerify)
-	require.Equal(t, true, configV2.Ingress.Rules[2].Config.NoHappyEyeballs)
-	require.Equal(t, configV2.WarpRouting.ConnectTimeout.Duration, 10*time.Second)
+	require.False(t, configV2.Ingress.Rules[2].Config.NoTLSVerify)
+	require.True(t, configV2.Ingress.Rules[2].Config.NoHappyEyeballs)
+	require.Equal(t, 10*time.Second, configV2.WarpRouting.ConnectTimeout.Duration)
 
 	originProxyV2, err := orchestrator.GetOriginProxy()
 	require.NoError(t, err)
@@ -317,7 +320,7 @@ func TestConcurrentUpdateAndRead(t *testing.T) {
 		go func(i int, originProxy connection.OriginProxy) {
 			defer wg.Done()
 			resp, err := proxyHTTP(originProxy, hostname)
-			require.NoError(t, err, "proxyHTTP %d failed %v", i, err)
+			assert.NoError(t, err, "proxyHTTP %d failed %v", i, err)
 			defer resp.Body.Close()
 
 			var warpRoutingDisabled bool
@@ -326,16 +329,16 @@ func TestConcurrentUpdateAndRead(t *testing.T) {
 			// v1 proxy, warp enabled
 			case 200:
 				body, err := io.ReadAll(resp.Body)
-				require.NoError(t, err)
-				require.Equal(t, t.Name(), string(body))
+				assert.NoError(t, err)
+				assert.Equal(t, t.Name(), string(body))
 				warpRoutingDisabled = false
 			// v2 proxy, warp disabled
 			case 204:
-				require.Greater(t, i, concurrentRequests/4)
+				assert.Greater(t, i, concurrentRequests/4)
 				warpRoutingDisabled = true
 			// v3 proxy, warp enabled
 			case 418:
-				require.Greater(t, i, concurrentRequests/2)
+				assert.Greater(t, i, concurrentRequests/2)
 				warpRoutingDisabled = false
 			}
 
@@ -358,11 +361,10 @@ func TestConcurrentUpdateAndRead(t *testing.T) {
 
 			err = proxyTCP(ctx, originProxy, tcpOrigin.Addr().String(), w, pr)
 			if warpRoutingDisabled {
-				require.Error(t, err, "expect proxyTCP %d to return error", i)
+				assert.Error(t, err, "expect proxyTCP %d to return error", i)
 			} else {
-				require.NoError(t, err, "proxyTCP %d failed %v", i, err)
+				assert.NoError(t, err, "proxyTCP %d failed %v", i, err)
 			}
-
 		}(i, originProxy)
 
 		if i == concurrentRequests/4 {
@@ -388,6 +390,57 @@ func TestConcurrentUpdateAndRead(t *testing.T) {
 	wg.Wait()
 }
 
+// TestOverrideWarpRoutingConfigWithLocalValues tests that if a value is defined in the Config.ConfigurationFlags,
+// it will override the value that comes from the remote result.
+func TestOverrideWarpRoutingConfigWithLocalValues(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	assertMaxActiveFlows := func(orchestrator *Orchestrator, expectedValue uint64) {
+		configJson, err := orchestrator.GetConfigJSON()
+		require.NoError(t, err)
+		var result map[string]interface{}
+		err = json.Unmarshal(configJson, &result)
+		require.NoError(t, err)
+		warpRouting := result["warp-routing"].(map[string]interface{})
+		require.EqualValues(t, expectedValue, warpRouting["maxActiveFlows"])
+	}
+
+	remoteValue := uint64(100)
+	remoteIngress := ingress.Ingress{}
+	remoteWarpConfig := ingress.WarpRoutingConfig{
+		MaxActiveFlows: remoteValue,
+	}
+	remoteConfig := &Config{
+		Ingress:            &remoteIngress,
+		WarpRouting:        remoteWarpConfig,
+		ConfigurationFlags: map[string]string{},
+	}
+	orchestrator, err := NewOrchestrator(ctx, remoteConfig, testTags, []ingress.Rule{}, &testLogger)
+	require.NoError(t, err)
+
+	assertMaxActiveFlows(orchestrator, remoteValue)
+
+	// Add a local override for the maxActiveFlows
+	localValue := uint64(500)
+	remoteConfig.ConfigurationFlags[flags.MaxActiveFlows] = fmt.Sprintf("%d", localValue)
+	// Force a configuration refresh
+	err = orchestrator.updateIngress(remoteIngress, remoteWarpConfig)
+	require.NoError(t, err)
+
+	// Check the value being used is the local one
+	assertMaxActiveFlows(orchestrator, localValue)
+
+	// Remove local override for the maxActiveFlows
+	delete(remoteConfig.ConfigurationFlags, flags.MaxActiveFlows)
+	// Force a configuration refresh
+	err = orchestrator.updateIngress(remoteIngress, remoteWarpConfig)
+	require.NoError(t, err)
+
+	// Check the value being used is now the remote again
+	assertMaxActiveFlows(orchestrator, remoteValue)
+}
+
 func proxyHTTP(originProxy connection.OriginProxy, hostname string) (*http.Response, error) {
 	req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s", hostname), nil)
 	if err != nil {
@@ -409,15 +462,16 @@ func proxyHTTP(originProxy connection.OriginProxy, hostname string) (*http.Respo
 	return w.Result(), nil
 }
 
+// nolint: testifylint // this is used inside go routines so it can't use `require.`
 func tcpEyeball(t *testing.T, reqWriter io.WriteCloser, body string, respReadWriter *respReadWriteFlusher) {
 	writeN, err := reqWriter.Write([]byte(body))
-	require.NoError(t, err)
+	assert.NoError(t, err)
 
 	readBuffer := make([]byte, writeN)
 	n, err := respReadWriter.Read(readBuffer)
-	require.NoError(t, err)
-	require.Equal(t, body, string(readBuffer[:n]))
-	require.Equal(t, writeN, n)
+	assert.NoError(t, err)
+	assert.Equal(t, body, string(readBuffer[:n]))
+	assert.Equal(t, writeN, n)
 }
 
 func proxyTCP(ctx context.Context, originProxy connection.OriginProxy, originAddr string, w http.ResponseWriter, reqBody io.ReadCloser) error {
@@ -458,14 +512,15 @@ func serveTCPOrigin(t *testing.T, tcpOrigin net.Listener, wg *sync.WaitGroup) {
 	}
 }
 
+// nolint: testifylint // this is used inside go routines so it can't use `require.`
 func echoTCP(t *testing.T, conn net.Conn) {
 	readBuf := make([]byte, 1000)
 	readN, err := conn.Read(readBuf)
-	require.NoError(t, err)
+	assert.NoError(t, err)
 
 	writeN, err := conn.Write(readBuf[:readN])
-	require.NoError(t, err)
-	require.Equal(t, readN, writeN)
+	assert.NoError(t, err)
+	assert.Equal(t, readN, writeN)
 }
 
 type validateHostHandler struct {
@@ -479,16 +534,17 @@ func (vhh *validateHostHandler) ServeHTTP(w http.ResponseWriter, r *http.Request
 		return
 	}
 	w.WriteHeader(http.StatusOK)
-	w.Write([]byte(vhh.body))
+	_, _ = w.Write([]byte(vhh.body))
 }
 
+// nolint: testifylint // this is used inside go routines so it can't use `require.`
 func updateWithValidation(t *testing.T, orchestrator *Orchestrator, version int32, config []byte) {
 	resp := orchestrator.UpdateConfig(version, config)
-	require.NoError(t, resp.Err)
-	require.Equal(t, version, resp.LastAppliedVersion)
+	assert.NoError(t, resp.Err)
+	assert.Equal(t, version, resp.LastAppliedVersion)
 }
 
-// TestClosePreviousProxies makes sure proxies started in the pervious configuration version are shutdown
+// TestClosePreviousProxies makes sure proxies started in the previous configuration version are shutdown
 func TestClosePreviousProxies(t *testing.T) {
 	var (
 		hostname             = "hello.tunnel1.org"
@@ -532,6 +588,7 @@ func TestClosePreviousProxies(t *testing.T) {
 
 	originProxyV1, err := orchestrator.GetOriginProxy()
 	require.NoError(t, err)
+	// nolint: bodyclose
 	resp, err := proxyHTTP(originProxyV1, hostname)
 	require.NoError(t, err)
 	require.Equal(t, http.StatusOK, resp.StatusCode)
@@ -540,12 +597,14 @@ func TestClosePreviousProxies(t *testing.T) {
 
 	originProxyV2, err := orchestrator.GetOriginProxy()
 	require.NoError(t, err)
+	// nolint: bodyclose
 	resp, err = proxyHTTP(originProxyV2, hostname)
 	require.NoError(t, err)
 	require.Equal(t, http.StatusTeapot, resp.StatusCode)
 
 	// The hello-world server in config v1 should have been stopped. We wait a bit since it's closed asynchronously.
 	time.Sleep(time.Millisecond * 10)
+	// nolint: bodyclose
 	resp, err = proxyHTTP(originProxyV1, hostname)
 	require.Error(t, err)
 	require.Nil(t, resp)
@@ -557,6 +616,7 @@ func TestClosePreviousProxies(t *testing.T) {
 	require.NoError(t, err)
 	require.NotEqual(t, originProxyV1, originProxyV3)
 
+	// nolint: bodyclose
 	resp, err = proxyHTTP(originProxyV3, hostname)
 	require.NoError(t, err)
 	require.Equal(t, http.StatusOK, resp.StatusCode)
@@ -566,6 +626,7 @@ func TestClosePreviousProxies(t *testing.T) {
 	// Wait for proxies to shutdown
 	time.Sleep(time.Millisecond * 10)
 
+	// nolint: bodyclose
 	resp, err = proxyHTTP(originProxyV3, hostname)
 	require.Error(t, err)
 	require.Nil(t, resp)
@@ -622,7 +683,7 @@ func TestPersistentConnection(t *testing.T) {
 	go func() {
 		defer wg.Done()
 		conn, err := tcpOrigin.Accept()
-		require.NoError(t, err)
+		assert.NoError(t, err)
 		defer conn.Close()
 
 		// Expect 3 TCP messages
@@ -630,26 +691,26 @@ func TestPersistentConnection(t *testing.T) {
 			echoTCP(t, conn)
 		}
 	}()
-	// Simulate cloudflared recieving a TCP connection
+	// Simulate cloudflared receiving a TCP connection
 	go func() {
 		defer wg.Done()
-		require.NoError(t, proxyTCP(ctx, originProxy, tcpOrigin.Addr().String(), tcpRespReadWriter, tcpReqReader))
+		assert.NoError(t, proxyTCP(ctx, originProxy, tcpOrigin.Addr().String(), tcpRespReadWriter, tcpReqReader))
 	}()
-	// Simulate cloudflared recieving a WS connection
+	// Simulate cloudflared receiving a WS connection
 	go func() {
 		defer wg.Done()
 
 		req, err := http.NewRequest(http.MethodGet, hostname, wsReqReader)
-		require.NoError(t, err)
+		assert.NoError(t, err)
 		// ProxyHTTP will add Connection, Upgrade and Sec-Websocket-Version headers
 		req.Header.Add("Sec-WebSocket-Key", "dGhlIHNhbXBsZSBub25jZQ==")
 
 		log := zerolog.Nop()
 		respWriter, err := connection.NewHTTP2RespWriter(req, wsRespReadWriter, connection.TypeWebsocket, &log)
-		require.NoError(t, err)
+		assert.NoError(t, err)
 
 		err = originProxy.ProxyHTTP(respWriter, tracing.NewTracedHTTPRequest(req, 0, &log), true)
-		require.NoError(t, err)
+		assert.NoError(t, err)
 	}()
 
 	// Simulate eyeball WS and TCP connections
diff --git a/supervisor/pqtunnels.go b/supervisor/pqtunnels.go
index 70a3fd69..2eaad9e8 100644
--- a/supervisor/pqtunnels.go
+++ b/supervisor/pqtunnels.go
@@ -7,30 +7,53 @@ import (
 	"github.com/cloudflare/cloudflared/features"
 )
 
-// When experimental post-quantum tunnels are enabled, and we're hitting an
-// issue creating the tunnel, we'll report the first error
-// to https://pqtunnels.cloudflareresearch.com.
-
 const (
-	PQKex     = tls.CurveID(0x6399) // X25519Kyber768Draft00
-	PQKexName = "X25519Kyber768Draft00"
+	X25519Kyber768Draft00PQKex     = tls.CurveID(0x6399) // X25519Kyber768Draft00
+	X25519Kyber768Draft00PQKexName = "X25519Kyber768Draft00"
+	P256Kyber768Draft00PQKex       = tls.CurveID(0xfe32) // P256Kyber768Draft00
+	P256Kyber768Draft00PQKexName   = "P256Kyber768Draft00"
+	X25519MLKEM768PQKex            = tls.CurveID(0x11ec) // X25519MLKEM768
+	X25519MLKEM768PQKexName        = "X25519MLKEM768"
 )
 
-func curvePreference(pqMode features.PostQuantumMode, currentCurve []tls.CurveID) ([]tls.CurveID, error) {
+var (
+	nonFipsPostQuantumStrictPKex []tls.CurveID = []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex}
+	nonFipsPostQuantumPreferPKex []tls.CurveID = []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex}
+	fipsPostQuantumStrictPKex    []tls.CurveID = []tls.CurveID{P256Kyber768Draft00PQKex}
+	fipsPostQuantumPreferPKex    []tls.CurveID = []tls.CurveID{P256Kyber768Draft00PQKex, tls.CurveP256}
+)
+
+func removeDuplicates(curves []tls.CurveID) []tls.CurveID {
+	bucket := make(map[tls.CurveID]bool)
+	var result []tls.CurveID
+	for _, curve := range curves {
+		if _, ok := bucket[curve]; !ok {
+			bucket[curve] = true
+			result = append(result, curve)
+		}
+	}
+	return result
+}
+
+func curvePreference(pqMode features.PostQuantumMode, fipsEnabled bool, currentCurve []tls.CurveID) ([]tls.CurveID, error) {
 	switch pqMode {
 	case features.PostQuantumStrict:
 		// If the user passes the -post-quantum flag, we override
 		// CurvePreferences to only support hybrid post-quantum key agreements.
-		return []tls.CurveID{PQKex}, nil
+		if fipsEnabled {
+			return fipsPostQuantumStrictPKex, nil
+		}
+		return nonFipsPostQuantumStrictPKex, nil
 	case features.PostQuantumPrefer:
-		if len(currentCurve) == 0 {
-			return []tls.CurveID{PQKex}, nil
+		if fipsEnabled {
+			// Ensure that all curves returned are FIPS compliant.
+			// Moreover the first curves are post-quantum and then the
+			// non post-quantum.
+			return fipsPostQuantumPreferPKex, nil
 		}
-
-		if currentCurve[0] != PQKex {
-			return append([]tls.CurveID{PQKex}, currentCurve...), nil
-		}
-		return currentCurve, nil
+		curves := append(nonFipsPostQuantumPreferPKex, currentCurve...)
+		curves = removeDuplicates(curves)
+		return curves, nil
 	default:
 		return nil, fmt.Errorf("Unexpected post quantum mode")
 	}
diff --git a/supervisor/pqtunnels_test.go b/supervisor/pqtunnels_test.go
new file mode 100644
index 00000000..383200db
--- /dev/null
+++ b/supervisor/pqtunnels_test.go
@@ -0,0 +1,84 @@
+package supervisor
+
+import (
+	"crypto/tls"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/cloudflare/cloudflared/features"
+)
+
+func TestCurvePreferences(t *testing.T) {
+	// This tests if the correct curves are returned
+	// given a PostQuantumMode and a FIPS enabled bool
+	t.Parallel()
+
+	tests := []struct {
+		name           string
+		currentCurves  []tls.CurveID
+		expectedCurves []tls.CurveID
+		pqMode         features.PostQuantumMode
+		fipsEnabled    bool
+	}{
+		{
+			name:           "FIPS with Prefer PQ",
+			pqMode:         features.PostQuantumPrefer,
+			fipsEnabled:    true,
+			currentCurves:  []tls.CurveID{tls.CurveP384},
+			expectedCurves: []tls.CurveID{P256Kyber768Draft00PQKex, tls.CurveP256},
+		},
+		{
+			name:           "FIPS with Strict PQ",
+			pqMode:         features.PostQuantumStrict,
+			fipsEnabled:    true,
+			currentCurves:  []tls.CurveID{tls.CurveP256, tls.CurveP384},
+			expectedCurves: []tls.CurveID{P256Kyber768Draft00PQKex},
+		},
+		{
+			name:           "FIPS with Prefer PQ - no duplicates",
+			pqMode:         features.PostQuantumPrefer,
+			fipsEnabled:    true,
+			currentCurves:  []tls.CurveID{tls.CurveP256},
+			expectedCurves: []tls.CurveID{P256Kyber768Draft00PQKex, tls.CurveP256},
+		},
+		{
+			name:           "Non FIPS with Prefer PQ",
+			pqMode:         features.PostQuantumPrefer,
+			fipsEnabled:    false,
+			currentCurves:  []tls.CurveID{tls.CurveP256},
+			expectedCurves: []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex, tls.CurveP256},
+		},
+		{
+			name:           "Non FIPS with Prefer PQ - no duplicates",
+			pqMode:         features.PostQuantumPrefer,
+			fipsEnabled:    false,
+			currentCurves:  []tls.CurveID{X25519Kyber768Draft00PQKex, tls.CurveP256},
+			expectedCurves: []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex, tls.CurveP256},
+		},
+		{
+			name:           "Non FIPS with Prefer PQ - correct preference order",
+			pqMode:         features.PostQuantumPrefer,
+			fipsEnabled:    false,
+			currentCurves:  []tls.CurveID{tls.CurveP256, X25519Kyber768Draft00PQKex},
+			expectedCurves: []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex, tls.CurveP256},
+		},
+		{
+			name:           "Non FIPS with Strict PQ",
+			pqMode:         features.PostQuantumStrict,
+			fipsEnabled:    false,
+			currentCurves:  []tls.CurveID{tls.CurveP256},
+			expectedCurves: []tls.CurveID{X25519MLKEM768PQKex, X25519Kyber768Draft00PQKex},
+		},
+	}
+
+	for _, tcase := range tests {
+		t.Run(tcase.name, func(t *testing.T) {
+			t.Parallel()
+			curves, err := curvePreference(tcase.pqMode, tcase.fipsEnabled, tcase.currentCurves)
+			require.NoError(t, err)
+			assert.Equal(t, tcase.expectedCurves, curves)
+		})
+	}
+}
diff --git a/supervisor/supervisor.go b/supervisor/supervisor.go
index 920fbeab..df8bbd46 100644
--- a/supervisor/supervisor.go
+++ b/supervisor/supervisor.go
@@ -247,9 +247,7 @@ func (s *Supervisor) startFirstTunnel(
 	ctx context.Context,
 	connectedSignal *signal.Signal,
 ) {
-	var (
-		err error
-	)
+	var err error
 	const firstConnIndex = 0
 	isStaticEdge := len(s.config.EdgeAddrs) > 0
 	defer func() {
@@ -300,9 +298,7 @@ func (s *Supervisor) startTunnel(
 	index int,
 	connectedSignal *signal.Signal,
 ) {
-	var (
-		err error
-	)
+	var err error
 	defer func() {
 		s.tunnelErrors <- tunnelError{index: index, err: err}
 	}()
diff --git a/supervisor/tunnel.go b/supervisor/tunnel.go
index 6807b56d..01937756 100644
--- a/supervisor/tunnel.go
+++ b/supervisor/tunnel.go
@@ -11,6 +11,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/getsentry/sentry-go"
 	"github.com/pkg/errors"
 	"github.com/quic-go/quic-go"
 	"github.com/rs/zerolog"
@@ -20,6 +21,7 @@ import (
 	"github.com/cloudflare/cloudflared/edgediscovery"
 	"github.com/cloudflare/cloudflared/edgediscovery/allregions"
 	"github.com/cloudflare/cloudflared/features"
+	"github.com/cloudflare/cloudflared/fips"
 	"github.com/cloudflare/cloudflared/ingress"
 	"github.com/cloudflare/cloudflared/management"
 	"github.com/cloudflare/cloudflared/orchestration"
@@ -555,11 +557,13 @@ func (e *EdgeTunnelServer) serveQUIC(
 	tlsConfig := e.config.EdgeTLSConfigs[connection.QUIC]
 
 	pqMode := e.config.FeatureSelector.PostQuantumMode()
-	curvePref, err := curvePreference(pqMode, tlsConfig.CurvePreferences)
+	curvePref, err := curvePreference(pqMode, fips.IsFipsEnabled(), tlsConfig.CurvePreferences)
 	if err != nil {
 		return err, true
 	}
 
+	connLogger.Logger().Info().Msgf("Using %v as curve preferences", curvePref)
+
 	tlsConfig.CurvePreferences = curvePref
 
 	// quic-go 0.44 increases the initial packet size to 1280 by default. That breaks anyone running tunnel through WARP
@@ -595,6 +599,8 @@ func (e *EdgeTunnelServer) serveQUIC(
 	)
 	if err != nil {
 		connLogger.ConnAwareLogger().Err(err).Msgf("Failed to dial a quic connection")
+
+		e.reportErrorToSentry(err)
 		return err, true
 	}
 
@@ -664,6 +670,26 @@ func (e *EdgeTunnelServer) serveQUIC(
 	return errGroup.Wait(), false
 }
 
+// The reportErrorToSentry is an helper function that handles
+// verifies if an error should be reported to Sentry.
+func (e *EdgeTunnelServer) reportErrorToSentry(err error) {
+	dialErr, ok := err.(*connection.EdgeQuicDialError)
+	if ok {
+		// The TransportError provides an Unwrap function however
+		// the err MAY not always be set
+		transportErr, ok := dialErr.Cause.(*quic.TransportError)
+		if ok &&
+			transportErr.ErrorCode.IsCryptoError() &&
+			fips.IsFipsEnabled() &&
+			e.config.FeatureSelector.PostQuantumMode() == features.PostQuantumStrict {
+			// Only report to Sentry when using FIPS, PQ,
+			// and the error is a Crypto error reported by
+			// an EdgeQuicDialError
+			sentry.CaptureException(err)
+		}
+	}
+}
+
 func listenReconnect(ctx context.Context, reconnectCh <-chan ReconnectSignal, gracefulShutdownCh <-chan struct{}) error {
 	select {
 	case reconnect := <-reconnectCh:
diff --git a/token/token.go b/token/token.go
index d561dc38..30ab9366 100644
--- a/token/token.go
+++ b/token/token.go
@@ -53,7 +53,7 @@ type signalHandler struct {
 }
 
 type jwtPayload struct {
-	Aud   []string `json:"aud"`
+	Aud   []string `json:"-"`
 	Email string   `json:"email"`
 	Exp   int      `json:"exp"`
 	Iat   int      `json:"iat"`
@@ -68,6 +68,34 @@ type transferServiceResponse struct {
 	OrgToken string `json:"org_token"`
 }
 
+func (p *jwtPayload) UnmarshalJSON(data []byte) error {
+	type Alias jwtPayload
+	if err := json.Unmarshal(data, (*Alias)(p)); err != nil {
+		return err
+	}
+	var audParser struct {
+		Aud any `json:"aud"`
+	}
+	if err := json.Unmarshal(data, &audParser); err != nil {
+		return err
+	}
+	switch aud := audParser.Aud.(type) {
+	case string:
+		p.Aud = []string{aud}
+	case []any:
+		for _, a := range aud {
+			s, ok := a.(string)
+			if !ok {
+				return errors.New("aud array contains non-string elements")
+			}
+			p.Aud = append(p.Aud, s)
+		}
+	default:
+		return errors.New("aud field is not a string or an array of strings")
+	}
+	return nil
+}
+
 func (p jwtPayload) isExpired() bool {
 	return int(time.Now().Unix()) > p.Exp
 }
@@ -182,7 +210,9 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, log *zerolog.
 	if err = fileLockAppToken.Acquire(); err != nil {
 		return "", errors.Wrap(err, "failed to acquire app token lock")
 	}
-	defer fileLockAppToken.Release()
+	defer func() {
+		_ = fileLockAppToken.Release()
+	}()
 
 	// check to see if another process has gotten a token while we waited for the lock
 	if token, err := GetAppTokenIfExists(appInfo); token != "" && err == nil {
@@ -202,7 +232,9 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, log *zerolog.
 		if err = fileLockOrgToken.Acquire(); err != nil {
 			return "", errors.Wrap(err, "failed to acquire org token lock")
 		}
-		defer fileLockOrgToken.Release()
+		defer func() {
+			_ = fileLockOrgToken.Release()
+		}()
 		// check if an org token has been created since the lock was acquired
 		orgToken, err = GetOrgTokenIfExists(appInfo.AuthDomain)
 	}
@@ -218,7 +250,6 @@ func getToken(appURL *url.URL, appInfo *AppInfo, useHostOnly bool, log *zerolog.
 		}
 	}
 	return getTokensFromEdge(appURL, appInfo.AppAUD, appTokenPath, orgTokenPath, useHostOnly, log)
-
 }
 
 // getTokensFromEdge will attempt to use the transfer service to retrieve an app and org token, save them to disk,
@@ -250,7 +281,6 @@ func getTokensFromEdge(appURL *url.URL, appAUD, appTokenPath, orgTokenPath strin
 	}
 
 	return resp.AppToken, nil
-
 }
 
 // GetAppInfo makes a request to the appURL and stops at the first redirect. The 302 location header will contain the
@@ -320,7 +350,6 @@ func handleRedirects(req *http.Request, via []*http.Request, orgToken string) er
 				}
 			}
 		}
-
 	}
 
 	// stop after hitting authorized endpoint since it will contain the app token
@@ -408,7 +437,6 @@ func GetAppTokenIfExists(appInfo *AppInfo) (string, error) {
 		return "", err
 	}
 	return token.CompactSerialize()
-
 }
 
 // GetTokenIfExists will return the token from local storage if it exists and not expired
diff --git a/token/token_test.go b/token/token_test.go
index 5c69352d..da92ed73 100644
--- a/token/token_test.go
+++ b/token/token_test.go
@@ -1,6 +1,7 @@
 package token
 
 import (
+	"encoding/json"
 	"net/http"
 	"net/url"
 	"testing"
@@ -11,7 +12,7 @@ func TestHandleRedirects_AttachOrgToken(t *testing.T) {
 	via := []*http.Request{}
 	orgToken := "orgTokenValue"
 
-	handleRedirects(req, via, orgToken)
+	_ = handleRedirects(req, via, orgToken)
 
 	// Check if the orgToken cookie is attached
 	cookies := req.Cookies()
@@ -80,3 +81,55 @@ func TestHandleRedirects_StopAtAuthorizedEndpoint(t *testing.T) {
 		t.Errorf("Expected ErrUseLastResponse, got %v", err)
 	}
 }
+
+func TestJwtPayloadUnmarshal_AudAsString(t *testing.T) {
+	jwt := `{"aud":"7afbdaf987054f889b3bdd0d29ebfcd2"}`
+	var payload jwtPayload
+	if err := json.Unmarshal([]byte(jwt), &payload); err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if len(payload.Aud) != 1 || payload.Aud[0] != "7afbdaf987054f889b3bdd0d29ebfcd2" {
+		t.Errorf("Expected aud to be 7afbdaf987054f889b3bdd0d29ebfcd2, got %v", payload.Aud)
+	}
+}
+
+func TestJwtPayloadUnmarshal_AudAsSlice(t *testing.T) {
+	jwt := `{"aud":["7afbdaf987054f889b3bdd0d29ebfcd2", "f835c0016f894768976c01e076844efe"]}`
+	var payload jwtPayload
+	if err := json.Unmarshal([]byte(jwt), &payload); err != nil {
+		t.Errorf("Expected no error, got %v", err)
+	}
+	if len(payload.Aud) != 2 || payload.Aud[0] != "7afbdaf987054f889b3bdd0d29ebfcd2" || payload.Aud[1] != "f835c0016f894768976c01e076844efe" {
+		t.Errorf("Expected aud to be [7afbdaf987054f889b3bdd0d29ebfcd2, f835c0016f894768976c01e076844efe], got %v", payload.Aud)
+	}
+}
+
+func TestJwtPayloadUnmarshal_FailsWhenAudIsInt(t *testing.T) {
+	jwt := `{"aud":123}`
+	var payload jwtPayload
+	err := json.Unmarshal([]byte(jwt), &payload)
+	wantErr := "aud field is not a string or an array of strings"
+	if err.Error() != wantErr {
+		t.Errorf("Expected %v, got %v", wantErr, err)
+	}
+}
+
+func TestJwtPayloadUnmarshal_FailsWhenAudIsArrayOfInts(t *testing.T) {
+	jwt := `{"aud": [999, 123] }`
+	var payload jwtPayload
+	err := json.Unmarshal([]byte(jwt), &payload)
+	wantErr := "aud array contains non-string elements"
+	if err.Error() != wantErr {
+		t.Errorf("Expected %v, got %v", wantErr, err)
+	}
+}
+
+func TestJwtPayloadUnmarshal_FailsWhenAudIsOmitted(t *testing.T) {
+	jwt := `{}`
+	var payload jwtPayload
+	err := json.Unmarshal([]byte(jwt), &payload)
+	wantErr := "aud field is not a string or an array of strings"
+	if err.Error() != wantErr {
+		t.Errorf("Expected %v, got %v", wantErr, err)
+	}
+}
diff --git a/token/transfer.go b/token/transfer.go
index 9b035537..fd5d80ed 100644
--- a/token/transfer.go
+++ b/token/transfer.go
@@ -70,7 +70,6 @@ func RunTransfer(transferURL *url.URL, appAUD, resourceName, key, value string,
 	}
 
 	return resourceData, nil
-
 }
 
 // BuildRequestURL creates a request suitable for a resource transfer.
diff --git a/tunnelrpc/pogs/quic_metadata_protocol.go b/tunnelrpc/pogs/quic_metadata_protocol.go
index d73c9732..57333b54 100644
--- a/tunnelrpc/pogs/quic_metadata_protocol.go
+++ b/tunnelrpc/pogs/quic_metadata_protocol.go
@@ -19,8 +19,8 @@ const (
 )
 
 var (
-	// ErrorFlowConnectRateLimitedKey is the Metadata entry that allows to know if a request was rate limited on connect.
-	ErrorFlowConnectRateLimitedKey = Metadata{Key: "FlowConnectRateLimited", Val: "true"}
+	// ErrorFlowConnectRateLimitedMetadata is the Metadata entry that allows to know if a request was rate limited on connect.
+	ErrorFlowConnectRateLimitedMetadata = Metadata{Key: "FlowConnectRateLimited", Val: "true"}
 )
 
 func (c ConnectionType) String() string {
diff --git a/vendor/github.com/quic-go/quic-go/connection.go b/vendor/github.com/quic-go/quic-go/connection.go
index d65f7c2e..0dbfa574 100644
--- a/vendor/github.com/quic-go/quic-go/connection.go
+++ b/vendor/github.com/quic-go/quic-go/connection.go
@@ -8,7 +8,9 @@ import (
 	"fmt"
 	"io"
 	"net"
+	"os"
 	"reflect"
+	"strconv"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -288,6 +290,16 @@ var newConnection = func(
 		s.logger,
 	)
 	s.maxPayloadSizeEstimate.Store(uint32(estimateMaxPayloadSize(protocol.ByteCount(s.config.InitialPacketSize))))
+	// Allow server to define custom MaxUDPPayloadSize
+	maxUDPPayloadSize := protocol.MaxPacketBufferSize
+	if maxPacketSize := os.Getenv("TUNNEL_MAX_QUIC_PACKET_SIZE"); maxPacketSize != "" {
+		if customMaxPacketSize, err := strconv.ParseUint(maxPacketSize, 10, 64); err == nil {
+			maxUDPPayloadSize = int(customMaxPacketSize)
+		} else {
+			utils.DefaultLogger.Errorf("failed to parse TUNNEL_MAX_QUIC_PACKET_SIZE: %v", err)
+		}
+	}
+
 	params := &wire.TransportParameters{
 		InitialMaxStreamDataBidiLocal:   protocol.ByteCount(s.config.InitialStreamReceiveWindow),
 		InitialMaxStreamDataBidiRemote:  protocol.ByteCount(s.config.InitialStreamReceiveWindow),
@@ -298,7 +310,7 @@ var newConnection = func(
 		MaxUniStreamNum:                 protocol.StreamNum(s.config.MaxIncomingUniStreams),
 		MaxAckDelay:                     protocol.MaxAckDelayInclGranularity,
 		AckDelayExponent:                protocol.AckDelayExponent,
-		MaxUDPPayloadSize:               protocol.MaxPacketBufferSize,
+		MaxUDPPayloadSize:               protocol.ByteCount(maxUDPPayloadSize),
 		DisableActiveMigration:          true,
 		StatelessResetToken:             &statelessResetToken,
 		OriginalDestinationConnectionID: origDestConnID,
diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/cipher_suite.go b/vendor/github.com/quic-go/quic-go/internal/handshake/cipher_suite.go
index d8a381da..fd3da738 100644
--- a/vendor/github.com/quic-go/quic-go/internal/handshake/cipher_suite.go
+++ b/vendor/github.com/quic-go/quic-go/internal/handshake/cipher_suite.go
@@ -12,7 +12,9 @@ import (
 
 // These cipher suite implementations are copied from the standard library crypto/tls package.
 
-const aeadNonceLength = 12
+const (
+	aeadNonceLength = 12
+)
 
 type cipherSuite struct {
 	ID     uint16
@@ -44,12 +46,13 @@ func aeadAESGCMTLS13(key, nonceMask []byte) *xorNonceAEAD {
 	if err != nil {
 		panic(err)
 	}
-	aead, err := cipher.NewGCM(aes)
+
+	aead, err := newAEAD(aes)
 	if err != nil {
 		panic(err)
 	}
 
-	ret := &xorNonceAEAD{aead: aead}
+	ret := &xorNonceAEAD{aead: aead, hasSeenNonceZero: false}
 	copy(ret.nonceMask[:], nonceMask)
 	return ret
 }
@@ -71,8 +74,9 @@ func aeadChaCha20Poly1305(key, nonceMask []byte) *xorNonceAEAD {
 // xorNonceAEAD wraps an AEAD by XORing in a fixed pattern to the nonce
 // before each call.
 type xorNonceAEAD struct {
-	nonceMask [aeadNonceLength]byte
-	aead      cipher.AEAD
+	nonceMask        [aeadNonceLength]byte
+	aead             cipher.AEAD
+	hasSeenNonceZero bool // This value denotes if the aead field was used with a nonce = 0
 }
 
 func (f *xorNonceAEAD) NonceSize() int        { return 8 } // 64-bit sequence number
@@ -80,6 +84,10 @@ func (f *xorNonceAEAD) Overhead() int         { return f.aead.Overhead() }
 func (f *xorNonceAEAD) explicitNonceLen() int { return 0 }
 
 func (f *xorNonceAEAD) Seal(out, nonce, plaintext, additionalData []byte) []byte {
+	return f.seal(nonce, out, plaintext, additionalData)
+}
+
+func (f *xorNonceAEAD) doSeal(nonce, out, plaintext, additionalData []byte) []byte {
 	for i, b := range nonce {
 		f.nonceMask[4+i] ^= b
 	}
diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_boring.go b/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_boring.go
new file mode 100644
index 00000000..cc44c1ac
--- /dev/null
+++ b/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_boring.go
@@ -0,0 +1,51 @@
+//go:build boringcrypto
+
+package handshake
+
+import (
+	"crypto/cipher"
+	"crypto/tls"
+	"os"
+	"strings"
+)
+
+var goBoringDisabled bool = strings.TrimSpace(os.Getenv("QUIC_GO_DISABLE_BORING")) == "1"
+
+func newAEAD(aes cipher.Block) (cipher.AEAD, error) {
+	if goBoringDisabled {
+		// In case Go Boring is disabled then
+		// fallback to normal cryptographic procedure.
+		return cipher.NewGCM(aes)
+	}
+	return tls.NewGCMTLS13(aes)
+}
+
+func allZeros(nonce []byte) bool {
+	for _, e := range nonce {
+		if e != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func (f *xorNonceAEAD) sealZeroNonce() {
+	f.doSeal([]byte{}, []byte{}, []byte{}, []byte{})
+}
+
+func (f *xorNonceAEAD) seal(nonce, out, plaintext, additionalData []byte) []byte {
+	if !goBoringDisabled {
+		if !f.hasSeenNonceZero {
+			// BoringSSL expects that the first nonce passed to the
+			// AEAD instance is zero.
+			// At this point the nonce argument is either zero or
+			// an artificial one will be passed to the AEAD through
+			// [sealZeroNonce]
+			f.hasSeenNonceZero = true
+			if !allZeros(nonce) {
+				f.sealZeroNonce()
+			}
+		}
+	}
+	return f.doSeal(nonce, out, plaintext, additionalData)
+}
diff --git a/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_noboring.go b/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_noboring.go
new file mode 100644
index 00000000..147269b5
--- /dev/null
+++ b/vendor/github.com/quic-go/quic-go/internal/handshake/xor_nonce_aead_noboring.go
@@ -0,0 +1,13 @@
+//go:build !boringcrypto
+
+package handshake
+
+import "crypto/cipher"
+
+func newAEAD(aes cipher.Block) (cipher.AEAD, error) {
+	return cipher.NewGCM(aes)
+}
+
+func (f *xorNonceAEAD) seal(nonce, out, plaintext, additionalData []byte) []byte {
+	return f.doSeal(nonce, out, plaintext, additionalData)
+}
diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE
index 6a66aea5..2a7cf70d 100644
--- a/vendor/golang.org/x/crypto/LICENSE
+++ b/vendor/golang.org/x/crypto/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
+Copyright 2009 The Go Authors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer.
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
-   * Neither the name of Google Inc. nor the names of its
+   * Neither the name of Google LLC nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
index 9ae8206c..f75162e0 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@@ -1,722 +1,4517 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
-DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
-DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
-
-DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
-
-#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
-#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
-#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
-#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
-#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
-
-#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
-	VPADDQ  m0, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFD $-79, Y3, Y3; \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPSHUFB c40, Y1, Y1;  \
-	VPADDQ  m1, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFB c48, Y3, Y3;  \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPADDQ  Y1, Y1, t;    \
-	VPSRLQ  $63, Y1, Y1;  \
-	VPXOR   t, Y1, Y1;    \
-	VPERMQ_0x39_Y1_Y1;    \
-	VPERMQ_0x4E_Y2_Y2;    \
-	VPERMQ_0x93_Y3_Y3;    \
-	VPADDQ  m2, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFD $-79, Y3, Y3; \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPSHUFB c40, Y1, Y1;  \
-	VPADDQ  m3, Y0, Y0;   \
-	VPADDQ  Y1, Y0, Y0;   \
-	VPXOR   Y0, Y3, Y3;   \
-	VPSHUFB c48, Y3, Y3;  \
-	VPADDQ  Y3, Y2, Y2;   \
-	VPXOR   Y2, Y1, Y1;   \
-	VPADDQ  Y1, Y1, t;    \
-	VPSRLQ  $63, Y1, Y1;  \
-	VPXOR   t, Y1, Y1;    \
-	VPERMQ_0x39_Y3_Y3;    \
-	VPERMQ_0x4E_Y2_Y2;    \
-	VPERMQ_0x93_Y1_Y1
-
-#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
-#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
-#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
-#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
-#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
-
-#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
-#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
-#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
-#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
-#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
-
-#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
-#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
-#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
-#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
-#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
-
-#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
-#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
-
-#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
-#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
-
-// load msg: Y12 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
-	VMOVQ_SI_X12(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X12(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y12, Y12
-
-// load msg: Y13 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
-	VMOVQ_SI_X13(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X13(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y13, Y13
-
-// load msg: Y14 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
-	VMOVQ_SI_X14(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X14(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y14, Y14
-
-// load msg: Y15 = (i0, i1, i2, i3)
-// i0, i1, i2, i3 must not be 0
-#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
-	VMOVQ_SI_X15(i0*8);           \
-	VMOVQ_SI_X11(i2*8);           \
-	VPINSRQ_1_SI_X15(i1*8);       \
-	VPINSRQ_1_SI_X11(i3*8);       \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
-	VMOVQ_SI_X12_0;                   \
-	VMOVQ_SI_X11(4*8);                \
-	VPINSRQ_1_SI_X12(2*8);            \
-	VPINSRQ_1_SI_X11(6*8);            \
-	VINSERTI128 $1, X11, Y12, Y12;    \
-	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
-	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
-	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
-
-#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
-	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
-	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
-	VMOVQ_SI_X11(11*8);              \
-	VPSHUFD     $0x4E, 0*8(SI), X14; \
-	VPINSRQ_1_SI_X11(5*8);           \
-	VINSERTI128 $1, X11, Y14, Y14;   \
-	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
-
-#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
-	VMOVQ_SI_X11(5*8);              \
-	VMOVDQU     11*8(SI), X12;      \
-	VPINSRQ_1_SI_X11(15*8);         \
-	VINSERTI128 $1, X11, Y12, Y12;  \
-	VMOVQ_SI_X13(8*8);              \
-	VMOVQ_SI_X11(2*8);              \
-	VPINSRQ_1_SI_X13_0;             \
-	VPINSRQ_1_SI_X11(13*8);         \
-	VINSERTI128 $1, X11, Y13, Y13;  \
-	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
-	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
-
-#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
-	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
-	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
-	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
-	VMOVQ_SI_X15(6*8);               \
-	VMOVQ_SI_X11_0;                  \
-	VPINSRQ_1_SI_X15(10*8);          \
-	VPINSRQ_1_SI_X11(8*8);           \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
-	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
-	VMOVQ_SI_X13_0;                  \
-	VMOVQ_SI_X11(4*8);               \
-	VPINSRQ_1_SI_X13(7*8);           \
-	VPINSRQ_1_SI_X11(15*8);          \
-	VINSERTI128 $1, X11, Y13, Y13;   \
-	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
-	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
-
-#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
-	VMOVQ_SI_X12(2*8);                \
-	VMOVQ_SI_X11_0;                   \
-	VPINSRQ_1_SI_X12(6*8);            \
-	VPINSRQ_1_SI_X11(8*8);            \
-	VINSERTI128 $1, X11, Y12, Y12;    \
-	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
-	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
-	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
-
-#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
-	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
-	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
-	VMOVQ_SI_X14_0;                   \
-	VPSHUFD     $0x4E, 8*8(SI), X11;  \
-	VPINSRQ_1_SI_X14(6*8);            \
-	VINSERTI128 $1, X11, Y14, Y14;    \
-	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
-
-#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
-	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
-	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
-	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
-	VMOVQ_SI_X15_0;                  \
-	VMOVQ_SI_X11(6*8);               \
-	VPINSRQ_1_SI_X15(4*8);           \
-	VPINSRQ_1_SI_X11(10*8);          \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
-	VMOVQ_SI_X12(6*8);              \
-	VMOVQ_SI_X11(11*8);             \
-	VPINSRQ_1_SI_X12(14*8);         \
-	VPINSRQ_1_SI_X11_0;             \
-	VINSERTI128 $1, X11, Y12, Y12;  \
-	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
-	VMOVQ_SI_X11(1*8);              \
-	VMOVDQU     12*8(SI), X14;      \
-	VPINSRQ_1_SI_X11(10*8);         \
-	VINSERTI128 $1, X11, Y14, Y14;  \
-	VMOVQ_SI_X15(2*8);              \
-	VMOVDQU     4*8(SI), X11;       \
-	VPINSRQ_1_SI_X15(7*8);          \
-	VINSERTI128 $1, X11, Y15, Y15
-
-#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
-	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
-	VMOVQ_SI_X13(2*8);               \
-	VPSHUFD     $0x4E, 5*8(SI), X11; \
-	VPINSRQ_1_SI_X13(4*8);           \
-	VINSERTI128 $1, X11, Y13, Y13;   \
-	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
-	VMOVQ_SI_X15(11*8);              \
-	VMOVQ_SI_X11(12*8);              \
-	VPINSRQ_1_SI_X15(14*8);          \
-	VPINSRQ_1_SI_X11_0;              \
-	VINSERTI128 $1, X11, Y15, Y15
-
 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, DX
-	ADDQ $31, DX
-	ANDQ $~31, DX
-
-	MOVQ CX, 16(DX)
-	XORQ CX, CX
-	MOVQ CX, 24(DX)
-
-	VMOVDQU ·AVX2_c40<>(SB), Y4
-	VMOVDQU ·AVX2_c48<>(SB), Y5
-
-	VMOVDQU 0(AX), Y8
+// Requires: AVX, AVX2
+TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48
+	MOVQ    h+0(FP), AX
+	MOVQ    c+8(FP), BX
+	MOVQ    flag+16(FP), CX
+	MOVQ    blocks_base+24(FP), SI
+	MOVQ    blocks_len+32(FP), DI
+	MOVQ    SP, DX
+	ADDQ    $+31, DX
+	ANDQ    $-32, DX
+	MOVQ    CX, 16(DX)
+	XORQ    CX, CX
+	MOVQ    CX, 24(DX)
+	VMOVDQU ·AVX2_c40<>+0(SB), Y4
+	VMOVDQU ·AVX2_c48<>+0(SB), Y5
+	VMOVDQU (AX), Y8
 	VMOVDQU 32(AX), Y9
-	VMOVDQU ·AVX2_iv0<>(SB), Y6
-	VMOVDQU ·AVX2_iv1<>(SB), Y7
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
-	MOVQ R9, 8(DX)
+	VMOVDQU ·AVX2_iv0<>+0(SB), Y6
+	VMOVDQU ·AVX2_iv1<>+0(SB), Y7
+	MOVQ    (BX), R8
+	MOVQ    8(BX), R9
+	MOVQ    R9, 8(DX)
 
 loop:
-	ADDQ $128, R8
-	MOVQ R8, 0(DX)
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	MOVQ R8, (DX)
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 	MOVQ R9, 8(DX)
 
 noinc:
-	VMOVDQA Y8, Y0
-	VMOVDQA Y9, Y1
-	VMOVDQA Y6, Y2
-	VPXOR   0(DX), Y7, Y3
-
-	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
-	VMOVDQA Y12, 32(DX)
-	VMOVDQA Y13, 64(DX)
-	VMOVDQA Y14, 96(DX)
-	VMOVDQA Y15, 128(DX)
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
-	VMOVDQA Y12, 160(DX)
-	VMOVDQA Y13, 192(DX)
-	VMOVDQA Y14, 224(DX)
-	VMOVDQA Y15, 256(DX)
-
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
-	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
-
-	ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5)
-	ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5)
-
-	VPXOR Y0, Y8, Y8
-	VPXOR Y1, Y9, Y9
-	VPXOR Y2, Y8, Y8
-	VPXOR Y3, Y9, Y9
-
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
-
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
-
-	VMOVDQU Y8, 0(AX)
-	VMOVDQU Y9, 32(AX)
+	VMOVDQA     Y8, Y0
+	VMOVDQA     Y9, Y1
+	VMOVDQA     Y6, Y2
+	VPXOR       (DX), Y7, Y3
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x26
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x10
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x08
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x28
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x40
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x58
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VMOVDQA     Y12, 32(DX)
+	VMOVDQA     Y13, 64(DX)
+	VMOVDQA     Y14, 96(DX)
+	VMOVDQA     Y15, 128(DX)
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x20
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x40
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	VPSHUFD     $0x4e, (SI), X14
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x28
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x10
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VMOVDQA     Y12, 160(DX)
+	VMOVDQA     Y13, 192(DX)
+	VMOVDQA     Y14, 224(DX)
+	VMOVDQA     Y15, 256(DX)
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x28
+	VMOVDQU     88(SI), X12
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x40
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x2e
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x30
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x38
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x08
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x10
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x30
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x1e
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x48
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x2e
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x70
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x58
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x08
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x60
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x10
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x1e
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x30
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x50
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x20
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x78
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x68
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x28
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x60
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x70
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x08
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x28
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x78
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x36
+	VPSHUFD     $0x4e, 64(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x30
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x38
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x18
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x68
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x38
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x58
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x48
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x28
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x78
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x10
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x3e
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x30
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x20
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x30
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x58
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x1e
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x78
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x48
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x40
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x08
+	VMOVDQU     96(SI), X14
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x50
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x10
+	VMOVDQU     32(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x38
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x66
+	BYTE        $0x50
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x38
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x99
+	BYTE        $0x22
+	BYTE        $0x66
+	BYTE        $0x40
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x08
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y12, Y12
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x6e
+	BYTE        $0x10
+	VPSHUFD     $0x4e, 40(SI), X11
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x91
+	BYTE        $0x22
+	BYTE        $0x6e
+	BYTE        $0x20
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y13, Y13
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x76
+	BYTE        $0x78
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x18
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x89
+	BYTE        $0x22
+	BYTE        $0x76
+	BYTE        $0x48
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x5e
+	BYTE        $0x68
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y14, Y14
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x7e
+	BYTE        $0x58
+	BYTE        $0xc5
+	BYTE        $0x7a
+	BYTE        $0x7e
+	BYTE        $0x5e
+	BYTE        $0x60
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0x81
+	BYTE        $0x22
+	BYTE        $0x7e
+	BYTE        $0x70
+	BYTE        $0x01
+	BYTE        $0xc4
+	BYTE        $0x63
+	BYTE        $0xa1
+	BYTE        $0x22
+	BYTE        $0x1e
+	BYTE        $0x01
+	VINSERTI128 $0x01, X11, Y15, Y15
+	VPADDQ      Y12, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y13, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      Y14, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      Y15, Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPADDQ      32(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      64(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      96(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      128(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPADDQ      160(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      192(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x93
+	VPADDQ      224(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFD     $-79, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPSHUFB     Y4, Y1, Y1
+	VPADDQ      256(DX), Y0, Y0
+	VPADDQ      Y1, Y0, Y0
+	VPXOR       Y0, Y3, Y3
+	VPSHUFB     Y5, Y3, Y3
+	VPADDQ      Y3, Y2, Y2
+	VPXOR       Y2, Y1, Y1
+	VPADDQ      Y1, Y1, Y10
+	VPSRLQ      $0x3f, Y1, Y1
+	VPXOR       Y10, Y1, Y1
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xdb
+	BYTE        $0x39
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xd2
+	BYTE        $0x4e
+	BYTE        $0xc4
+	BYTE        $0xe3
+	BYTE        $0xfd
+	BYTE        $0x00
+	BYTE        $0xc9
+	BYTE        $0x93
+	VPXOR       Y0, Y8, Y8
+	VPXOR       Y1, Y9, Y9
+	VPXOR       Y2, Y8, Y8
+	VPXOR       Y3, Y9, Y9
+	LEAQ        128(SI), SI
+	SUBQ        $0x80, DI
+	JNE         loop
+	MOVQ        R8, (BX)
+	MOVQ        R9, 8(BX)
+	VMOVDQU     Y8, (AX)
+	VMOVDQU     Y9, 32(AX)
 	VZEROUPPER
-
 	RET
 
-#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
-#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
-#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
-#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
-#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
+DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403
+DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32
 
-#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
-#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
-#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
-#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
-#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
+DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302
+DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32
 
-#define SHUFFLE_AVX() \
-	VMOVDQA X6, X13;         \
-	VMOVDQA X2, X14;         \
-	VMOVDQA X4, X6;          \
-	VPUNPCKLQDQ_X13_X13_X15; \
-	VMOVDQA X5, X4;          \
-	VMOVDQA X6, X5;          \
-	VPUNPCKHQDQ_X15_X7_X6;   \
-	VPUNPCKLQDQ_X7_X7_X15;   \
-	VPUNPCKHQDQ_X15_X13_X7;  \
-	VPUNPCKLQDQ_X3_X3_X15;   \
-	VPUNPCKHQDQ_X15_X2_X2;   \
-	VPUNPCKLQDQ_X14_X14_X15; \
-	VPUNPCKHQDQ_X15_X3_X3;   \
+DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32
 
-#define SHUFFLE_AVX_INV() \
-	VMOVDQA X2, X13;         \
-	VMOVDQA X4, X14;         \
-	VPUNPCKLQDQ_X2_X2_X15;   \
-	VMOVDQA X5, X4;          \
-	VPUNPCKHQDQ_X15_X3_X2;   \
-	VMOVDQA X14, X5;         \
-	VPUNPCKLQDQ_X3_X3_X15;   \
-	VMOVDQA X6, X14;         \
-	VPUNPCKHQDQ_X15_X13_X3;  \
-	VPUNPCKLQDQ_X7_X7_X15;   \
-	VPUNPCKHQDQ_X15_X6_X6;   \
-	VPUNPCKLQDQ_X14_X14_X15; \
-	VPUNPCKHQDQ_X15_X7_X7;   \
-
-#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
-	VPADDQ  m0, v0, v0;   \
-	VPADDQ  v2, v0, v0;   \
-	VPADDQ  m1, v1, v1;   \
-	VPADDQ  v3, v1, v1;   \
-	VPXOR   v0, v6, v6;   \
-	VPXOR   v1, v7, v7;   \
-	VPSHUFD $-79, v6, v6; \
-	VPSHUFD $-79, v7, v7; \
-	VPADDQ  v6, v4, v4;   \
-	VPADDQ  v7, v5, v5;   \
-	VPXOR   v4, v2, v2;   \
-	VPXOR   v5, v3, v3;   \
-	VPSHUFB c40, v2, v2;  \
-	VPSHUFB c40, v3, v3;  \
-	VPADDQ  m2, v0, v0;   \
-	VPADDQ  v2, v0, v0;   \
-	VPADDQ  m3, v1, v1;   \
-	VPADDQ  v3, v1, v1;   \
-	VPXOR   v0, v6, v6;   \
-	VPXOR   v1, v7, v7;   \
-	VPSHUFB c48, v6, v6;  \
-	VPSHUFB c48, v7, v7;  \
-	VPADDQ  v6, v4, v4;   \
-	VPADDQ  v7, v5, v5;   \
-	VPXOR   v4, v2, v2;   \
-	VPXOR   v5, v3, v3;   \
-	VPADDQ  v2, v2, t0;   \
-	VPSRLQ  $63, v2, v2;  \
-	VPXOR   t0, v2, v2;   \
-	VPADDQ  v3, v3, t0;   \
-	VPSRLQ  $63, v3, v3;  \
-	VPXOR   t0, v3, v3
-
-// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
-// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
-#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
-	VMOVQ_SI_X12(i0*8);     \
-	VMOVQ_SI_X13(i2*8);     \
-	VMOVQ_SI_X14(i4*8);     \
-	VMOVQ_SI_X15(i6*8);     \
-	VPINSRQ_1_SI_X12(i1*8); \
-	VPINSRQ_1_SI_X13(i3*8); \
-	VPINSRQ_1_SI_X14(i5*8); \
-	VPINSRQ_1_SI_X15(i7*8)
-
-// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
-#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
-	VMOVQ_SI_X12_0;        \
-	VMOVQ_SI_X13(4*8);     \
-	VMOVQ_SI_X14(1*8);     \
-	VMOVQ_SI_X15(5*8);     \
-	VPINSRQ_1_SI_X12(2*8); \
-	VPINSRQ_1_SI_X13(6*8); \
-	VPINSRQ_1_SI_X14(3*8); \
-	VPINSRQ_1_SI_X15(7*8)
-
-// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
-#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
-	VPSHUFD $0x4E, 0*8(SI), X12; \
-	VMOVQ_SI_X13(11*8);          \
-	VMOVQ_SI_X14(12*8);          \
-	VMOVQ_SI_X15(7*8);           \
-	VPINSRQ_1_SI_X13(5*8);       \
-	VPINSRQ_1_SI_X14(2*8);       \
-	VPINSRQ_1_SI_X15(3*8)
-
-// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
-#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
-	VMOVDQU 11*8(SI), X12;  \
-	VMOVQ_SI_X13(5*8);      \
-	VMOVQ_SI_X14(8*8);      \
-	VMOVQ_SI_X15(2*8);      \
-	VPINSRQ_1_SI_X13(15*8); \
-	VPINSRQ_1_SI_X14_0;     \
-	VPINSRQ_1_SI_X15(13*8)
-
-// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
-#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
-	VMOVQ_SI_X12(2*8);      \
-	VMOVQ_SI_X13(4*8);      \
-	VMOVQ_SI_X14(6*8);      \
-	VMOVQ_SI_X15_0;         \
-	VPINSRQ_1_SI_X12(5*8);  \
-	VPINSRQ_1_SI_X13(15*8); \
-	VPINSRQ_1_SI_X14(10*8); \
-	VPINSRQ_1_SI_X15(8*8)
-
-// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
-#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
-	VMOVQ_SI_X12(9*8);      \
-	VMOVQ_SI_X13(2*8);      \
-	VMOVQ_SI_X14_0;         \
-	VMOVQ_SI_X15(4*8);      \
-	VPINSRQ_1_SI_X12(5*8);  \
-	VPINSRQ_1_SI_X13(10*8); \
-	VPINSRQ_1_SI_X14(7*8);  \
-	VPINSRQ_1_SI_X15(15*8)
-
-// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
-#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
-	VMOVQ_SI_X12(2*8);      \
-	VMOVQ_SI_X13_0;         \
-	VMOVQ_SI_X14(12*8);     \
-	VMOVQ_SI_X15(11*8);     \
-	VPINSRQ_1_SI_X12(6*8);  \
-	VPINSRQ_1_SI_X13(8*8);  \
-	VPINSRQ_1_SI_X14(10*8); \
-	VPINSRQ_1_SI_X15(3*8)
-
-// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
-#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
-	MOVQ    0*8(SI), X12;        \
-	VPSHUFD $0x4E, 8*8(SI), X13; \
-	MOVQ    7*8(SI), X14;        \
-	MOVQ    2*8(SI), X15;        \
-	VPINSRQ_1_SI_X12(6*8);       \
-	VPINSRQ_1_SI_X14(3*8);       \
-	VPINSRQ_1_SI_X15(11*8)
-
-// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
-#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
-	MOVQ 6*8(SI), X12;      \
-	MOVQ 11*8(SI), X13;     \
-	MOVQ 15*8(SI), X14;     \
-	MOVQ 3*8(SI), X15;      \
-	VPINSRQ_1_SI_X12(14*8); \
-	VPINSRQ_1_SI_X13_0;     \
-	VPINSRQ_1_SI_X14(9*8);  \
-	VPINSRQ_1_SI_X15(8*8)
-
-// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
-#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
-	MOVQ 5*8(SI), X12;      \
-	MOVQ 8*8(SI), X13;      \
-	MOVQ 0*8(SI), X14;      \
-	MOVQ 6*8(SI), X15;      \
-	VPINSRQ_1_SI_X12(15*8); \
-	VPINSRQ_1_SI_X13(2*8);  \
-	VPINSRQ_1_SI_X14(4*8);  \
-	VPINSRQ_1_SI_X15(10*8)
-
-// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
-#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
-	VMOVDQU 12*8(SI), X12;  \
-	MOVQ    1*8(SI), X13;   \
-	MOVQ    2*8(SI), X14;   \
-	VPINSRQ_1_SI_X13(10*8); \
-	VPINSRQ_1_SI_X14(7*8);  \
-	VMOVDQU 4*8(SI), X15
-
-// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
-#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
-	MOVQ 15*8(SI), X12;     \
-	MOVQ 3*8(SI), X13;      \
-	MOVQ 11*8(SI), X14;     \
-	MOVQ 12*8(SI), X15;     \
-	VPINSRQ_1_SI_X12(9*8);  \
-	VPINSRQ_1_SI_X13(13*8); \
-	VPINSRQ_1_SI_X14(14*8); \
-	VPINSRQ_1_SI_X15_0
+DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f
+DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32
 
 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, R10
-	ADDQ $15, R10
-	ANDQ $~15, R10
-
-	VMOVDQU ·AVX_c40<>(SB), X0
-	VMOVDQU ·AVX_c48<>(SB), X1
+// Requires: AVX, SSE2
+TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48
+	MOVQ    h+0(FP), AX
+	MOVQ    c+8(FP), BX
+	MOVQ    flag+16(FP), CX
+	MOVQ    blocks_base+24(FP), SI
+	MOVQ    blocks_len+32(FP), DI
+	MOVQ    SP, R10
+	ADDQ    $0x0f, R10
+	ANDQ    $-16, R10
+	VMOVDQU ·AVX_c40<>+0(SB), X0
+	VMOVDQU ·AVX_c48<>+0(SB), X1
 	VMOVDQA X0, X8
 	VMOVDQA X1, X9
-
-	VMOVDQU ·AVX_iv3<>(SB), X0
-	VMOVDQA X0, 0(R10)
-	XORQ    CX, 0(R10)          // 0(R10) = ·AVX_iv3 ^ (CX || 0)
-
-	VMOVDQU 0(AX), X10
+	VMOVDQU ·AVX_iv3<>+0(SB), X0
+	VMOVDQA X0, (R10)
+	XORQ    CX, (R10)
+	VMOVDQU (AX), X10
 	VMOVDQU 16(AX), X11
 	VMOVDQU 32(AX), X2
 	VMOVDQU 48(AX), X3
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
+	MOVQ    (BX), R8
+	MOVQ    8(BX), R9
 
 loop:
-	ADDQ $128, R8
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 
 noinc:
-	VMOVQ_R8_X15
-	VPINSRQ_1_R9_X15
-
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0xf9
+	BYTE    $0x6e
+	BYTE    $0xf8
+	BYTE    $0xc4
+	BYTE    $0x43
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0xf9
+	BYTE    $0x01
 	VMOVDQA X10, X0
 	VMOVDQA X11, X1
-	VMOVDQU ·AVX_iv0<>(SB), X4
-	VMOVDQU ·AVX_iv1<>(SB), X5
-	VMOVDQU ·AVX_iv2<>(SB), X6
-
+	VMOVDQU ·AVX_iv0<>+0(SB), X4
+	VMOVDQU ·AVX_iv1<>+0(SB), X5
+	VMOVDQU ·AVX_iv2<>+0(SB), X6
 	VPXOR   X15, X6, X6
-	VMOVDQA 0(R10), X7
-
-	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
+	VMOVDQA (R10), X7
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x26
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x28
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x38
+	BYTE    $0x01
 	VMOVDQA X12, 16(R10)
 	VMOVDQA X13, 32(R10)
 	VMOVDQA X14, 48(R10)
 	VMOVDQA X15, 64(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x40
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0x01
 	VMOVDQA X12, 80(R10)
 	VMOVDQA X13, 96(R10)
 	VMOVDQA X14, 112(R10)
 	VMOVDQA X15, 128(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x30
+	BYTE    $0x01
 	VMOVDQA X12, 144(R10)
 	VMOVDQA X13, 160(R10)
 	VMOVDQA X14, 176(R10)
 	VMOVDQA X15, 192(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPSHUFD $0x4e, (SI), X12
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x58
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x38
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x18
+	BYTE    $0x01
 	VMOVDQA X12, 208(R10)
 	VMOVDQA X13, 224(R10)
 	VMOVDQA X14, 240(R10)
 	VMOVDQA X15, 256(R10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX()
-	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9)
-	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9)
-	SHUFFLE_AVX()
-	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9)
-	SHUFFLE_AVX_INV()
-
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VMOVDQU 88(SI), X12
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x28
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x40
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x36
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x08
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x20
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x60
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x70
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x30
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x3e
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x48
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x36
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x78
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x30
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x08
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x58
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x2e
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x58
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x18
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x20
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x78
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x70
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x28
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x48
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x70
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x28
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x68
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x50
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    (SI), X12
+	VPSHUFD $0x4e, 64(SI), X13
+	MOVQ    56(SI), X14
+	MOVQ    16(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x30
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x58
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x68
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x60
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x58
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x08
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x38
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x18
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x48
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    40(SI), X12
+	MOVQ    64(SI), X13
+	MOVQ    (SI), X14
+	MOVQ    48(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x78
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x10
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x50
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	MOVQ    48(SI), X12
+	MOVQ    88(SI), X13
+	MOVQ    120(SI), X14
+	MOVQ    24(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x2e
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x40
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VMOVDQU 96(SI), X12
+	MOVQ    8(SI), X13
+	MOVQ    16(SI), X14
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x50
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x38
+	BYTE    $0x01
+	VMOVDQU 32(SI), X15
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x66
+	BYTE    $0x50
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x6e
+	BYTE    $0x38
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x76
+	BYTE    $0x10
+	BYTE    $0xc5
+	BYTE    $0x7a
+	BYTE    $0x7e
+	BYTE    $0x7e
+	BYTE    $0x30
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x40
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x08
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x20
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x7e
+	BYTE    $0x28
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	MOVQ    120(SI), X12
+	MOVQ    24(SI), X13
+	MOVQ    88(SI), X14
+	MOVQ    96(SI), X15
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x99
+	BYTE    $0x22
+	BYTE    $0x66
+	BYTE    $0x48
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x91
+	BYTE    $0x22
+	BYTE    $0x6e
+	BYTE    $0x68
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x89
+	BYTE    $0x22
+	BYTE    $0x76
+	BYTE    $0x70
+	BYTE    $0x01
+	BYTE    $0xc4
+	BYTE    $0x63
+	BYTE    $0x81
+	BYTE    $0x22
+	BYTE    $0x3e
+	BYTE    $0x01
+	VPADDQ  X12, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X13, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  X14, X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  X15, X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VPADDQ  16(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  32(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  48(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  64(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPADDQ  80(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  96(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  112(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  128(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
+	VPADDQ  144(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  160(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  176(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  192(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X6, X13
+	VMOVDQA X2, X14
+	VMOVDQA X4, X6
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x11
+	BYTE    $0x6c
+	BYTE    $0xfd
+	VMOVDQA X5, X4
+	VMOVDQA X6, X5
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xff
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x69
+	BYTE    $0x6d
+	BYTE    $0xd7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xdf
+	VPADDQ  208(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  224(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFD $-79, X6, X6
+	VPSHUFD $-79, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPSHUFB X8, X2, X2
+	VPSHUFB X8, X3, X3
+	VPADDQ  240(R10), X0, X0
+	VPADDQ  X2, X0, X0
+	VPADDQ  256(R10), X1, X1
+	VPADDQ  X3, X1, X1
+	VPXOR   X0, X6, X6
+	VPXOR   X1, X7, X7
+	VPSHUFB X9, X6, X6
+	VPSHUFB X9, X7, X7
+	VPADDQ  X6, X4, X4
+	VPADDQ  X7, X5, X5
+	VPXOR   X4, X2, X2
+	VPXOR   X5, X3, X3
+	VPADDQ  X2, X2, X15
+	VPSRLQ  $0x3f, X2, X2
+	VPXOR   X15, X2, X2
+	VPADDQ  X3, X3, X15
+	VPSRLQ  $0x3f, X3, X3
+	VPXOR   X15, X3, X3
+	VMOVDQA X2, X13
+	VMOVDQA X4, X14
+	BYTE    $0xc5
+	BYTE    $0x69
+	BYTE    $0x6c
+	BYTE    $0xfa
+	VMOVDQA X5, X4
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x61
+	BYTE    $0x6d
+	BYTE    $0xd7
+	VMOVDQA X14, X5
+	BYTE    $0xc5
+	BYTE    $0x61
+	BYTE    $0x6c
+	BYTE    $0xfb
+	VMOVDQA X6, X14
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x11
+	BYTE    $0x6d
+	BYTE    $0xdf
+	BYTE    $0xc5
+	BYTE    $0x41
+	BYTE    $0x6c
+	BYTE    $0xff
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x49
+	BYTE    $0x6d
+	BYTE    $0xf7
+	BYTE    $0xc4
+	BYTE    $0x41
+	BYTE    $0x09
+	BYTE    $0x6c
+	BYTE    $0xfe
+	BYTE    $0xc4
+	BYTE    $0xc1
+	BYTE    $0x41
+	BYTE    $0x6d
+	BYTE    $0xff
 	VMOVDQU 32(AX), X14
 	VMOVDQU 48(AX), X15
 	VPXOR   X0, X10, X10
@@ -729,16 +4524,36 @@ noinc:
 	VPXOR   X7, X15, X3
 	VMOVDQU X2, 32(AX)
 	VMOVDQU X3, 48(AX)
-
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
-
-	VMOVDQU X10, 0(AX)
+	LEAQ    128(SI), SI
+	SUBQ    $0x80, DI
+	JNE     loop
+	VMOVDQU X10, (AX)
 	VMOVDQU X11, 16(AX)
-
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
+	MOVQ    R8, (BX)
+	MOVQ    R9, 8(BX)
 	VZEROUPPER
-
 	RET
+
+DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403
+DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302
+DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16
+
+DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
index adfac00c..9a0ce212 100644
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@@ -1,278 +1,1441 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT.
 
 //go:build amd64 && gc && !purego
 
 #include "textflag.h"
 
-DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
-DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
-GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
-DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
-GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
-DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
-GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
-
-DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
-DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
-GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
-DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
-GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
-
-DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
-DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
-GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
-
-#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v6, t1; \
-	PUNPCKLQDQ v6, t2; \
-	PUNPCKHQDQ v7, v6; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ v7, t2; \
-	MOVO       t1, v7; \
-	MOVO       v2, t1; \
-	PUNPCKHQDQ t2, v7; \
-	PUNPCKLQDQ v3, t2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v3
-
-#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
-	MOVO       v4, t1; \
-	MOVO       v5, v4; \
-	MOVO       t1, v5; \
-	MOVO       v2, t1; \
-	PUNPCKLQDQ v2, t2; \
-	PUNPCKHQDQ v3, v2; \
-	PUNPCKHQDQ t2, v2; \
-	PUNPCKLQDQ v3, t2; \
-	MOVO       t1, v3; \
-	MOVO       v6, t1; \
-	PUNPCKHQDQ t2, v3; \
-	PUNPCKLQDQ v7, t2; \
-	PUNPCKHQDQ t2, v6; \
-	PUNPCKLQDQ t1, t2; \
-	PUNPCKHQDQ t2, v7
-
-#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
-	PADDQ  m0, v0;        \
-	PADDQ  m1, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFD $0xB1, v6, v6; \
-	PSHUFD $0xB1, v7, v7; \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	PSHUFB c40, v2;       \
-	PSHUFB c40, v3;       \
-	PADDQ  m2, v0;        \
-	PADDQ  m3, v1;        \
-	PADDQ  v2, v0;        \
-	PADDQ  v3, v1;        \
-	PXOR   v0, v6;        \
-	PXOR   v1, v7;        \
-	PSHUFB c48, v6;       \
-	PSHUFB c48, v7;       \
-	PADDQ  v6, v4;        \
-	PADDQ  v7, v5;        \
-	PXOR   v4, v2;        \
-	PXOR   v5, v3;        \
-	MOVOU  v2, t0;        \
-	PADDQ  v2, t0;        \
-	PSRLQ  $63, v2;       \
-	PXOR   t0, v2;        \
-	MOVOU  v3, t0;        \
-	PADDQ  v3, t0;        \
-	PSRLQ  $63, v3;       \
-	PXOR   t0, v3
-
-#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
-	MOVQ   i0*8(src), m0;     \
-	PINSRQ $1, i1*8(src), m0; \
-	MOVQ   i2*8(src), m1;     \
-	PINSRQ $1, i3*8(src), m1; \
-	MOVQ   i4*8(src), m2;     \
-	PINSRQ $1, i5*8(src), m2; \
-	MOVQ   i6*8(src), m3;     \
-	PINSRQ $1, i7*8(src), m3
-
 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
-TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
-	MOVQ h+0(FP), AX
-	MOVQ c+8(FP), BX
-	MOVQ flag+16(FP), CX
-	MOVQ blocks_base+24(FP), SI
-	MOVQ blocks_len+32(FP), DI
-
-	MOVQ SP, R10
-	ADDQ $15, R10
-	ANDQ $~15, R10
-
-	MOVOU ·iv3<>(SB), X0
-	MOVO  X0, 0(R10)
-	XORQ  CX, 0(R10)     // 0(R10) = ·iv3 ^ (CX || 0)
-
-	MOVOU ·c40<>(SB), X13
-	MOVOU ·c48<>(SB), X14
-
-	MOVOU 0(AX), X12
+// Requires: SSE2, SSE4.1, SSSE3
+TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48
+	MOVQ  h+0(FP), AX
+	MOVQ  c+8(FP), BX
+	MOVQ  flag+16(FP), CX
+	MOVQ  blocks_base+24(FP), SI
+	MOVQ  blocks_len+32(FP), DI
+	MOVQ  SP, R10
+	ADDQ  $0x0f, R10
+	ANDQ  $-16, R10
+	MOVOU ·iv3<>+0(SB), X0
+	MOVO  X0, (R10)
+	XORQ  CX, (R10)
+	MOVOU ·c40<>+0(SB), X13
+	MOVOU ·c48<>+0(SB), X14
+	MOVOU (AX), X12
 	MOVOU 16(AX), X15
-
-	MOVQ 0(BX), R8
-	MOVQ 8(BX), R9
+	MOVQ  (BX), R8
+	MOVQ  8(BX), R9
 
 loop:
-	ADDQ $128, R8
-	CMPQ R8, $128
+	ADDQ $0x80, R8
+	CMPQ R8, $0x80
 	JGE  noinc
 	INCQ R9
 
 noinc:
-	MOVQ R8, X8
-	PINSRQ $1, R9, X8
-
-	MOVO X12, X0
-	MOVO X15, X1
-	MOVOU 32(AX), X2
-	MOVOU 48(AX), X3
-	MOVOU ·iv0<>(SB), X4
-	MOVOU ·iv1<>(SB), X5
-	MOVOU ·iv2<>(SB), X6
-
-	PXOR X8, X6
-	MOVO 0(R10), X7
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
-	MOVO X8, 16(R10)
-	MOVO X9, 32(R10)
-	MOVO X10, 48(R10)
-	MOVO X11, 64(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
-	MOVO X8, 80(R10)
-	MOVO X9, 96(R10)
-	MOVO X10, 112(R10)
-	MOVO X11, 128(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
-	MOVO X8, 144(R10)
-	MOVO X9, 160(R10)
-	MOVO X10, 176(R10)
-	MOVO X11, 192(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
-	MOVO X8, 208(R10)
-	MOVO X9, 224(R10)
-	MOVO X10, 240(R10)
-	MOVO X11, 256(R10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14)
-	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
-	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14)
-	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
-
-	MOVOU 32(AX), X10
-	MOVOU 48(AX), X11
-	PXOR  X0, X12
-	PXOR  X1, X15
-	PXOR  X2, X10
-	PXOR  X3, X11
-	PXOR  X4, X12
-	PXOR  X5, X15
-	PXOR  X6, X10
-	PXOR  X7, X11
-	MOVOU X10, 32(AX)
-	MOVOU X11, 48(AX)
-
-	LEAQ 128(SI), SI
-	SUBQ $128, DI
-	JNE  loop
-
-	MOVOU X12, 0(AX)
-	MOVOU X15, 16(AX)
-
-	MOVQ R8, 0(BX)
-	MOVQ R9, 8(BX)
-
+	MOVQ       R8, X8
+	PINSRQ     $0x01, R9, X8
+	MOVO       X12, X0
+	MOVO       X15, X1
+	MOVOU      32(AX), X2
+	MOVOU      48(AX), X3
+	MOVOU      ·iv0<>+0(SB), X4
+	MOVOU      ·iv1<>+0(SB), X5
+	MOVOU      ·iv2<>+0(SB), X6
+	PXOR       X8, X6
+	MOVO       (R10), X7
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 16(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 48(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       40(SI), X11
+	PINSRQ     $0x01, 56(SI), X11
+	MOVO       X8, 16(R10)
+	MOVO       X9, 32(R10)
+	MOVO       X10, 48(R10)
+	MOVO       X11, 64(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       64(SI), X8
+	PINSRQ     $0x01, 80(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 112(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 88(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	MOVO       X8, 80(R10)
+	MOVO       X9, 96(R10)
+	MOVO       X10, 112(R10)
+	MOVO       X11, 128(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 32(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       80(SI), X10
+	PINSRQ     $0x01, 64(SI), X10
+	MOVQ       120(SI), X11
+	PINSRQ     $0x01, 48(SI), X11
+	MOVO       X8, 144(R10)
+	MOVO       X9, 160(R10)
+	MOVO       X10, 176(R10)
+	MOVO       X11, 192(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       8(SI), X8
+	PINSRQ     $0x01, (SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, 40(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 16(SI), X10
+	MOVQ       56(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	MOVO       X8, 208(R10)
+	MOVO       X9, 224(R10)
+	MOVO       X10, 240(R10)
+	MOVO       X11, 256(R10)
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       88(SI), X8
+	PINSRQ     $0x01, 96(SI), X8
+	MOVQ       40(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       64(SI), X10
+	PINSRQ     $0x01, (SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 72(SI), X9
+	MOVQ       112(SI), X10
+	PINSRQ     $0x01, 48(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 32(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       56(SI), X8
+	PINSRQ     $0x01, 24(SI), X8
+	MOVQ       104(SI), X9
+	PINSRQ     $0x01, 88(SI), X9
+	MOVQ       72(SI), X10
+	PINSRQ     $0x01, 8(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, 112(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       32(SI), X9
+	PINSRQ     $0x01, 120(SI), X9
+	MOVQ       48(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       (SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       72(SI), X8
+	PINSRQ     $0x01, 40(SI), X8
+	MOVQ       16(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 120(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       112(SI), X8
+	PINSRQ     $0x01, 88(SI), X8
+	MOVQ       48(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       8(SI), X10
+	PINSRQ     $0x01, 96(SI), X10
+	MOVQ       64(SI), X11
+	PINSRQ     $0x01, 104(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       16(SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       (SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       96(SI), X10
+	PINSRQ     $0x01, 80(SI), X10
+	MOVQ       88(SI), X11
+	PINSRQ     $0x01, 24(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       32(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       120(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       104(SI), X10
+	PINSRQ     $0x01, 40(SI), X10
+	MOVQ       112(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 8(SI), X8
+	MOVQ       112(SI), X9
+	PINSRQ     $0x01, 32(SI), X9
+	MOVQ       40(SI), X10
+	PINSRQ     $0x01, 120(SI), X10
+	MOVQ       104(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       (SI), X8
+	PINSRQ     $0x01, 48(SI), X8
+	MOVQ       72(SI), X9
+	PINSRQ     $0x01, 64(SI), X9
+	MOVQ       56(SI), X10
+	PINSRQ     $0x01, 24(SI), X10
+	MOVQ       16(SI), X11
+	PINSRQ     $0x01, 88(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       104(SI), X8
+	PINSRQ     $0x01, 56(SI), X8
+	MOVQ       96(SI), X9
+	PINSRQ     $0x01, 24(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       8(SI), X11
+	PINSRQ     $0x01, 72(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       40(SI), X8
+	PINSRQ     $0x01, 120(SI), X8
+	MOVQ       64(SI), X9
+	PINSRQ     $0x01, 16(SI), X9
+	MOVQ       (SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 80(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       48(SI), X8
+	PINSRQ     $0x01, 112(SI), X8
+	MOVQ       88(SI), X9
+	PINSRQ     $0x01, (SI), X9
+	MOVQ       120(SI), X10
+	PINSRQ     $0x01, 72(SI), X10
+	MOVQ       24(SI), X11
+	PINSRQ     $0x01, 64(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       96(SI), X8
+	PINSRQ     $0x01, 104(SI), X8
+	MOVQ       8(SI), X9
+	PINSRQ     $0x01, 80(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 56(SI), X10
+	MOVQ       32(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVQ       80(SI), X8
+	PINSRQ     $0x01, 64(SI), X8
+	MOVQ       56(SI), X9
+	PINSRQ     $0x01, 8(SI), X9
+	MOVQ       16(SI), X10
+	PINSRQ     $0x01, 32(SI), X10
+	MOVQ       48(SI), X11
+	PINSRQ     $0x01, 40(SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	MOVQ       120(SI), X8
+	PINSRQ     $0x01, 72(SI), X8
+	MOVQ       24(SI), X9
+	PINSRQ     $0x01, 104(SI), X9
+	MOVQ       88(SI), X10
+	PINSRQ     $0x01, 112(SI), X10
+	MOVQ       96(SI), X11
+	PINSRQ     $0x01, (SI), X11
+	PADDQ      X8, X0
+	PADDQ      X9, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      X10, X0
+	PADDQ      X11, X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      16(R10), X0
+	PADDQ      32(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      48(R10), X0
+	PADDQ      64(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      80(R10), X0
+	PADDQ      96(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      112(R10), X0
+	PADDQ      128(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	PADDQ      144(R10), X0
+	PADDQ      160(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      176(R10), X0
+	PADDQ      192(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X6, X8
+	PUNPCKLQDQ X6, X9
+	PUNPCKHQDQ X7, X6
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X7, X9
+	MOVO       X8, X7
+	MOVO       X2, X8
+	PUNPCKHQDQ X9, X7
+	PUNPCKLQDQ X3, X9
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X3
+	PADDQ      208(R10), X0
+	PADDQ      224(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFD     $0xb1, X6, X6
+	PSHUFD     $0xb1, X7, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	PSHUFB     X13, X2
+	PSHUFB     X13, X3
+	PADDQ      240(R10), X0
+	PADDQ      256(R10), X1
+	PADDQ      X2, X0
+	PADDQ      X3, X1
+	PXOR       X0, X6
+	PXOR       X1, X7
+	PSHUFB     X14, X6
+	PSHUFB     X14, X7
+	PADDQ      X6, X4
+	PADDQ      X7, X5
+	PXOR       X4, X2
+	PXOR       X5, X3
+	MOVOU      X2, X11
+	PADDQ      X2, X11
+	PSRLQ      $0x3f, X2
+	PXOR       X11, X2
+	MOVOU      X3, X11
+	PADDQ      X3, X11
+	PSRLQ      $0x3f, X3
+	PXOR       X11, X3
+	MOVO       X4, X8
+	MOVO       X5, X4
+	MOVO       X8, X5
+	MOVO       X2, X8
+	PUNPCKLQDQ X2, X9
+	PUNPCKHQDQ X3, X2
+	PUNPCKHQDQ X9, X2
+	PUNPCKLQDQ X3, X9
+	MOVO       X8, X3
+	MOVO       X6, X8
+	PUNPCKHQDQ X9, X3
+	PUNPCKLQDQ X7, X9
+	PUNPCKHQDQ X9, X6
+	PUNPCKLQDQ X8, X9
+	PUNPCKHQDQ X9, X7
+	MOVOU      32(AX), X10
+	MOVOU      48(AX), X11
+	PXOR       X0, X12
+	PXOR       X1, X15
+	PXOR       X2, X10
+	PXOR       X3, X11
+	PXOR       X4, X12
+	PXOR       X5, X15
+	PXOR       X6, X10
+	PXOR       X7, X11
+	MOVOU      X10, 32(AX)
+	MOVOU      X11, 48(AX)
+	LEAQ       128(SI), SI
+	SUBQ       $0x80, DI
+	JNE        loop
+	MOVOU      X12, (AX)
+	MOVOU      X15, 16(AX)
+	MOVQ       R8, (BX)
+	MOVQ       R9, 8(BX)
 	RET
+
+DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b
+DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179
+GLOBL ·iv3<>(SB), RODATA|NOPTR, $16
+
+DATA ·c40<>+0(SB)/8, $0x0201000706050403
+DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b
+GLOBL ·c40<>(SB), RODATA|NOPTR, $16
+
+DATA ·c48<>+0(SB)/8, $0x0100070605040302
+DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a
+GLOBL ·c48<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908
+DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b
+GLOBL ·iv0<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b
+DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1
+GLOBL ·iv1<>(SB), RODATA|NOPTR, $16
+
+DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1
+DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f
+GLOBL ·iv2<>(SB), RODATA|NOPTR, $16
diff --git a/vendor/golang.org/x/crypto/blowfish/cipher.go b/vendor/golang.org/x/crypto/blowfish/cipher.go
index 213bf204..08989568 100644
--- a/vendor/golang.org/x/crypto/blowfish/cipher.go
+++ b/vendor/golang.org/x/crypto/blowfish/cipher.go
@@ -11,7 +11,7 @@
 // Deprecated: any new system should use AES (from crypto/aes, if necessary in
 // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from
 // golang.org/x/crypto/chacha20poly1305).
-package blowfish // import "golang.org/x/crypto/blowfish"
+package blowfish
 
 // The code is a port of Bruce Schneier's C implementation.
 // See https://www.schneier.com/blowfish.html.
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
index db42e667..c709b728 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego
+//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego
 
 package chacha20
 
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
similarity index 89%
rename from vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
rename to vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
index 3a4287f9..bd183d9b 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 package chacha20
 
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
similarity index 76%
rename from vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
rename to vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
index c672ccf6..a660b411 100644
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s
@@ -19,7 +19,7 @@
 // The differences in this and the original implementation are
 // due to the calling conventions and initialization of constants.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 #include "textflag.h"
 
@@ -36,32 +36,68 @@
 // for VPERMXOR
 #define MASK  R18
 
-DATA consts<>+0x00(SB)/8, $0x3320646e61707865
-DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
-DATA consts<>+0x10(SB)/8, $0x0000000000000001
-DATA consts<>+0x18(SB)/8, $0x0000000000000000
-DATA consts<>+0x20(SB)/8, $0x0000000000000004
-DATA consts<>+0x28(SB)/8, $0x0000000000000000
-DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d
-DATA consts<>+0x38(SB)/8, $0x0203000106070405
-DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c
-DATA consts<>+0x48(SB)/8, $0x0102030005060704
-DATA consts<>+0x50(SB)/8, $0x6170786561707865
-DATA consts<>+0x58(SB)/8, $0x6170786561707865
-DATA consts<>+0x60(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x68(SB)/8, $0x3320646e3320646e
-DATA consts<>+0x70(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x78(SB)/8, $0x79622d3279622d32
-DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
-DATA consts<>+0x90(SB)/8, $0x0000000100000000
-DATA consts<>+0x98(SB)/8, $0x0000000300000002
-DATA consts<>+0xa0(SB)/8, $0x5566774411223300
-DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
-DATA consts<>+0xb0(SB)/8, $0x6677445522330011
-DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
+DATA consts<>+0x00(SB)/4, $0x61707865
+DATA consts<>+0x04(SB)/4, $0x3320646e
+DATA consts<>+0x08(SB)/4, $0x79622d32
+DATA consts<>+0x0c(SB)/4, $0x6b206574
+DATA consts<>+0x10(SB)/4, $0x00000001
+DATA consts<>+0x14(SB)/4, $0x00000000
+DATA consts<>+0x18(SB)/4, $0x00000000
+DATA consts<>+0x1c(SB)/4, $0x00000000
+DATA consts<>+0x20(SB)/4, $0x00000004
+DATA consts<>+0x24(SB)/4, $0x00000000
+DATA consts<>+0x28(SB)/4, $0x00000000
+DATA consts<>+0x2c(SB)/4, $0x00000000
+DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
+DATA consts<>+0x34(SB)/4, $0x0a0b0809
+DATA consts<>+0x38(SB)/4, $0x06070405
+DATA consts<>+0x3c(SB)/4, $0x02030001
+DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
+DATA consts<>+0x44(SB)/4, $0x090a0b08
+DATA consts<>+0x48(SB)/4, $0x05060704
+DATA consts<>+0x4c(SB)/4, $0x01020300
+DATA consts<>+0x50(SB)/4, $0x61707865
+DATA consts<>+0x54(SB)/4, $0x61707865
+DATA consts<>+0x58(SB)/4, $0x61707865
+DATA consts<>+0x5c(SB)/4, $0x61707865
+DATA consts<>+0x60(SB)/4, $0x3320646e
+DATA consts<>+0x64(SB)/4, $0x3320646e
+DATA consts<>+0x68(SB)/4, $0x3320646e
+DATA consts<>+0x6c(SB)/4, $0x3320646e
+DATA consts<>+0x70(SB)/4, $0x79622d32
+DATA consts<>+0x74(SB)/4, $0x79622d32
+DATA consts<>+0x78(SB)/4, $0x79622d32
+DATA consts<>+0x7c(SB)/4, $0x79622d32
+DATA consts<>+0x80(SB)/4, $0x6b206574
+DATA consts<>+0x84(SB)/4, $0x6b206574
+DATA consts<>+0x88(SB)/4, $0x6b206574
+DATA consts<>+0x8c(SB)/4, $0x6b206574
+DATA consts<>+0x90(SB)/4, $0x00000000
+DATA consts<>+0x94(SB)/4, $0x00000001
+DATA consts<>+0x98(SB)/4, $0x00000002
+DATA consts<>+0x9c(SB)/4, $0x00000003
+DATA consts<>+0xa0(SB)/4, $0x11223300
+DATA consts<>+0xa4(SB)/4, $0x55667744
+DATA consts<>+0xa8(SB)/4, $0x99aabb88
+DATA consts<>+0xac(SB)/4, $0xddeeffcc
+DATA consts<>+0xb0(SB)/4, $0x22330011
+DATA consts<>+0xb4(SB)/4, $0x66774455
+DATA consts<>+0xb8(SB)/4, $0xaabb8899
+DATA consts<>+0xbc(SB)/4, $0xeeffccdd
 GLOBL consts<>(SB), RODATA, $0xc0
 
+#ifdef GOARCH_ppc64
+#define BE_XXBRW_INIT() \
+		LVSL (R0)(R0), V24 \
+		VSPLTISB $3, V25   \
+		VXOR V24, V25, V24 \
+
+#define BE_XXBRW(vr) VPERM vr, vr, V24, vr
+#else
+#define BE_XXBRW_INIT()
+#define BE_XXBRW(vr)
+#endif
+
 //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	MOVD out+0(FP), OUT
@@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	// Clear V27
 	VXOR V27, V27, V27
 
+	BE_XXBRW_INIT()
+
 	// V28
 	LXVW4X (CONSTBASE)(R11), VS60
 
@@ -299,6 +337,11 @@ loop_vsx:
 	VADDUWM V8, V18, V8
 	VADDUWM V12, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU LEN, $64
 	BLT tail_vsx
 
@@ -327,6 +370,11 @@ loop_vsx:
 	VADDUWM V9, V18, V8
 	VADDUWM V13, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU  LEN, $64
 	BLT   tail_vsx
 
@@ -334,8 +382,8 @@ loop_vsx:
 	LXVW4X (INP)(R8), VS60
 	LXVW4X (INP)(R9), VS61
 	LXVW4X (INP)(R10), VS62
-	VXOR   V27, V0, V27
 
+	VXOR V27, V0, V27
 	VXOR V28, V4, V28
 	VXOR V29, V8, V29
 	VXOR V30, V12, V30
@@ -354,6 +402,11 @@ loop_vsx:
 	VADDUWM V10, V18, V8
 	VADDUWM V14, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU LEN, $64
 	BLT  tail_vsx
 
@@ -381,6 +434,11 @@ loop_vsx:
 	VADDUWM V11, V18, V8
 	VADDUWM V15, V19, V12
 
+	BE_XXBRW(V0)
+	BE_XXBRW(V4)
+	BE_XXBRW(V8)
+	BE_XXBRW(V12)
+
 	CMPU  LEN, $64
 	BLT   tail_vsx
 
@@ -408,9 +466,9 @@ loop_vsx:
 
 done_vsx:
 	// Increment counter by number of 64 byte blocks
-	MOVD (CNT), R14
+	MOVWZ (CNT), R14
 	ADD  BLOCKS, R14
-	MOVD R14, (CNT)
+	MOVWZ R14, (CNT)
 	RET
 
 tail_vsx:
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
index 93da7322..8cf5d811 100644
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
@@ -5,7 +5,7 @@
 // Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD and its
 // extended nonce variant XChaCha20-Poly1305, as specified in RFC 8439 and
 // draft-irtf-cfrg-xchacha-01.
-package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305"
+package chacha20poly1305
 
 import (
 	"crypto/cipher"
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
index 731d2ac6..fd5ee845 100644
--- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -1,2715 +1,9762 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
 
 //go:build gc && !purego
 
 #include "textflag.h"
-// General register allocation
-#define oup DI
-#define inp SI
-#define inl BX
-#define adp CX // free to reuse, after we hash the additional data
-#define keyp R8 // free to reuse, when we copy the key to stack
-#define itr2 R9 // general iterator
-#define itr1 CX // general iterator
-#define acc0 R10
-#define acc1 R11
-#define acc2 R12
-#define t0 R13
-#define t1 R14
-#define t2 R15
-#define t3 R8
-// Register and stack allocation for the SSE code
-#define rStore (0*16)(BP)
-#define sStore (1*16)(BP)
-#define state1Store (2*16)(BP)
-#define state2Store (3*16)(BP)
-#define tmpStore (4*16)(BP)
-#define ctr0Store (5*16)(BP)
-#define ctr1Store (6*16)(BP)
-#define ctr2Store (7*16)(BP)
-#define ctr3Store (8*16)(BP)
-#define A0 X0
-#define A1 X1
-#define A2 X2
-#define B0 X3
-#define B1 X4
-#define B2 X5
-#define C0 X6
-#define C1 X7
-#define C2 X8
-#define D0 X9
-#define D1 X10
-#define D2 X11
-#define T0 X12
-#define T1 X13
-#define T2 X14
-#define T3 X15
-#define A3 T0
-#define B3 T1
-#define C3 T2
-#define D3 T3
-// Register and stack allocation for the AVX2 code
-#define rsStoreAVX2 (0*32)(BP)
-#define state1StoreAVX2 (1*32)(BP)
-#define state2StoreAVX2 (2*32)(BP)
-#define ctr0StoreAVX2 (3*32)(BP)
-#define ctr1StoreAVX2 (4*32)(BP)
-#define ctr2StoreAVX2 (5*32)(BP)
-#define ctr3StoreAVX2 (6*32)(BP)
-#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
-#define AA0 Y0
-#define AA1 Y5
-#define AA2 Y6
-#define AA3 Y7
-#define BB0 Y14
-#define BB1 Y9
-#define BB2 Y10
-#define BB3 Y11
-#define CC0 Y12
-#define CC1 Y13
-#define CC2 Y8
-#define CC3 Y15
-#define DD0 Y4
-#define DD1 Y1
-#define DD2 Y2
-#define DD3 Y3
-#define TT0 DD3
-#define TT1 AA3
-#define TT2 BB3
-#define TT3 CC3
-// ChaCha20 constants
-DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
-DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
-DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
-DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
-DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
-// <<< 16 with PSHUFB
-DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
-DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
-DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
-// <<< 8 with PSHUFB
-DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
-DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
-DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
 
-DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
-DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
-DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
-
-DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
-DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
-DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
-// Poly1305 key clamp
-DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
-DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
-
-DATA ·sseIncMask<>+0x00(SB)/8, $0x1
-DATA ·sseIncMask<>+0x08(SB)/8, $0x0
-// To load/store the last < 16 bytes in a buffer
-DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
-DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
-DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
-DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
-DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
-DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
-DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
-DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
-DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
-DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
-
-GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
-GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
-GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
-GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
-GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
-// No PALIGNR in Go ASM yet (but VPALIGNR is present).
-#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
-#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
-#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
-#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
-#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
-#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
-#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
-#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
-#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
-#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
-#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
-#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
-#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
-#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
-#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
-#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
-#define shiftC0Right shiftC0Left
-#define shiftC1Right shiftC1Left
-#define shiftC2Right shiftC2Left
-#define shiftC3Right shiftC3Left
-#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
-#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
-#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
-#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
-
-// Some macros
-
-// ROL rotates the uint32s in register R left by N bits, using temporary T.
-#define ROL(N, R, T) \
-	MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R
-
-// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
-#else
-#define ROL16(R, T) ROL(16, R, T)
-#endif
-
-// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
-#ifdef GOAMD64_v2
-#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
-#else
-#define ROL8(R, T) ROL(8, R, T)
-#endif
-
-#define chachaQR(A, B, C, D, T) \
-	PADDD B, A; PXOR A, D; ROL16(D, T) \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
-	PADDD B, A; PXOR A, D; ROL8(D, T) \
-	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
-
-#define chachaQR_AVX2(A, B, C, D, T) \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
-	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
-	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
-
-#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
-#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
-#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
-#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
-
-#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
-#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
-#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
-
-#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
-#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
-// ----------------------------------------------------------------------------
+// func polyHashADInternal<>()
 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
-	// adp points to beginning of additional data
-	// itr2 holds ad length
-	XORQ acc0, acc0
-	XORQ acc1, acc1
-	XORQ acc2, acc2
-	CMPQ itr2, $13
-	JNE  hashADLoop
+	// Hack: Must declare #define macros inside of a function due to Avo constraints
+	// ROL rotates the uint32s in register R left by N bits, using temporary T.
+	#define ROL(N, R, T) \
+		MOVO R, T; \
+		PSLLL $(N), T; \
+		PSRLL $(32-(N)), R; \
+		PXOR T, R
 
-openFastTLSAD:
-	// Special treatment for the TLS case of 13 bytes
-	MOVQ (adp), acc0
-	MOVQ 5(adp), acc1
-	SHRQ $24, acc1
-	MOVQ $1, acc2
-	polyMul
+	// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
+	#ifdef GOAMD64_v2
+		#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
+	#else
+		#define ROL8(R, T) ROL(8, R, T)
+	#endif
+
+	// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
+	#ifdef GOAMD64_v2
+		#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
+	#else
+		#define ROL16(R, T) ROL(16, R, T)
+	#endif
+	XORQ  R10, R10
+	XORQ  R11, R11
+	XORQ  R12, R12
+	CMPQ  R9, $0x0d
+	JNE   hashADLoop
+	MOVQ  (CX), R10
+	MOVQ  5(CX), R11
+	SHRQ  $0x18, R11
+	MOVQ  $0x00000001, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 	RET
 
 hashADLoop:
 	// Hash in 16 byte chunks
-	CMPQ itr2, $16
-	JB   hashADTail
-	polyAdd(0(adp))
-	LEAQ (1*16)(adp), adp
-	SUBQ $16, itr2
-	polyMul
-	JMP  hashADLoop
+	CMPQ  R9, $0x10
+	JB    hashADTail
+	ADDQ  (CX), R10
+	ADCQ  8(CX), R11
+	ADCQ  $0x01, R12
+	LEAQ  16(CX), CX
+	SUBQ  $0x10, R9
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	JMP   hashADLoop
 
 hashADTail:
-	CMPQ itr2, $0
+	CMPQ R9, $0x00
 	JE   hashADDone
 
 	// Hash last < 16 byte tail
-	XORQ t0, t0
-	XORQ t1, t1
-	XORQ t2, t2
-	ADDQ itr2, adp
+	XORQ R13, R13
+	XORQ R14, R14
+	XORQ R15, R15
+	ADDQ R9, CX
 
 hashADTailLoop:
-	SHLQ $8, t0, t1
-	SHLQ $8, t0
-	MOVB -1(adp), t2
-	XORQ t2, t0
-	DECQ adp
-	DECQ itr2
-	JNE  hashADTailLoop
+	SHLQ  $0x08, R13, R14
+	SHLQ  $0x08, R13
+	MOVB  -1(CX), R15
+	XORQ  R15, R13
+	DECQ  CX
+	DECQ  R9
+	JNE   hashADTailLoop
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
-hashADTailFinish:
-	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	// Finished AD
 hashADDone:
 	RET
 
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
-TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Open(SB), $288-97
 	// For aligned stack access
 	MOVQ SP, BP
-	ADDQ $32, BP
+	ADDQ $0x20, BP
 	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
+	MOVQ dst_base+0(FP), DI
+	MOVQ key_base+24(FP), R8
+	MOVQ src_base+48(FP), SI
+	MOVQ src_len+56(FP), BX
+	MOVQ ad_base+72(FP), CX
 
 	// Check for AVX2 support
-	CMPB ·useAVX2(SB), $1
+	CMPB ·useAVX2+0(SB), $0x01
 	JE   chacha20Poly1305Open_AVX2
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  openSSE128 // About 16% faster
+	CMPQ BX, $0x80
+	JBE  openSSE128
 
 	// For long buffers, prepare the poly key first
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
-	MOVO  D0, T1
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X9, X13
 
 	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
-	MOVO D0, ctr3Store
-	MOVQ $10, itr2
+	MOVO X3, 32(BP)
+	MOVO X6, 48(BP)
+	MOVO X9, 128(BP)
+	MOVQ $0x0000000a, R9
 
 openSSEPreparePolyKey:
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	DECQ          itr2
-	JNE           openSSEPreparePolyKey
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	DECQ  R9
+	JNE   openSSEPreparePolyKey
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL 32(BP), X3
 
 	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore; MOVO B0, sStore
+	PAND ·polyClampMask<>+0(SB), X0
+	MOVO X0, (BP)
+	MOVO X3, 16(BP)
 
 	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openSSEMainLoop:
-	CMPQ inl, $256
+	CMPQ BX, $0x00000100
 	JB   openSSEMainLoopDone
 
 	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
 
-	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
-	MOVQ $4, itr1
-	MOVQ inp, itr2
+	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
+	// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+	MOVQ $0x00000004, CX
+	MOVQ SI, R9
 
 openSSEInternalLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(itr2))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(itr2), itr2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr1
-	JGE           openSSEInternalLoop
-
-	polyAdd(0(itr2))
-	polyMul
-	LEAQ (2*8)(itr2), itr2
-
-	CMPQ itr1, $-6
-	JG   openSSEInternalLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	LEAQ  16(R9), R9
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  CX
+	JGE   openSSEInternalLoop
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	CMPQ  CX, $-6
+	JG    openSSEInternalLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
 
 	// Load - xor - store
-	MOVO  D3, tmpStore
-	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
-	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
-	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
-	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
-	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
-	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
-	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
-	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
-	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
-	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
-	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
-	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
-	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
-	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
-	LEAQ  256(inp), inp
-	LEAQ  256(oup), oup
-	SUBQ  $256, inl
+	MOVO  X15, 64(BP)
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU X0, (DI)
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU X3, 16(DI)
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU X6, 32(DI)
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X9
+	PXOR  X9, X1
+	MOVOU X1, 64(DI)
+	MOVOU 80(SI), X9
+	PXOR  X9, X4
+	MOVOU X4, 80(DI)
+	MOVOU 96(SI), X9
+	PXOR  X9, X7
+	MOVOU X7, 96(DI)
+	MOVOU 112(SI), X9
+	PXOR  X9, X10
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X9
+	PXOR  X9, X2
+	MOVOU X2, 128(DI)
+	MOVOU 144(SI), X9
+	PXOR  X9, X5
+	MOVOU X5, 144(DI)
+	MOVOU 160(SI), X9
+	PXOR  X9, X8
+	MOVOU X8, 160(DI)
+	MOVOU 176(SI), X9
+	PXOR  X9, X11
+	MOVOU X11, 176(DI)
+	MOVOU 192(SI), X9
+	PXOR  X9, X12
+	MOVOU X12, 192(DI)
+	MOVOU 208(SI), X9
+	PXOR  X9, X13
+	MOVOU X13, 208(DI)
+	MOVOU 224(SI), X9
+	PXOR  X9, X14
+	MOVOU X14, 224(DI)
+	MOVOU 240(SI), X9
+	PXOR  64(BP), X9
+	MOVOU X9, 240(DI)
+	LEAQ  256(SI), SI
+	LEAQ  256(DI), DI
+	SUBQ  $0x00000100, BX
 	JMP   openSSEMainLoop
 
 openSSEMainLoopDone:
 	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
-	CMPQ  inl, $64
+	CMPQ  BX, $0x40
 	JBE   openSSETail64
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   openSSETail128
-	CMPQ  inl, $192
+	CMPQ  BX, $0xc0
 	JBE   openSSETail192
 	JMP   openSSETail256
 
 openSSEFinalize:
 	// Hash in the PT, AAD lengths
-	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
-	polyMul
+	ADDQ  ad_len+80(FP), R10
+	ADCQ  src_len+56(FP), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
+	MOVQ    R10, R13
+	MOVQ    R11, R14
+	MOVQ    R12, R15
+	SUBQ    $-5, R10
+	SBBQ    $-1, R11
+	SBBQ    $0x03, R12
+	CMOVQCS R13, R10
+	CMOVQCS R14, R11
+	CMOVQCS R15, R12
 
 	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
+	ADDQ 16(BP), R10
+	ADCQ 24(BP), R11
 
 	// Finally, constant time compare to the tag at the end of the message
 	XORQ    AX, AX
-	MOVQ    $1, DX
-	XORQ    (0*8)(inp), acc0
-	XORQ    (1*8)(inp), acc1
-	ORQ     acc1, acc0
+	MOVQ    $0x00000001, DX
+	XORQ    (SI), R10
+	XORQ    8(SI), R11
+	ORQ     R11, R10
 	CMOVQEQ DX, AX
 
 	// Return true iff tags are equal
 	MOVB AX, ret+96(FP)
 	RET
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 129 bytes
 openSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X3, X13
+	MOVO  X6, X14
+	MOVO  X10, X15
+	MOVQ  $0x0000000a, R9
 
 openSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           openSSE128InnerCipherLoop
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  R9
+	JNE   openSSE128InnerCipherLoop
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL X13, X3
+	PADDL X13, X4
+	PADDL X13, X5
+	PADDL X14, X7
+	PADDL X14, X8
+	PADDL X15, X10
+	PADDL ·sseIncMask<>+0(SB), X15
+	PADDL X15, X11
 
 	// Clamp and store the key
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore; MOVOU B0, sStore
+	PAND  ·polyClampMask<>+0(SB), X0
+	MOVOU X0, (BP)
+	MOVOU X3, 16(BP)
 
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openSSE128Open:
-	CMPQ inl, $16
+	CMPQ BX, $0x10
 	JB   openSSETail16
-	SUBQ $16, inl
+	SUBQ $0x10, BX
 
 	// Load for hashing
-	polyAdd(0(inp))
+	ADDQ (SI), R10
+	ADCQ 8(SI), R11
+	ADCQ $0x01, R12
 
 	// Load for decryption
-	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
-	polyMul
+	MOVOU (SI), X12
+	PXOR  X12, X1
+	MOVOU X1, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
+	MOVO X4, X1
+	MOVO X7, X4
+	MOVO X10, X7
+	MOVO X2, X10
+	MOVO X5, X2
+	MOVO X8, X5
+	MOVO X11, X8
 	JMP  openSSE128Open
 
 openSSETail16:
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
 
 	// We can safely load the CT from the end, because it is padded with the MAC
-	MOVQ   inl, itr2
-	SHLQ   $4, itr2
-	LEAQ   ·andMask<>(SB), t0
-	MOVOU  (inp), T0
-	ADDQ   inl, inp
-	PAND   -16(t0)(itr2*1), T0
-	MOVO   T0, 0+tmpStore
-	MOVQ   T0, t0
-	MOVQ   8+tmpStore, t1
-	PXOR   A1, T0
+	MOVQ  BX, R9
+	SHLQ  $0x04, R9
+	LEAQ  ·andMask<>+0(SB), R13
+	MOVOU (SI), X12
+	ADDQ  BX, SI
+	PAND  -16(R13)(R9*1), X12
+	MOVO  X12, 64(BP)
+	MOVQ  X12, R13
+	MOVQ  72(BP), R14
+	PXOR  X1, X12
 
 	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
 openSSETail16Store:
-	MOVQ T0, t3
-	MOVB t3, (oup)
-	PSRLDQ $1, T0
-	INCQ   oup
-	DECQ   inl
+	MOVQ   X12, R8
+	MOVB   R8, (DI)
+	PSRLDQ $0x01, X12
+	INCQ   DI
+	DECQ   BX
 	JNE    openSSETail16Store
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
 	JMP    openSSEFinalize
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of ciphertext
 openSSETail64:
-	// Need to decrypt up to 64 bytes - prepare single block
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	CMPQ itr1, $16
-	JB   openSSETail64LoopB
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	XORQ  R9, R9
+	MOVQ  BX, CX
+	CMPQ  CX, $0x10
+	JB    openSSETail64LoopB
 
 openSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	SUBQ $16, itr1
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
 
 openSSETail64LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	chachaQR(A0, B0, C0, D0, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-
-	CMPQ itr1, $16
-	JAE  openSSETail64LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail64LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	CMPQ  CX, $0x10
+	JAE   openSSETail64LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSETail64LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL 32(BP), X3
+	PADDL 48(BP), X6
+	PADDL 80(BP), X9
 
 openSSETail64DecLoop:
-	CMPQ  inl, $16
+	CMPQ  BX, $0x10
 	JB    openSSETail64DecLoopDone
-	SUBQ  $16, inl
-	MOVOU (inp), T0
-	PXOR  T0, A0
-	MOVOU A0, (oup)
-	LEAQ  16(inp), inp
-	LEAQ  16(oup), oup
-	MOVO  B0, A0
-	MOVO  C0, B0
-	MOVO  D0, C0
+	SUBQ  $0x10, BX
+	MOVOU (SI), X12
+	PXOR  X12, X0
+	MOVOU X0, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
+	MOVO  X3, X0
+	MOVO  X6, X3
+	MOVO  X9, X6
 	JMP   openSSETail64DecLoop
 
 openSSETail64DecLoopDone:
-	MOVO A0, A1
+	MOVO X0, X1
 	JMP  openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 openSSETail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
-	XORQ itr2, itr2
-	MOVQ inl, itr1
-	ANDQ $-16, itr1
+	MOVO  ·chacha20Constants<>+0(SB), X1
+	MOVO  32(BP), X4
+	MOVO  48(BP), X7
+	MOVO  128(BP), X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 80(BP)
+	MOVO  X1, X0
+	MOVO  X4, X3
+	MOVO  X7, X6
+	MOVO  X10, X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 96(BP)
+	XORQ  R9, R9
+	MOVQ  BX, CX
+	ANDQ  $-16, CX
 
 openSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSETail128LoopB:
-	ADDQ          $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	CMPQ  R9, CX
+	JB    openSSETail128LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSETail128LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 96(BP), X9
+	PADDL 80(BP), X10
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, (DI)
+	MOVOU X4, 16(DI)
+	MOVOU X7, 32(DI)
+	MOVOU X10, 48(DI)
+	SUBQ  $0x40, BX
+	LEAQ  64(SI), SI
+	LEAQ  64(DI), DI
+	JMP   openSSETail64DecLoop
 
-	CMPQ itr2, itr1
-	JB   openSSETail128LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr1Store, D0; PADDL ctr0Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-	LEAQ 64(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of ciphertext
 openSSETail192:
-	// Need to decrypt up to 192 bytes - prepare three blocks
-	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
-	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
-
-	MOVQ    inl, itr1
-	MOVQ    $160, itr2
-	CMPQ    itr1, $160
-	CMOVQGT itr2, itr1
-	ANDQ    $-16, itr1
-	XORQ    itr2, itr2
+	MOVO    ·chacha20Constants<>+0(SB), X2
+	MOVO    32(BP), X5
+	MOVO    48(BP), X8
+	MOVO    128(BP), X11
+	PADDL   ·sseIncMask<>+0(SB), X11
+	MOVO    X11, 80(BP)
+	MOVO    X2, X1
+	MOVO    X5, X4
+	MOVO    X8, X7
+	MOVO    X11, X10
+	PADDL   ·sseIncMask<>+0(SB), X10
+	MOVO    X10, 96(BP)
+	MOVO    X1, X0
+	MOVO    X4, X3
+	MOVO    X7, X6
+	MOVO    X10, X9
+	PADDL   ·sseIncMask<>+0(SB), X9
+	MOVO    X9, 112(BP)
+	MOVQ    BX, CX
+	MOVQ    $0x000000a0, R9
+	CMPQ    CX, $0xa0
+	CMOVQGT R9, CX
+	ANDQ    $-16, CX
+	XORQ    R9, R9
 
 openSSLTail192LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMul
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSLTail192LoopB:
-	ADDQ         $16, itr2
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	CMPQ itr2, itr1
-	JB   openSSLTail192LoopA
-
-	CMPQ itr2, $160
-	JNE  openSSLTail192LoopB
-
-	CMPQ inl, $176
-	JB   openSSLTail192Store
-
-	polyAdd(160(inp))
-	polyMul
-
-	CMPQ inl, $192
-	JB   openSSLTail192Store
-
-	polyAdd(176(inp))
-	polyMul
+	ADDQ  $0x10, R9
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	CMPQ  R9, CX
+	JB    openSSLTail192LoopA
+	CMPQ  R9, $0xa0
+	JNE   openSSLTail192LoopB
+	CMPQ  BX, $0xb0
+	JB    openSSLTail192Store
+	ADDQ  160(SI), R10
+	ADCQ  168(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	CMPQ  BX, $0xc0
+	JB    openSSLTail192Store
+	ADDQ  176(SI), R10
+	ADCQ  184(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openSSLTail192Store:
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 32(BP), X5
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 48(BP), X8
+	PADDL 112(BP), X9
+	PADDL 96(BP), X10
+	PADDL 80(BP), X11
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X2
+	PXOR  X13, X5
+	PXOR  X14, X8
+	PXOR  X15, X11
+	MOVOU X2, (DI)
+	MOVOU X5, 16(DI)
+	MOVOU X8, 32(DI)
+	MOVOU X11, 48(DI)
+	MOVOU 64(SI), X12
+	MOVOU 80(SI), X13
+	MOVOU 96(SI), X14
+	MOVOU 112(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	SUBQ  $0x80, BX
+	LEAQ  128(SI), SI
+	LEAQ  128(DI), DI
+	JMP   openSSETail64DecLoop
 
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
-	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
-
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-	LEAQ 128(oup), oup
-	JMP  openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
 openSSETail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	XORQ itr2, itr2
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
+	XORQ R9, R9
 
 openSSETail256Loop:
-	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
-	polyAdd(0(inp)(itr2*1))
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulStage3
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	ADDQ          $2*8, itr2
-	CMPQ          itr2, $160
-	JB            openSSETail256Loop
-	MOVQ          inl, itr1
-	ANDQ          $-16, itr1
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	ADDQ  $0x10, R9
+	CMPQ  R9, $0xa0
+	JB    openSSETail256Loop
+	MOVQ  BX, CX
+	ANDQ  $-16, CX
 
 openSSETail256HashLoop:
-	polyAdd(0(inp)(itr2*1))
-	polyMul
-	ADDQ $2*8, itr2
-	CMPQ itr2, itr1
-	JB   openSSETail256HashLoop
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  $0x10, R9
+	CMPQ  R9, CX
+	JB    openSSETail256HashLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
+	MOVO  X15, 64(BP)
 
 	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	LEAQ  192(inp), inp
-	LEAQ  192(oup), oup
-	SUBQ  $192, inl
-	MOVO  A3, A0
-	MOVO  B3, B0
-	MOVO  C3, C0
-	MOVO  tmpStore, D0
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X0
+	MOVOU 144(SI), X3
+	MOVOU 160(SI), X6
+	MOVOU 176(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 128(DI)
+	MOVOU X5, 144(DI)
+	MOVOU X8, 160(DI)
+	MOVOU X11, 176(DI)
+	LEAQ  192(SI), SI
+	LEAQ  192(DI), DI
+	SUBQ  $0xc0, BX
+	MOVO  X12, X0
+	MOVO  X13, X3
+	MOVO  X14, X6
+	MOVO  64(BP), X9
+	JMP   openSSETail64DecLoop
 
-	JMP openSSETail64DecLoop
-
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
 chacha20Poly1305Open_AVX2:
 	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x70
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0xc2
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x30
+	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $192
+	CMPQ BX, $0xc0
 	JBE  openAVX2192
-	CMPQ inl, $320
+	CMPQ BX, $0x00000140
 	JBE  openAVX2320
 
 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, state2StoreAVX2
-	VMOVDQA DD0, ctr3StoreAVX2
-	MOVQ    $10, itr2
+	VMOVDQA Y14, 32(BP)
+	VMOVDQA Y12, 64(BP)
+	VMOVDQA Y4, 192(BP)
+	MOVQ    $0x0000000a, R9
 
 openAVX2PreparePolyKey:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr2
-	JNE      openAVX2PreparePolyKey
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0
-	VPADDD state1StoreAVX2, BB0, BB0
-	VPADDD state2StoreAVX2, CC0, CC0
-	VPADDD ctr3StoreAVX2, DD0, DD0
-
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	DECQ       R9
+	JNE        openAVX2PreparePolyKey
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     192(BP), Y4, Y4
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for the first 64 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
 
 	// Hash AD + first 64 bytes
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 openAVX2InitialHash64:
-	polyAdd(0(inp)(itr1*1))
-	polyMulAVX2
-	ADDQ $16, itr1
-	CMPQ itr1, $64
-	JNE  openAVX2InitialHash64
+	ADDQ  (SI)(CX*1), R10
+	ADCQ  8(SI)(CX*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  $0x10, CX
+	CMPQ  CX, $0x40
+	JNE   openAVX2InitialHash64
 
 	// Decrypt the first 64 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), BB0, BB0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU BB0, (1*32)(oup)
-	LEAQ    (2*32)(inp), inp
-	LEAQ    (2*32)(oup), oup
-	SUBQ    $64, inl
+	VPXOR   (SI), Y0, Y0
+	VPXOR   32(SI), Y14, Y14
+	VMOVDQU Y0, (DI)
+	VMOVDQU Y14, 32(DI)
+	LEAQ    64(SI), SI
+	LEAQ    64(DI), DI
+	SUBQ    $0x40, BX
 
 openAVX2MainLoop:
-	CMPQ inl, $512
+	CMPQ BX, $0x00000200
 	JB   openAVX2MainLoopDone
 
 	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	XORQ    CX, CX
 
 openAVX2InternalLoop:
-	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
-	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
-	polyAdd(0*8(inp)(itr1*1))
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(inp)(itr1*1))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(inp)(itr1*1))
-	LEAQ     (6*8)(itr1), itr1
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	CMPQ     itr1, $480
+	ADDQ     (SI)(CX*1), R10
+	ADCQ     8(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	ADDQ     16(SI)(CX*1), R10
+	ADCQ     24(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     32(SI)(CX*1), R10
+	ADCQ     40(SI)(CX*1), R11
+	ADCQ     $0x01, R12
+	LEAQ     48(CX), CX
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	CMPQ     CX, $0x000001e0
 	JNE      openAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
+	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD   32(BP), Y14, Y14
+	VPADDD   32(BP), Y9, Y9
+	VPADDD   32(BP), Y10, Y10
+	VPADDD   32(BP), Y11, Y11
+	VPADDD   64(BP), Y12, Y12
+	VPADDD   64(BP), Y13, Y13
+	VPADDD   64(BP), Y8, Y8
+	VPADDD   64(BP), Y15, Y15
+	VPADDD   96(BP), Y4, Y4
+	VPADDD   128(BP), Y1, Y1
+	VPADDD   160(BP), Y2, Y2
+	VPADDD   192(BP), Y3, Y3
+	VMOVDQA  Y15, 224(BP)
 
 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(480(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+	ADDQ       480(SI), R10
+	ADCQ       488(SI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
 
 	// and here
-	polyAdd(496(inp))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	LEAQ       (32*16)(oup), oup
-	SUBQ       $(32*16), inl
+	ADDQ       496(SI), R10
+	ADCQ       504(SI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	VPXOR      384(SI), Y0, Y0
+	VPXOR      416(SI), Y14, Y14
+	VPXOR      448(SI), Y12, Y12
+	VPXOR      480(SI), Y4, Y4
+	VMOVDQU    Y0, 384(DI)
+	VMOVDQU    Y14, 416(DI)
+	VMOVDQU    Y12, 448(DI)
+	VMOVDQU    Y4, 480(DI)
+	LEAQ       512(SI), SI
+	LEAQ       512(DI), DI
+	SUBQ       $0x00000200, BX
 	JMP        openAVX2MainLoop
 
 openAVX2MainLoopDone:
 	// Handle the various tail sizes efficiently
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    openSSEFinalize
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   openAVX2Tail128
-	CMPQ  inl, $256
+	CMPQ  BX, $0x00000100
 	JBE   openAVX2Tail256
-	CMPQ  inl, $384
+	CMPQ  BX, $0x00000180
 	JBE   openAVX2Tail384
 	JMP   openAVX2Tail512
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
 openAVX2192:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VMOVDQA Y4, Y2
+	VMOVDQA Y1, Y15
+	MOVQ    $0x0000000a, R9
 
 openAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       R9
 	JNE        openAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y6, Y0, Y0
+	VPADDD     Y6, Y5, Y5
+	VPADDD     Y10, Y14, Y14
+	VPADDD     Y10, Y9, Y9
+	VPADDD     Y8, Y12, Y12
+	VPADDD     Y8, Y13, Y13
+	VPADDD     Y2, Y4, Y4
+	VPADDD     Y15, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
 
 openAVX2ShortOpen:
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 openAVX2ShortOpenLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   openAVX2ShortTail32
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
-	polyAdd(2*8(inp))
-	polyMulAVX2
+	ADDQ  (SI), R10
+	ADCQ  8(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(SI), R10
+	ADCQ  24(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
+	LEAQ    32(DI), DI
 
 	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y4
+	VMOVDQA Y9, Y5
+	VMOVDQA Y13, Y9
+	VMOVDQA Y1, Y13
+	VMOVDQA Y6, Y1
+	VMOVDQA Y10, Y6
 	JMP     openAVX2ShortOpenLoop
 
 openAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      openAVX2ShortDone
-
-	SUBQ $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for hashing
-	polyAdd(0*8(inp))
-	polyMulAVX2
+	ADDQ  (SI), R10
+	ADCQ  8(SI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	VPXOR      (SI), X0, X12
+	VMOVDQU    X12, (DI)
+	LEAQ       16(SI), SI
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 openAVX2ShortDone:
 	VZEROUPPER
 	JMP openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
 openAVX2320:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y14, Y7
+	VMOVDQA Y12, Y11
+	VMOVDQA Y4, Y15
+	MOVQ    $0x0000000a, R9
 
 openAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	DECQ     R9
 	JNE      openAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
+	VPADDD   Y3, Y0, Y0
+	VPADDD   Y3, Y5, Y5
+	VPADDD   Y3, Y6, Y6
+	VPADDD   Y7, Y14, Y14
+	VPADDD   Y7, Y9, Y9
+	VPADDD   Y7, Y10, Y10
+	VPADDD   Y11, Y12, Y12
+	VPADDD   Y11, Y13, Y13
+	VPADDD   Y11, Y8, Y8
+	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
+	VPADDD   Y15, Y4, Y4
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y1, Y1
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y2, Y2
 
 	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA    Y3, (BP)
 
 	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
+	VPERM2I128 $0x02, Y6, Y10, Y13
+	VPERM2I128 $0x02, Y8, Y2, Y1
+	VPERM2I128 $0x13, Y6, Y10, Y6
+	VPERM2I128 $0x13, Y8, Y2, Y10
 	JMP        openAVX2ShortOpen
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 openAVX2Tail128:
 	// Need to decrypt up to 128 bytes - prepare two blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
-	VMOVDQA DD1, DD0
-
-	XORQ  itr2, itr2
-	MOVQ  inl, itr1
-	ANDQ  $-16, itr1
-	TESTQ itr1, itr1
-	JE    openAVX2Tail128LoopB
+	VMOVDQA ·chacha20Constants<>+0(SB), Y5
+	VMOVDQA 32(BP), Y9
+	VMOVDQA 64(BP), Y13
+	VMOVDQA 192(BP), Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y1
+	VMOVDQA Y1, Y4
+	XORQ    R9, R9
+	MOVQ    BX, CX
+	ANDQ    $-16, CX
+	TESTQ   CX, CX
+	JE      openAVX2Tail128LoopB
 
 openAVX2Tail128LoopA:
-	// Perform ChaCha rounds, while hashing the remaining input
-	polyAdd(0(inp)(itr2*1))
-	polyMulAVX2
+	ADDQ  (SI)(R9*1), R10
+	ADCQ  8(SI)(R9*1), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 openAVX2Tail128LoopB:
-	ADDQ     $16, itr2
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
-	JB       openAVX2Tail128LoopA
-	CMPQ     itr2, $160
-	JNE      openAVX2Tail128LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC1, CC1
-	VPADDD     DD0, DD1, DD1
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	ADDQ       $0x10, R9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	CMPQ       R9, CX
+	JB         openAVX2Tail128LoopA
+	CMPQ       R9, $0xa0
+	JNE        openAVX2Tail128LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y4, Y1, Y1
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
 
 openAVX2TailLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   openAVX2Tail
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for decryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
-	LEAQ    (1*32)(oup), oup
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
+	LEAQ    32(DI), DI
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
 	JMP     openAVX2TailLoop
 
 openAVX2Tail:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      openAVX2TailDone
-	SUBQ    $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for decryption
-	VPXOR      (inp), A0, T0
-	VMOVDQU    T0, (oup)
-	LEAQ       (1*16)(inp), inp
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	VPXOR      (SI), X0, X12
+	VMOVDQU    X12, (DI)
+	LEAQ       16(SI), SI
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 openAVX2TailDone:
 	VZEROUPPER
 	JMP openSSETail16
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
 openAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare four blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
 
 	// Compute the number of iterations that will hash data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $128, itr1
-	SHRQ    $4, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
+	MOVQ    BX, 224(BP)
+	MOVQ    BX, CX
+	SUBQ    $0x80, CX
+	SHRQ    $0x04, CX
+	MOVQ    $0x0000000a, R9
+	CMPQ    CX, $0x0a
+	CMOVQGT R9, CX
+	MOVQ    SI, BX
+	XORQ    R9, R9
 
 openAVX2Tail256LoopA:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
+	ADDQ  (BX), R10
+	ADCQ  8(BX), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(BX), BX
 
-	// Perform ChaCha rounds, while hashing the remaining input
 openAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	CMPQ     itr2, itr1
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	INCQ     R9
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	CMPQ     R9, CX
 	JB       openAVX2Tail256LoopA
+	CMPQ     R9, $0x0a
+	JNE      openAVX2Tail256LoopB
+	MOVQ     BX, R9
+	SUBQ     SI, BX
+	MOVQ     BX, CX
+	MOVQ     224(BP), BX
 
-	CMPQ itr2, $10
-	JNE  openAVX2Tail256LoopB
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
-
-	// Hash the remainder of data (if any)
 openAVX2Tail256Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail256HashEnd
-	polyAdd (0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail256Hash
+	ADDQ  $0x10, CX
+	CMPQ  CX, BX
+	JGT   openAVX2Tail256HashEnd
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	JMP   openAVX2Tail256Hash
 
-// Store 128 bytes safely, then go to store loop
 openAVX2Tail256HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-
-	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
-	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
-	LEAQ    (4*32)(inp), inp
-	LEAQ    (4*32)(oup), oup
-	SUBQ    $4*32, inl
-
-	JMP openAVX2TailLoop
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
-openAVX2Tail384:
-	// Need to decrypt up to 384 bytes - prepare six blocks
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, ctr0StoreAVX2
-	VMOVDQA DD1, ctr1StoreAVX2
-	VMOVDQA DD2, ctr2StoreAVX2
-
-	// Compute the number of iterations that will hash two blocks of data
-	MOVQ    inl, tmpStoreAVX2
-	MOVQ    inl, itr1
-	SUBQ    $256, itr1
-	SHRQ    $4, itr1
-	ADDQ    $6, itr1
-	MOVQ    $10, itr2
-	CMPQ    itr1, $10
-	CMOVQGT itr2, itr1
-	MOVQ    inp, inl
-	XORQ    itr2, itr2
-
-	// Perform ChaCha rounds, while hashing the remaining input
-openAVX2Tail384LoopB:
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ 16(inl), inl
-
-openAVX2Tail384LoopA:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	polyAdd(0(inl))
-	polyMulAVX2
-	LEAQ     16(inl), inl
-	INCQ     itr2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-
-	CMPQ itr2, itr1
-	JB   openAVX2Tail384LoopB
-
-	CMPQ itr2, $10
-	JNE  openAVX2Tail384LoopA
-
-	MOVQ inl, itr2
-	SUBQ inp, inl
-	MOVQ inl, itr1
-	MOVQ tmpStoreAVX2, inl
-
-openAVX2Tail384Hash:
-	ADDQ $16, itr1
-	CMPQ itr1, inl
-	JGT  openAVX2Tail384HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ 16(itr2), itr2
-	JMP  openAVX2Tail384Hash
-
-// Store 256 bytes safely, then go to store loop
-openAVX2Tail384HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	LEAQ       (8*32)(inp), inp
-	LEAQ       (8*32)(oup), oup
-	SUBQ       $8*32, inl
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y6
+	VPERM2I128 $0x02, Y12, Y4, Y10
+	VPERM2I128 $0x13, Y0, Y14, Y8
+	VPERM2I128 $0x13, Y12, Y4, Y2
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      (SI), Y6, Y6
+	VPXOR      32(SI), Y10, Y10
+	VPXOR      64(SI), Y8, Y8
+	VPXOR      96(SI), Y2, Y2
+	VMOVDQU    Y6, (DI)
+	VMOVDQU    Y10, 32(DI)
+	VMOVDQU    Y8, 64(DI)
+	VMOVDQU    Y2, 96(DI)
+	LEAQ       128(SI), SI
+	LEAQ       128(DI), DI
+	SUBQ       $0x80, BX
+	JMP        openAVX2TailLoop
+
+openAVX2Tail384:
+	// Need to decrypt up to 384 bytes - prepare six blocks
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+
+	// Compute the number of iterations that will hash two blocks of data
+	MOVQ    BX, 224(BP)
+	MOVQ    BX, CX
+	SUBQ    $0x00000100, CX
+	SHRQ    $0x04, CX
+	ADDQ    $0x06, CX
+	MOVQ    $0x0000000a, R9
+	CMPQ    CX, $0x0a
+	CMOVQGT R9, CX
+	MOVQ    SI, BX
+	XORQ    R9, R9
+
+openAVX2Tail384LoopB:
+	ADDQ  (BX), R10
+	ADCQ  8(BX), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(BX), BX
+
+openAVX2Tail384LoopA:
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	ADDQ     (BX), R10
+	ADCQ     8(BX), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	LEAQ     16(BX), BX
+	INCQ     R9
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	CMPQ     R9, CX
+	JB       openAVX2Tail384LoopB
+	CMPQ     R9, $0x0a
+	JNE      openAVX2Tail384LoopA
+	MOVQ     BX, R9
+	SUBQ     SI, BX
+	MOVQ     BX, CX
+	MOVQ     224(BP), BX
+
+openAVX2Tail384Hash:
+	ADDQ  $0x10, CX
+	CMPQ  CX, BX
+	JGT   openAVX2Tail384HashEnd
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	JMP   openAVX2Tail384Hash
+
+openAVX2Tail384HashEnd:
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y3
+	VPERM2I128 $0x02, Y13, Y1, Y7
+	VPERM2I128 $0x13, Y5, Y9, Y11
+	VPERM2I128 $0x13, Y13, Y1, Y15
+	VPXOR      128(SI), Y3, Y3
+	VPXOR      160(SI), Y7, Y7
+	VPXOR      192(SI), Y11, Y11
+	VPXOR      224(SI), Y15, Y15
+	VMOVDQU    Y3, 128(DI)
+	VMOVDQU    Y7, 160(DI)
+	VMOVDQU    Y11, 192(DI)
+	VMOVDQU    Y15, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	LEAQ       256(SI), SI
+	LEAQ       256(DI), DI
+	SUBQ       $0x00000100, BX
 	JMP        openAVX2TailLoop
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
 openAVX2Tail512:
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	XORQ    itr1, itr1
-	MOVQ    inp, itr2
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	XORQ    CX, CX
+	MOVQ    SI, R9
 
 openAVX2Tail512LoopB:
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ (2*8)(itr2), itr2
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
 
 openAVX2Tail512LoopA:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(itr2))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(itr2))
-	polyMulAVX2
-	LEAQ     (4*8)(itr2), itr2
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	INCQ     itr1
-	CMPQ     itr1, $4
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	ADDQ     (R9), R10
+	ADCQ     8(R9), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     16(R9), R10
+	ADCQ     24(R9), R11
+	ADCQ     $0x01, R12
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	LEAQ     32(R9), R9
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	INCQ     CX
+	CMPQ     CX, $0x04
 	JLT      openAVX2Tail512LoopB
-
-	CMPQ itr1, $10
-	JNE  openAVX2Tail512LoopA
-
-	MOVQ inl, itr1
-	SUBQ $384, itr1
-	ANDQ $-16, itr1
+	CMPQ     CX, $0x0a
+	JNE      openAVX2Tail512LoopA
+	MOVQ     BX, CX
+	SUBQ     $0x00000180, CX
+	ANDQ     $-16, CX
 
 openAVX2Tail512HashLoop:
-	TESTQ itr1, itr1
+	TESTQ CX, CX
 	JE    openAVX2Tail512HashEnd
-	polyAdd(0(itr2))
-	polyMulAVX2
-	LEAQ  16(itr2), itr2
-	SUBQ  $16, itr1
+	ADDQ  (R9), R10
+	ADCQ  8(R9), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(R9), R9
+	SUBQ  $0x10, CX
 	JMP   openAVX2Tail512HashLoop
 
 openAVX2Tail512HashEnd:
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	LEAQ       384(SI), SI
+	LEAQ       384(DI), DI
+	SUBQ       $0x00000180, BX
+	JMP        openAVX2TailLoop
 
-	LEAQ (12*32)(inp), inp
-	LEAQ (12*32)(oup), oup
-	SUBQ $12*32, inl
+DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
+GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
 
-	JMP openAVX2TailLoop
+DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
+DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
+DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
+DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
+GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
 
-// ----------------------------------------------------------------------------
-// ----------------------------------------------------------------------------
-// func chacha20Poly1305Seal(dst, key, src, ad []byte)
-TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
-	// For aligned stack access
+DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
+DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
+GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
+
+DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+8(SB)/8, $0x0000000000000000
+DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+24(SB)/8, $0x0000000000000000
+DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+40(SB)/8, $0x0000000000000000
+DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+56(SB)/8, $0x0000000000000000
+DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+72(SB)/8, $0x0000000000000000
+DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+88(SB)/8, $0x0000000000000000
+DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+104(SB)/8, $0x0000000000000000
+DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+120(SB)/8, $0x0000000000000000
+DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
+GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
+
+DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
+DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol16<>+0(SB)/8, $0x0504070601000302
+DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
+DATA ·rol16<>+16(SB)/8, $0x0504070601000302
+DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
+GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
+
+DATA ·rol8<>+0(SB)/8, $0x0605040702010003
+DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
+DATA ·rol8<>+16(SB)/8, $0x0605040702010003
+DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
+GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
+
+DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
+DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
+DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
+GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
+
+// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
+// Requires: AVX, AVX2, BMI2, CMOV, SSE2
+TEXT ·chacha20Poly1305Seal(SB), $288-96
 	MOVQ SP, BP
-	ADDQ $32, BP
+	ADDQ $0x20, BP
 	ANDQ $-32, BP
-	MOVQ dst+0(FP), oup
-	MOVQ key+24(FP), keyp
-	MOVQ src+48(FP), inp
-	MOVQ src_len+56(FP), inl
-	MOVQ ad+72(FP), adp
-
-	CMPB ·useAVX2(SB), $1
+	MOVQ dst_base+0(FP), DI
+	MOVQ key_base+24(FP), R8
+	MOVQ src_base+48(FP), SI
+	MOVQ src_len+56(FP), BX
+	MOVQ ad_base+72(FP), CX
+	CMPB ·useAVX2+0(SB), $0x01
 	JE   chacha20Poly1305Seal_AVX2
 
 	// Special optimization, for very short buffers
-	CMPQ inl, $128
-	JBE  sealSSE128 // About 15% faster
+	CMPQ BX, $0x80
+	JBE  sealSSE128
 
 	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
-	MOVOU ·chacha20Constants<>(SB), A0
-	MOVOU (1*16)(keyp), B0
-	MOVOU (2*16)(keyp), C0
-	MOVOU (3*16)(keyp), D0
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
 
 	// Store state on stack for future use
-	MOVO B0, state1Store
-	MOVO C0, state2Store
+	MOVO X3, 32(BP)
+	MOVO X6, 48(BP)
 
 	// Load state, increment counter blocks
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
-	MOVQ $10, itr2
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
+	MOVQ $0x0000000a, R9
 
 sealSSEIntroLoop:
-	MOVO         C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO         tmpStore, C3
-	MOVO         C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO         tmpStore, C1
-	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
-
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JNE           sealSSEIntroLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  R9
+	JNE   sealSSEIntroLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
 
 	// Clamp and store the key
-	PAND ·polyClampMask<>(SB), A0
-	MOVO A0, rStore
-	MOVO B0, sStore
+	PAND ·polyClampMask<>+0(SB), X0
+	MOVO X0, (BP)
+	MOVO X3, 16(BP)
 
 	// Hash AAD
-	MOVQ ad_len+80(FP), itr2
-	CALL polyHashADInternal<>(SB)
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
-
-	MOVQ $128, itr1
-	SUBQ $128, inl
-	LEAQ 128(inp), inp
-
-	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
-
-	CMPQ inl, $64
-	JBE  sealSSE128SealHash
-
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
-
-	ADDQ $64, itr1
-	SUBQ $64, inl
-	LEAQ 64(inp), inp
-
-	MOVQ $2, itr1
-	MOVQ $8, itr2
-
-	CMPQ inl, $64
-	JBE  sealSSETail64
-	CMPQ inl, $128
-	JBE  sealSSETail128
-	CMPQ inl, $192
-	JBE  sealSSETail192
+	MOVQ  ad_len+80(FP), R9
+	CALL  polyHashADInternal<>(SB)
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, (DI)
+	MOVOU X4, 16(DI)
+	MOVOU X7, 32(DI)
+	MOVOU X10, 48(DI)
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 64(DI)
+	MOVOU X5, 80(DI)
+	MOVOU X8, 96(DI)
+	MOVOU X11, 112(DI)
+	MOVQ  $0x00000080, CX
+	SUBQ  $0x80, BX
+	LEAQ  128(SI), SI
+	MOVO  X12, X1
+	MOVO  X13, X4
+	MOVO  X14, X7
+	MOVO  X15, X10
+	CMPQ  BX, $0x40
+	JBE   sealSSE128SealHash
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X12
+	PXOR  X3, X13
+	PXOR  X6, X14
+	PXOR  X9, X15
+	MOVOU X12, 128(DI)
+	MOVOU X13, 144(DI)
+	MOVOU X14, 160(DI)
+	MOVOU X15, 176(DI)
+	ADDQ  $0x40, CX
+	SUBQ  $0x40, BX
+	LEAQ  64(SI), SI
+	MOVQ  $0x00000002, CX
+	MOVQ  $0x00000008, R9
+	CMPQ  BX, $0x40
+	JBE   sealSSETail64
+	CMPQ  BX, $0x80
+	JBE   sealSSETail128
+	CMPQ  BX, $0xc0
+	JBE   sealSSETail192
 
 sealSSEMainLoop:
 	// Load state, increment counter blocks
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X2, X12
+	MOVO  X5, X13
+	MOVO  X8, X14
+	MOVO  X11, X15
+	PADDL ·sseIncMask<>+0(SB), X15
 
 	// Store counters
-	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	MOVO X9, 80(BP)
+	MOVO X10, 96(BP)
+	MOVO X11, 112(BP)
+	MOVO X15, 128(BP)
 
 sealSSEInnerLoop:
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyAdd(0(oup))
-	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
-	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
-	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
-	polyMulStage1
-	polyMulStage2
-	LEAQ          (2*8)(oup), oup
-	MOVO          C3, tmpStore
-	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
-	MOVO          tmpStore, C3
-	MOVO          C1, tmpStore
-	polyMulStage3
-	chachaQR(A3, B3, C3, D3, C1)
-	MOVO          tmpStore, C1
-	polyMulReduceStage
-	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
-	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
-	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
-	DECQ          itr2
-	JGE           sealSSEInnerLoop
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          (2*8)(oup), oup
-	DECQ          itr1
-	JG            sealSSEInnerLoop
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x0c
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	LEAQ  16(DI), DI
+	MOVO  X14, 64(BP)
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X3
+	PXOR  X14, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X14)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X3
+	PXOR  X14, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X4
+	PXOR  X14, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X14)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X4
+	PXOR  X14, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x0c, X14
+	PSRLL $0x14, X5
+	PXOR  X14, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X14)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X14
+	PSLLL $0x07, X14
+	PSRLL $0x19, X5
+	PXOR  X14, X5
+	MOVO  64(BP), X14
+	MOVO  X7, 64(BP)
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL16(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x0c, X7
+	PSRLL $0x14, X13
+	PXOR  X7, X13
+	PADDD X13, X12
+	PXOR  X12, X15
+	ROL8(X15, X7)
+	PADDD X15, X14
+	PXOR  X14, X13
+	MOVO  X13, X7
+	PSLLL $0x07, X7
+	PSRLL $0x19, X13
+	PXOR  X7, X13
+	MOVO  64(BP), X7
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x04
+	DECQ  R9
+	JGE   sealSSEInnerLoop
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	DECQ  CX
+	JG    sealSSEInnerLoop
 
 	// Add in the state
-	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
-	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
-	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
-	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
-	MOVO  D3, tmpStore
+	PADDD ·chacha20Constants<>+0(SB), X0
+	PADDD ·chacha20Constants<>+0(SB), X1
+	PADDD ·chacha20Constants<>+0(SB), X2
+	PADDD ·chacha20Constants<>+0(SB), X12
+	PADDD 32(BP), X3
+	PADDD 32(BP), X4
+	PADDD 32(BP), X5
+	PADDD 32(BP), X13
+	PADDD 48(BP), X6
+	PADDD 48(BP), X7
+	PADDD 48(BP), X8
+	PADDD 48(BP), X14
+	PADDD 80(BP), X9
+	PADDD 96(BP), X10
+	PADDD 112(BP), X11
+	PADDD 128(BP), X15
+	MOVO  X15, 64(BP)
 
 	// Load - xor - store
-	MOVOU (0*16)(inp), D3; PXOR D3, A0
-	MOVOU (1*16)(inp), D3; PXOR D3, B0
-	MOVOU (2*16)(inp), D3; PXOR D3, C0
-	MOVOU (3*16)(inp), D3; PXOR D3, D0
-	MOVOU A0, (0*16)(oup)
-	MOVOU B0, (1*16)(oup)
-	MOVOU C0, (2*16)(oup)
-	MOVOU D0, (3*16)(oup)
-	MOVO  tmpStore, D3
-
-	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
-	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
-	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
-	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
-	ADDQ  $192, inp
-	MOVQ  $192, itr1
-	SUBQ  $192, inl
-	MOVO  A3, A1
-	MOVO  B3, B1
-	MOVO  C3, C1
-	MOVO  D3, D1
-	CMPQ  inl, $64
+	MOVOU (SI), X15
+	PXOR  X15, X0
+	MOVOU 16(SI), X15
+	PXOR  X15, X3
+	MOVOU 32(SI), X15
+	PXOR  X15, X6
+	MOVOU 48(SI), X15
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVO  64(BP), X15
+	MOVOU 64(SI), X0
+	MOVOU 80(SI), X3
+	MOVOU 96(SI), X6
+	MOVOU 112(SI), X9
+	PXOR  X0, X1
+	PXOR  X3, X4
+	PXOR  X6, X7
+	PXOR  X9, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVOU 128(SI), X0
+	MOVOU 144(SI), X3
+	MOVOU 160(SI), X6
+	MOVOU 176(SI), X9
+	PXOR  X0, X2
+	PXOR  X3, X5
+	PXOR  X6, X8
+	PXOR  X9, X11
+	MOVOU X2, 128(DI)
+	MOVOU X5, 144(DI)
+	MOVOU X8, 160(DI)
+	MOVOU X11, 176(DI)
+	ADDQ  $0xc0, SI
+	MOVQ  $0x000000c0, CX
+	SUBQ  $0xc0, BX
+	MOVO  X12, X1
+	MOVO  X13, X4
+	MOVO  X14, X7
+	MOVO  X15, X10
+	CMPQ  BX, $0x40
 	JBE   sealSSE128SealHash
-	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
-	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
-	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
-	LEAQ  64(inp), inp
-	SUBQ  $64, inl
-	MOVQ  $6, itr1
-	MOVQ  $4, itr2
-	CMPQ  inl, $192
+	MOVOU (SI), X0
+	MOVOU 16(SI), X3
+	MOVOU 32(SI), X6
+	MOVOU 48(SI), X9
+	PXOR  X0, X12
+	PXOR  X3, X13
+	PXOR  X6, X14
+	PXOR  X9, X15
+	MOVOU X12, 192(DI)
+	MOVOU X13, 208(DI)
+	MOVOU X14, 224(DI)
+	MOVOU X15, 240(DI)
+	LEAQ  64(SI), SI
+	SUBQ  $0x40, BX
+	MOVQ  $0x00000006, CX
+	MOVQ  $0x00000004, R9
+	CMPQ  BX, $0xc0
 	JG    sealSSEMainLoop
-
-	MOVQ  inl, itr1
-	TESTQ inl, inl
+	MOVQ  BX, CX
+	TESTQ BX, BX
 	JE    sealSSE128SealHash
-	MOVQ  $6, itr1
-	CMPQ  inl, $64
+	MOVQ  $0x00000006, CX
+	CMPQ  BX, $0x40
 	JBE   sealSSETail64
-	CMPQ  inl, $128
+	CMPQ  BX, $0x80
 	JBE   sealSSETail128
 	JMP   sealSSETail192
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 64 bytes of plaintext
 sealSSETail64:
-	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
-	MOVO  ·chacha20Constants<>(SB), A1
-	MOVO  state1Store, B1
-	MOVO  state2Store, C1
-	MOVO  ctr3Store, D1
-	PADDL ·sseIncMask<>(SB), D1
-	MOVO  D1, ctr0Store
+	MOVO  ·chacha20Constants<>+0(SB), X1
+	MOVO  32(BP), X4
+	MOVO  48(BP), X7
+	MOVO  128(BP), X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 80(BP)
 
 sealSSETail64LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail64LoopB:
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	chachaQR(A1, B1, C1, D1, T1)
-	shiftB1Right; shiftC1Right; shiftD1Right
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-
-	DECQ itr1
-	JG   sealSSETail64LoopA
-
-	DECQ  itr2
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x0c, X13
+	PSRLL $0x14, X4
+	PXOR  X13, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x07, X13
+	PSRLL $0x19, X4
+	PXOR  X13, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x0c, X13
+	PSRLL $0x14, X4
+	PXOR  X13, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X13)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X13
+	PSLLL $0x07, X13
+	PSRLL $0x19, X4
+	PXOR  X13, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	DECQ  CX
+	JG    sealSSETail64LoopA
+	DECQ  R9
 	JGE   sealSSETail64LoopB
-	PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B1
-	PADDL state2Store, C1
-	PADDL ctr0Store, D1
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X4
+	PADDL 48(BP), X7
+	PADDL 80(BP), X10
+	JMP   sealSSE128Seal
 
-	JMP sealSSE128Seal
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of plaintext
 sealSSETail128:
-	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 96(BP)
 
 sealSSETail128LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail128LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Left;  shiftC0Left; shiftD0Left
-	shiftB1Left;  shiftC1Left; shiftD1Left
-	polyAdd(0(oup))
-	polyMul
-	LEAQ          16(oup), oup
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	DECQ  CX
+	JG    sealSSETail128LoopA
+	DECQ  R9
+	JGE   sealSSETail128LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 80(BP), X9
+	PADDL 96(BP), X10
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X0
+	PXOR  X13, X3
+	PXOR  X14, X6
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVQ  $0x00000040, CX
+	LEAQ  64(SI), SI
+	SUBQ  $0x40, BX
+	JMP   sealSSE128SealHash
 
-	DECQ itr1
-	JG   sealSSETail128LoopA
-
-	DECQ itr2
-	JGE  sealSSETail128LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
-	PADDL state1Store, B0; PADDL state1Store, B1
-	PADDL state2Store, C0; PADDL state2Store, C1
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-
-	MOVQ $64, itr1
-	LEAQ 64(inp), inp
-	SUBQ $64, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 192 bytes of plaintext
 sealSSETail192:
-	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
-	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
-	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
-	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+	MOVO  ·chacha20Constants<>+0(SB), X0
+	MOVO  32(BP), X3
+	MOVO  48(BP), X6
+	MOVO  128(BP), X9
+	PADDL ·sseIncMask<>+0(SB), X9
+	MOVO  X9, 80(BP)
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X10, 96(BP)
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X11, 112(BP)
 
 sealSSETail192LoopA:
-	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealSSETail192LoopB:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left; shiftC0Left; shiftD0Left
-	shiftB1Left; shiftC1Left; shiftD1Left
-	shiftB2Left; shiftC2Left; shiftD2Left
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  CX
+	JG    sealSSETail192LoopA
+	DECQ  R9
+	JGE   sealSSETail192LoopB
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL 32(BP), X3
+	PADDL 32(BP), X4
+	PADDL 32(BP), X5
+	PADDL 48(BP), X6
+	PADDL 48(BP), X7
+	PADDL 48(BP), X8
+	PADDL 80(BP), X9
+	PADDL 96(BP), X10
+	PADDL 112(BP), X11
+	MOVOU (SI), X12
+	MOVOU 16(SI), X13
+	MOVOU 32(SI), X14
+	MOVOU 48(SI), X15
+	PXOR  X12, X0
+	PXOR  X13, X3
+	PXOR  X14, X6
+	PXOR  X15, X9
+	MOVOU X0, (DI)
+	MOVOU X3, 16(DI)
+	MOVOU X6, 32(DI)
+	MOVOU X9, 48(DI)
+	MOVOU 64(SI), X12
+	MOVOU 80(SI), X13
+	MOVOU 96(SI), X14
+	MOVOU 112(SI), X15
+	PXOR  X12, X1
+	PXOR  X13, X4
+	PXOR  X14, X7
+	PXOR  X15, X10
+	MOVOU X1, 64(DI)
+	MOVOU X4, 80(DI)
+	MOVOU X7, 96(DI)
+	MOVOU X10, 112(DI)
+	MOVO  X2, X1
+	MOVO  X5, X4
+	MOVO  X8, X7
+	MOVO  X11, X10
+	MOVQ  $0x00000080, CX
+	LEAQ  128(SI), SI
+	SUBQ  $0x80, BX
+	JMP   sealSSE128SealHash
 
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
-
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftC0Right; shiftD0Right
-	shiftB1Right; shiftC1Right; shiftD1Right
-	shiftB2Right; shiftC2Right; shiftD2Right
-
-	DECQ itr1
-	JG   sealSSETail192LoopA
-
-	DECQ itr2
-	JGE  sealSSETail192LoopB
-
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
-	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
-	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
-
-	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
-	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
-	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
-	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
-	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
-	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
-
-	MOVO A2, A1
-	MOVO B2, B1
-	MOVO C2, C1
-	MOVO D2, D1
-	MOVQ $128, itr1
-	LEAQ 128(inp), inp
-	SUBQ $128, inl
-
-	JMP sealSSE128SealHash
-
-// ----------------------------------------------------------------------------
-// Special seal optimization for buffers smaller than 129 bytes
 sealSSE128:
-	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
-	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
-	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
-	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
-	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
-	MOVQ  $10, itr2
+	MOVOU ·chacha20Constants<>+0(SB), X0
+	MOVOU 16(R8), X3
+	MOVOU 32(R8), X6
+	MOVOU 48(R8), X9
+	MOVO  X0, X1
+	MOVO  X3, X4
+	MOVO  X6, X7
+	MOVO  X9, X10
+	PADDL ·sseIncMask<>+0(SB), X10
+	MOVO  X1, X2
+	MOVO  X4, X5
+	MOVO  X7, X8
+	MOVO  X10, X11
+	PADDL ·sseIncMask<>+0(SB), X11
+	MOVO  X3, X13
+	MOVO  X6, X14
+	MOVO  X10, X15
+	MOVQ  $0x0000000a, R9
 
 sealSSE128InnerCipherLoop:
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Left;  shiftB1Left; shiftB2Left
-	shiftC0Left;  shiftC1Left; shiftC2Left
-	shiftD0Left;  shiftD1Left; shiftD2Left
-	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
-	shiftB0Right; shiftB1Right; shiftB2Right
-	shiftC0Right; shiftC1Right; shiftC2Right
-	shiftD0Right; shiftD1Right; shiftD2Right
-	DECQ          itr2
-	JNE           sealSSE128InnerCipherLoop
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL16(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X3
+	PXOR  X12, X3
+	PADDD X3, X0
+	PXOR  X0, X9
+	ROL8(X9, X12)
+	PADDD X9, X6
+	PXOR  X6, X3
+	MOVO  X3, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X3
+	PXOR  X12, X3
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL16(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X4
+	PXOR  X12, X4
+	PADDD X4, X1
+	PXOR  X1, X10
+	ROL8(X10, X12)
+	PADDD X10, X7
+	PXOR  X7, X4
+	MOVO  X4, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X4
+	PXOR  X12, X4
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL16(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x0c, X12
+	PSRLL $0x14, X5
+	PXOR  X12, X5
+	PADDD X5, X2
+	PXOR  X2, X11
+	ROL8(X11, X12)
+	PADDD X11, X8
+	PXOR  X8, X5
+	MOVO  X5, X12
+	PSLLL $0x07, X12
+	PSRLL $0x19, X5
+	PXOR  X12, X5
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xe4
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xed
+	BYTE  $0x0c
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xf6
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xff
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc0
+	BYTE  $0x08
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xc9
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xd2
+	BYTE  $0x04
+	BYTE  $0x66
+	BYTE  $0x45
+	BYTE  $0x0f
+	BYTE  $0x3a
+	BYTE  $0x0f
+	BYTE  $0xdb
+	BYTE  $0x04
+	DECQ  R9
+	JNE   sealSSE128InnerCipherLoop
 
 	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
-	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
-	PADDL T2, C1; PADDL T2, C2
-	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
-	PAND  ·polyClampMask<>(SB), A0
-	MOVOU A0, rStore
-	MOVOU B0, sStore
+	PADDL ·chacha20Constants<>+0(SB), X0
+	PADDL ·chacha20Constants<>+0(SB), X1
+	PADDL ·chacha20Constants<>+0(SB), X2
+	PADDL X13, X3
+	PADDL X13, X4
+	PADDL X13, X5
+	PADDL X14, X7
+	PADDL X14, X8
+	PADDL X15, X10
+	PADDL ·sseIncMask<>+0(SB), X15
+	PADDL X15, X11
+	PAND  ·polyClampMask<>+0(SB), X0
+	MOVOU X0, (BP)
+	MOVOU X3, 16(BP)
 
 	// Hash
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 sealSSE128SealHash:
-	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealSSE128Seal
-	polyAdd(0(oup))
-	polyMul
-
-	SUBQ $16, itr1
-	ADDQ $16, oup
-
-	JMP sealSSE128SealHash
+	CMPQ  CX, $0x10
+	JB    sealSSE128Seal
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
+	ADDQ  $0x10, DI
+	JMP   sealSSE128SealHash
 
 sealSSE128Seal:
-	CMPQ inl, $16
+	CMPQ BX, $0x10
 	JB   sealSSETail
-	SUBQ $16, inl
+	SUBQ $0x10, BX
 
 	// Load for decryption
-	MOVOU (inp), T0
-	PXOR  T0, A1
-	MOVOU A1, (oup)
-	LEAQ  (1*16)(inp), inp
-	LEAQ  (1*16)(oup), oup
+	MOVOU (SI), X12
+	PXOR  X12, X1
+	MOVOU X1, (DI)
+	LEAQ  16(SI), SI
+	LEAQ  16(DI), DI
 
 	// Extract for hashing
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
+	MOVQ   X1, R13
+	PSRLDQ $0x08, X1
+	MOVQ   X1, R14
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
 
 	// Shift the stream "left"
-	MOVO B1, A1
-	MOVO C1, B1
-	MOVO D1, C1
-	MOVO A2, D1
-	MOVO B2, A2
-	MOVO C2, B2
-	MOVO D2, C2
+	MOVO X4, X1
+	MOVO X7, X4
+	MOVO X10, X7
+	MOVO X2, X10
+	MOVO X5, X2
+	MOVO X8, X5
+	MOVO X11, X8
 	JMP  sealSSE128Seal
 
 sealSSETail:
-	TESTQ inl, inl
+	TESTQ BX, BX
 	JE    sealSSEFinalize
 
 	// We can only load the PT one byte at a time to avoid read after end of buffer
-	MOVQ inl, itr2
-	SHLQ $4, itr2
-	LEAQ ·andMask<>(SB), t0
-	MOVQ inl, itr1
-	LEAQ -1(inp)(inl*1), inp
-	XORQ t2, t2
-	XORQ t3, t3
+	MOVQ BX, R9
+	SHLQ $0x04, R9
+	LEAQ ·andMask<>+0(SB), R13
+	MOVQ BX, CX
+	LEAQ -1(SI)(BX*1), SI
+	XORQ R15, R15
+	XORQ R8, R8
 	XORQ AX, AX
 
 sealSSETailLoadLoop:
-	SHLQ $8, t2, t3
-	SHLQ $8, t2
-	MOVB (inp), AX
-	XORQ AX, t2
-	LEAQ   -1(inp), inp
-	DECQ   itr1
+	SHLQ   $0x08, R15, R8
+	SHLQ   $0x08, R15
+	MOVB   (SI), AX
+	XORQ   AX, R15
+	LEAQ   -1(SI), SI
+	DECQ   CX
 	JNE    sealSSETailLoadLoop
-	MOVQ t2, 0+tmpStore
-	MOVQ t3, 8+tmpStore
-	PXOR 0+tmpStore, A1
-	MOVOU  A1, (oup)
-	MOVOU  -16(t0)(itr2*1), T0
-	PAND   T0, A1
-	MOVQ   A1, t0
-	PSRLDQ $8, A1
-	MOVQ   A1, t1
-	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
-	polyMul
-
-	ADDQ inl, oup
+	MOVQ   R15, 64(BP)
+	MOVQ   R8, 72(BP)
+	PXOR   64(BP), X1
+	MOVOU  X1, (DI)
+	MOVOU  -16(R13)(R9*1), X12
+	PAND   X12, X1
+	MOVQ   X1, R13
+	PSRLDQ $0x08, X1
+	MOVQ   X1, R14
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x01, R12
+	MOVQ   (BP), AX
+	MOVQ   AX, R15
+	MULQ   R10
+	MOVQ   AX, R13
+	MOVQ   DX, R14
+	MOVQ   (BP), AX
+	MULQ   R11
+	IMULQ  R12, R15
+	ADDQ   AX, R14
+	ADCQ   DX, R15
+	MOVQ   8(BP), AX
+	MOVQ   AX, R8
+	MULQ   R10
+	ADDQ   AX, R14
+	ADCQ   $0x00, DX
+	MOVQ   DX, R10
+	MOVQ   8(BP), AX
+	MULQ   R11
+	ADDQ   AX, R15
+	ADCQ   $0x00, DX
+	IMULQ  R12, R8
+	ADDQ   R10, R15
+	ADCQ   DX, R8
+	MOVQ   R13, R10
+	MOVQ   R14, R11
+	MOVQ   R15, R12
+	ANDQ   $0x03, R12
+	MOVQ   R15, R13
+	ANDQ   $-4, R13
+	MOVQ   R8, R14
+	SHRQ   $0x02, R8, R15
+	SHRQ   $0x02, R8
+	ADDQ   R13, R10
+	ADCQ   R14, R11
+	ADCQ   $0x00, R12
+	ADDQ   R15, R10
+	ADCQ   R8, R11
+	ADCQ   $0x00, R12
+	ADDQ   BX, DI
 
 sealSSEFinalize:
 	// Hash in the buffer lengths
-	ADDQ ad_len+80(FP), acc0
-	ADCQ src_len+56(FP), acc1
-	ADCQ $1, acc2
-	polyMul
+	ADDQ  ad_len+80(FP), R10
+	ADCQ  src_len+56(FP), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
 
 	// Final reduce
-	MOVQ    acc0, t0
-	MOVQ    acc1, t1
-	MOVQ    acc2, t2
-	SUBQ    $-5, acc0
-	SBBQ    $-1, acc1
-	SBBQ    $3, acc2
-	CMOVQCS t0, acc0
-	CMOVQCS t1, acc1
-	CMOVQCS t2, acc2
+	MOVQ    R10, R13
+	MOVQ    R11, R14
+	MOVQ    R12, R15
+	SUBQ    $-5, R10
+	SBBQ    $-1, R11
+	SBBQ    $0x03, R12
+	CMOVQCS R13, R10
+	CMOVQCS R14, R11
+	CMOVQCS R15, R12
 
 	// Add in the "s" part of the key
-	ADDQ 0+sStore, acc0
-	ADCQ 8+sStore, acc1
+	ADDQ 16(BP), R10
+	ADCQ 24(BP), R11
 
 	// Finally store the tag at the end of the message
-	MOVQ acc0, (0*8)(oup)
-	MOVQ acc1, (1*8)(oup)
+	MOVQ R10, (DI)
+	MOVQ R11, 8(DI)
 	RET
 
-// ----------------------------------------------------------------------------
-// ------------------------- AVX2 Code ----------------------------------------
 chacha20Poly1305Seal_AVX2:
 	VZEROUPPER
-	VMOVDQU ·chacha20Constants<>(SB), AA0
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
-	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
-	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
-	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x70
+	BYTE    $0x10
+	BYTE    $0xc4
+	BYTE    $0x42
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x20
+	BYTE    $0xc4
+	BYTE    $0xc2
+	BYTE    $0x7d
+	BYTE    $0x5a
+	BYTE    $0x60
+	BYTE    $0x30
+	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
 
 	// Special optimizations, for very short buffers
-	CMPQ inl, $192
-	JBE  seal192AVX2 // 33% faster
-	CMPQ inl, $320
-	JBE  seal320AVX2 // 17% faster
+	CMPQ BX, $0x000000c0
+	JBE  seal192AVX2
+	CMPQ BX, $0x00000140
+	JBE  seal320AVX2
 
 	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
-	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
-	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
-	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
-	VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA Y14, 32(BP)
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA Y12, 64(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, 96(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y1, 128(BP)
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	MOVQ    $0x0000000a, R9
 
 sealAVX2IntroLoop:
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr2
-	JNE      sealAVX2IntroLoop
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-
-	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
-	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
-	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+	VMOVDQA    Y15, 224(BP)
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VMOVDQA    224(BP), Y15
+	VMOVDQA    Y13, 224(BP)
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x0c, Y11, Y13
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x07, Y11, Y13
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VMOVDQA    224(BP), Y13
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPALIGNR   $0x04, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x0c, Y3, Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VMOVDQA    224(BP), Y15
+	VMOVDQA    Y13, 224(BP)
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x0c, Y11, Y13
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y15, Y11, Y11
+	VPSLLD     $0x07, Y11, Y13
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y13, Y11, Y11
+	VMOVDQA    224(BP), Y13
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	VPALIGNR   $0x0c, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x04, Y3, Y3, Y3
+	DECQ       R9
+	JNE        sealAVX2IntroLoop
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPERM2I128 $0x02, Y0, Y14, Y4
+	VPERM2I128 $0x13, Y0, Y14, Y0
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), DD0, DD0
-	VMOVDQA DD0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y4, Y4
+	VMOVDQA Y4, (BP)
 
 	// Hash AD
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
 
 	// Can store at least 320 bytes
-	VPXOR   (0*32)(inp), AA0, AA0
-	VPXOR   (1*32)(inp), CC0, CC0
-	VMOVDQU AA0, (0*32)(oup)
-	VMOVDQU CC0, (1*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
-
-	MOVQ $320, itr1
-	SUBQ $320, inl
-	LEAQ 320(inp), inp
-
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
-	CMPQ       inl, $128
+	VPXOR      (SI), Y0, Y0
+	VPXOR      32(SI), Y12, Y12
+	VMOVDQU    Y0, (DI)
+	VMOVDQU    Y12, 32(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      64(SI), Y0, Y0
+	VPXOR      96(SI), Y14, Y14
+	VPXOR      128(SI), Y12, Y12
+	VPXOR      160(SI), Y4, Y4
+	VMOVDQU    Y0, 64(DI)
+	VMOVDQU    Y14, 96(DI)
+	VMOVDQU    Y12, 128(DI)
+	VMOVDQU    Y4, 160(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      192(SI), Y0, Y0
+	VPXOR      224(SI), Y14, Y14
+	VPXOR      256(SI), Y12, Y12
+	VPXOR      288(SI), Y4, Y4
+	VMOVDQU    Y0, 192(DI)
+	VMOVDQU    Y14, 224(DI)
+	VMOVDQU    Y12, 256(DI)
+	VMOVDQU    Y4, 288(DI)
+	MOVQ       $0x00000140, CX
+	SUBQ       $0x00000140, BX
+	LEAQ       320(SI), SI
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, Y15, Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, Y15, Y3, Y4
+	CMPQ       BX, $0x80
 	JBE        sealAVX2SealHash
-
-	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
-	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
-	SUBQ    $128, inl
-	LEAQ    128(inp), inp
-
-	MOVQ $8, itr1
-	MOVQ $2, itr2
-
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	CMPQ inl, $512
-	JBE  sealAVX2Tail512
+	VPXOR      (SI), Y0, Y0
+	VPXOR      32(SI), Y14, Y14
+	VPXOR      64(SI), Y12, Y12
+	VPXOR      96(SI), Y4, Y4
+	VMOVDQU    Y0, 320(DI)
+	VMOVDQU    Y14, 352(DI)
+	VMOVDQU    Y12, 384(DI)
+	VMOVDQU    Y4, 416(DI)
+	SUBQ       $0x80, BX
+	LEAQ       128(SI), SI
+	MOVQ       $0x00000008, CX
+	MOVQ       $0x00000002, R9
+	CMPQ       BX, $0x80
+	JBE        sealAVX2Tail128
+	CMPQ       BX, $0x00000100
+	JBE        sealAVX2Tail256
+	CMPQ       BX, $0x00000180
+	JBE        sealAVX2Tail384
+	CMPQ       BX, $0x00000200
+	JBE        sealAVX2Tail512
 
 	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
-	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
-	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
-	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
-
-	VMOVDQA CC3, tmpStoreAVX2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
-	VMOVDQA tmpStoreAVX2, CC3
-	VMOVDQA CC1, tmpStoreAVX2
-	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
-	VMOVDQA tmpStoreAVX2, CC1
-
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
-	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
-	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
-	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-
-	SUBQ $16, oup                  // Adjust the pointer
-	MOVQ $9, itr1
-	JMP  sealAVX2InternalLoopStart
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA  Y0, Y5
+	VMOVDQA  Y0, Y6
+	VMOVDQA  Y0, Y7
+	VMOVDQA  32(BP), Y14
+	VMOVDQA  Y14, Y9
+	VMOVDQA  Y14, Y10
+	VMOVDQA  Y14, Y11
+	VMOVDQA  64(BP), Y12
+	VMOVDQA  Y12, Y13
+	VMOVDQA  Y12, Y8
+	VMOVDQA  Y12, Y15
+	VMOVDQA  192(BP), Y4
+	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD   ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD   ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA  Y4, 96(BP)
+	VMOVDQA  Y1, 128(BP)
+	VMOVDQA  Y2, 160(BP)
+	VMOVDQA  Y3, 192(BP)
+	VMOVDQA  Y15, 224(BP)
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VMOVDQA  224(BP), Y15
+	VMOVDQA  Y13, 224(BP)
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x0c, Y11, Y13
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x07, Y11, Y13
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VMOVDQA  224(BP), Y13
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VMOVDQA  Y15, 224(BP)
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VMOVDQA  224(BP), Y15
+	VMOVDQA  Y13, 224(BP)
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x0c, Y11, Y13
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y15, Y11, Y11
+	VPSLLD   $0x07, Y11, Y13
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y13, Y11, Y11
+	VMOVDQA  224(BP), Y13
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	SUBQ     $0x10, DI
+	MOVQ     $0x00000009, CX
+	JMP      sealAVX2InternalLoopStart
 
 sealAVX2MainLoop:
-	// Load state, increment counter blocks, store the incremented counters
-	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
-	MOVQ    $10, itr1
+	VMOVDQU ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
+	MOVQ    $0x0000000a, CX
 
 sealAVX2InternalLoop:
-	polyAdd(0*8(oup))
-	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage1_AVX2
-	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulStage2_AVX2
-	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyMulStage3_AVX2
-	VMOVDQA CC3, tmpStoreAVX2
-	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA tmpStoreAVX2, CC3
-	polyMulReduceStage
+	ADDQ    (DI), R10
+	ADCQ    8(DI), R11
+	ADCQ    $0x01, R12
+	VPADDD  Y14, Y0, Y0
+	VPADDD  Y9, Y5, Y5
+	VPADDD  Y10, Y6, Y6
+	VPADDD  Y11, Y7, Y7
+	MOVQ    (BP), DX
+	MOVQ    DX, R15
+	MULXQ   R10, R13, R14
+	IMULQ   R12, R15
+	MULXQ   R11, AX, DX
+	ADDQ    AX, R14
+	ADCQ    DX, R15
+	VPXOR   Y0, Y4, Y4
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	VPSHUFB ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB ·rol16<>+0(SB), Y3, Y3
+	MOVQ    8(BP), DX
+	MULXQ   R10, R10, AX
+	ADDQ    R10, R14
+	MULXQ   R11, R11, R8
+	ADCQ    R11, R15
+	ADCQ    $0x00, R8
+	VPADDD  Y4, Y12, Y12
+	VPADDD  Y1, Y13, Y13
+	VPADDD  Y2, Y8, Y8
+	VPADDD  Y3, Y15, Y15
+	VPXOR   Y12, Y14, Y14
+	VPXOR   Y13, Y9, Y9
+	VPXOR   Y8, Y10, Y10
+	VPXOR   Y15, Y11, Y11
+	IMULQ   R12, DX
+	ADDQ    AX, R15
+	ADCQ    DX, R8
+	VMOVDQA Y15, 224(BP)
+	VPSLLD  $0x0c, Y14, Y15
+	VPSRLD  $0x14, Y14, Y14
+	VPXOR   Y15, Y14, Y14
+	VPSLLD  $0x0c, Y9, Y15
+	VPSRLD  $0x14, Y9, Y9
+	VPXOR   Y15, Y9, Y9
+	VPSLLD  $0x0c, Y10, Y15
+	VPSRLD  $0x14, Y10, Y10
+	VPXOR   Y15, Y10, Y10
+	VPSLLD  $0x0c, Y11, Y15
+	VPSRLD  $0x14, Y11, Y11
+	VPXOR   Y15, Y11, Y11
+	VMOVDQA 224(BP), Y15
+	MOVQ    R13, R10
+	MOVQ    R14, R11
+	MOVQ    R15, R12
+	ANDQ    $0x03, R12
+	MOVQ    R15, R13
+	ANDQ    $-4, R13
+	MOVQ    R8, R14
+	SHRQ    $0x02, R8, R15
+	SHRQ    $0x02, R8
+	ADDQ    R13, R10
+	ADCQ    R14, R11
+	ADCQ    $0x00, R12
+	ADDQ    R15, R10
+	ADCQ    R8, R11
+	ADCQ    $0x00, R12
 
 sealAVX2InternalLoopStart:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	polyAdd(2*8(oup))
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage1_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage2_AVX2
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	polyMulStage3_AVX2
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	polyMulReduceStage
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(4*8(oup))
-	LEAQ     (6*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulStage1_AVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	polyMulStage2_AVX2
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	polyMulStage3_AVX2
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyMulReduceStage
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-	DECQ     itr1
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	ADDQ     16(DI), R10
+	ADCQ     24(DI), R11
+	ADCQ     $0x01, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x04, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPALIGNR $0x0c, Y3, Y3, Y3
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	ADDQ     32(DI), R10
+	ADCQ     40(DI), R11
+	ADCQ     $0x01, R12
+	LEAQ     48(DI), DI
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x0c, Y14, Y15
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x0c, Y9, Y15
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x0c, Y10, Y15
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x0c, Y11, Y15
+	VPSRLD   $0x14, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     (BP), DX
+	MOVQ     DX, R15
+	MULXQ    R10, R13, R14
+	IMULQ    R12, R15
+	MULXQ    R11, AX, DX
+	ADDQ     AX, R14
+	ADCQ     DX, R15
+	VPADDD   Y14, Y0, Y0
+	VPADDD   Y9, Y5, Y5
+	VPADDD   Y10, Y6, Y6
+	VPADDD   Y11, Y7, Y7
+	VPXOR    Y0, Y4, Y4
+	VPXOR    Y5, Y1, Y1
+	VPXOR    Y6, Y2, Y2
+	VPXOR    Y7, Y3, Y3
+	MOVQ     8(BP), DX
+	MULXQ    R10, R10, AX
+	ADDQ     R10, R14
+	MULXQ    R11, R11, R8
+	ADCQ     R11, R15
+	ADCQ     $0x00, R8
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
+	VPADDD   Y4, Y12, Y12
+	VPADDD   Y1, Y13, Y13
+	VPADDD   Y2, Y8, Y8
+	VPADDD   Y3, Y15, Y15
+	IMULQ    R12, DX
+	ADDQ     AX, R15
+	ADCQ     DX, R8
+	VPXOR    Y12, Y14, Y14
+	VPXOR    Y13, Y9, Y9
+	VPXOR    Y8, Y10, Y10
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  Y15, 224(BP)
+	VPSLLD   $0x07, Y14, Y15
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y15, Y14, Y14
+	VPSLLD   $0x07, Y9, Y15
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y15, Y9, Y9
+	VPSLLD   $0x07, Y10, Y15
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y15, Y10, Y10
+	VPSLLD   $0x07, Y11, Y15
+	VPSRLD   $0x19, Y11, Y11
+	VPXOR    Y15, Y11, Y11
+	VMOVDQA  224(BP), Y15
+	MOVQ     R13, R10
+	MOVQ     R14, R11
+	MOVQ     R15, R12
+	ANDQ     $0x03, R12
+	MOVQ     R15, R13
+	ANDQ     $-4, R13
+	MOVQ     R8, R14
+	SHRQ     $0x02, R8, R15
+	SHRQ     $0x02, R8
+	ADDQ     R13, R10
+	ADCQ     R14, R11
+	ADCQ     $0x00, R12
+	ADDQ     R15, R10
+	ADCQ     R8, R11
+	ADCQ     $0x00, R12
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x0c, Y11, Y11, Y11
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x08, Y15, Y15, Y15
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	VPALIGNR $0x04, Y3, Y3, Y3
+	DECQ     CX
 	JNE      sealAVX2InternalLoop
-
-	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA CC3, tmpStoreAVX2
+	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD   32(BP), Y14, Y14
+	VPADDD   32(BP), Y9, Y9
+	VPADDD   32(BP), Y10, Y10
+	VPADDD   32(BP), Y11, Y11
+	VPADDD   64(BP), Y12, Y12
+	VPADDD   64(BP), Y13, Y13
+	VPADDD   64(BP), Y8, Y8
+	VPADDD   64(BP), Y15, Y15
+	VPADDD   96(BP), Y4, Y4
+	VPADDD   128(BP), Y1, Y1
+	VPADDD   160(BP), Y2, Y2
+	VPADDD   192(BP), Y3, Y3
+	VMOVDQA  Y15, 224(BP)
 
 	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (4*8)(oup), oup
-	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
-	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
-	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPERM2I128 $0x13, Y0, Y14, Y14
+	VPERM2I128 $0x02, Y12, Y4, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y12
+	VPXOR      (SI), Y15, Y15
+	VPXOR      32(SI), Y0, Y0
+	VPXOR      64(SI), Y14, Y14
+	VPXOR      96(SI), Y12, Y12
+	VMOVDQU    Y15, (DI)
+	VMOVDQU    Y0, 32(DI)
+	VMOVDQU    Y14, 64(DI)
+	VMOVDQU    Y12, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
 
 	// and here
-	polyAdd(-2*8(oup))
-	polyMulAVX2
-	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
-	LEAQ       (32*16)(inp), inp
-	SUBQ       $(32*16), inl
-	CMPQ       inl, $512
+	ADDQ       -16(DI), R10
+	ADCQ       -8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	VPXOR      384(SI), Y0, Y0
+	VPXOR      416(SI), Y14, Y14
+	VPXOR      448(SI), Y12, Y12
+	VPXOR      480(SI), Y4, Y4
+	VMOVDQU    Y0, 384(DI)
+	VMOVDQU    Y14, 416(DI)
+	VMOVDQU    Y12, 448(DI)
+	VMOVDQU    Y4, 480(DI)
+	LEAQ       512(SI), SI
+	SUBQ       $0x00000200, BX
+	CMPQ       BX, $0x00000200
 	JG         sealAVX2MainLoop
 
 	// Tail can only hash 480 bytes
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ 32(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(DI), R10
+	ADCQ  24(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  32(DI), DI
+	MOVQ  $0x0000000a, CX
+	MOVQ  $0x00000000, R9
+	CMPQ  BX, $0x80
+	JBE   sealAVX2Tail128
+	CMPQ  BX, $0x00000100
+	JBE   sealAVX2Tail256
+	CMPQ  BX, $0x00000180
+	JBE   sealAVX2Tail384
+	JMP   sealAVX2Tail512
 
-	MOVQ $10, itr1
-	MOVQ $0, itr2
-	CMPQ inl, $128
-	JBE  sealAVX2Tail128
-	CMPQ inl, $256
-	JBE  sealAVX2Tail256
-	CMPQ inl, $384
-	JBE  sealAVX2Tail384
-	JMP  sealAVX2Tail512
-
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 193 bytes
 seal192AVX2:
-	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
-	VMOVDQA AA0, AA1
-	VMOVDQA BB0, BB1
-	VMOVDQA CC0, CC1
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2
-	VMOVDQA BB0, BB2
-	VMOVDQA CC0, CC2
-	VMOVDQA DD0, DD2
-	VMOVDQA DD1, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VMOVDQA Y4, Y2
+	VMOVDQA Y1, Y15
+	MOVQ    $0x0000000a, R9
 
 sealAVX2192InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ       itr2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       R9
 	JNE        sealAVX2192InnerCipherLoop
-	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
-	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
-	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
-	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPADDD     Y6, Y0, Y0
+	VPADDD     Y6, Y5, Y5
+	VPADDD     Y10, Y14, Y14
+	VPADDD     Y10, Y9, Y9
+	VPADDD     Y8, Y12, Y12
+	VPADDD     Y8, Y13, Y13
+	VPADDD     Y2, Y4, Y4
+	VPADDD     Y15, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
 
 	// Clamp and store poly key
-	VPAND   ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA TT0, rsStoreAVX2
+	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA Y3, (BP)
 
 	// Stream for up to 192 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
 
 sealAVX2ShortSeal:
 	// Hash aad
-	MOVQ ad_len+80(FP), itr2
+	MOVQ ad_len+80(FP), R9
 	CALL polyHashADInternal<>(SB)
-	XORQ itr1, itr1
+	XORQ CX, CX
 
 sealAVX2SealHash:
 	// itr1 holds the number of bytes encrypted but not yet hashed
-	CMPQ itr1, $16
-	JB   sealAVX2ShortSealLoop
-	polyAdd(0(oup))
-	polyMul
-	SUBQ $16, itr1
-	ADDQ $16, oup
-	JMP  sealAVX2SealHash
+	CMPQ  CX, $0x10
+	JB    sealAVX2ShortSealLoop
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	SUBQ  $0x10, CX
+	ADDQ  $0x10, DI
+	JMP   sealAVX2SealHash
 
 sealAVX2ShortSealLoop:
-	CMPQ inl, $32
+	CMPQ BX, $0x20
 	JB   sealAVX2ShortTail32
-	SUBQ $32, inl
+	SUBQ $0x20, BX
 
 	// Load for encryption
-	VPXOR   (inp), AA0, AA0
-	VMOVDQU AA0, (oup)
-	LEAQ    (1*32)(inp), inp
+	VPXOR   (SI), Y0, Y0
+	VMOVDQU Y0, (DI)
+	LEAQ    32(SI), SI
 
 	// Now can hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ (1*32)(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	ADDQ  16(DI), R10
+	ADCQ  24(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), DX
+	MOVQ  DX, R15
+	MULXQ R10, R13, R14
+	IMULQ R12, R15
+	MULXQ R11, AX, DX
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), DX
+	MULXQ R10, R10, AX
+	ADDQ  R10, R14
+	MULXQ R11, R11, R8
+	ADCQ  R11, R15
+	ADCQ  $0x00, R8
+	IMULQ R12, DX
+	ADDQ  AX, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  32(DI), DI
 
 	// Shift stream left
-	VMOVDQA BB0, AA0
-	VMOVDQA CC0, BB0
-	VMOVDQA DD0, CC0
-	VMOVDQA AA1, DD0
-	VMOVDQA BB1, AA1
-	VMOVDQA CC1, BB1
-	VMOVDQA DD1, CC1
-	VMOVDQA AA2, DD1
-	VMOVDQA BB2, AA2
+	VMOVDQA Y14, Y0
+	VMOVDQA Y12, Y14
+	VMOVDQA Y4, Y12
+	VMOVDQA Y5, Y4
+	VMOVDQA Y9, Y5
+	VMOVDQA Y13, Y9
+	VMOVDQA Y1, Y13
+	VMOVDQA Y6, Y1
+	VMOVDQA Y10, Y6
 	JMP     sealAVX2ShortSealLoop
 
 sealAVX2ShortTail32:
-	CMPQ    inl, $16
-	VMOVDQA A0, A1
+	CMPQ    BX, $0x10
+	VMOVDQA X0, X1
 	JB      sealAVX2ShortDone
-
-	SUBQ $16, inl
+	SUBQ    $0x10, BX
 
 	// Load for encryption
-	VPXOR   (inp), A0, T0
-	VMOVDQU T0, (oup)
-	LEAQ    (1*16)(inp), inp
+	VPXOR   (SI), X0, X12
+	VMOVDQU X12, (DI)
+	LEAQ    16(SI), SI
 
 	// Hash
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	LEAQ       (1*16)(oup), oup
-	VPERM2I128 $0x11, AA0, AA0, AA0
-	VMOVDQA    A0, A1
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       16(DI), DI
+	VPERM2I128 $0x11, Y0, Y0, Y0
+	VMOVDQA    X0, X1
 
 sealAVX2ShortDone:
 	VZEROUPPER
 	JMP sealSSETail
 
-// ----------------------------------------------------------------------------
-// Special optimization for buffers smaller than 321 bytes
 seal320AVX2:
-	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
-	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
-	MOVQ    $10, itr2
+	VMOVDQA Y0, Y5
+	VMOVDQA Y14, Y9
+	VMOVDQA Y12, Y13
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y0, Y6
+	VMOVDQA Y14, Y10
+	VMOVDQA Y12, Y8
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y14, Y7
+	VMOVDQA Y12, Y11
+	VMOVDQA Y4, Y15
+	MOVQ    $0x0000000a, R9
 
 sealAVX2320InnerCipherLoop:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x04, Y14, Y14, Y14
+	VPALIGNR $0x04, Y9, Y9, Y9
+	VPALIGNR $0x04, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x0c, Y4, Y4, Y4
+	VPALIGNR $0x0c, Y1, Y1, Y1
+	VPALIGNR $0x0c, Y2, Y2, Y2
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x0c, Y14, Y3
+	VPSRLD   $0x14, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y14, Y0, Y0
+	VPXOR    Y0, Y4, Y4
+	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
+	VPADDD   Y4, Y12, Y12
+	VPXOR    Y12, Y14, Y14
+	VPSLLD   $0x07, Y14, Y3
+	VPSRLD   $0x19, Y14, Y14
+	VPXOR    Y3, Y14, Y14
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x0c, Y9, Y3
+	VPSRLD   $0x14, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y9, Y5, Y5
+	VPXOR    Y5, Y1, Y1
+	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
+	VPADDD   Y1, Y13, Y13
+	VPXOR    Y13, Y9, Y9
+	VPSLLD   $0x07, Y9, Y3
+	VPSRLD   $0x19, Y9, Y9
+	VPXOR    Y3, Y9, Y9
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x0c, Y10, Y3
+	VPSRLD   $0x14, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPADDD   Y10, Y6, Y6
+	VPXOR    Y6, Y2, Y2
+	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
+	VPADDD   Y2, Y8, Y8
+	VPXOR    Y8, Y10, Y10
+	VPSLLD   $0x07, Y10, Y3
+	VPSRLD   $0x19, Y10, Y10
+	VPXOR    Y3, Y10, Y10
+	VPALIGNR $0x0c, Y14, Y14, Y14
+	VPALIGNR $0x0c, Y9, Y9, Y9
+	VPALIGNR $0x0c, Y10, Y10, Y10
+	VPALIGNR $0x08, Y12, Y12, Y12
+	VPALIGNR $0x08, Y13, Y13, Y13
+	VPALIGNR $0x08, Y8, Y8, Y8
+	VPALIGNR $0x04, Y4, Y4, Y4
+	VPALIGNR $0x04, Y1, Y1, Y1
+	VPALIGNR $0x04, Y2, Y2, Y2
+	DECQ     R9
 	JNE      sealAVX2320InnerCipherLoop
-
-	VMOVDQA ·chacha20Constants<>(SB), TT0
-	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
-	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
-	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
-	VMOVDQA ·avx2IncMask<>(SB), TT0
-	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
-	VPADDD  TT3, DD2, DD2
+	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
+	VPADDD   Y3, Y0, Y0
+	VPADDD   Y3, Y5, Y5
+	VPADDD   Y3, Y6, Y6
+	VPADDD   Y7, Y14, Y14
+	VPADDD   Y7, Y9, Y9
+	VPADDD   Y7, Y10, Y10
+	VPADDD   Y11, Y12, Y12
+	VPADDD   Y11, Y13, Y13
+	VPADDD   Y11, Y8, Y8
+	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
+	VPADDD   Y15, Y4, Y4
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y1, Y1
+	VPADDD   Y3, Y15, Y15
+	VPADDD   Y15, Y2, Y2
 
 	// Clamp and store poly key
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPAND      ·polyClampMask<>(SB), TT0, TT0
-	VMOVDQA    TT0, rsStoreAVX2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
+	VMOVDQA    Y3, (BP)
 
 	// Stream for up to 320 bytes
-	VPERM2I128 $0x13, AA0, BB0, AA0
-	VPERM2I128 $0x13, CC0, DD0, BB0
-	VPERM2I128 $0x02, AA1, BB1, CC0
-	VPERM2I128 $0x02, CC1, DD1, DD0
-	VPERM2I128 $0x13, AA1, BB1, AA1
-	VPERM2I128 $0x13, CC1, DD1, BB1
-	VPERM2I128 $0x02, AA2, BB2, CC1
-	VPERM2I128 $0x02, CC2, DD2, DD1
-	VPERM2I128 $0x13, AA2, BB2, AA2
-	VPERM2I128 $0x13, CC2, DD2, BB2
+	VPERM2I128 $0x13, Y0, Y14, Y0
+	VPERM2I128 $0x13, Y12, Y4, Y14
+	VPERM2I128 $0x02, Y5, Y9, Y12
+	VPERM2I128 $0x02, Y13, Y1, Y4
+	VPERM2I128 $0x13, Y5, Y9, Y5
+	VPERM2I128 $0x13, Y13, Y1, Y9
+	VPERM2I128 $0x02, Y6, Y10, Y13
+	VPERM2I128 $0x02, Y8, Y2, Y1
+	VPERM2I128 $0x13, Y6, Y10, Y6
+	VPERM2I128 $0x13, Y8, Y2, Y10
 	JMP        sealAVX2ShortSeal
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 128 bytes of ciphertext
 sealAVX2Tail128:
-	// Need to decrypt up to 128 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0
-	VMOVDQA state1StoreAVX2, BB0
-	VMOVDQA state2StoreAVX2, CC0
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VMOVDQA DD0, DD1
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA 32(BP), Y14
+	VMOVDQA 64(BP), Y12
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VMOVDQA Y4, Y1
 
 sealAVX2Tail128LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail128LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $12, DD0, DD0, DD0
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0
-	VPALIGNR $8, CC0, CC0, CC0
-	VPALIGNR $4, DD0, DD0, DD0
-	DECQ     itr1
-	JG       sealAVX2Tail128LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail128LoopB
-
-	VPADDD ·chacha20Constants<>(SB), AA0, AA1
-	VPADDD state1StoreAVX2, BB0, BB1
-	VPADDD state2StoreAVX2, CC0, CC1
-	VPADDD DD1, DD0, DD1
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	DECQ       CX
+	JG         sealAVX2Tail128LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail128LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y5
+	VPADDD     32(BP), Y14, Y9
+	VPADDD     64(BP), Y12, Y13
+	VPADDD     Y1, Y4, Y1
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
 	JMP        sealAVX2ShortSealLoop
 
-// ----------------------------------------------------------------------------
-// Special optimization for the last 256 bytes of ciphertext
 sealAVX2Tail256:
-	// Need to decrypt up to 256 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
-	VMOVDQA DD0, TT1
-	VMOVDQA DD1, TT2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA ·chacha20Constants<>+0(SB), Y5
+	VMOVDQA 32(BP), Y14
+	VMOVDQA 32(BP), Y9
+	VMOVDQA 64(BP), Y12
+	VMOVDQA 64(BP), Y13
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
 
 sealAVX2Tail256LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail256LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
-	DECQ     itr1
-	JG       sealAVX2Tail256LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail256LoopB
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	DECQ       CX
+	JG         sealAVX2Tail256LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail256LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	MOVQ       $0x00000080, CX
+	LEAQ       128(SI), SI
+	SUBQ       $0x80, BX
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	JMP        sealAVX2SealHash
 
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	MOVQ       $128, itr1
-	LEAQ       128(inp), inp
-	SUBQ       $128, inl
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 384 bytes of ciphertext
 sealAVX2Tail384:
-	// Need to decrypt up to 384 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
-	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VMOVDQA Y4, Y7
+	VMOVDQA Y1, Y11
+	VMOVDQA Y2, Y15
 
 sealAVX2Tail384LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail384LoopB:
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(0(oup))
-	polyMul
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
-	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
-	polyAdd(16(oup))
-	polyMul
-	LEAQ     32(oup), oup
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
-	DECQ     itr1
-	JG       sealAVX2Tail384LoopA
-	DECQ     itr2
-	JGE      sealAVX2Tail384LoopB
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y3
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y3
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x0c, Y14, Y3
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y14, Y0, Y0
+	VPXOR      Y0, Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPADDD     Y4, Y12, Y12
+	VPXOR      Y12, Y14, Y14
+	VPSLLD     $0x07, Y14, Y3
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y3, Y14, Y14
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x0c, Y9, Y3
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y9, Y5, Y5
+	VPXOR      Y5, Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPADDD     Y1, Y13, Y13
+	VPXOR      Y13, Y9, Y9
+	VPSLLD     $0x07, Y9, Y3
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y3, Y9, Y9
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x0c, Y10, Y3
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	VPADDD     Y10, Y6, Y6
+	VPXOR      Y6, Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPADDD     Y2, Y8, Y8
+	VPXOR      Y8, Y10, Y10
+	VPSLLD     $0x07, Y10, Y3
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y3, Y10, Y10
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), AX
+	MOVQ       AX, R15
+	MULQ       R10
+	MOVQ       AX, R13
+	MOVQ       DX, R14
+	MOVQ       (BP), AX
+	MULQ       R11
+	IMULQ      R12, R15
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), AX
+	MOVQ       AX, R8
+	MULQ       R10
+	ADDQ       AX, R14
+	ADCQ       $0x00, DX
+	MOVQ       DX, R10
+	MOVQ       8(BP), AX
+	MULQ       R11
+	ADDQ       AX, R15
+	ADCQ       $0x00, DX
+	IMULQ      R12, R8
+	ADDQ       R10, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	DECQ       CX
+	JG         sealAVX2Tail384LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail384LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     Y7, Y4, Y4
+	VPADDD     Y11, Y1, Y1
+	VPADDD     Y15, Y2, Y2
+	VPERM2I128 $0x02, Y0, Y14, Y3
+	VPERM2I128 $0x02, Y12, Y4, Y7
+	VPERM2I128 $0x13, Y0, Y14, Y11
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      (SI), Y3, Y3
+	VPXOR      32(SI), Y7, Y7
+	VPXOR      64(SI), Y11, Y11
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y3, (DI)
+	VMOVDQU    Y7, 32(DI)
+	VMOVDQU    Y11, 64(DI)
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y3
+	VPERM2I128 $0x02, Y13, Y1, Y7
+	VPERM2I128 $0x13, Y5, Y9, Y11
+	VPERM2I128 $0x13, Y13, Y1, Y15
+	VPXOR      128(SI), Y3, Y3
+	VPXOR      160(SI), Y7, Y7
+	VPXOR      192(SI), Y11, Y11
+	VPXOR      224(SI), Y15, Y15
+	VMOVDQU    Y3, 128(DI)
+	VMOVDQU    Y7, 160(DI)
+	VMOVDQU    Y11, 192(DI)
+	VMOVDQU    Y15, 224(DI)
+	MOVQ       $0x00000100, CX
+	LEAQ       256(SI), SI
+	SUBQ       $0x00000100, BX
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	JMP        sealAVX2SealHash
 
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
-	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
-	VPERM2I128 $0x02, AA0, BB0, TT0
-	VPERM2I128 $0x02, CC0, DD0, TT1
-	VPERM2I128 $0x13, AA0, BB0, TT2
-	VPERM2I128 $0x13, CC0, DD0, TT3
-	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
-	VPERM2I128 $0x02, AA1, BB1, TT0
-	VPERM2I128 $0x02, CC1, DD1, TT1
-	VPERM2I128 $0x13, AA1, BB1, TT2
-	VPERM2I128 $0x13, CC1, DD1, TT3
-	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
-	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
-	MOVQ       $256, itr1
-	LEAQ       256(inp), inp
-	SUBQ       $256, inl
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-
-	JMP sealAVX2SealHash
-
-// ----------------------------------------------------------------------------
-// Special optimization for the last 512 bytes of ciphertext
 sealAVX2Tail512:
-	// Need to decrypt up to 512 bytes - prepare two blocks
-	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
-	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
-	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
-	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
-	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
-	VMOVDQA ctr3StoreAVX2, DD0
-	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
-	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+	VMOVDQA ·chacha20Constants<>+0(SB), Y0
+	VMOVDQA Y0, Y5
+	VMOVDQA Y0, Y6
+	VMOVDQA Y0, Y7
+	VMOVDQA 32(BP), Y14
+	VMOVDQA Y14, Y9
+	VMOVDQA Y14, Y10
+	VMOVDQA Y14, Y11
+	VMOVDQA 64(BP), Y12
+	VMOVDQA Y12, Y13
+	VMOVDQA Y12, Y8
+	VMOVDQA Y12, Y15
+	VMOVDQA 192(BP), Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
+	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
+	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
+	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
+	VMOVDQA Y4, 96(BP)
+	VMOVDQA Y1, 128(BP)
+	VMOVDQA Y2, 160(BP)
+	VMOVDQA Y3, 192(BP)
 
 sealAVX2Tail512LoopA:
-	polyAdd(0(oup))
-	polyMul
-	LEAQ 16(oup), oup
+	ADDQ  (DI), R10
+	ADCQ  8(DI), R11
+	ADCQ  $0x01, R12
+	MOVQ  (BP), AX
+	MOVQ  AX, R15
+	MULQ  R10
+	MOVQ  AX, R13
+	MOVQ  DX, R14
+	MOVQ  (BP), AX
+	MULQ  R11
+	IMULQ R12, R15
+	ADDQ  AX, R14
+	ADCQ  DX, R15
+	MOVQ  8(BP), AX
+	MOVQ  AX, R8
+	MULQ  R10
+	ADDQ  AX, R14
+	ADCQ  $0x00, DX
+	MOVQ  DX, R10
+	MOVQ  8(BP), AX
+	MULQ  R11
+	ADDQ  AX, R15
+	ADCQ  $0x00, DX
+	IMULQ R12, R8
+	ADDQ  R10, R15
+	ADCQ  DX, R8
+	MOVQ  R13, R10
+	MOVQ  R14, R11
+	MOVQ  R15, R12
+	ANDQ  $0x03, R12
+	MOVQ  R15, R13
+	ANDQ  $-4, R13
+	MOVQ  R8, R14
+	SHRQ  $0x02, R8, R15
+	SHRQ  $0x02, R8
+	ADDQ  R13, R10
+	ADCQ  R14, R11
+	ADCQ  $0x00, R12
+	ADDQ  R15, R10
+	ADCQ  R8, R11
+	ADCQ  $0x00, R12
+	LEAQ  16(DI), DI
 
 sealAVX2Tail512LoopB:
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	polyAdd(0*8(oup))
-	polyMulAVX2
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	polyAdd(2*8(oup))
-	polyMulAVX2
-	LEAQ     (4*8)(oup), oup
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
-	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
-	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
-	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
-	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
-	VMOVDQA  CC3, tmpStoreAVX2
-	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
-	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
-	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
-	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
-	VMOVDQA  tmpStoreAVX2, CC3
-	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
-	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
-	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
-
-	DECQ itr1
-	JG   sealAVX2Tail512LoopA
-	DECQ itr2
-	JGE  sealAVX2Tail512LoopB
-
-	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
-	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
-	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
-	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
-	VMOVDQA    CC3, tmpStoreAVX2
-	VPERM2I128 $0x02, AA0, BB0, CC3
-	VPXOR      (0*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (0*32)(oup)
-	VPERM2I128 $0x02, CC0, DD0, CC3
-	VPXOR      (1*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (1*32)(oup)
-	VPERM2I128 $0x13, AA0, BB0, CC3
-	VPXOR      (2*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (2*32)(oup)
-	VPERM2I128 $0x13, CC0, DD0, CC3
-	VPXOR      (3*32)(inp), CC3, CC3
-	VMOVDQU    CC3, (3*32)(oup)
-
-	VPERM2I128 $0x02, AA1, BB1, AA0
-	VPERM2I128 $0x02, CC1, DD1, BB0
-	VPERM2I128 $0x13, AA1, BB1, CC0
-	VPERM2I128 $0x13, CC1, DD1, DD0
-	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
-
-	VPERM2I128 $0x02, AA2, BB2, AA0
-	VPERM2I128 $0x02, CC2, DD2, BB0
-	VPERM2I128 $0x13, AA2, BB2, CC0
-	VPERM2I128 $0x13, CC2, DD2, DD0
-	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
-	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
-
-	MOVQ       $384, itr1
-	LEAQ       384(inp), inp
-	SUBQ       $384, inl
-	VPERM2I128 $0x02, AA3, BB3, AA0
-	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
-	VPERM2I128 $0x13, AA3, BB3, CC0
-	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
-
-	JMP sealAVX2SealHash
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x0c, Y11, Y15
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	ADDQ       (DI), R10
+	ADCQ       8(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x07, Y11, Y15
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPALIGNR   $0x04, Y14, Y14, Y14
+	VPALIGNR   $0x04, Y9, Y9, Y9
+	VPALIGNR   $0x04, Y10, Y10, Y10
+	VPALIGNR   $0x04, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x0c, Y4, Y4, Y4
+	VPALIGNR   $0x0c, Y1, Y1, Y1
+	VPALIGNR   $0x0c, Y2, Y2, Y2
+	VPALIGNR   $0x0c, Y3, Y3, Y3
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	ADDQ       16(DI), R10
+	ADCQ       24(DI), R11
+	ADCQ       $0x01, R12
+	MOVQ       (BP), DX
+	MOVQ       DX, R15
+	MULXQ      R10, R13, R14
+	IMULQ      R12, R15
+	MULXQ      R11, AX, DX
+	ADDQ       AX, R14
+	ADCQ       DX, R15
+	MOVQ       8(BP), DX
+	MULXQ      R10, R10, AX
+	ADDQ       R10, R14
+	MULXQ      R11, R11, R8
+	ADCQ       R11, R15
+	ADCQ       $0x00, R8
+	IMULQ      R12, DX
+	ADDQ       AX, R15
+	ADCQ       DX, R8
+	MOVQ       R13, R10
+	MOVQ       R14, R11
+	MOVQ       R15, R12
+	ANDQ       $0x03, R12
+	MOVQ       R15, R13
+	ANDQ       $-4, R13
+	MOVQ       R8, R14
+	SHRQ       $0x02, R8, R15
+	SHRQ       $0x02, R8
+	ADDQ       R13, R10
+	ADCQ       R14, R11
+	ADCQ       $0x00, R12
+	ADDQ       R15, R10
+	ADCQ       R8, R11
+	ADCQ       $0x00, R12
+	LEAQ       32(DI), DI
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x0c, Y14, Y15
+	VPSRLD     $0x14, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x0c, Y9, Y15
+	VPSRLD     $0x14, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x0c, Y10, Y15
+	VPSRLD     $0x14, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x0c, Y11, Y15
+	VPSRLD     $0x14, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPADDD     Y14, Y0, Y0
+	VPADDD     Y9, Y5, Y5
+	VPADDD     Y10, Y6, Y6
+	VPADDD     Y11, Y7, Y7
+	VPXOR      Y0, Y4, Y4
+	VPXOR      Y5, Y1, Y1
+	VPXOR      Y6, Y2, Y2
+	VPXOR      Y7, Y3, Y3
+	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
+	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
+	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
+	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
+	VPADDD     Y4, Y12, Y12
+	VPADDD     Y1, Y13, Y13
+	VPADDD     Y2, Y8, Y8
+	VPADDD     Y3, Y15, Y15
+	VPXOR      Y12, Y14, Y14
+	VPXOR      Y13, Y9, Y9
+	VPXOR      Y8, Y10, Y10
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    Y15, 224(BP)
+	VPSLLD     $0x07, Y14, Y15
+	VPSRLD     $0x19, Y14, Y14
+	VPXOR      Y15, Y14, Y14
+	VPSLLD     $0x07, Y9, Y15
+	VPSRLD     $0x19, Y9, Y9
+	VPXOR      Y15, Y9, Y9
+	VPSLLD     $0x07, Y10, Y15
+	VPSRLD     $0x19, Y10, Y10
+	VPXOR      Y15, Y10, Y10
+	VPSLLD     $0x07, Y11, Y15
+	VPSRLD     $0x19, Y11, Y11
+	VPXOR      Y15, Y11, Y11
+	VMOVDQA    224(BP), Y15
+	VPALIGNR   $0x0c, Y14, Y14, Y14
+	VPALIGNR   $0x0c, Y9, Y9, Y9
+	VPALIGNR   $0x0c, Y10, Y10, Y10
+	VPALIGNR   $0x0c, Y11, Y11, Y11
+	VPALIGNR   $0x08, Y12, Y12, Y12
+	VPALIGNR   $0x08, Y13, Y13, Y13
+	VPALIGNR   $0x08, Y8, Y8, Y8
+	VPALIGNR   $0x08, Y15, Y15, Y15
+	VPALIGNR   $0x04, Y4, Y4, Y4
+	VPALIGNR   $0x04, Y1, Y1, Y1
+	VPALIGNR   $0x04, Y2, Y2, Y2
+	VPALIGNR   $0x04, Y3, Y3, Y3
+	DECQ       CX
+	JG         sealAVX2Tail512LoopA
+	DECQ       R9
+	JGE        sealAVX2Tail512LoopB
+	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
+	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
+	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
+	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
+	VPADDD     32(BP), Y14, Y14
+	VPADDD     32(BP), Y9, Y9
+	VPADDD     32(BP), Y10, Y10
+	VPADDD     32(BP), Y11, Y11
+	VPADDD     64(BP), Y12, Y12
+	VPADDD     64(BP), Y13, Y13
+	VPADDD     64(BP), Y8, Y8
+	VPADDD     64(BP), Y15, Y15
+	VPADDD     96(BP), Y4, Y4
+	VPADDD     128(BP), Y1, Y1
+	VPADDD     160(BP), Y2, Y2
+	VPADDD     192(BP), Y3, Y3
+	VMOVDQA    Y15, 224(BP)
+	VPERM2I128 $0x02, Y0, Y14, Y15
+	VPXOR      (SI), Y15, Y15
+	VMOVDQU    Y15, (DI)
+	VPERM2I128 $0x02, Y12, Y4, Y15
+	VPXOR      32(SI), Y15, Y15
+	VMOVDQU    Y15, 32(DI)
+	VPERM2I128 $0x13, Y0, Y14, Y15
+	VPXOR      64(SI), Y15, Y15
+	VMOVDQU    Y15, 64(DI)
+	VPERM2I128 $0x13, Y12, Y4, Y15
+	VPXOR      96(SI), Y15, Y15
+	VMOVDQU    Y15, 96(DI)
+	VPERM2I128 $0x02, Y5, Y9, Y0
+	VPERM2I128 $0x02, Y13, Y1, Y14
+	VPERM2I128 $0x13, Y5, Y9, Y12
+	VPERM2I128 $0x13, Y13, Y1, Y4
+	VPXOR      128(SI), Y0, Y0
+	VPXOR      160(SI), Y14, Y14
+	VPXOR      192(SI), Y12, Y12
+	VPXOR      224(SI), Y4, Y4
+	VMOVDQU    Y0, 128(DI)
+	VMOVDQU    Y14, 160(DI)
+	VMOVDQU    Y12, 192(DI)
+	VMOVDQU    Y4, 224(DI)
+	VPERM2I128 $0x02, Y6, Y10, Y0
+	VPERM2I128 $0x02, Y8, Y2, Y14
+	VPERM2I128 $0x13, Y6, Y10, Y12
+	VPERM2I128 $0x13, Y8, Y2, Y4
+	VPXOR      256(SI), Y0, Y0
+	VPXOR      288(SI), Y14, Y14
+	VPXOR      320(SI), Y12, Y12
+	VPXOR      352(SI), Y4, Y4
+	VMOVDQU    Y0, 256(DI)
+	VMOVDQU    Y14, 288(DI)
+	VMOVDQU    Y12, 320(DI)
+	VMOVDQU    Y4, 352(DI)
+	MOVQ       $0x00000180, CX
+	LEAQ       384(SI), SI
+	SUBQ       $0x00000180, BX
+	VPERM2I128 $0x02, Y7, Y11, Y0
+	VPERM2I128 $0x02, 224(BP), Y3, Y14
+	VPERM2I128 $0x13, Y7, Y11, Y12
+	VPERM2I128 $0x13, 224(BP), Y3, Y4
+	JMP        sealAVX2SealHash
diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519.go b/vendor/golang.org/x/crypto/curve25519/curve25519.go
index 00f963ea..21ca3b2e 100644
--- a/vendor/golang.org/x/crypto/curve25519/curve25519.go
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519.go
@@ -6,9 +6,11 @@
 // performs scalar multiplication on the elliptic curve known as Curve25519.
 // See RFC 7748.
 //
-// Starting in Go 1.20, this package is a wrapper for the X25519 implementation
+// This package is a wrapper for the X25519 implementation
 // in the crypto/ecdh package.
-package curve25519 // import "golang.org/x/crypto/curve25519"
+package curve25519
+
+import "crypto/ecdh"
 
 // ScalarMult sets dst to the product scalar * point.
 //
@@ -16,7 +18,13 @@ package curve25519 // import "golang.org/x/crypto/curve25519"
 // zeroes, irrespective of the scalar. Instead, use the X25519 function, which
 // will return an error.
 func ScalarMult(dst, scalar, point *[32]byte) {
-	scalarMult(dst, scalar, point)
+	if _, err := x25519(dst, scalar[:], point[:]); err != nil {
+		// The only error condition for x25519 when the inputs are 32 bytes long
+		// is if the output would have been the all-zero value.
+		for i := range dst {
+			dst[i] = 0
+		}
+	}
 }
 
 // ScalarBaseMult sets dst to the product scalar * base where base is the
@@ -25,7 +33,12 @@ func ScalarMult(dst, scalar, point *[32]byte) {
 // It is recommended to use the X25519 function with Basepoint instead, as
 // copying into fixed size arrays can lead to unexpected bugs.
 func ScalarBaseMult(dst, scalar *[32]byte) {
-	scalarBaseMult(dst, scalar)
+	curve := ecdh.X25519()
+	priv, err := curve.NewPrivateKey(scalar[:])
+	if err != nil {
+		panic("curve25519: internal error: scalarBaseMult was not 32 bytes")
+	}
+	copy(dst[:], priv.PublicKey().Bytes())
 }
 
 const (
@@ -57,3 +70,21 @@ func X25519(scalar, point []byte) ([]byte, error) {
 	var dst [32]byte
 	return x25519(&dst, scalar, point)
 }
+
+func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) {
+	curve := ecdh.X25519()
+	pub, err := curve.NewPublicKey(point)
+	if err != nil {
+		return nil, err
+	}
+	priv, err := curve.NewPrivateKey(scalar)
+	if err != nil {
+		return nil, err
+	}
+	out, err := priv.ECDH(pub)
+	if err != nil {
+		return nil, err
+	}
+	copy(dst[:], out)
+	return dst[:], nil
+}
diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go b/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go
deleted file mode 100644
index ba647e8d..00000000
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_compat.go
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !go1.20
-
-package curve25519
-
-import (
-	"crypto/subtle"
-	"errors"
-	"strconv"
-
-	"golang.org/x/crypto/curve25519/internal/field"
-)
-
-func scalarMult(dst, scalar, point *[32]byte) {
-	var e [32]byte
-
-	copy(e[:], scalar[:])
-	e[0] &= 248
-	e[31] &= 127
-	e[31] |= 64
-
-	var x1, x2, z2, x3, z3, tmp0, tmp1 field.Element
-	x1.SetBytes(point[:])
-	x2.One()
-	x3.Set(&x1)
-	z3.One()
-
-	swap := 0
-	for pos := 254; pos >= 0; pos-- {
-		b := e[pos/8] >> uint(pos&7)
-		b &= 1
-		swap ^= int(b)
-		x2.Swap(&x3, swap)
-		z2.Swap(&z3, swap)
-		swap = int(b)
-
-		tmp0.Subtract(&x3, &z3)
-		tmp1.Subtract(&x2, &z2)
-		x2.Add(&x2, &z2)
-		z2.Add(&x3, &z3)
-		z3.Multiply(&tmp0, &x2)
-		z2.Multiply(&z2, &tmp1)
-		tmp0.Square(&tmp1)
-		tmp1.Square(&x2)
-		x3.Add(&z3, &z2)
-		z2.Subtract(&z3, &z2)
-		x2.Multiply(&tmp1, &tmp0)
-		tmp1.Subtract(&tmp1, &tmp0)
-		z2.Square(&z2)
-
-		z3.Mult32(&tmp1, 121666)
-		x3.Square(&x3)
-		tmp0.Add(&tmp0, &z3)
-		z3.Multiply(&x1, &z2)
-		z2.Multiply(&tmp1, &tmp0)
-	}
-
-	x2.Swap(&x3, swap)
-	z2.Swap(&z3, swap)
-
-	z2.Invert(&z2)
-	x2.Multiply(&x2, &z2)
-	copy(dst[:], x2.Bytes())
-}
-
-func scalarBaseMult(dst, scalar *[32]byte) {
-	checkBasepoint()
-	scalarMult(dst, scalar, &basePoint)
-}
-
-func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) {
-	var in [32]byte
-	if l := len(scalar); l != 32 {
-		return nil, errors.New("bad scalar length: " + strconv.Itoa(l) + ", expected 32")
-	}
-	if l := len(point); l != 32 {
-		return nil, errors.New("bad point length: " + strconv.Itoa(l) + ", expected 32")
-	}
-	copy(in[:], scalar)
-	if &point[0] == &Basepoint[0] {
-		scalarBaseMult(dst, &in)
-	} else {
-		var base, zero [32]byte
-		copy(base[:], point)
-		scalarMult(dst, &in, &base)
-		if subtle.ConstantTimeCompare(dst[:], zero[:]) == 1 {
-			return nil, errors.New("bad input point: low order point")
-		}
-	}
-	return dst[:], nil
-}
-
-func checkBasepoint() {
-	if subtle.ConstantTimeCompare(Basepoint, []byte{
-		0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-	}) != 1 {
-		panic("curve25519: global Basepoint value was modified")
-	}
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go b/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go
deleted file mode 100644
index 627df497..00000000
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_go120.go
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2022 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build go1.20
-
-package curve25519
-
-import "crypto/ecdh"
-
-func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) {
-	curve := ecdh.X25519()
-	pub, err := curve.NewPublicKey(point)
-	if err != nil {
-		return nil, err
-	}
-	priv, err := curve.NewPrivateKey(scalar)
-	if err != nil {
-		return nil, err
-	}
-	out, err := priv.ECDH(pub)
-	if err != nil {
-		return nil, err
-	}
-	copy(dst[:], out)
-	return dst[:], nil
-}
-
-func scalarMult(dst, scalar, point *[32]byte) {
-	if _, err := x25519(dst, scalar[:], point[:]); err != nil {
-		// The only error condition for x25519 when the inputs are 32 bytes long
-		// is if the output would have been the all-zero value.
-		for i := range dst {
-			dst[i] = 0
-		}
-	}
-}
-
-func scalarBaseMult(dst, scalar *[32]byte) {
-	curve := ecdh.X25519()
-	priv, err := curve.NewPrivateKey(scalar[:])
-	if err != nil {
-		panic("curve25519: internal error: scalarBaseMult was not 32 bytes")
-	}
-	copy(dst[:], priv.PublicKey().Bytes())
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/README b/vendor/golang.org/x/crypto/curve25519/internal/field/README
deleted file mode 100644
index e25bca7d..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/README
+++ /dev/null
@@ -1,7 +0,0 @@
-This package is kept in sync with crypto/ed25519/internal/edwards25519/field in
-the standard library.
-
-If there are any changes in the standard library that need to be synced to this
-package, run sync.sh. It will not overwrite any local changes made since the
-previous sync, so it's ok to land changes in this package first, and then sync
-to the standard library later.
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go
deleted file mode 100644
index ca841ad9..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe.go
+++ /dev/null
@@ -1,416 +0,0 @@
-// Copyright (c) 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package field implements fast arithmetic modulo 2^255-19.
-package field
-
-import (
-	"crypto/subtle"
-	"encoding/binary"
-	"math/bits"
-)
-
-// Element represents an element of the field GF(2^255-19). Note that this
-// is not a cryptographically secure group, and should only be used to interact
-// with edwards25519.Point coordinates.
-//
-// This type works similarly to math/big.Int, and all arguments and receivers
-// are allowed to alias.
-//
-// The zero value is a valid zero element.
-type Element struct {
-	// An element t represents the integer
-	//     t.l0 + t.l1*2^51 + t.l2*2^102 + t.l3*2^153 + t.l4*2^204
-	//
-	// Between operations, all limbs are expected to be lower than 2^52.
-	l0 uint64
-	l1 uint64
-	l2 uint64
-	l3 uint64
-	l4 uint64
-}
-
-const maskLow51Bits uint64 = (1 << 51) - 1
-
-var feZero = &Element{0, 0, 0, 0, 0}
-
-// Zero sets v = 0, and returns v.
-func (v *Element) Zero() *Element {
-	*v = *feZero
-	return v
-}
-
-var feOne = &Element{1, 0, 0, 0, 0}
-
-// One sets v = 1, and returns v.
-func (v *Element) One() *Element {
-	*v = *feOne
-	return v
-}
-
-// reduce reduces v modulo 2^255 - 19 and returns it.
-func (v *Element) reduce() *Element {
-	v.carryPropagate()
-
-	// After the light reduction we now have a field element representation
-	// v < 2^255 + 2^13 * 19, but need v < 2^255 - 19.
-
-	// If v >= 2^255 - 19, then v + 19 >= 2^255, which would overflow 2^255 - 1,
-	// generating a carry. That is, c will be 0 if v < 2^255 - 19, and 1 otherwise.
-	c := (v.l0 + 19) >> 51
-	c = (v.l1 + c) >> 51
-	c = (v.l2 + c) >> 51
-	c = (v.l3 + c) >> 51
-	c = (v.l4 + c) >> 51
-
-	// If v < 2^255 - 19 and c = 0, this will be a no-op. Otherwise, it's
-	// effectively applying the reduction identity to the carry.
-	v.l0 += 19 * c
-
-	v.l1 += v.l0 >> 51
-	v.l0 = v.l0 & maskLow51Bits
-	v.l2 += v.l1 >> 51
-	v.l1 = v.l1 & maskLow51Bits
-	v.l3 += v.l2 >> 51
-	v.l2 = v.l2 & maskLow51Bits
-	v.l4 += v.l3 >> 51
-	v.l3 = v.l3 & maskLow51Bits
-	// no additional carry
-	v.l4 = v.l4 & maskLow51Bits
-
-	return v
-}
-
-// Add sets v = a + b, and returns v.
-func (v *Element) Add(a, b *Element) *Element {
-	v.l0 = a.l0 + b.l0
-	v.l1 = a.l1 + b.l1
-	v.l2 = a.l2 + b.l2
-	v.l3 = a.l3 + b.l3
-	v.l4 = a.l4 + b.l4
-	// Using the generic implementation here is actually faster than the
-	// assembly. Probably because the body of this function is so simple that
-	// the compiler can figure out better optimizations by inlining the carry
-	// propagation. TODO
-	return v.carryPropagateGeneric()
-}
-
-// Subtract sets v = a - b, and returns v.
-func (v *Element) Subtract(a, b *Element) *Element {
-	// We first add 2 * p, to guarantee the subtraction won't underflow, and
-	// then subtract b (which can be up to 2^255 + 2^13 * 19).
-	v.l0 = (a.l0 + 0xFFFFFFFFFFFDA) - b.l0
-	v.l1 = (a.l1 + 0xFFFFFFFFFFFFE) - b.l1
-	v.l2 = (a.l2 + 0xFFFFFFFFFFFFE) - b.l2
-	v.l3 = (a.l3 + 0xFFFFFFFFFFFFE) - b.l3
-	v.l4 = (a.l4 + 0xFFFFFFFFFFFFE) - b.l4
-	return v.carryPropagate()
-}
-
-// Negate sets v = -a, and returns v.
-func (v *Element) Negate(a *Element) *Element {
-	return v.Subtract(feZero, a)
-}
-
-// Invert sets v = 1/z mod p, and returns v.
-//
-// If z == 0, Invert returns v = 0.
-func (v *Element) Invert(z *Element) *Element {
-	// Inversion is implemented as exponentiation with exponent p − 2. It uses the
-	// same sequence of 255 squarings and 11 multiplications as [Curve25519].
-	var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t Element
-
-	z2.Square(z)             // 2
-	t.Square(&z2)            // 4
-	t.Square(&t)             // 8
-	z9.Multiply(&t, z)       // 9
-	z11.Multiply(&z9, &z2)   // 11
-	t.Square(&z11)           // 22
-	z2_5_0.Multiply(&t, &z9) // 31 = 2^5 - 2^0
-
-	t.Square(&z2_5_0) // 2^6 - 2^1
-	for i := 0; i < 4; i++ {
-		t.Square(&t) // 2^10 - 2^5
-	}
-	z2_10_0.Multiply(&t, &z2_5_0) // 2^10 - 2^0
-
-	t.Square(&z2_10_0) // 2^11 - 2^1
-	for i := 0; i < 9; i++ {
-		t.Square(&t) // 2^20 - 2^10
-	}
-	z2_20_0.Multiply(&t, &z2_10_0) // 2^20 - 2^0
-
-	t.Square(&z2_20_0) // 2^21 - 2^1
-	for i := 0; i < 19; i++ {
-		t.Square(&t) // 2^40 - 2^20
-	}
-	t.Multiply(&t, &z2_20_0) // 2^40 - 2^0
-
-	t.Square(&t) // 2^41 - 2^1
-	for i := 0; i < 9; i++ {
-		t.Square(&t) // 2^50 - 2^10
-	}
-	z2_50_0.Multiply(&t, &z2_10_0) // 2^50 - 2^0
-
-	t.Square(&z2_50_0) // 2^51 - 2^1
-	for i := 0; i < 49; i++ {
-		t.Square(&t) // 2^100 - 2^50
-	}
-	z2_100_0.Multiply(&t, &z2_50_0) // 2^100 - 2^0
-
-	t.Square(&z2_100_0) // 2^101 - 2^1
-	for i := 0; i < 99; i++ {
-		t.Square(&t) // 2^200 - 2^100
-	}
-	t.Multiply(&t, &z2_100_0) // 2^200 - 2^0
-
-	t.Square(&t) // 2^201 - 2^1
-	for i := 0; i < 49; i++ {
-		t.Square(&t) // 2^250 - 2^50
-	}
-	t.Multiply(&t, &z2_50_0) // 2^250 - 2^0
-
-	t.Square(&t) // 2^251 - 2^1
-	t.Square(&t) // 2^252 - 2^2
-	t.Square(&t) // 2^253 - 2^3
-	t.Square(&t) // 2^254 - 2^4
-	t.Square(&t) // 2^255 - 2^5
-
-	return v.Multiply(&t, &z11) // 2^255 - 21
-}
-
-// Set sets v = a, and returns v.
-func (v *Element) Set(a *Element) *Element {
-	*v = *a
-	return v
-}
-
-// SetBytes sets v to x, which must be a 32-byte little-endian encoding.
-//
-// Consistent with RFC 7748, the most significant bit (the high bit of the
-// last byte) is ignored, and non-canonical values (2^255-19 through 2^255-1)
-// are accepted. Note that this is laxer than specified by RFC 8032.
-func (v *Element) SetBytes(x []byte) *Element {
-	if len(x) != 32 {
-		panic("edwards25519: invalid field element input size")
-	}
-
-	// Bits 0:51 (bytes 0:8, bits 0:64, shift 0, mask 51).
-	v.l0 = binary.LittleEndian.Uint64(x[0:8])
-	v.l0 &= maskLow51Bits
-	// Bits 51:102 (bytes 6:14, bits 48:112, shift 3, mask 51).
-	v.l1 = binary.LittleEndian.Uint64(x[6:14]) >> 3
-	v.l1 &= maskLow51Bits
-	// Bits 102:153 (bytes 12:20, bits 96:160, shift 6, mask 51).
-	v.l2 = binary.LittleEndian.Uint64(x[12:20]) >> 6
-	v.l2 &= maskLow51Bits
-	// Bits 153:204 (bytes 19:27, bits 152:216, shift 1, mask 51).
-	v.l3 = binary.LittleEndian.Uint64(x[19:27]) >> 1
-	v.l3 &= maskLow51Bits
-	// Bits 204:251 (bytes 24:32, bits 192:256, shift 12, mask 51).
-	// Note: not bytes 25:33, shift 4, to avoid overread.
-	v.l4 = binary.LittleEndian.Uint64(x[24:32]) >> 12
-	v.l4 &= maskLow51Bits
-
-	return v
-}
-
-// Bytes returns the canonical 32-byte little-endian encoding of v.
-func (v *Element) Bytes() []byte {
-	// This function is outlined to make the allocations inline in the caller
-	// rather than happen on the heap.
-	var out [32]byte
-	return v.bytes(&out)
-}
-
-func (v *Element) bytes(out *[32]byte) []byte {
-	t := *v
-	t.reduce()
-
-	var buf [8]byte
-	for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} {
-		bitsOffset := i * 51
-		binary.LittleEndian.PutUint64(buf[:], l<<uint(bitsOffset%8))
-		for i, bb := range buf {
-			off := bitsOffset/8 + i
-			if off >= len(out) {
-				break
-			}
-			out[off] |= bb
-		}
-	}
-
-	return out[:]
-}
-
-// Equal returns 1 if v and u are equal, and 0 otherwise.
-func (v *Element) Equal(u *Element) int {
-	sa, sv := u.Bytes(), v.Bytes()
-	return subtle.ConstantTimeCompare(sa, sv)
-}
-
-// mask64Bits returns 0xffffffff if cond is 1, and 0 otherwise.
-func mask64Bits(cond int) uint64 { return ^(uint64(cond) - 1) }
-
-// Select sets v to a if cond == 1, and to b if cond == 0.
-func (v *Element) Select(a, b *Element, cond int) *Element {
-	m := mask64Bits(cond)
-	v.l0 = (m & a.l0) | (^m & b.l0)
-	v.l1 = (m & a.l1) | (^m & b.l1)
-	v.l2 = (m & a.l2) | (^m & b.l2)
-	v.l3 = (m & a.l3) | (^m & b.l3)
-	v.l4 = (m & a.l4) | (^m & b.l4)
-	return v
-}
-
-// Swap swaps v and u if cond == 1 or leaves them unchanged if cond == 0, and returns v.
-func (v *Element) Swap(u *Element, cond int) {
-	m := mask64Bits(cond)
-	t := m & (v.l0 ^ u.l0)
-	v.l0 ^= t
-	u.l0 ^= t
-	t = m & (v.l1 ^ u.l1)
-	v.l1 ^= t
-	u.l1 ^= t
-	t = m & (v.l2 ^ u.l2)
-	v.l2 ^= t
-	u.l2 ^= t
-	t = m & (v.l3 ^ u.l3)
-	v.l3 ^= t
-	u.l3 ^= t
-	t = m & (v.l4 ^ u.l4)
-	v.l4 ^= t
-	u.l4 ^= t
-}
-
-// IsNegative returns 1 if v is negative, and 0 otherwise.
-func (v *Element) IsNegative() int {
-	return int(v.Bytes()[0] & 1)
-}
-
-// Absolute sets v to |u|, and returns v.
-func (v *Element) Absolute(u *Element) *Element {
-	return v.Select(new(Element).Negate(u), u, u.IsNegative())
-}
-
-// Multiply sets v = x * y, and returns v.
-func (v *Element) Multiply(x, y *Element) *Element {
-	feMul(v, x, y)
-	return v
-}
-
-// Square sets v = x * x, and returns v.
-func (v *Element) Square(x *Element) *Element {
-	feSquare(v, x)
-	return v
-}
-
-// Mult32 sets v = x * y, and returns v.
-func (v *Element) Mult32(x *Element, y uint32) *Element {
-	x0lo, x0hi := mul51(x.l0, y)
-	x1lo, x1hi := mul51(x.l1, y)
-	x2lo, x2hi := mul51(x.l2, y)
-	x3lo, x3hi := mul51(x.l3, y)
-	x4lo, x4hi := mul51(x.l4, y)
-	v.l0 = x0lo + 19*x4hi // carried over per the reduction identity
-	v.l1 = x1lo + x0hi
-	v.l2 = x2lo + x1hi
-	v.l3 = x3lo + x2hi
-	v.l4 = x4lo + x3hi
-	// The hi portions are going to be only 32 bits, plus any previous excess,
-	// so we can skip the carry propagation.
-	return v
-}
-
-// mul51 returns lo + hi * 2⁵¹ = a * b.
-func mul51(a uint64, b uint32) (lo uint64, hi uint64) {
-	mh, ml := bits.Mul64(a, uint64(b))
-	lo = ml & maskLow51Bits
-	hi = (mh << 13) | (ml >> 51)
-	return
-}
-
-// Pow22523 set v = x^((p-5)/8), and returns v. (p-5)/8 is 2^252-3.
-func (v *Element) Pow22523(x *Element) *Element {
-	var t0, t1, t2 Element
-
-	t0.Square(x)             // x^2
-	t1.Square(&t0)           // x^4
-	t1.Square(&t1)           // x^8
-	t1.Multiply(x, &t1)      // x^9
-	t0.Multiply(&t0, &t1)    // x^11
-	t0.Square(&t0)           // x^22
-	t0.Multiply(&t1, &t0)    // x^31
-	t1.Square(&t0)           // x^62
-	for i := 1; i < 5; i++ { // x^992
-		t1.Square(&t1)
-	}
-	t0.Multiply(&t1, &t0)     // x^1023 -> 1023 = 2^10 - 1
-	t1.Square(&t0)            // 2^11 - 2
-	for i := 1; i < 10; i++ { // 2^20 - 2^10
-		t1.Square(&t1)
-	}
-	t1.Multiply(&t1, &t0)     // 2^20 - 1
-	t2.Square(&t1)            // 2^21 - 2
-	for i := 1; i < 20; i++ { // 2^40 - 2^20
-		t2.Square(&t2)
-	}
-	t1.Multiply(&t2, &t1)     // 2^40 - 1
-	t1.Square(&t1)            // 2^41 - 2
-	for i := 1; i < 10; i++ { // 2^50 - 2^10
-		t1.Square(&t1)
-	}
-	t0.Multiply(&t1, &t0)     // 2^50 - 1
-	t1.Square(&t0)            // 2^51 - 2
-	for i := 1; i < 50; i++ { // 2^100 - 2^50
-		t1.Square(&t1)
-	}
-	t1.Multiply(&t1, &t0)      // 2^100 - 1
-	t2.Square(&t1)             // 2^101 - 2
-	for i := 1; i < 100; i++ { // 2^200 - 2^100
-		t2.Square(&t2)
-	}
-	t1.Multiply(&t2, &t1)     // 2^200 - 1
-	t1.Square(&t1)            // 2^201 - 2
-	for i := 1; i < 50; i++ { // 2^250 - 2^50
-		t1.Square(&t1)
-	}
-	t0.Multiply(&t1, &t0)     // 2^250 - 1
-	t0.Square(&t0)            // 2^251 - 2
-	t0.Square(&t0)            // 2^252 - 4
-	return v.Multiply(&t0, x) // 2^252 - 3 -> x^(2^252-3)
-}
-
-// sqrtM1 is 2^((p-1)/4), which squared is equal to -1 by Euler's Criterion.
-var sqrtM1 = &Element{1718705420411056, 234908883556509,
-	2233514472574048, 2117202627021982, 765476049583133}
-
-// SqrtRatio sets r to the non-negative square root of the ratio of u and v.
-//
-// If u/v is square, SqrtRatio returns r and 1. If u/v is not square, SqrtRatio
-// sets r according to Section 4.3 of draft-irtf-cfrg-ristretto255-decaf448-00,
-// and returns r and 0.
-func (r *Element) SqrtRatio(u, v *Element) (rr *Element, wasSquare int) {
-	var a, b Element
-
-	// r = (u * v3) * (u * v7)^((p-5)/8)
-	v2 := a.Square(v)
-	uv3 := b.Multiply(u, b.Multiply(v2, v))
-	uv7 := a.Multiply(uv3, a.Square(v2))
-	r.Multiply(uv3, r.Pow22523(uv7))
-
-	check := a.Multiply(v, a.Square(r)) // check = v * r^2
-
-	uNeg := b.Negate(u)
-	correctSignSqrt := check.Equal(u)
-	flippedSignSqrt := check.Equal(uNeg)
-	flippedSignSqrtI := check.Equal(uNeg.Multiply(uNeg, sqrtM1))
-
-	rPrime := b.Multiply(r, sqrtM1) // r_prime = SQRT_M1 * r
-	// r = CT_SELECT(r_prime IF flipped_sign_sqrt | flipped_sign_sqrt_i ELSE r)
-	r.Select(rPrime, r, flippedSignSqrt|flippedSignSqrtI)
-
-	r.Absolute(r) // Choose the nonnegative square root.
-	return r, correctSignSqrt | flippedSignSqrt
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go
deleted file mode 100644
index 70c54169..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
-
-//go:build amd64 && gc && !purego
-
-package field
-
-// feMul sets out = a * b. It works like feMulGeneric.
-//
-//go:noescape
-func feMul(out *Element, a *Element, b *Element)
-
-// feSquare sets out = a * a. It works like feSquareGeneric.
-//
-//go:noescape
-func feSquare(out *Element, a *Element)
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s
deleted file mode 100644
index 60817acc..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64.s
+++ /dev/null
@@ -1,378 +0,0 @@
-// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
-
-//go:build amd64 && gc && !purego
-
-#include "textflag.h"
-
-// func feMul(out *Element, a *Element, b *Element)
-TEXT ·feMul(SB), NOSPLIT, $0-24
-	MOVQ a+8(FP), CX
-	MOVQ b+16(FP), BX
-
-	// r0 = a0×b0
-	MOVQ (CX), AX
-	MULQ (BX)
-	MOVQ AX, DI
-	MOVQ DX, SI
-
-	// r0 += 19×a1×b4
-	MOVQ   8(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
-
-	// r0 += 19×a2×b3
-	MOVQ   16(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
-
-	// r0 += 19×a3×b2
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   16(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
-
-	// r0 += 19×a4×b1
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   8(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
-
-	// r1 = a0×b1
-	MOVQ (CX), AX
-	MULQ 8(BX)
-	MOVQ AX, R9
-	MOVQ DX, R8
-
-	// r1 += a1×b0
-	MOVQ 8(CX), AX
-	MULQ (BX)
-	ADDQ AX, R9
-	ADCQ DX, R8
-
-	// r1 += 19×a2×b4
-	MOVQ   16(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
-
-	// r1 += 19×a3×b3
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
-
-	// r1 += 19×a4×b2
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   16(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
-
-	// r2 = a0×b2
-	MOVQ (CX), AX
-	MULQ 16(BX)
-	MOVQ AX, R11
-	MOVQ DX, R10
-
-	// r2 += a1×b1
-	MOVQ 8(CX), AX
-	MULQ 8(BX)
-	ADDQ AX, R11
-	ADCQ DX, R10
-
-	// r2 += a2×b0
-	MOVQ 16(CX), AX
-	MULQ (BX)
-	ADDQ AX, R11
-	ADCQ DX, R10
-
-	// r2 += 19×a3×b4
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R11
-	ADCQ   DX, R10
-
-	// r2 += 19×a4×b3
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, R11
-	ADCQ   DX, R10
-
-	// r3 = a0×b3
-	MOVQ (CX), AX
-	MULQ 24(BX)
-	MOVQ AX, R13
-	MOVQ DX, R12
-
-	// r3 += a1×b2
-	MOVQ 8(CX), AX
-	MULQ 16(BX)
-	ADDQ AX, R13
-	ADCQ DX, R12
-
-	// r3 += a2×b1
-	MOVQ 16(CX), AX
-	MULQ 8(BX)
-	ADDQ AX, R13
-	ADCQ DX, R12
-
-	// r3 += a3×b0
-	MOVQ 24(CX), AX
-	MULQ (BX)
-	ADDQ AX, R13
-	ADCQ DX, R12
-
-	// r3 += 19×a4×b4
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R13
-	ADCQ   DX, R12
-
-	// r4 = a0×b4
-	MOVQ (CX), AX
-	MULQ 32(BX)
-	MOVQ AX, R15
-	MOVQ DX, R14
-
-	// r4 += a1×b3
-	MOVQ 8(CX), AX
-	MULQ 24(BX)
-	ADDQ AX, R15
-	ADCQ DX, R14
-
-	// r4 += a2×b2
-	MOVQ 16(CX), AX
-	MULQ 16(BX)
-	ADDQ AX, R15
-	ADCQ DX, R14
-
-	// r4 += a3×b1
-	MOVQ 24(CX), AX
-	MULQ 8(BX)
-	ADDQ AX, R15
-	ADCQ DX, R14
-
-	// r4 += a4×b0
-	MOVQ 32(CX), AX
-	MULQ (BX)
-	ADDQ AX, R15
-	ADCQ DX, R14
-
-	// First reduction chain
-	MOVQ   $0x0007ffffffffffff, AX
-	SHLQ   $0x0d, DI, SI
-	SHLQ   $0x0d, R9, R8
-	SHLQ   $0x0d, R11, R10
-	SHLQ   $0x0d, R13, R12
-	SHLQ   $0x0d, R15, R14
-	ANDQ   AX, DI
-	IMUL3Q $0x13, R14, R14
-	ADDQ   R14, DI
-	ANDQ   AX, R9
-	ADDQ   SI, R9
-	ANDQ   AX, R11
-	ADDQ   R8, R11
-	ANDQ   AX, R13
-	ADDQ   R10, R13
-	ANDQ   AX, R15
-	ADDQ   R12, R15
-
-	// Second reduction chain (carryPropagate)
-	MOVQ   DI, SI
-	SHRQ   $0x33, SI
-	MOVQ   R9, R8
-	SHRQ   $0x33, R8
-	MOVQ   R11, R10
-	SHRQ   $0x33, R10
-	MOVQ   R13, R12
-	SHRQ   $0x33, R12
-	MOVQ   R15, R14
-	SHRQ   $0x33, R14
-	ANDQ   AX, DI
-	IMUL3Q $0x13, R14, R14
-	ADDQ   R14, DI
-	ANDQ   AX, R9
-	ADDQ   SI, R9
-	ANDQ   AX, R11
-	ADDQ   R8, R11
-	ANDQ   AX, R13
-	ADDQ   R10, R13
-	ANDQ   AX, R15
-	ADDQ   R12, R15
-
-	// Store output
-	MOVQ out+0(FP), AX
-	MOVQ DI, (AX)
-	MOVQ R9, 8(AX)
-	MOVQ R11, 16(AX)
-	MOVQ R13, 24(AX)
-	MOVQ R15, 32(AX)
-	RET
-
-// func feSquare(out *Element, a *Element)
-TEXT ·feSquare(SB), NOSPLIT, $0-16
-	MOVQ a+8(FP), CX
-
-	// r0 = l0×l0
-	MOVQ (CX), AX
-	MULQ (CX)
-	MOVQ AX, SI
-	MOVQ DX, BX
-
-	// r0 += 38×l1×l4
-	MOVQ   8(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, SI
-	ADCQ   DX, BX
-
-	// r0 += 38×l2×l3
-	MOVQ   16(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, SI
-	ADCQ   DX, BX
-
-	// r1 = 2×l0×l1
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 8(CX)
-	MOVQ AX, R8
-	MOVQ DX, DI
-
-	// r1 += 38×l2×l4
-	MOVQ   16(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R8
-	ADCQ   DX, DI
-
-	// r1 += 19×l3×l3
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, R8
-	ADCQ   DX, DI
-
-	// r2 = 2×l0×l2
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 16(CX)
-	MOVQ AX, R10
-	MOVQ DX, R9
-
-	// r2 += l1×l1
-	MOVQ 8(CX), AX
-	MULQ 8(CX)
-	ADDQ AX, R10
-	ADCQ DX, R9
-
-	// r2 += 38×l3×l4
-	MOVQ   24(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R10
-	ADCQ   DX, R9
-
-	// r3 = 2×l0×l3
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 24(CX)
-	MOVQ AX, R12
-	MOVQ DX, R11
-
-	// r3 += 2×l1×l2
-	MOVQ   8(CX), AX
-	IMUL3Q $0x02, AX, AX
-	MULQ   16(CX)
-	ADDQ   AX, R12
-	ADCQ   DX, R11
-
-	// r3 += 19×l4×l4
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R12
-	ADCQ   DX, R11
-
-	// r4 = 2×l0×l4
-	MOVQ (CX), AX
-	SHLQ $0x01, AX
-	MULQ 32(CX)
-	MOVQ AX, R14
-	MOVQ DX, R13
-
-	// r4 += 2×l1×l3
-	MOVQ   8(CX), AX
-	IMUL3Q $0x02, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, R14
-	ADCQ   DX, R13
-
-	// r4 += l2×l2
-	MOVQ 16(CX), AX
-	MULQ 16(CX)
-	ADDQ AX, R14
-	ADCQ DX, R13
-
-	// First reduction chain
-	MOVQ   $0x0007ffffffffffff, AX
-	SHLQ   $0x0d, SI, BX
-	SHLQ   $0x0d, R8, DI
-	SHLQ   $0x0d, R10, R9
-	SHLQ   $0x0d, R12, R11
-	SHLQ   $0x0d, R14, R13
-	ANDQ   AX, SI
-	IMUL3Q $0x13, R13, R13
-	ADDQ   R13, SI
-	ANDQ   AX, R8
-	ADDQ   BX, R8
-	ANDQ   AX, R10
-	ADDQ   DI, R10
-	ANDQ   AX, R12
-	ADDQ   R9, R12
-	ANDQ   AX, R14
-	ADDQ   R11, R14
-
-	// Second reduction chain (carryPropagate)
-	MOVQ   SI, BX
-	SHRQ   $0x33, BX
-	MOVQ   R8, DI
-	SHRQ   $0x33, DI
-	MOVQ   R10, R9
-	SHRQ   $0x33, R9
-	MOVQ   R12, R11
-	SHRQ   $0x33, R11
-	MOVQ   R14, R13
-	SHRQ   $0x33, R13
-	ANDQ   AX, SI
-	IMUL3Q $0x13, R13, R13
-	ADDQ   R13, SI
-	ANDQ   AX, R8
-	ADDQ   BX, R8
-	ANDQ   AX, R10
-	ADDQ   DI, R10
-	ANDQ   AX, R12
-	ADDQ   R9, R12
-	ANDQ   AX, R14
-	ADDQ   R11, R14
-
-	// Store output
-	MOVQ out+0(FP), AX
-	MOVQ SI, (AX)
-	MOVQ R8, 8(AX)
-	MOVQ R10, 16(AX)
-	MOVQ R12, 24(AX)
-	MOVQ R14, 32(AX)
-	RET
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go
deleted file mode 100644
index 9da280d1..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_amd64_noasm.go
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !amd64 || !gc || purego
-
-package field
-
-func feMul(v, x, y *Element) { feMulGeneric(v, x, y) }
-
-func feSquare(v, x *Element) { feSquareGeneric(v, x) }
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go
deleted file mode 100644
index 075fe9b9..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build arm64 && gc && !purego
-
-package field
-
-//go:noescape
-func carryPropagate(v *Element)
-
-func (v *Element) carryPropagate() *Element {
-	carryPropagate(v)
-	return v
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s
deleted file mode 100644
index 3126a434..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64.s
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build arm64 && gc && !purego
-
-#include "textflag.h"
-
-// carryPropagate works exactly like carryPropagateGeneric and uses the
-// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but
-// avoids loading R0-R4 twice and uses LDP and STP.
-//
-// See https://golang.org/issues/43145 for the main compiler issue.
-//
-// func carryPropagate(v *Element)
-TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8
-	MOVD v+0(FP), R20
-
-	LDP 0(R20), (R0, R1)
-	LDP 16(R20), (R2, R3)
-	MOVD 32(R20), R4
-
-	AND $0x7ffffffffffff, R0, R10
-	AND $0x7ffffffffffff, R1, R11
-	AND $0x7ffffffffffff, R2, R12
-	AND $0x7ffffffffffff, R3, R13
-	AND $0x7ffffffffffff, R4, R14
-
-	ADD R0>>51, R11, R11
-	ADD R1>>51, R12, R12
-	ADD R2>>51, R13, R13
-	ADD R3>>51, R14, R14
-	// R4>>51 * 19 + R10 -> R10
-	LSR $51, R4, R21
-	MOVD $19, R22
-	MADD R22, R10, R21, R10
-
-	STP (R10, R11), 0(R20)
-	STP (R12, R13), 16(R20)
-	MOVD R14, 32(R20)
-
-	RET
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go
deleted file mode 100644
index fc029ac1..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_arm64_noasm.go
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !arm64 || !gc || purego
-
-package field
-
-func (v *Element) carryPropagate() *Element {
-	return v.carryPropagateGeneric()
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go b/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go
deleted file mode 100644
index 2671217d..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/fe_generic.go
+++ /dev/null
@@ -1,264 +0,0 @@
-// Copyright (c) 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package field
-
-import "math/bits"
-
-// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
-// bits.Mul64 and bits.Add64 intrinsics.
-type uint128 struct {
-	lo, hi uint64
-}
-
-// mul64 returns a * b.
-func mul64(a, b uint64) uint128 {
-	hi, lo := bits.Mul64(a, b)
-	return uint128{lo, hi}
-}
-
-// addMul64 returns v + a * b.
-func addMul64(v uint128, a, b uint64) uint128 {
-	hi, lo := bits.Mul64(a, b)
-	lo, c := bits.Add64(lo, v.lo, 0)
-	hi, _ = bits.Add64(hi, v.hi, c)
-	return uint128{lo, hi}
-}
-
-// shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits.
-func shiftRightBy51(a uint128) uint64 {
-	return (a.hi << (64 - 51)) | (a.lo >> 51)
-}
-
-func feMulGeneric(v, a, b *Element) {
-	a0 := a.l0
-	a1 := a.l1
-	a2 := a.l2
-	a3 := a.l3
-	a4 := a.l4
-
-	b0 := b.l0
-	b1 := b.l1
-	b2 := b.l2
-	b3 := b.l3
-	b4 := b.l4
-
-	// Limb multiplication works like pen-and-paper columnar multiplication, but
-	// with 51-bit limbs instead of digits.
-	//
-	//                          a4   a3   a2   a1   a0  x
-	//                          b4   b3   b2   b1   b0  =
-	//                         ------------------------
-	//                        a4b0 a3b0 a2b0 a1b0 a0b0  +
-	//                   a4b1 a3b1 a2b1 a1b1 a0b1       +
-	//              a4b2 a3b2 a2b2 a1b2 a0b2            +
-	//         a4b3 a3b3 a2b3 a1b3 a0b3                 +
-	//    a4b4 a3b4 a2b4 a1b4 a0b4                      =
-	//   ----------------------------------------------
-	//      r8   r7   r6   r5   r4   r3   r2   r1   r0
-	//
-	// We can then use the reduction identity (a * 2²⁵⁵ + b = a * 19 + b) to
-	// reduce the limbs that would overflow 255 bits. r5 * 2²⁵⁵ becomes 19 * r5,
-	// r6 * 2³⁰⁶ becomes 19 * r6 * 2⁵¹, etc.
-	//
-	// Reduction can be carried out simultaneously to multiplication. For
-	// example, we do not compute r5: whenever the result of a multiplication
-	// belongs to r5, like a1b4, we multiply it by 19 and add the result to r0.
-	//
-	//            a4b0    a3b0    a2b0    a1b0    a0b0  +
-	//            a3b1    a2b1    a1b1    a0b1 19×a4b1  +
-	//            a2b2    a1b2    a0b2 19×a4b2 19×a3b2  +
-	//            a1b3    a0b3 19×a4b3 19×a3b3 19×a2b3  +
-	//            a0b4 19×a4b4 19×a3b4 19×a2b4 19×a1b4  =
-	//           --------------------------------------
-	//              r4      r3      r2      r1      r0
-	//
-	// Finally we add up the columns into wide, overlapping limbs.
-
-	a1_19 := a1 * 19
-	a2_19 := a2 * 19
-	a3_19 := a3 * 19
-	a4_19 := a4 * 19
-
-	// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
-	r0 := mul64(a0, b0)
-	r0 = addMul64(r0, a1_19, b4)
-	r0 = addMul64(r0, a2_19, b3)
-	r0 = addMul64(r0, a3_19, b2)
-	r0 = addMul64(r0, a4_19, b1)
-
-	// r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2)
-	r1 := mul64(a0, b1)
-	r1 = addMul64(r1, a1, b0)
-	r1 = addMul64(r1, a2_19, b4)
-	r1 = addMul64(r1, a3_19, b3)
-	r1 = addMul64(r1, a4_19, b2)
-
-	// r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3)
-	r2 := mul64(a0, b2)
-	r2 = addMul64(r2, a1, b1)
-	r2 = addMul64(r2, a2, b0)
-	r2 = addMul64(r2, a3_19, b4)
-	r2 = addMul64(r2, a4_19, b3)
-
-	// r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4
-	r3 := mul64(a0, b3)
-	r3 = addMul64(r3, a1, b2)
-	r3 = addMul64(r3, a2, b1)
-	r3 = addMul64(r3, a3, b0)
-	r3 = addMul64(r3, a4_19, b4)
-
-	// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
-	r4 := mul64(a0, b4)
-	r4 = addMul64(r4, a1, b3)
-	r4 = addMul64(r4, a2, b2)
-	r4 = addMul64(r4, a3, b1)
-	r4 = addMul64(r4, a4, b0)
-
-	// After the multiplication, we need to reduce (carry) the five coefficients
-	// to obtain a result with limbs that are at most slightly larger than 2⁵¹,
-	// to respect the Element invariant.
-	//
-	// Overall, the reduction works the same as carryPropagate, except with
-	// wider inputs: we take the carry for each coefficient by shifting it right
-	// by 51, and add it to the limb above it. The top carry is multiplied by 19
-	// according to the reduction identity and added to the lowest limb.
-	//
-	// The largest coefficient (r0) will be at most 111 bits, which guarantees
-	// that all carries are at most 111 - 51 = 60 bits, which fits in a uint64.
-	//
-	//     r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
-	//     r0 < 2⁵²×2⁵² + 19×(2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵² + 2⁵²×2⁵²)
-	//     r0 < (1 + 19 × 4) × 2⁵² × 2⁵²
-	//     r0 < 2⁷ × 2⁵² × 2⁵²
-	//     r0 < 2¹¹¹
-	//
-	// Moreover, the top coefficient (r4) is at most 107 bits, so c4 is at most
-	// 56 bits, and c4 * 19 is at most 61 bits, which again fits in a uint64 and
-	// allows us to easily apply the reduction identity.
-	//
-	//     r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
-	//     r4 < 5 × 2⁵² × 2⁵²
-	//     r4 < 2¹⁰⁷
-	//
-
-	c0 := shiftRightBy51(r0)
-	c1 := shiftRightBy51(r1)
-	c2 := shiftRightBy51(r2)
-	c3 := shiftRightBy51(r3)
-	c4 := shiftRightBy51(r4)
-
-	rr0 := r0.lo&maskLow51Bits + c4*19
-	rr1 := r1.lo&maskLow51Bits + c0
-	rr2 := r2.lo&maskLow51Bits + c1
-	rr3 := r3.lo&maskLow51Bits + c2
-	rr4 := r4.lo&maskLow51Bits + c3
-
-	// Now all coefficients fit into 64-bit registers but are still too large to
-	// be passed around as a Element. We therefore do one last carry chain,
-	// where the carries will be small enough to fit in the wiggle room above 2⁵¹.
-	*v = Element{rr0, rr1, rr2, rr3, rr4}
-	v.carryPropagate()
-}
-
-func feSquareGeneric(v, a *Element) {
-	l0 := a.l0
-	l1 := a.l1
-	l2 := a.l2
-	l3 := a.l3
-	l4 := a.l4
-
-	// Squaring works precisely like multiplication above, but thanks to its
-	// symmetry we get to group a few terms together.
-	//
-	//                          l4   l3   l2   l1   l0  x
-	//                          l4   l3   l2   l1   l0  =
-	//                         ------------------------
-	//                        l4l0 l3l0 l2l0 l1l0 l0l0  +
-	//                   l4l1 l3l1 l2l1 l1l1 l0l1       +
-	//              l4l2 l3l2 l2l2 l1l2 l0l2            +
-	//         l4l3 l3l3 l2l3 l1l3 l0l3                 +
-	//    l4l4 l3l4 l2l4 l1l4 l0l4                      =
-	//   ----------------------------------------------
-	//      r8   r7   r6   r5   r4   r3   r2   r1   r0
-	//
-	//            l4l0    l3l0    l2l0    l1l0    l0l0  +
-	//            l3l1    l2l1    l1l1    l0l1 19×l4l1  +
-	//            l2l2    l1l2    l0l2 19×l4l2 19×l3l2  +
-	//            l1l3    l0l3 19×l4l3 19×l3l3 19×l2l3  +
-	//            l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4  =
-	//           --------------------------------------
-	//              r4      r3      r2      r1      r0
-	//
-	// With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with
-	// only three Mul64 and four Add64, instead of five and eight.
-
-	l0_2 := l0 * 2
-	l1_2 := l1 * 2
-
-	l1_38 := l1 * 38
-	l2_38 := l2 * 38
-	l3_38 := l3 * 38
-
-	l3_19 := l3 * 19
-	l4_19 := l4 * 19
-
-	// r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3)
-	r0 := mul64(l0, l0)
-	r0 = addMul64(r0, l1_38, l4)
-	r0 = addMul64(r0, l2_38, l3)
-
-	// r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
-	r1 := mul64(l0_2, l1)
-	r1 = addMul64(r1, l2_38, l4)
-	r1 = addMul64(r1, l3_19, l3)
-
-	// r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4
-	r2 := mul64(l0_2, l2)
-	r2 = addMul64(r2, l1, l1)
-	r2 = addMul64(r2, l3_38, l4)
-
-	// r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
-	r3 := mul64(l0_2, l3)
-	r3 = addMul64(r3, l1_2, l2)
-	r3 = addMul64(r3, l4_19, l4)
-
-	// r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2
-	r4 := mul64(l0_2, l4)
-	r4 = addMul64(r4, l1_2, l3)
-	r4 = addMul64(r4, l2, l2)
-
-	c0 := shiftRightBy51(r0)
-	c1 := shiftRightBy51(r1)
-	c2 := shiftRightBy51(r2)
-	c3 := shiftRightBy51(r3)
-	c4 := shiftRightBy51(r4)
-
-	rr0 := r0.lo&maskLow51Bits + c4*19
-	rr1 := r1.lo&maskLow51Bits + c0
-	rr2 := r2.lo&maskLow51Bits + c1
-	rr3 := r3.lo&maskLow51Bits + c2
-	rr4 := r4.lo&maskLow51Bits + c3
-
-	*v = Element{rr0, rr1, rr2, rr3, rr4}
-	v.carryPropagate()
-}
-
-// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction
-// identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry. TODO inline
-func (v *Element) carryPropagateGeneric() *Element {
-	c0 := v.l0 >> 51
-	c1 := v.l1 >> 51
-	c2 := v.l2 >> 51
-	c3 := v.l3 >> 51
-	c4 := v.l4 >> 51
-
-	v.l0 = v.l0&maskLow51Bits + c4*19
-	v.l1 = v.l1&maskLow51Bits + c0
-	v.l2 = v.l2&maskLow51Bits + c1
-	v.l3 = v.l3&maskLow51Bits + c2
-	v.l4 = v.l4&maskLow51Bits + c3
-
-	return v
-}
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint b/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint
deleted file mode 100644
index e3685f95..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.checkpoint
+++ /dev/null
@@ -1 +0,0 @@
-b0c49ae9f59d233526f8934262c5bbbe14d4358d
diff --git a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh b/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh
deleted file mode 100644
index 1ba22a8b..00000000
--- a/vendor/golang.org/x/crypto/curve25519/internal/field/sync.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#! /bin/bash
-set -euo pipefail
-
-cd "$(git rev-parse --show-toplevel)"
-
-STD_PATH=src/crypto/ed25519/internal/edwards25519/field
-LOCAL_PATH=curve25519/internal/field
-LAST_SYNC_REF=$(cat $LOCAL_PATH/sync.checkpoint)
-
-git fetch https://go.googlesource.com/go master
-
-if git diff --quiet $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH; then
-    echo "No changes."
-else
-    NEW_REF=$(git rev-parse FETCH_HEAD | tee $LOCAL_PATH/sync.checkpoint)
-    echo "Applying changes from $LAST_SYNC_REF to $NEW_REF..."
-    git diff $LAST_SYNC_REF:$STD_PATH FETCH_HEAD:$STD_PATH | \
-        git apply -3 --directory=$LOCAL_PATH
-fi
diff --git a/vendor/golang.org/x/crypto/hkdf/hkdf.go b/vendor/golang.org/x/crypto/hkdf/hkdf.go
index f4ded5fe..3bee6629 100644
--- a/vendor/golang.org/x/crypto/hkdf/hkdf.go
+++ b/vendor/golang.org/x/crypto/hkdf/hkdf.go
@@ -8,7 +8,7 @@
 // HKDF is a cryptographic key derivation function (KDF) with the goal of
 // expanding limited input keying material into one or more cryptographically
 // strong secret keys.
-package hkdf // import "golang.org/x/crypto/hkdf"
+package hkdf
 
 import (
 	"crypto/hmac"
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
index 333da285..bd896bdc 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego
+//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
 
 package poly1305
 
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
index e0d3c647..13375738 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s
@@ -1,108 +1,93 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT.
 
 //go:build gc && !purego
 
-#include "textflag.h"
-
-#define POLY1305_ADD(msg, h0, h1, h2) \
-	ADDQ 0(msg), h0;  \
-	ADCQ 8(msg), h1;  \
-	ADCQ $1, h2;      \
-	LEAQ 16(msg), msg
-
-#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
-	MOVQ  r0, AX;                  \
-	MULQ  h0;                      \
-	MOVQ  AX, t0;                  \
-	MOVQ  DX, t1;                  \
-	MOVQ  r0, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  r0, t2;                  \
-	IMULQ h2, t2;                  \
-	ADDQ  DX, t2;                  \
-	                               \
-	MOVQ  r1, AX;                  \
-	MULQ  h0;                      \
-	ADDQ  AX, t1;                  \
-	ADCQ  $0, DX;                  \
-	MOVQ  DX, h0;                  \
-	MOVQ  r1, t3;                  \
-	IMULQ h2, t3;                  \
-	MOVQ  r1, AX;                  \
-	MULQ  h1;                      \
-	ADDQ  AX, t2;                  \
-	ADCQ  DX, t3;                  \
-	ADDQ  h0, t2;                  \
-	ADCQ  $0, t3;                  \
-	                               \
-	MOVQ  t0, h0;                  \
-	MOVQ  t1, h1;                  \
-	MOVQ  t2, h2;                  \
-	ANDQ  $3, h2;                  \
-	MOVQ  t2, t0;                  \
-	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
-	ADDQ  t0, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2;                  \
-	SHRQ  $2, t3, t2;              \
-	SHRQ  $2, t3;                  \
-	ADDQ  t2, h0;                  \
-	ADCQ  t3, h1;                  \
-	ADCQ  $0, h2
-
-// func update(state *[7]uint64, msg []byte)
+// func update(state *macState, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVQ state+0(FP), DI
 	MOVQ msg_base+8(FP), SI
 	MOVQ msg_len+16(FP), R15
-
-	MOVQ 0(DI), R8   // h0
-	MOVQ 8(DI), R9   // h1
-	MOVQ 16(DI), R10 // h2
-	MOVQ 24(DI), R11 // r0
-	MOVQ 32(DI), R12 // r1
-
-	CMPQ R15, $16
+	MOVQ (DI), R8
+	MOVQ 8(DI), R9
+	MOVQ 16(DI), R10
+	MOVQ 24(DI), R11
+	MOVQ 32(DI), R12
+	CMPQ R15, $0x10
 	JB   bytes_between_0_and_15
 
 loop:
-	POLY1305_ADD(SI, R8, R9, R10)
+	ADDQ (SI), R8
+	ADCQ 8(SI), R9
+	ADCQ $0x01, R10
+	LEAQ 16(SI), SI
 
 multiply:
-	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
-	SUBQ $16, R15
-	CMPQ R15, $16
-	JAE  loop
+	MOVQ  R11, AX
+	MULQ  R8
+	MOVQ  AX, BX
+	MOVQ  DX, CX
+	MOVQ  R11, AX
+	MULQ  R9
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  R11, R13
+	IMULQ R10, R13
+	ADDQ  DX, R13
+	MOVQ  R12, AX
+	MULQ  R8
+	ADDQ  AX, CX
+	ADCQ  $0x00, DX
+	MOVQ  DX, R8
+	MOVQ  R12, R14
+	IMULQ R10, R14
+	MOVQ  R12, AX
+	MULQ  R9
+	ADDQ  AX, R13
+	ADCQ  DX, R14
+	ADDQ  R8, R13
+	ADCQ  $0x00, R14
+	MOVQ  BX, R8
+	MOVQ  CX, R9
+	MOVQ  R13, R10
+	ANDQ  $0x03, R10
+	MOVQ  R13, BX
+	ANDQ  $-4, BX
+	ADDQ  BX, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SHRQ  $0x02, R14, R13
+	SHRQ  $0x02, R14
+	ADDQ  R13, R8
+	ADCQ  R14, R9
+	ADCQ  $0x00, R10
+	SUBQ  $0x10, R15
+	CMPQ  R15, $0x10
+	JAE   loop
 
 bytes_between_0_and_15:
 	TESTQ R15, R15
 	JZ    done
-	MOVQ  $1, BX
+	MOVQ  $0x00000001, BX
 	XORQ  CX, CX
 	XORQ  R13, R13
 	ADDQ  R15, SI
 
 flush_buffer:
-	SHLQ $8, BX, CX
-	SHLQ $8, BX
+	SHLQ $0x08, BX, CX
+	SHLQ $0x08, BX
 	MOVB -1(SI), R13
 	XORQ R13, BX
 	DECQ SI
 	DECQ R15
 	JNZ  flush_buffer
-
 	ADDQ BX, R8
 	ADCQ CX, R9
-	ADCQ $0, R10
-	MOVQ $16, R15
+	ADCQ $0x00, R10
+	MOVQ $0x00000010, R15
 	JMP  multiply
 
 done:
-	MOVQ R8, 0(DI)
+	MOVQ R8, (DI)
 	MOVQ R9, 8(DI)
 	MOVQ R10, 16(DI)
 	RET
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
similarity index 95%
rename from vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
rename to vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
index 4aec4874..1a1679aa 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 package poly1305
 
diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
similarity index 89%
rename from vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
rename to vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
index b3c1699b..6899a1da 100644
--- a/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s
+++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s
@@ -2,15 +2,25 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build gc && !purego
+//go:build gc && !purego && (ppc64 || ppc64le)
 
 #include "textflag.h"
 
 // This was ported from the amd64 implementation.
 
+#ifdef GOARCH_ppc64le
+#define LE_MOVD MOVD
+#define LE_MOVWZ MOVWZ
+#define LE_MOVHZ MOVHZ
+#else
+#define LE_MOVD MOVDBR
+#define LE_MOVWZ MOVWBR
+#define LE_MOVHZ MOVHBR
+#endif
+
 #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \
-	MOVD (msg), t0;  \
-	MOVD 8(msg), t1; \
+	LE_MOVD (msg)( R0), t0; \
+	LE_MOVD (msg)(R24), t1; \
 	MOVD $1, t2;     \
 	ADDC t0, h0, h0; \
 	ADDE t1, h1, h1; \
@@ -50,10 +60,6 @@
 	ADDE   t3, h1, h1;  \
 	ADDZE  h2
 
-DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
-DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
-GLOBL ·poly1305Mask<>(SB), RODATA, $16
-
 // func update(state *[7]uint64, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVD state+0(FP), R3
@@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32
 	MOVD 24(R3), R11 // r0
 	MOVD 32(R3), R12 // r1
 
+	MOVD $8, R24
+
 	CMP R5, $16
 	BLT bytes_between_0_and_15
 
@@ -94,7 +102,7 @@ flush_buffer:
 
 	// Greater than 8 -- load the rightmost remaining bytes in msg
 	// and put into R17 (h1)
-	MOVD (R4)(R21), R17
+	LE_MOVD (R4)(R21), R17
 	MOVD $16, R22
 
 	// Find the offset to those bytes
@@ -118,7 +126,7 @@ just1:
 	BLT less8
 
 	// Exactly 8
-	MOVD (R4), R16
+	LE_MOVD (R4), R16
 
 	CMP R17, $0
 
@@ -133,7 +141,7 @@ less8:
 	MOVD  $0, R22   // shift count
 	CMP   R5, $4
 	BLT   less4
-	MOVWZ (R4), R16
+	LE_MOVWZ (R4), R16
 	ADD   $4, R4
 	ADD   $-4, R5
 	MOVD  $32, R22
@@ -141,7 +149,7 @@ less8:
 less4:
 	CMP   R5, $2
 	BLT   less2
-	MOVHZ (R4), R21
+	LE_MOVHZ (R4), R21
 	SLD   R22, R21, R21
 	OR    R16, R21, R16
 	ADD   $16, R22
diff --git a/vendor/golang.org/x/crypto/nacl/box/box.go b/vendor/golang.org/x/crypto/nacl/box/box.go
index 7f3b830e..357bdc77 100644
--- a/vendor/golang.org/x/crypto/nacl/box/box.go
+++ b/vendor/golang.org/x/crypto/nacl/box/box.go
@@ -35,7 +35,7 @@ Anonymous sealing/opening is an extension of NaCl defined by and interoperable
 with libsodium:
 https://libsodium.gitbook.io/doc/public-key_cryptography/sealed_boxes.
 */
-package box // import "golang.org/x/crypto/nacl/box"
+package box
 
 import (
 	cryptorand "crypto/rand"
diff --git a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
index f3c3242a..1fe600ad 100644
--- a/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
+++ b/vendor/golang.org/x/crypto/nacl/secretbox/secretbox.go
@@ -32,7 +32,7 @@ chunk size.
 
 This package is interoperable with NaCl: https://nacl.cr.yp.to/secretbox.html.
 */
-package secretbox // import "golang.org/x/crypto/nacl/secretbox"
+package secretbox
 
 import (
 	"golang.org/x/crypto/internal/alias"
diff --git a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go
index 904b57e0..28cd99c7 100644
--- a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go
+++ b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go
@@ -16,7 +16,7 @@ Hash Functions SHA-1, SHA-224, SHA-256, SHA-384 and SHA-512 for HMAC. To
 choose, you can pass the `New` functions from the different SHA packages to
 pbkdf2.Key.
 */
-package pbkdf2 // import "golang.org/x/crypto/pbkdf2"
+package pbkdf2
 
 import (
 	"crypto/hmac"
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go
index 3fd05b27..3685b344 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/hsalsa20.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // Package salsa provides low-level access to functions in the Salsa family.
-package salsa // import "golang.org/x/crypto/salsa20/salsa"
+package salsa
 
 import "math/bits"
 
diff --git a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
index fcce0234..3883e0ec 100644
--- a/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
+++ b/vendor/golang.org/x/crypto/salsa20/salsa/salsa20_amd64.s
@@ -1,880 +1,880 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT.
 
 //go:build amd64 && !purego && gc
 
-// This code was translated into a form compatible with 6a from the public
-// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
+// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte)
+// Requires: SSE2
+TEXT ·salsa2020XORKeyStream(SB), $456-40
+	// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
+	MOVQ   out+0(FP), DI
+	MOVQ   in+8(FP), SI
+	MOVQ   n+16(FP), DX
+	MOVQ   nonce+24(FP), CX
+	MOVQ   key+32(FP), R8
+	MOVQ   SP, R12
+	ADDQ   $0x1f, R12
+	ANDQ   $-32, R12
+	MOVQ   DX, R9
+	MOVQ   CX, DX
+	MOVQ   R8, R10
+	CMPQ   R9, $0x00
+	JBE    DONE
+	MOVL   20(R10), CX
+	MOVL   (R10), R8
+	MOVL   (DX), AX
+	MOVL   16(R10), R11
+	MOVL   CX, (R12)
+	MOVL   R8, 4(R12)
+	MOVL   AX, 8(R12)
+	MOVL   R11, 12(R12)
+	MOVL   8(DX), CX
+	MOVL   24(R10), R8
+	MOVL   4(R10), AX
+	MOVL   4(DX), R11
+	MOVL   CX, 16(R12)
+	MOVL   R8, 20(R12)
+	MOVL   AX, 24(R12)
+	MOVL   R11, 28(R12)
+	MOVL   12(DX), CX
+	MOVL   12(R10), DX
+	MOVL   28(R10), R8
+	MOVL   8(R10), AX
+	MOVL   DX, 32(R12)
+	MOVL   CX, 36(R12)
+	MOVL   R8, 40(R12)
+	MOVL   AX, 44(R12)
+	MOVQ   $0x61707865, DX
+	MOVQ   $0x3320646e, CX
+	MOVQ   $0x79622d32, R8
+	MOVQ   $0x6b206574, AX
+	MOVL   DX, 48(R12)
+	MOVL   CX, 52(R12)
+	MOVL   R8, 56(R12)
+	MOVL   AX, 60(R12)
+	CMPQ   R9, $0x00000100
+	JB     BYTESBETWEEN1AND255
+	MOVOA  48(R12), X0
+	PSHUFL $0x55, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X3
+	PSHUFL $0x00, X0, X0
+	MOVOA  X1, 64(R12)
+	MOVOA  X2, 80(R12)
+	MOVOA  X3, 96(R12)
+	MOVOA  X0, 112(R12)
+	MOVOA  (R12), X0
+	PSHUFL $0xaa, X0, X1
+	PSHUFL $0xff, X0, X2
+	PSHUFL $0x00, X0, X3
+	PSHUFL $0x55, X0, X0
+	MOVOA  X1, 128(R12)
+	MOVOA  X2, 144(R12)
+	MOVOA  X3, 160(R12)
+	MOVOA  X0, 176(R12)
+	MOVOA  16(R12), X0
+	PSHUFL $0xff, X0, X1
+	PSHUFL $0x55, X0, X2
+	PSHUFL $0xaa, X0, X0
+	MOVOA  X1, 192(R12)
+	MOVOA  X2, 208(R12)
+	MOVOA  X0, 224(R12)
+	MOVOA  32(R12), X0
+	PSHUFL $0x00, X0, X1
+	PSHUFL $0xaa, X0, X2
+	PSHUFL $0xff, X0, X0
+	MOVOA  X1, 240(R12)
+	MOVOA  X2, 256(R12)
+	MOVOA  X0, 272(R12)
 
-// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte)
-// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.
-TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment
-	MOVQ out+0(FP),DI
-	MOVQ in+8(FP),SI
-	MOVQ n+16(FP),DX
-	MOVQ nonce+24(FP),CX
-	MOVQ key+32(FP),R8
+BYTESATLEAST256:
+	MOVL  16(R12), DX
+	MOVL  36(R12), CX
+	MOVL  DX, 288(R12)
+	MOVL  CX, 304(R12)
+	SHLQ  $0x20, CX
+	ADDQ  CX, DX
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 292(R12)
+	MOVL  CX, 308(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 296(R12)
+	MOVL  CX, 312(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 300(R12)
+	MOVL  CX, 316(R12)
+	ADDQ  $0x01, DX
+	MOVQ  DX, CX
+	SHRQ  $0x20, CX
+	MOVL  DX, 16(R12)
+	MOVL  CX, 36(R12)
+	MOVQ  R9, 352(R12)
+	MOVQ  $0x00000014, DX
+	MOVOA 64(R12), X0
+	MOVOA 80(R12), X1
+	MOVOA 96(R12), X2
+	MOVOA 256(R12), X3
+	MOVOA 272(R12), X4
+	MOVOA 128(R12), X5
+	MOVOA 144(R12), X6
+	MOVOA 176(R12), X7
+	MOVOA 192(R12), X8
+	MOVOA 208(R12), X9
+	MOVOA 224(R12), X10
+	MOVOA 304(R12), X11
+	MOVOA 112(R12), X12
+	MOVOA 160(R12), X13
+	MOVOA 240(R12), X14
+	MOVOA 288(R12), X15
 
-	MOVQ SP,R12
-	ADDQ $31, R12
-	ANDQ $~31, R12
+MAINLOOP1:
+	MOVOA  X1, 320(R12)
+	MOVOA  X2, 336(R12)
+	MOVOA  X13, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X14
+	PSRLL  $0x19, X2
+	PXOR   X2, X14
+	MOVOA  X7, X1
+	PADDL  X0, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X11
+	PSRLL  $0x19, X2
+	PXOR   X2, X11
+	MOVOA  X12, X1
+	PADDL  X14, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X15
+	PSRLL  $0x17, X2
+	PXOR   X2, X15
+	MOVOA  X0, X1
+	PADDL  X11, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X9
+	PSRLL  $0x17, X2
+	PXOR   X2, X9
+	MOVOA  X14, X1
+	PADDL  X15, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X13
+	PSRLL  $0x13, X2
+	PXOR   X2, X13
+	MOVOA  X11, X1
+	PADDL  X9, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X7
+	PSRLL  $0x13, X2
+	PXOR   X2, X7
+	MOVOA  X15, X1
+	PADDL  X13, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  320(R12), X1
+	MOVOA  X12, 320(R12)
+	MOVOA  X9, X2
+	PADDL  X7, X2
+	MOVOA  X2, X12
+	PSLLL  $0x12, X2
+	PXOR   X2, X0
+	PSRLL  $0x0e, X12
+	PXOR   X12, X0
+	MOVOA  X5, X2
+	PADDL  X1, X2
+	MOVOA  X2, X12
+	PSLLL  $0x07, X2
+	PXOR   X2, X3
+	PSRLL  $0x19, X12
+	PXOR   X12, X3
+	MOVOA  336(R12), X2
+	MOVOA  X0, 336(R12)
+	MOVOA  X6, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X4
+	PSRLL  $0x19, X12
+	PXOR   X12, X4
+	MOVOA  X1, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X10
+	PSRLL  $0x17, X12
+	PXOR   X12, X10
+	MOVOA  X2, X0
+	PADDL  X4, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X8
+	PSRLL  $0x17, X12
+	PXOR   X12, X8
+	MOVOA  X3, X0
+	PADDL  X10, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X5
+	PSRLL  $0x13, X12
+	PXOR   X12, X5
+	MOVOA  X4, X0
+	PADDL  X8, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X6
+	PSRLL  $0x13, X12
+	PXOR   X12, X6
+	MOVOA  X10, X0
+	PADDL  X5, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  320(R12), X0
+	MOVOA  X1, 320(R12)
+	MOVOA  X4, X1
+	PADDL  X0, X1
+	MOVOA  X1, X12
+	PSLLL  $0x07, X1
+	PXOR   X1, X7
+	PSRLL  $0x19, X12
+	PXOR   X12, X7
+	MOVOA  X8, X1
+	PADDL  X6, X1
+	MOVOA  X1, X12
+	PSLLL  $0x12, X1
+	PXOR   X1, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  336(R12), X12
+	MOVOA  X2, 336(R12)
+	MOVOA  X14, X1
+	PADDL  X12, X1
+	MOVOA  X1, X2
+	PSLLL  $0x07, X1
+	PXOR   X1, X5
+	PSRLL  $0x19, X2
+	PXOR   X2, X5
+	MOVOA  X0, X1
+	PADDL  X7, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X10
+	PSRLL  $0x17, X2
+	PXOR   X2, X10
+	MOVOA  X12, X1
+	PADDL  X5, X1
+	MOVOA  X1, X2
+	PSLLL  $0x09, X1
+	PXOR   X1, X8
+	PSRLL  $0x17, X2
+	PXOR   X2, X8
+	MOVOA  X7, X1
+	PADDL  X10, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X4
+	PSRLL  $0x13, X2
+	PXOR   X2, X4
+	MOVOA  X5, X1
+	PADDL  X8, X1
+	MOVOA  X1, X2
+	PSLLL  $0x0d, X1
+	PXOR   X1, X14
+	PSRLL  $0x13, X2
+	PXOR   X2, X14
+	MOVOA  X10, X1
+	PADDL  X4, X1
+	MOVOA  X1, X2
+	PSLLL  $0x12, X1
+	PXOR   X1, X0
+	PSRLL  $0x0e, X2
+	PXOR   X2, X0
+	MOVOA  320(R12), X1
+	MOVOA  X0, 320(R12)
+	MOVOA  X8, X0
+	PADDL  X14, X0
+	MOVOA  X0, X2
+	PSLLL  $0x12, X0
+	PXOR   X0, X12
+	PSRLL  $0x0e, X2
+	PXOR   X2, X12
+	MOVOA  X11, X0
+	PADDL  X1, X0
+	MOVOA  X0, X2
+	PSLLL  $0x07, X0
+	PXOR   X0, X6
+	PSRLL  $0x19, X2
+	PXOR   X2, X6
+	MOVOA  336(R12), X2
+	MOVOA  X12, 336(R12)
+	MOVOA  X3, X0
+	PADDL  X2, X0
+	MOVOA  X0, X12
+	PSLLL  $0x07, X0
+	PXOR   X0, X13
+	PSRLL  $0x19, X12
+	PXOR   X12, X13
+	MOVOA  X1, X0
+	PADDL  X6, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X15
+	PSRLL  $0x17, X12
+	PXOR   X12, X15
+	MOVOA  X2, X0
+	PADDL  X13, X0
+	MOVOA  X0, X12
+	PSLLL  $0x09, X0
+	PXOR   X0, X9
+	PSRLL  $0x17, X12
+	PXOR   X12, X9
+	MOVOA  X6, X0
+	PADDL  X15, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X11
+	PSRLL  $0x13, X12
+	PXOR   X12, X11
+	MOVOA  X13, X0
+	PADDL  X9, X0
+	MOVOA  X0, X12
+	PSLLL  $0x0d, X0
+	PXOR   X0, X3
+	PSRLL  $0x13, X12
+	PXOR   X12, X3
+	MOVOA  X15, X0
+	PADDL  X11, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X1
+	PSRLL  $0x0e, X12
+	PXOR   X12, X1
+	MOVOA  X9, X0
+	PADDL  X3, X0
+	MOVOA  X0, X12
+	PSLLL  $0x12, X0
+	PXOR   X0, X2
+	PSRLL  $0x0e, X12
+	PXOR   X12, X2
+	MOVOA  320(R12), X12
+	MOVOA  336(R12), X0
+	SUBQ   $0x02, DX
+	JA     MAINLOOP1
+	PADDL  112(R12), X12
+	PADDL  176(R12), X7
+	PADDL  224(R12), X10
+	PADDL  272(R12), X4
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   (SI), DX
+	XORL   4(SI), CX
+	XORL   8(SI), R8
+	XORL   12(SI), R9
+	MOVL   DX, (DI)
+	MOVL   CX, 4(DI)
+	MOVL   R8, 8(DI)
+	MOVL   R9, 12(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   64(SI), DX
+	XORL   68(SI), CX
+	XORL   72(SI), R8
+	XORL   76(SI), R9
+	MOVL   DX, 64(DI)
+	MOVL   CX, 68(DI)
+	MOVL   R8, 72(DI)
+	MOVL   R9, 76(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	PSHUFL $0x39, X12, X12
+	PSHUFL $0x39, X7, X7
+	PSHUFL $0x39, X10, X10
+	PSHUFL $0x39, X4, X4
+	XORL   128(SI), DX
+	XORL   132(SI), CX
+	XORL   136(SI), R8
+	XORL   140(SI), R9
+	MOVL   DX, 128(DI)
+	MOVL   CX, 132(DI)
+	MOVL   R8, 136(DI)
+	MOVL   R9, 140(DI)
+	MOVD   X12, DX
+	MOVD   X7, CX
+	MOVD   X10, R8
+	MOVD   X4, R9
+	XORL   192(SI), DX
+	XORL   196(SI), CX
+	XORL   200(SI), R8
+	XORL   204(SI), R9
+	MOVL   DX, 192(DI)
+	MOVL   CX, 196(DI)
+	MOVL   R8, 200(DI)
+	MOVL   R9, 204(DI)
+	PADDL  240(R12), X14
+	PADDL  64(R12), X0
+	PADDL  128(R12), X5
+	PADDL  192(R12), X8
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   16(SI), DX
+	XORL   20(SI), CX
+	XORL   24(SI), R8
+	XORL   28(SI), R9
+	MOVL   DX, 16(DI)
+	MOVL   CX, 20(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 28(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   80(SI), DX
+	XORL   84(SI), CX
+	XORL   88(SI), R8
+	XORL   92(SI), R9
+	MOVL   DX, 80(DI)
+	MOVL   CX, 84(DI)
+	MOVL   R8, 88(DI)
+	MOVL   R9, 92(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	PSHUFL $0x39, X14, X14
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X5, X5
+	PSHUFL $0x39, X8, X8
+	XORL   144(SI), DX
+	XORL   148(SI), CX
+	XORL   152(SI), R8
+	XORL   156(SI), R9
+	MOVL   DX, 144(DI)
+	MOVL   CX, 148(DI)
+	MOVL   R8, 152(DI)
+	MOVL   R9, 156(DI)
+	MOVD   X14, DX
+	MOVD   X0, CX
+	MOVD   X5, R8
+	MOVD   X8, R9
+	XORL   208(SI), DX
+	XORL   212(SI), CX
+	XORL   216(SI), R8
+	XORL   220(SI), R9
+	MOVL   DX, 208(DI)
+	MOVL   CX, 212(DI)
+	MOVL   R8, 216(DI)
+	MOVL   R9, 220(DI)
+	PADDL  288(R12), X15
+	PADDL  304(R12), X11
+	PADDL  80(R12), X1
+	PADDL  144(R12), X6
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   32(SI), DX
+	XORL   36(SI), CX
+	XORL   40(SI), R8
+	XORL   44(SI), R9
+	MOVL   DX, 32(DI)
+	MOVL   CX, 36(DI)
+	MOVL   R8, 40(DI)
+	MOVL   R9, 44(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   96(SI), DX
+	XORL   100(SI), CX
+	XORL   104(SI), R8
+	XORL   108(SI), R9
+	MOVL   DX, 96(DI)
+	MOVL   CX, 100(DI)
+	MOVL   R8, 104(DI)
+	MOVL   R9, 108(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	PSHUFL $0x39, X15, X15
+	PSHUFL $0x39, X11, X11
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X6, X6
+	XORL   160(SI), DX
+	XORL   164(SI), CX
+	XORL   168(SI), R8
+	XORL   172(SI), R9
+	MOVL   DX, 160(DI)
+	MOVL   CX, 164(DI)
+	MOVL   R8, 168(DI)
+	MOVL   R9, 172(DI)
+	MOVD   X15, DX
+	MOVD   X11, CX
+	MOVD   X1, R8
+	MOVD   X6, R9
+	XORL   224(SI), DX
+	XORL   228(SI), CX
+	XORL   232(SI), R8
+	XORL   236(SI), R9
+	MOVL   DX, 224(DI)
+	MOVL   CX, 228(DI)
+	MOVL   R8, 232(DI)
+	MOVL   R9, 236(DI)
+	PADDL  160(R12), X13
+	PADDL  208(R12), X9
+	PADDL  256(R12), X3
+	PADDL  96(R12), X2
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   48(SI), DX
+	XORL   52(SI), CX
+	XORL   56(SI), R8
+	XORL   60(SI), R9
+	MOVL   DX, 48(DI)
+	MOVL   CX, 52(DI)
+	MOVL   R8, 56(DI)
+	MOVL   R9, 60(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   112(SI), DX
+	XORL   116(SI), CX
+	XORL   120(SI), R8
+	XORL   124(SI), R9
+	MOVL   DX, 112(DI)
+	MOVL   CX, 116(DI)
+	MOVL   R8, 120(DI)
+	MOVL   R9, 124(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	PSHUFL $0x39, X13, X13
+	PSHUFL $0x39, X9, X9
+	PSHUFL $0x39, X3, X3
+	PSHUFL $0x39, X2, X2
+	XORL   176(SI), DX
+	XORL   180(SI), CX
+	XORL   184(SI), R8
+	XORL   188(SI), R9
+	MOVL   DX, 176(DI)
+	MOVL   CX, 180(DI)
+	MOVL   R8, 184(DI)
+	MOVL   R9, 188(DI)
+	MOVD   X13, DX
+	MOVD   X9, CX
+	MOVD   X3, R8
+	MOVD   X2, R9
+	XORL   240(SI), DX
+	XORL   244(SI), CX
+	XORL   248(SI), R8
+	XORL   252(SI), R9
+	MOVL   DX, 240(DI)
+	MOVL   CX, 244(DI)
+	MOVL   R8, 248(DI)
+	MOVL   R9, 252(DI)
+	MOVQ   352(R12), R9
+	SUBQ   $0x00000100, R9
+	ADDQ   $0x00000100, SI
+	ADDQ   $0x00000100, DI
+	CMPQ   R9, $0x00000100
+	JAE    BYTESATLEAST256
+	CMPQ   R9, $0x00
+	JBE    DONE
 
-	MOVQ DX,R9
-	MOVQ CX,DX
-	MOVQ R8,R10
-	CMPQ R9,$0
-	JBE DONE
-	START:
-	MOVL 20(R10),CX
-	MOVL 0(R10),R8
-	MOVL 0(DX),AX
-	MOVL 16(R10),R11
-	MOVL CX,0(R12)
-	MOVL R8, 4 (R12)
-	MOVL AX, 8 (R12)
-	MOVL R11, 12 (R12)
-	MOVL 8(DX),CX
-	MOVL 24(R10),R8
-	MOVL 4(R10),AX
-	MOVL 4(DX),R11
-	MOVL CX,16(R12)
-	MOVL R8, 20 (R12)
-	MOVL AX, 24 (R12)
-	MOVL R11, 28 (R12)
-	MOVL 12(DX),CX
-	MOVL 12(R10),DX
-	MOVL 28(R10),R8
-	MOVL 8(R10),AX
-	MOVL DX,32(R12)
-	MOVL CX, 36 (R12)
-	MOVL R8, 40 (R12)
-	MOVL AX, 44 (R12)
-	MOVQ $1634760805,DX
-	MOVQ $857760878,CX
-	MOVQ $2036477234,R8
-	MOVQ $1797285236,AX
-	MOVL DX,48(R12)
-	MOVL CX, 52 (R12)
-	MOVL R8, 56 (R12)
-	MOVL AX, 60 (R12)
-	CMPQ R9,$256
-	JB BYTESBETWEEN1AND255
-	MOVOA 48(R12),X0
-	PSHUFL $0X55,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X3
-	PSHUFL $0X00,X0,X0
-	MOVOA X1,64(R12)
-	MOVOA X2,80(R12)
-	MOVOA X3,96(R12)
-	MOVOA X0,112(R12)
-	MOVOA 0(R12),X0
-	PSHUFL $0XAA,X0,X1
-	PSHUFL $0XFF,X0,X2
-	PSHUFL $0X00,X0,X3
-	PSHUFL $0X55,X0,X0
-	MOVOA X1,128(R12)
-	MOVOA X2,144(R12)
-	MOVOA X3,160(R12)
-	MOVOA X0,176(R12)
-	MOVOA 16(R12),X0
-	PSHUFL $0XFF,X0,X1
-	PSHUFL $0X55,X0,X2
-	PSHUFL $0XAA,X0,X0
-	MOVOA X1,192(R12)
-	MOVOA X2,208(R12)
-	MOVOA X0,224(R12)
-	MOVOA 32(R12),X0
-	PSHUFL $0X00,X0,X1
-	PSHUFL $0XAA,X0,X2
-	PSHUFL $0XFF,X0,X0
-	MOVOA X1,240(R12)
-	MOVOA X2,256(R12)
-	MOVOA X0,272(R12)
-	BYTESATLEAST256:
-	MOVL 16(R12),DX
-	MOVL  36 (R12),CX
-	MOVL DX,288(R12)
-	MOVL CX,304(R12)
-	SHLQ $32,CX
-	ADDQ CX,DX
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 292 (R12)
-	MOVL CX, 308 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 296 (R12)
-	MOVL CX, 312 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX, 300 (R12)
-	MOVL CX, 316 (R12)
-	ADDQ $1,DX
-	MOVQ DX,CX
-	SHRQ $32,CX
-	MOVL DX,16(R12)
-	MOVL CX, 36 (R12)
-	MOVQ R9,352(R12)
-	MOVQ $20,DX
-	MOVOA 64(R12),X0
-	MOVOA 80(R12),X1
-	MOVOA 96(R12),X2
-	MOVOA 256(R12),X3
-	MOVOA 272(R12),X4
-	MOVOA 128(R12),X5
-	MOVOA 144(R12),X6
-	MOVOA 176(R12),X7
-	MOVOA 192(R12),X8
-	MOVOA 208(R12),X9
-	MOVOA 224(R12),X10
-	MOVOA 304(R12),X11
-	MOVOA 112(R12),X12
-	MOVOA 160(R12),X13
-	MOVOA 240(R12),X14
-	MOVOA 288(R12),X15
-	MAINLOOP1:
-	MOVOA X1,320(R12)
-	MOVOA X2,336(R12)
-	MOVOA X13,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X14
-	PSRLL $25,X2
-	PXOR X2,X14
-	MOVOA X7,X1
-	PADDL X0,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X11
-	PSRLL $25,X2
-	PXOR X2,X11
-	MOVOA X12,X1
-	PADDL X14,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X15
-	PSRLL $23,X2
-	PXOR X2,X15
-	MOVOA X0,X1
-	PADDL X11,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X9
-	PSRLL $23,X2
-	PXOR X2,X9
-	MOVOA X14,X1
-	PADDL X15,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X13
-	PSRLL $19,X2
-	PXOR X2,X13
-	MOVOA X11,X1
-	PADDL X9,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X7
-	PSRLL $19,X2
-	PXOR X2,X7
-	MOVOA X15,X1
-	PADDL X13,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA 320(R12),X1
-	MOVOA X12,320(R12)
-	MOVOA X9,X2
-	PADDL X7,X2
-	MOVOA X2,X12
-	PSLLL $18,X2
-	PXOR X2,X0
-	PSRLL $14,X12
-	PXOR X12,X0
-	MOVOA X5,X2
-	PADDL X1,X2
-	MOVOA X2,X12
-	PSLLL $7,X2
-	PXOR X2,X3
-	PSRLL $25,X12
-	PXOR X12,X3
-	MOVOA 336(R12),X2
-	MOVOA X0,336(R12)
-	MOVOA X6,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X4
-	PSRLL $25,X12
-	PXOR X12,X4
-	MOVOA X1,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X10
-	PSRLL $23,X12
-	PXOR X12,X10
-	MOVOA X2,X0
-	PADDL X4,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X8
-	PSRLL $23,X12
-	PXOR X12,X8
-	MOVOA X3,X0
-	PADDL X10,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X5
-	PSRLL $19,X12
-	PXOR X12,X5
-	MOVOA X4,X0
-	PADDL X8,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X6
-	PSRLL $19,X12
-	PXOR X12,X6
-	MOVOA X10,X0
-	PADDL X5,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA 320(R12),X0
-	MOVOA X1,320(R12)
-	MOVOA X4,X1
-	PADDL X0,X1
-	MOVOA X1,X12
-	PSLLL $7,X1
-	PXOR X1,X7
-	PSRLL $25,X12
-	PXOR X12,X7
-	MOVOA X8,X1
-	PADDL X6,X1
-	MOVOA X1,X12
-	PSLLL $18,X1
-	PXOR X1,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 336(R12),X12
-	MOVOA X2,336(R12)
-	MOVOA X14,X1
-	PADDL X12,X1
-	MOVOA X1,X2
-	PSLLL $7,X1
-	PXOR X1,X5
-	PSRLL $25,X2
-	PXOR X2,X5
-	MOVOA X0,X1
-	PADDL X7,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X10
-	PSRLL $23,X2
-	PXOR X2,X10
-	MOVOA X12,X1
-	PADDL X5,X1
-	MOVOA X1,X2
-	PSLLL $9,X1
-	PXOR X1,X8
-	PSRLL $23,X2
-	PXOR X2,X8
-	MOVOA X7,X1
-	PADDL X10,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X4
-	PSRLL $19,X2
-	PXOR X2,X4
-	MOVOA X5,X1
-	PADDL X8,X1
-	MOVOA X1,X2
-	PSLLL $13,X1
-	PXOR X1,X14
-	PSRLL $19,X2
-	PXOR X2,X14
-	MOVOA X10,X1
-	PADDL X4,X1
-	MOVOA X1,X2
-	PSLLL $18,X1
-	PXOR X1,X0
-	PSRLL $14,X2
-	PXOR X2,X0
-	MOVOA 320(R12),X1
-	MOVOA X0,320(R12)
-	MOVOA X8,X0
-	PADDL X14,X0
-	MOVOA X0,X2
-	PSLLL $18,X0
-	PXOR X0,X12
-	PSRLL $14,X2
-	PXOR X2,X12
-	MOVOA X11,X0
-	PADDL X1,X0
-	MOVOA X0,X2
-	PSLLL $7,X0
-	PXOR X0,X6
-	PSRLL $25,X2
-	PXOR X2,X6
-	MOVOA 336(R12),X2
-	MOVOA X12,336(R12)
-	MOVOA X3,X0
-	PADDL X2,X0
-	MOVOA X0,X12
-	PSLLL $7,X0
-	PXOR X0,X13
-	PSRLL $25,X12
-	PXOR X12,X13
-	MOVOA X1,X0
-	PADDL X6,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X15
-	PSRLL $23,X12
-	PXOR X12,X15
-	MOVOA X2,X0
-	PADDL X13,X0
-	MOVOA X0,X12
-	PSLLL $9,X0
-	PXOR X0,X9
-	PSRLL $23,X12
-	PXOR X12,X9
-	MOVOA X6,X0
-	PADDL X15,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X11
-	PSRLL $19,X12
-	PXOR X12,X11
-	MOVOA X13,X0
-	PADDL X9,X0
-	MOVOA X0,X12
-	PSLLL $13,X0
-	PXOR X0,X3
-	PSRLL $19,X12
-	PXOR X12,X3
-	MOVOA X15,X0
-	PADDL X11,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X1
-	PSRLL $14,X12
-	PXOR X12,X1
-	MOVOA X9,X0
-	PADDL X3,X0
-	MOVOA X0,X12
-	PSLLL $18,X0
-	PXOR X0,X2
-	PSRLL $14,X12
-	PXOR X12,X2
-	MOVOA 320(R12),X12
-	MOVOA 336(R12),X0
-	SUBQ $2,DX
-	JA MAINLOOP1
-	PADDL 112(R12),X12
-	PADDL 176(R12),X7
-	PADDL 224(R12),X10
-	PADDL 272(R12),X4
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 0(SI),DX
-	XORL 4(SI),CX
-	XORL 8(SI),R8
-	XORL 12(SI),R9
-	MOVL DX,0(DI)
-	MOVL CX,4(DI)
-	MOVL R8,8(DI)
-	MOVL R9,12(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 64(SI),DX
-	XORL 68(SI),CX
-	XORL 72(SI),R8
-	XORL 76(SI),R9
-	MOVL DX,64(DI)
-	MOVL CX,68(DI)
-	MOVL R8,72(DI)
-	MOVL R9,76(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	PSHUFL $0X39,X12,X12
-	PSHUFL $0X39,X7,X7
-	PSHUFL $0X39,X10,X10
-	PSHUFL $0X39,X4,X4
-	XORL 128(SI),DX
-	XORL 132(SI),CX
-	XORL 136(SI),R8
-	XORL 140(SI),R9
-	MOVL DX,128(DI)
-	MOVL CX,132(DI)
-	MOVL R8,136(DI)
-	MOVL R9,140(DI)
-	MOVD X12,DX
-	MOVD X7,CX
-	MOVD X10,R8
-	MOVD X4,R9
-	XORL 192(SI),DX
-	XORL 196(SI),CX
-	XORL 200(SI),R8
-	XORL 204(SI),R9
-	MOVL DX,192(DI)
-	MOVL CX,196(DI)
-	MOVL R8,200(DI)
-	MOVL R9,204(DI)
-	PADDL 240(R12),X14
-	PADDL 64(R12),X0
-	PADDL 128(R12),X5
-	PADDL 192(R12),X8
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 16(SI),DX
-	XORL 20(SI),CX
-	XORL 24(SI),R8
-	XORL 28(SI),R9
-	MOVL DX,16(DI)
-	MOVL CX,20(DI)
-	MOVL R8,24(DI)
-	MOVL R9,28(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 80(SI),DX
-	XORL 84(SI),CX
-	XORL 88(SI),R8
-	XORL 92(SI),R9
-	MOVL DX,80(DI)
-	MOVL CX,84(DI)
-	MOVL R8,88(DI)
-	MOVL R9,92(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	PSHUFL $0X39,X14,X14
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X5,X5
-	PSHUFL $0X39,X8,X8
-	XORL 144(SI),DX
-	XORL 148(SI),CX
-	XORL 152(SI),R8
-	XORL 156(SI),R9
-	MOVL DX,144(DI)
-	MOVL CX,148(DI)
-	MOVL R8,152(DI)
-	MOVL R9,156(DI)
-	MOVD X14,DX
-	MOVD X0,CX
-	MOVD X5,R8
-	MOVD X8,R9
-	XORL 208(SI),DX
-	XORL 212(SI),CX
-	XORL 216(SI),R8
-	XORL 220(SI),R9
-	MOVL DX,208(DI)
-	MOVL CX,212(DI)
-	MOVL R8,216(DI)
-	MOVL R9,220(DI)
-	PADDL 288(R12),X15
-	PADDL 304(R12),X11
-	PADDL 80(R12),X1
-	PADDL 144(R12),X6
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 32(SI),DX
-	XORL 36(SI),CX
-	XORL 40(SI),R8
-	XORL 44(SI),R9
-	MOVL DX,32(DI)
-	MOVL CX,36(DI)
-	MOVL R8,40(DI)
-	MOVL R9,44(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 96(SI),DX
-	XORL 100(SI),CX
-	XORL 104(SI),R8
-	XORL 108(SI),R9
-	MOVL DX,96(DI)
-	MOVL CX,100(DI)
-	MOVL R8,104(DI)
-	MOVL R9,108(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	PSHUFL $0X39,X15,X15
-	PSHUFL $0X39,X11,X11
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X6,X6
-	XORL 160(SI),DX
-	XORL 164(SI),CX
-	XORL 168(SI),R8
-	XORL 172(SI),R9
-	MOVL DX,160(DI)
-	MOVL CX,164(DI)
-	MOVL R8,168(DI)
-	MOVL R9,172(DI)
-	MOVD X15,DX
-	MOVD X11,CX
-	MOVD X1,R8
-	MOVD X6,R9
-	XORL 224(SI),DX
-	XORL 228(SI),CX
-	XORL 232(SI),R8
-	XORL 236(SI),R9
-	MOVL DX,224(DI)
-	MOVL CX,228(DI)
-	MOVL R8,232(DI)
-	MOVL R9,236(DI)
-	PADDL 160(R12),X13
-	PADDL 208(R12),X9
-	PADDL 256(R12),X3
-	PADDL 96(R12),X2
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 48(SI),DX
-	XORL 52(SI),CX
-	XORL 56(SI),R8
-	XORL 60(SI),R9
-	MOVL DX,48(DI)
-	MOVL CX,52(DI)
-	MOVL R8,56(DI)
-	MOVL R9,60(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 112(SI),DX
-	XORL 116(SI),CX
-	XORL 120(SI),R8
-	XORL 124(SI),R9
-	MOVL DX,112(DI)
-	MOVL CX,116(DI)
-	MOVL R8,120(DI)
-	MOVL R9,124(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	PSHUFL $0X39,X13,X13
-	PSHUFL $0X39,X9,X9
-	PSHUFL $0X39,X3,X3
-	PSHUFL $0X39,X2,X2
-	XORL 176(SI),DX
-	XORL 180(SI),CX
-	XORL 184(SI),R8
-	XORL 188(SI),R9
-	MOVL DX,176(DI)
-	MOVL CX,180(DI)
-	MOVL R8,184(DI)
-	MOVL R9,188(DI)
-	MOVD X13,DX
-	MOVD X9,CX
-	MOVD X3,R8
-	MOVD X2,R9
-	XORL 240(SI),DX
-	XORL 244(SI),CX
-	XORL 248(SI),R8
-	XORL 252(SI),R9
-	MOVL DX,240(DI)
-	MOVL CX,244(DI)
-	MOVL R8,248(DI)
-	MOVL R9,252(DI)
-	MOVQ 352(R12),R9
-	SUBQ $256,R9
-	ADDQ $256,SI
-	ADDQ $256,DI
-	CMPQ R9,$256
-	JAE BYTESATLEAST256
-	CMPQ R9,$0
-	JBE DONE
-	BYTESBETWEEN1AND255:
-	CMPQ R9,$64
-	JAE NOCOPY
-	MOVQ DI,DX
-	LEAQ 360(R12),DI
-	MOVQ R9,CX
+BYTESBETWEEN1AND255:
+	CMPQ R9, $0x40
+	JAE  NOCOPY
+	MOVQ DI, DX
+	LEAQ 360(R12), DI
+	MOVQ R9, CX
 	REP; MOVSB
-	LEAQ 360(R12),DI
-	LEAQ 360(R12),SI
-	NOCOPY:
-	MOVQ R9,352(R12)
-	MOVOA 48(R12),X0
-	MOVOA 0(R12),X1
-	MOVOA 16(R12),X2
-	MOVOA 32(R12),X3
-	MOVOA X1,X4
-	MOVQ $20,CX
-	MAINLOOP2:
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X3
-	PXOR X6,X3
-	PADDL X3,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X3,X3
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X1
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X3,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X1,X1
-	PXOR X6,X0
-	PADDL X0,X4
-	MOVOA X0,X5
-	MOVOA X4,X6
-	PSLLL $7,X4
-	PSRLL $25,X6
-	PXOR X4,X1
-	PXOR X6,X1
-	PADDL X1,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $9,X5
-	PSRLL $23,X6
-	PXOR X5,X2
-	PSHUFL $0X93,X1,X1
-	PXOR X6,X2
-	PADDL X2,X4
-	MOVOA X2,X5
-	MOVOA X4,X6
-	PSLLL $13,X4
-	PSRLL $19,X6
-	PXOR X4,X3
-	PSHUFL $0X4E,X2,X2
-	PXOR X6,X3
-	SUBQ $4,CX
-	PADDL X3,X5
-	MOVOA X1,X4
-	MOVOA X5,X6
-	PSLLL $18,X5
-	PXOR X7,X7
-	PSRLL $14,X6
-	PXOR X5,X0
-	PSHUFL $0X39,X3,X3
-	PXOR X6,X0
-	JA MAINLOOP2
-	PADDL 48(R12),X0
-	PADDL 0(R12),X1
-	PADDL 16(R12),X2
-	PADDL 32(R12),X3
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 0(SI),CX
-	XORL 48(SI),R8
-	XORL 32(SI),R9
-	XORL 16(SI),AX
-	MOVL CX,0(DI)
-	MOVL R8,48(DI)
-	MOVL R9,32(DI)
-	MOVL AX,16(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 20(SI),CX
-	XORL 4(SI),R8
-	XORL 52(SI),R9
-	XORL 36(SI),AX
-	MOVL CX,20(DI)
-	MOVL R8,4(DI)
-	MOVL R9,52(DI)
-	MOVL AX,36(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	PSHUFL $0X39,X0,X0
-	PSHUFL $0X39,X1,X1
-	PSHUFL $0X39,X2,X2
-	PSHUFL $0X39,X3,X3
-	XORL 40(SI),CX
-	XORL 24(SI),R8
-	XORL 8(SI),R9
-	XORL 56(SI),AX
-	MOVL CX,40(DI)
-	MOVL R8,24(DI)
-	MOVL R9,8(DI)
-	MOVL AX,56(DI)
-	MOVD X0,CX
-	MOVD X1,R8
-	MOVD X2,R9
-	MOVD X3,AX
-	XORL 60(SI),CX
-	XORL 44(SI),R8
-	XORL 28(SI),R9
-	XORL 12(SI),AX
-	MOVL CX,60(DI)
-	MOVL R8,44(DI)
-	MOVL R9,28(DI)
-	MOVL AX,12(DI)
-	MOVQ 352(R12),R9
-	MOVL 16(R12),CX
-	MOVL  36 (R12),R8
-	ADDQ $1,CX
-	SHLQ $32,R8
-	ADDQ R8,CX
-	MOVQ CX,R8
-	SHRQ $32,R8
-	MOVL CX,16(R12)
-	MOVL R8, 36 (R12)
-	CMPQ R9,$64
-	JA BYTESATLEAST65
-	JAE BYTESATLEAST64
-	MOVQ DI,SI
-	MOVQ DX,DI
-	MOVQ R9,CX
+	LEAQ 360(R12), DI
+	LEAQ 360(R12), SI
+
+NOCOPY:
+	MOVQ  R9, 352(R12)
+	MOVOA 48(R12), X0
+	MOVOA (R12), X1
+	MOVOA 16(R12), X2
+	MOVOA 32(R12), X3
+	MOVOA X1, X4
+	MOVQ  $0x00000014, CX
+
+MAINLOOP2:
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X3
+	PXOR   X6, X3
+	PADDL  X3, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X3, X3
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X1
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X3, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X1, X1
+	PXOR   X6, X0
+	PADDL  X0, X4
+	MOVOA  X0, X5
+	MOVOA  X4, X6
+	PSLLL  $0x07, X4
+	PSRLL  $0x19, X6
+	PXOR   X4, X1
+	PXOR   X6, X1
+	PADDL  X1, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x09, X5
+	PSRLL  $0x17, X6
+	PXOR   X5, X2
+	PSHUFL $0x93, X1, X1
+	PXOR   X6, X2
+	PADDL  X2, X4
+	MOVOA  X2, X5
+	MOVOA  X4, X6
+	PSLLL  $0x0d, X4
+	PSRLL  $0x13, X6
+	PXOR   X4, X3
+	PSHUFL $0x4e, X2, X2
+	PXOR   X6, X3
+	SUBQ   $0x04, CX
+	PADDL  X3, X5
+	MOVOA  X1, X4
+	MOVOA  X5, X6
+	PSLLL  $0x12, X5
+	PXOR   X7, X7
+	PSRLL  $0x0e, X6
+	PXOR   X5, X0
+	PSHUFL $0x39, X3, X3
+	PXOR   X6, X0
+	JA     MAINLOOP2
+	PADDL  48(R12), X0
+	PADDL  (R12), X1
+	PADDL  16(R12), X2
+	PADDL  32(R12), X3
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   (SI), CX
+	XORL   48(SI), R8
+	XORL   32(SI), R9
+	XORL   16(SI), AX
+	MOVL   CX, (DI)
+	MOVL   R8, 48(DI)
+	MOVL   R9, 32(DI)
+	MOVL   AX, 16(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   20(SI), CX
+	XORL   4(SI), R8
+	XORL   52(SI), R9
+	XORL   36(SI), AX
+	MOVL   CX, 20(DI)
+	MOVL   R8, 4(DI)
+	MOVL   R9, 52(DI)
+	MOVL   AX, 36(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	PSHUFL $0x39, X0, X0
+	PSHUFL $0x39, X1, X1
+	PSHUFL $0x39, X2, X2
+	PSHUFL $0x39, X3, X3
+	XORL   40(SI), CX
+	XORL   24(SI), R8
+	XORL   8(SI), R9
+	XORL   56(SI), AX
+	MOVL   CX, 40(DI)
+	MOVL   R8, 24(DI)
+	MOVL   R9, 8(DI)
+	MOVL   AX, 56(DI)
+	MOVD   X0, CX
+	MOVD   X1, R8
+	MOVD   X2, R9
+	MOVD   X3, AX
+	XORL   60(SI), CX
+	XORL   44(SI), R8
+	XORL   28(SI), R9
+	XORL   12(SI), AX
+	MOVL   CX, 60(DI)
+	MOVL   R8, 44(DI)
+	MOVL   R9, 28(DI)
+	MOVL   AX, 12(DI)
+	MOVQ   352(R12), R9
+	MOVL   16(R12), CX
+	MOVL   36(R12), R8
+	ADDQ   $0x01, CX
+	SHLQ   $0x20, R8
+	ADDQ   R8, CX
+	MOVQ   CX, R8
+	SHRQ   $0x20, R8
+	MOVL   CX, 16(R12)
+	MOVL   R8, 36(R12)
+	CMPQ   R9, $0x40
+	JA     BYTESATLEAST65
+	JAE    BYTESATLEAST64
+	MOVQ   DI, SI
+	MOVQ   DX, DI
+	MOVQ   R9, CX
 	REP; MOVSB
-	BYTESATLEAST64:
-	DONE:
+
+BYTESATLEAST64:
+DONE:
 	RET
-	BYTESATLEAST65:
-	SUBQ $64,R9
-	ADDQ $64,DI
-	ADDQ $64,SI
-	JMP BYTESBETWEEN1AND255
+
+BYTESATLEAST65:
+	SUBQ $0x40, R9
+	ADDQ $0x40, DI
+	ADDQ $0x40, SI
+	JMP  BYTESBETWEEN1AND255
diff --git a/vendor/golang.org/x/crypto/ssh/client_auth.go b/vendor/golang.org/x/crypto/ssh/client_auth.go
index 9486c598..b86dde15 100644
--- a/vendor/golang.org/x/crypto/ssh/client_auth.go
+++ b/vendor/golang.org/x/crypto/ssh/client_auth.go
@@ -71,6 +71,10 @@ func (c *connection) clientAuthenticate(config *ClientConfig) error {
 	for auth := AuthMethod(new(noneAuth)); auth != nil; {
 		ok, methods, err := auth.auth(sessionID, config.User, c.transport, config.Rand, extensions)
 		if err != nil {
+			// On disconnect, return error immediately
+			if _, ok := err.(*disconnectMsg); ok {
+				return err
+			}
 			// We return the error later if there is no other method left to
 			// try.
 			ok = authFailure
@@ -551,6 +555,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe
 	}
 
 	gotMsgExtInfo := false
+	gotUserAuthInfoRequest := false
 	for {
 		packet, err := c.readPacket()
 		if err != nil {
@@ -581,6 +586,9 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe
 			if msg.PartialSuccess {
 				return authPartialSuccess, msg.Methods, nil
 			}
+			if !gotUserAuthInfoRequest {
+				return authFailure, msg.Methods, unexpectedMessageError(msgUserAuthInfoRequest, packet[0])
+			}
 			return authFailure, msg.Methods, nil
 		case msgUserAuthSuccess:
 			return authSuccess, nil, nil
@@ -592,6 +600,7 @@ func (cb KeyboardInteractiveChallenge) auth(session []byte, user string, c packe
 		if err := Unmarshal(packet, &msg); err != nil {
 			return authFailure, nil, err
 		}
+		gotUserAuthInfoRequest = true
 
 		// Manually unpack the prompt/echo pairs.
 		rest := msg.Prompts
diff --git a/vendor/golang.org/x/crypto/ssh/doc.go b/vendor/golang.org/x/crypto/ssh/doc.go
index edbe6334..f5d352fe 100644
--- a/vendor/golang.org/x/crypto/ssh/doc.go
+++ b/vendor/golang.org/x/crypto/ssh/doc.go
@@ -20,4 +20,4 @@ References:
 This package does not fall under the stability promise of the Go language itself,
 so its API may be changed when pressing needs arise.
 */
-package ssh // import "golang.org/x/crypto/ssh"
+package ssh
diff --git a/vendor/golang.org/x/crypto/ssh/keys.go b/vendor/golang.org/x/crypto/ssh/keys.go
index 7967665f..98e6706d 100644
--- a/vendor/golang.org/x/crypto/ssh/keys.go
+++ b/vendor/golang.org/x/crypto/ssh/keys.go
@@ -488,7 +488,49 @@ func (r *rsaPublicKey) Verify(data []byte, sig *Signature) error {
 	h := hash.New()
 	h.Write(data)
 	digest := h.Sum(nil)
-	return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, sig.Blob)
+
+	// Signatures in PKCS1v15 must match the key's modulus in
+	// length. However with SSH, some signers provide RSA
+	// signatures which are missing the MSB 0's of the bignum
+	// represented. With ssh-rsa signatures, this is encouraged by
+	// the spec (even though e.g. OpenSSH will give the full
+	// length unconditionally). With rsa-sha2-* signatures, the
+	// verifier is allowed to support these, even though they are
+	// out of spec. See RFC 4253 Section 6.6 for ssh-rsa and RFC
+	// 8332 Section 3 for rsa-sha2-* details.
+	//
+	// In practice:
+	// * OpenSSH always allows "short" signatures:
+	//   https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L526
+	//   but always generates padded signatures:
+	//   https://github.com/openssh/openssh-portable/blob/V_9_8_P1/ssh-rsa.c#L439
+	//
+	// * PuTTY versions 0.81 and earlier will generate short
+	//   signatures for all RSA signature variants. Note that
+	//   PuTTY is embedded in other software, such as WinSCP and
+	//   FileZilla. At the time of writing, a patch has been
+	//   applied to PuTTY to generate padded signatures for
+	//   rsa-sha2-*, but not yet released:
+	//   https://git.tartarus.org/?p=simon/putty.git;a=commitdiff;h=a5bcf3d384e1bf15a51a6923c3724cbbee022d8e
+	//
+	// * SSH.NET versions 2024.0.0 and earlier will generate short
+	//   signatures for all RSA signature variants, fixed in 2024.1.0:
+	//   https://github.com/sshnet/SSH.NET/releases/tag/2024.1.0
+	//
+	// As a result, we pad these up to the key size by inserting
+	// leading 0's.
+	//
+	// Note that support for short signatures with rsa-sha2-* may
+	// be removed in the future due to such signatures not being
+	// allowed by the spec.
+	blob := sig.Blob
+	keySize := (*rsa.PublicKey)(r).Size()
+	if len(blob) < keySize {
+		padded := make([]byte, keySize)
+		copy(padded[keySize-len(blob):], blob)
+		blob = padded
+	}
+	return rsa.VerifyPKCS1v15((*rsa.PublicKey)(r), hash, digest, blob)
 }
 
 func (r *rsaPublicKey) CryptoPublicKey() crypto.PublicKey {
diff --git a/vendor/golang.org/x/crypto/ssh/server.go b/vendor/golang.org/x/crypto/ssh/server.go
index 3ca9e89e..5b5ccd96 100644
--- a/vendor/golang.org/x/crypto/ssh/server.go
+++ b/vendor/golang.org/x/crypto/ssh/server.go
@@ -149,7 +149,7 @@ func (s *ServerConfig) AddHostKey(key Signer) {
 }
 
 // cachedPubKey contains the results of querying whether a public key is
-// acceptable for a user.
+// acceptable for a user. This is a FIFO cache.
 type cachedPubKey struct {
 	user       string
 	pubKeyData []byte
@@ -157,7 +157,13 @@ type cachedPubKey struct {
 	perms      *Permissions
 }
 
-const maxCachedPubKeys = 16
+// maxCachedPubKeys is the number of cache entries we store.
+//
+// Due to consistent misuse of the PublicKeyCallback API, we have reduced this
+// to 1, such that the only key in the cache is the most recently seen one. This
+// forces the behavior that the last call to PublicKeyCallback will always be
+// with the key that is used for authentication.
+const maxCachedPubKeys = 1
 
 // pubKeyCache caches tests for public keys.  Since SSH clients
 // will query whether a public key is acceptable before attempting to
@@ -179,9 +185,10 @@ func (c *pubKeyCache) get(user string, pubKeyData []byte) (cachedPubKey, bool) {
 
 // add adds the given tuple to the cache.
 func (c *pubKeyCache) add(candidate cachedPubKey) {
-	if len(c.keys) < maxCachedPubKeys {
-		c.keys = append(c.keys, candidate)
+	if len(c.keys) >= maxCachedPubKeys {
+		c.keys = c.keys[1:]
 	}
+	c.keys = append(c.keys, candidate)
 }
 
 // ServerConn is an authenticated SSH connection, as seen from the
@@ -510,8 +517,8 @@ userAuthLoop:
 			if err := s.transport.writePacket(Marshal(discMsg)); err != nil {
 				return nil, err
 			}
-
-			return nil, discMsg
+			authErrs = append(authErrs, discMsg)
+			return nil, &ServerAuthError{Errors: authErrs}
 		}
 
 		var userAuthReq userAuthRequestMsg
diff --git a/vendor/golang.org/x/sync/LICENSE b/vendor/golang.org/x/sync/LICENSE
index 6a66aea5..2a7cf70d 100644
--- a/vendor/golang.org/x/sync/LICENSE
+++ b/vendor/golang.org/x/sync/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
+Copyright 2009 The Go Authors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer.
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
-   * Neither the name of Google Inc. nor the names of its
+   * Neither the name of Google LLC nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/vendor/golang.org/x/sys/LICENSE b/vendor/golang.org/x/sys/LICENSE
index 6a66aea5..2a7cf70d 100644
--- a/vendor/golang.org/x/sys/LICENSE
+++ b/vendor/golang.org/x/sys/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
+Copyright 2009 The Go Authors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer.
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
-   * Neither the name of Google Inc. nor the names of its
+   * Neither the name of Google LLC nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s b/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s
new file mode 100644
index 00000000..ec2acfe5
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/asm_darwin_x86_gc.s
@@ -0,0 +1,17 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+#include "textflag.h"
+
+TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctl(SB)
+GLOBL	·libc_sysctl_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
+
+TEXT libc_sysctlbyname_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_sysctlbyname(SB)
+GLOBL	·libc_sysctlbyname_trampoline_addr(SB), RODATA, $8
+DATA	·libc_sysctlbyname_trampoline_addr(SB)/8, $libc_sysctlbyname_trampoline<>(SB)
diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go
index 8fa707aa..02609d5b 100644
--- a/vendor/golang.org/x/sys/cpu/cpu.go
+++ b/vendor/golang.org/x/sys/cpu/cpu.go
@@ -105,6 +105,8 @@ var ARM64 struct {
 	HasSVE      bool // Scalable Vector Extensions
 	HasSVE2     bool // Scalable Vector Extensions 2
 	HasASIMDFHM bool // Advanced SIMD multiplication FP16 to FP32
+	HasDIT      bool // Data Independent Timing support
+	HasI8MM     bool // Advanced SIMD Int8 matrix multiplication instructions
 	_           CacheLinePad
 }
 
@@ -199,6 +201,25 @@ var S390X struct {
 	_         CacheLinePad
 }
 
+// RISCV64 contains the supported CPU features and performance characteristics for riscv64
+// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate
+// the presence of RISC-V extensions.
+//
+// It is safe to assume that all the RV64G extensions are supported and so they are omitted from
+// this structure. As riscv64 Go programs require at least RV64G, the code that populates
+// this structure cannot run successfully if some of the RV64G extensions are missing.
+// The struct is padded to avoid false sharing.
+var RISCV64 struct {
+	_                 CacheLinePad
+	HasFastMisaligned bool // Fast misaligned accesses
+	HasC              bool // Compressed instruction-set extension
+	HasV              bool // Vector extension compatible with RVV 1.0
+	HasZba            bool // Address generation instructions extension
+	HasZbb            bool // Basic bit-manipulation extension
+	HasZbs            bool // Single-bit instructions extension
+	_                 CacheLinePad
+}
+
 func init() {
 	archInit()
 	initOptions()
diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_arm64.go
index 0e27a21e..af2aa99f 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.go
@@ -38,6 +38,8 @@ func initOptions() {
 		{Name: "dcpop", Feature: &ARM64.HasDCPOP},
 		{Name: "asimddp", Feature: &ARM64.HasASIMDDP},
 		{Name: "asimdfhm", Feature: &ARM64.HasASIMDFHM},
+		{Name: "dit", Feature: &ARM64.HasDIT},
+		{Name: "i8mm", Feature: &ARM64.HasI8MM},
 	}
 }
 
@@ -145,6 +147,11 @@ func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) {
 		ARM64.HasLRCPC = true
 	}
 
+	switch extractBits(isar1, 52, 55) {
+	case 1:
+		ARM64.HasI8MM = true
+	}
+
 	// ID_AA64PFR0_EL1
 	switch extractBits(pfr0, 16, 19) {
 	case 0:
@@ -168,6 +175,11 @@ func parseARM64SystemRegisters(isar0, isar1, pfr0 uint64) {
 
 		parseARM64SVERegister(getzfr0())
 	}
+
+	switch extractBits(pfr0, 48, 51) {
+	case 1:
+		ARM64.HasDIT = true
+	}
 }
 
 func parseARM64SVERegister(zfr0 uint64) {
diff --git a/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go b/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go
new file mode 100644
index 00000000..b838cb9e
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_darwin_x86.go
@@ -0,0 +1,61 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+// darwinSupportsAVX512 checks Darwin kernel for AVX512 support via sysctl
+// call (see issue 43089). It also restricts AVX512 support for Darwin to
+// kernel version 21.3.0 (MacOS 12.2.0) or later (see issue 49233).
+//
+// Background:
+// Darwin implements a special mechanism to economize on thread state when
+// AVX512 specific registers are not in use. This scheme minimizes state when
+// preempting threads that haven't yet used any AVX512 instructions, but adds
+// special requirements to check for AVX512 hardware support at runtime (e.g.
+// via sysctl call or commpage inspection). See issue 43089 and link below for
+// full background:
+// https://github.com/apple-oss-distributions/xnu/blob/xnu-11215.1.10/osfmk/i386/fpu.c#L214-L240
+//
+// Additionally, all versions of the Darwin kernel from 19.6.0 through 21.2.0
+// (corresponding to MacOS 10.15.6 - 12.1) have a bug that can cause corruption
+// of the AVX512 mask registers (K0-K7) upon signal return. For this reason
+// AVX512 is considered unsafe to use on Darwin for kernel versions prior to
+// 21.3.0, where a fix has been confirmed. See issue 49233 for full background.
+func darwinSupportsAVX512() bool {
+	return darwinSysctlEnabled([]byte("hw.optional.avx512f\x00")) && darwinKernelVersionCheck(21, 3, 0)
+}
+
+// Ensure Darwin kernel version is at least major.minor.patch, avoiding dependencies
+func darwinKernelVersionCheck(major, minor, patch int) bool {
+	var release [256]byte
+	err := darwinOSRelease(&release)
+	if err != nil {
+		return false
+	}
+
+	var mmp [3]int
+	c := 0
+Loop:
+	for _, b := range release[:] {
+		switch {
+		case b >= '0' && b <= '9':
+			mmp[c] = 10*mmp[c] + int(b-'0')
+		case b == '.':
+			c++
+			if c > 2 {
+				return false
+			}
+		case b == 0:
+			break Loop
+		default:
+			return false
+		}
+	}
+	if c != 2 {
+		return false
+	}
+	return mmp[0] > major || mmp[0] == major && (mmp[1] > minor || mmp[1] == minor && mmp[2] >= patch)
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
index 910728fb..32a44514 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go
@@ -6,10 +6,10 @@
 
 package cpu
 
-// cpuid is implemented in cpu_x86.s for gc compiler
+// cpuid is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
 
-// xgetbv with ecx = 0 is implemented in cpu_x86.s for gc compiler
+// xgetbv with ecx = 0 is implemented in cpu_gc_x86.s for gc compiler
 // and in cpu_gccgo.c for gccgo.
 func xgetbv() (eax, edx uint32)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.s b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
similarity index 94%
rename from vendor/golang.org/x/sys/cpu/cpu_x86.s
rename to vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
index 7d7ba33e..ce208ce6 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.s
+++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.s
@@ -18,7 +18,7 @@ TEXT ·cpuid(SB), NOSPLIT, $0-24
 	RET
 
 // func xgetbv() (eax, edx uint32)
-TEXT ·xgetbv(SB),NOSPLIT,$0-8
+TEXT ·xgetbv(SB), NOSPLIT, $0-8
 	MOVL $0, CX
 	XGETBV
 	MOVL AX, eax+0(FP)
diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
index 99c60fe9..170d21dd 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go
@@ -23,9 +23,3 @@ func xgetbv() (eax, edx uint32) {
 	gccgoXgetbv(&a, &d)
 	return a, d
 }
-
-// gccgo doesn't build on Darwin, per:
-// https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/gcc.rb#L76
-func darwinSupportsAVX512() bool {
-	return false
-}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
index 3d386d0f..f1caf0f7 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_arm64.go
@@ -35,8 +35,10 @@ const (
 	hwcap_SHA512   = 1 << 21
 	hwcap_SVE      = 1 << 22
 	hwcap_ASIMDFHM = 1 << 23
+	hwcap_DIT      = 1 << 24
 
 	hwcap2_SVE2 = 1 << 1
+	hwcap2_I8MM = 1 << 13
 )
 
 // linuxKernelCanEmulateCPUID reports whether we're running
@@ -106,9 +108,11 @@ func doinit() {
 	ARM64.HasSHA512 = isSet(hwCap, hwcap_SHA512)
 	ARM64.HasSVE = isSet(hwCap, hwcap_SVE)
 	ARM64.HasASIMDFHM = isSet(hwCap, hwcap_ASIMDFHM)
+	ARM64.HasDIT = isSet(hwCap, hwcap_DIT)
 
 	// HWCAP2 feature bits
 	ARM64.HasSVE2 = isSet(hwCap2, hwcap2_SVE2)
+	ARM64.HasI8MM = isSet(hwCap2, hwcap2_I8MM)
 }
 
 func isSet(hwc uint, value uint) bool {
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
index cd63e733..7d902b68 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x
+//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x && !riscv64
 
 package cpu
 
diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
new file mode 100644
index 00000000..cb4a0c57
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go
@@ -0,0 +1,137 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// RISC-V extension discovery code for Linux. The approach here is to first try the riscv_hwprobe
+// syscall falling back to HWCAP to check for the C extension if riscv_hwprobe is not available.
+//
+// A note on detection of the Vector extension using HWCAP.
+//
+// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5.
+// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe
+// syscall is not available then neither is the Vector extension (which needs kernel support).
+// The riscv_hwprobe syscall should then be all we need to detect the Vector extension.
+// However, some RISC-V board manufacturers ship boards with an older kernel on top of which
+// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe
+// patches. These kernels advertise support for the Vector extension using HWCAP. Falling
+// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not
+// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option.
+//
+// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by
+// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board
+// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified
+// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use
+// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector
+// extension are binary incompatible. HWCAP can then not be used in isolation to populate the
+// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0.
+//
+// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector
+// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype
+// register. This check would allow us to safely detect version 1.0 of the Vector extension
+// with HWCAP, if riscv_hwprobe were not available. However, the check cannot
+// be added until the assembler supports the Vector instructions.
+//
+// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the
+// extensions it advertises support for are explicitly versioned. It's also worth noting that
+// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zba.
+// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority
+// of RISC-V extensions.
+//
+// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information.
+
+// golang.org/x/sys/cpu is not allowed to depend on golang.org/x/sys/unix so we must
+// reproduce the constants, types and functions needed to make the riscv_hwprobe syscall
+// here.
+
+const (
+	// Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+	riscv_HWPROBE_KEY_IMA_EXT_0   = 0x4
+	riscv_HWPROBE_IMA_C           = 0x2
+	riscv_HWPROBE_IMA_V           = 0x4
+	riscv_HWPROBE_EXT_ZBA         = 0x8
+	riscv_HWPROBE_EXT_ZBB         = 0x10
+	riscv_HWPROBE_EXT_ZBS         = 0x20
+	riscv_HWPROBE_KEY_CPUPERF_0   = 0x5
+	riscv_HWPROBE_MISALIGNED_FAST = 0x3
+	riscv_HWPROBE_MISALIGNED_MASK = 0x7
+)
+
+const (
+	// sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go.
+	sys_RISCV_HWPROBE = 258
+)
+
+// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go.
+type riscvHWProbePairs struct {
+	key   int64
+	value uint64
+}
+
+const (
+	// CPU features
+	hwcap_RISCV_ISA_C = 1 << ('C' - 'A')
+)
+
+func doinit() {
+	// A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key
+	// field should be initialised with one of the key constants defined above, e.g.,
+	// RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value.
+	// If the kernel does not recognise a key it will set the key field to -1 and the value field to 0.
+
+	pairs := []riscvHWProbePairs{
+		{riscv_HWPROBE_KEY_IMA_EXT_0, 0},
+		{riscv_HWPROBE_KEY_CPUPERF_0, 0},
+	}
+
+	// This call only indicates that extensions are supported if they are implemented on all cores.
+	if riscvHWProbe(pairs, 0) {
+		if pairs[0].key != -1 {
+			v := uint(pairs[0].value)
+			RISCV64.HasC = isSet(v, riscv_HWPROBE_IMA_C)
+			RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V)
+			RISCV64.HasZba = isSet(v, riscv_HWPROBE_EXT_ZBA)
+			RISCV64.HasZbb = isSet(v, riscv_HWPROBE_EXT_ZBB)
+			RISCV64.HasZbs = isSet(v, riscv_HWPROBE_EXT_ZBS)
+		}
+		if pairs[1].key != -1 {
+			v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK
+			RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST
+		}
+	}
+
+	// Let's double check with HWCAP if the C extension does not appear to be supported.
+	// This may happen if we're running on a kernel older than 6.4.
+
+	if !RISCV64.HasC {
+		RISCV64.HasC = isSet(hwCap, hwcap_RISCV_ISA_C)
+	}
+}
+
+func isSet(hwc uint, value uint) bool {
+	return hwc&value != 0
+}
+
+// riscvHWProbe is a simplified version of the generated wrapper function found in
+// golang.org/x/sys/unix/zsyscall_linux_riscv64.go. We simplify it by removing the
+// cpuCount and cpus parameters which we do not need. We always want to pass 0 for
+// these parameters here so the kernel only reports the extensions that are present
+// on all cores.
+func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool {
+	var _zero uintptr
+	var p0 unsafe.Pointer
+	if len(pairs) > 0 {
+		p0 = unsafe.Pointer(&pairs[0])
+	} else {
+		p0 = unsafe.Pointer(&_zero)
+	}
+
+	_, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(p0), uintptr(len(pairs)), uintptr(0), uintptr(0), uintptr(flags), 0)
+	return e1 == 0
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_x86.go b/vendor/golang.org/x/sys/cpu/cpu_other_x86.go
new file mode 100644
index 00000000..a0fd7e2f
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/cpu_other_x86.go
@@ -0,0 +1,11 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build 386 || amd64p32 || (amd64 && (!darwin || !gc))
+
+package cpu
+
+func darwinSupportsAVX512() bool {
+	panic("only implemented for gc && amd64 && darwin")
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
index 7f0c79c0..aca3199c 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go
@@ -8,4 +8,13 @@ package cpu
 
 const cacheLineSize = 64
 
-func initOptions() {}
+func initOptions() {
+	options = []option{
+		{Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned},
+		{Name: "c", Feature: &RISCV64.HasC},
+		{Name: "v", Feature: &RISCV64.HasV},
+		{Name: "zba", Feature: &RISCV64.HasZba},
+		{Name: "zbb", Feature: &RISCV64.HasZbb},
+		{Name: "zbs", Feature: &RISCV64.HasZbs},
+	}
+}
diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go
index c29f5e4c..600a6807 100644
--- a/vendor/golang.org/x/sys/cpu/cpu_x86.go
+++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go
@@ -92,10 +92,8 @@ func archInit() {
 		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
 
 		if runtime.GOOS == "darwin" {
-			// Darwin doesn't save/restore AVX-512 mask registers correctly across signal handlers.
-			// Since users can't rely on mask register contents, let's not advertise AVX-512 support.
-			// See issue 49233.
-			osSupportsAVX512 = false
+			// Darwin requires special AVX512 checks, see cpu_darwin_x86.go
+			osSupportsAVX512 = osSupportsAVX && darwinSupportsAVX512()
 		} else {
 			// Check if OPMASK and ZMM registers have OS support.
 			osSupportsAVX512 = osSupportsAVX && isSet(5, eax) && isSet(6, eax) && isSet(7, eax)
diff --git a/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go b/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go
new file mode 100644
index 00000000..4d0888b0
--- /dev/null
+++ b/vendor/golang.org/x/sys/cpu/syscall_darwin_x86_gc.go
@@ -0,0 +1,98 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Minimal copy of x/sys/unix so the cpu package can make a
+// system call on Darwin without depending on x/sys/unix.
+
+//go:build darwin && amd64 && gc
+
+package cpu
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+type _C_int int32
+
+// adapted from unix.Uname() at x/sys/unix/syscall_darwin.go L419
+func darwinOSRelease(release *[256]byte) error {
+	// from x/sys/unix/zerrors_openbsd_amd64.go
+	const (
+		CTL_KERN       = 0x1
+		KERN_OSRELEASE = 0x2
+	)
+
+	mib := []_C_int{CTL_KERN, KERN_OSRELEASE}
+	n := unsafe.Sizeof(*release)
+
+	return sysctl(mib, &release[0], &n, nil, 0)
+}
+
+type Errno = syscall.Errno
+
+var _zero uintptr // Single-word zero for use when we need a valid pointer to 0 bytes.
+
+// from x/sys/unix/zsyscall_darwin_amd64.go L791-807
+func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	var _p0 unsafe.Pointer
+	if len(mib) > 0 {
+		_p0 = unsafe.Pointer(&mib[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	if _, _, err := syscall_syscall6(
+		libc_sysctl_trampoline_addr,
+		uintptr(_p0),
+		uintptr(len(mib)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+var libc_sysctl_trampoline_addr uintptr
+
+// adapted from internal/cpu/cpu_arm64_darwin.go
+func darwinSysctlEnabled(name []byte) bool {
+	out := int32(0)
+	nout := unsafe.Sizeof(out)
+	if ret := sysctlbyname(&name[0], (*byte)(unsafe.Pointer(&out)), &nout, nil, 0); ret != nil {
+		return false
+	}
+	return out > 0
+}
+
+//go:cgo_import_dynamic libc_sysctl sysctl "/usr/lib/libSystem.B.dylib"
+
+var libc_sysctlbyname_trampoline_addr uintptr
+
+// adapted from runtime/sys_darwin.go in the pattern of sysctl() above, as defined in x/sys/unix
+func sysctlbyname(name *byte, old *byte, oldlen *uintptr, new *byte, newlen uintptr) error {
+	if _, _, err := syscall_syscall6(
+		libc_sysctlbyname_trampoline_addr,
+		uintptr(unsafe.Pointer(name)),
+		uintptr(unsafe.Pointer(old)),
+		uintptr(unsafe.Pointer(oldlen)),
+		uintptr(unsafe.Pointer(new)),
+		uintptr(newlen),
+		0,
+	); err != 0 {
+		return err
+	}
+
+	return nil
+}
+
+//go:cgo_import_dynamic libc_sysctlbyname sysctlbyname "/usr/lib/libSystem.B.dylib"
+
+// Implemented in the runtime package (runtime/sys_darwin.go)
+func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno)
+
+//go:linkname syscall_syscall6 syscall.syscall6
diff --git a/vendor/golang.org/x/sys/unix/README.md b/vendor/golang.org/x/sys/unix/README.md
index 7d3c060e..6e08a76a 100644
--- a/vendor/golang.org/x/sys/unix/README.md
+++ b/vendor/golang.org/x/sys/unix/README.md
@@ -156,7 +156,7 @@ from the generated architecture-specific files listed below, and merge these
 into a common file for each OS.
 
 The merge is performed in the following steps:
-1. Construct the set of common code that is idential in all architecture-specific files.
+1. Construct the set of common code that is identical in all architecture-specific files.
 2. Write this common code to the merged file.
 3. Remove the common code from all architecture-specific files.
 
diff --git a/vendor/golang.org/x/sys/unix/ioctl_linux.go b/vendor/golang.org/x/sys/unix/ioctl_linux.go
index dbe680ea..7ca4fa12 100644
--- a/vendor/golang.org/x/sys/unix/ioctl_linux.go
+++ b/vendor/golang.org/x/sys/unix/ioctl_linux.go
@@ -58,6 +58,102 @@ func IoctlGetEthtoolDrvinfo(fd int, ifname string) (*EthtoolDrvinfo, error) {
 	return &value, err
 }
 
+// IoctlGetEthtoolTsInfo fetches ethtool timestamping and PHC
+// association for the network device specified by ifname.
+func IoctlGetEthtoolTsInfo(fd int, ifname string) (*EthtoolTsInfo, error) {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return nil, err
+	}
+
+	value := EthtoolTsInfo{Cmd: ETHTOOL_GET_TS_INFO}
+	ifrd := ifr.withData(unsafe.Pointer(&value))
+
+	err = ioctlIfreqData(fd, SIOCETHTOOL, &ifrd)
+	return &value, err
+}
+
+// IoctlGetHwTstamp retrieves the hardware timestamping configuration
+// for the network device specified by ifname.
+func IoctlGetHwTstamp(fd int, ifname string) (*HwTstampConfig, error) {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return nil, err
+	}
+
+	value := HwTstampConfig{}
+	ifrd := ifr.withData(unsafe.Pointer(&value))
+
+	err = ioctlIfreqData(fd, SIOCGHWTSTAMP, &ifrd)
+	return &value, err
+}
+
+// IoctlSetHwTstamp updates the hardware timestamping configuration for
+// the network device specified by ifname.
+func IoctlSetHwTstamp(fd int, ifname string, cfg *HwTstampConfig) error {
+	ifr, err := NewIfreq(ifname)
+	if err != nil {
+		return err
+	}
+	ifrd := ifr.withData(unsafe.Pointer(cfg))
+	return ioctlIfreqData(fd, SIOCSHWTSTAMP, &ifrd)
+}
+
+// FdToClockID derives the clock ID from the file descriptor number
+// - see clock_gettime(3), FD_TO_CLOCKID macros. The resulting ID is
+// suitable for system calls like ClockGettime.
+func FdToClockID(fd int) int32 { return int32((int(^fd) << 3) | 3) }
+
+// IoctlPtpClockGetcaps returns the description of a given PTP device.
+func IoctlPtpClockGetcaps(fd int) (*PtpClockCaps, error) {
+	var value PtpClockCaps
+	err := ioctlPtr(fd, PTP_CLOCK_GETCAPS2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpSysOffsetPrecise returns a description of the clock
+// offset compared to the system clock.
+func IoctlPtpSysOffsetPrecise(fd int) (*PtpSysOffsetPrecise, error) {
+	var value PtpSysOffsetPrecise
+	err := ioctlPtr(fd, PTP_SYS_OFFSET_PRECISE2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpSysOffsetExtended returns an extended description of the
+// clock offset compared to the system clock. The samples parameter
+// specifies the desired number of measurements.
+func IoctlPtpSysOffsetExtended(fd int, samples uint) (*PtpSysOffsetExtended, error) {
+	value := PtpSysOffsetExtended{Samples: uint32(samples)}
+	err := ioctlPtr(fd, PTP_SYS_OFFSET_EXTENDED2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpPinGetfunc returns the configuration of the specified
+// I/O pin on given PTP device.
+func IoctlPtpPinGetfunc(fd int, index uint) (*PtpPinDesc, error) {
+	value := PtpPinDesc{Index: uint32(index)}
+	err := ioctlPtr(fd, PTP_PIN_GETFUNC2, unsafe.Pointer(&value))
+	return &value, err
+}
+
+// IoctlPtpPinSetfunc updates configuration of the specified PTP
+// I/O pin.
+func IoctlPtpPinSetfunc(fd int, pd *PtpPinDesc) error {
+	return ioctlPtr(fd, PTP_PIN_SETFUNC2, unsafe.Pointer(pd))
+}
+
+// IoctlPtpPeroutRequest configures the periodic output mode of the
+// PTP I/O pins.
+func IoctlPtpPeroutRequest(fd int, r *PtpPeroutRequest) error {
+	return ioctlPtr(fd, PTP_PEROUT_REQUEST2, unsafe.Pointer(r))
+}
+
+// IoctlPtpExttsRequest configures the external timestamping mode
+// of the PTP I/O pins.
+func IoctlPtpExttsRequest(fd int, r *PtpExttsRequest) error {
+	return ioctlPtr(fd, PTP_EXTTS_REQUEST2, unsafe.Pointer(r))
+}
+
 // IoctlGetWatchdogInfo fetches information about a watchdog device from the
 // Linux watchdog API. For more information, see:
 // https://www.kernel.org/doc/html/latest/watchdog/watchdog-api.html.
diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh
index 4ed2e488..6ab02b6c 100644
--- a/vendor/golang.org/x/sys/unix/mkerrors.sh
+++ b/vendor/golang.org/x/sys/unix/mkerrors.sh
@@ -58,6 +58,7 @@ includes_Darwin='
 #define _DARWIN_USE_64_BIT_INODE
 #define __APPLE_USE_RFC_3542
 #include <stdint.h>
+#include <sys/stdio.h>
 #include <sys/attr.h>
 #include <sys/clonefile.h>
 #include <sys/kern_control.h>
@@ -157,6 +158,16 @@ includes_Linux='
 #endif
 #define _GNU_SOURCE
 
+// See the description in unix/linux/types.go
+#if defined(__ARM_EABI__) || \
+	(defined(__mips__) && (_MIPS_SIM == _ABIO32)) || \
+	(defined(__powerpc__) && (!defined(__powerpc64__)))
+# ifdef   _TIME_BITS
+#  undef  _TIME_BITS
+# endif
+# define  _TIME_BITS 32
+#endif
+
 // <sys/ioctl.h> is broken on powerpc64, as it fails to include definitions of
 // these structures. We just include them copied from <bits/termios.h>.
 #if defined(__powerpc__)
@@ -255,6 +266,7 @@ struct ltchars {
 #include <linux/nsfs.h>
 #include <linux/perf_event.h>
 #include <linux/pps.h>
+#include <linux/ptp_clock.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
 #include <linux/reboot.h>
@@ -526,6 +538,7 @@ ccflags="$@"
 		$2 ~ /^(AF|SOCK|SO|SOL|IPPROTO|IP|IPV6|TCP|MCAST|EVFILT|NOTE|SHUT|PROT|MAP|MREMAP|MFD|T?PACKET|MSG|SCM|MCL|DT|MADV|PR|LOCAL|TCPOPT|UDP)_/ ||
 		$2 ~ /^NFC_(GENL|PROTO|COMM|RF|SE|DIRECTION|LLCP|SOCKPROTO)_/ ||
 		$2 ~ /^NFC_.*_(MAX)?SIZE$/ ||
+		$2 ~ /^PTP_/ ||
 		$2 ~ /^RAW_PAYLOAD_/ ||
 		$2 ~ /^[US]F_/ ||
 		$2 ~ /^TP_STATUS_/ ||
@@ -551,6 +564,7 @@ ccflags="$@"
 		$2 !~ /^RTC_VL_(ACCURACY|BACKUP|DATA)/ &&
 		$2 ~ /^(NETLINK|NLM|NLMSG|NLA|IFA|IFAN|RT|RTC|RTCF|RTN|RTPROT|RTNH|ARPHRD|ETH_P|NETNSA)_/ ||
 		$2 ~ /^SOCK_|SK_DIAG_|SKNLGRP_$/ ||
+		$2 ~ /^(CONNECT|SAE)_/ ||
 		$2 ~ /^FIORDCHK$/ ||
 		$2 ~ /^SIOC/ ||
 		$2 ~ /^TIOC/ ||
@@ -654,7 +668,7 @@ errors=$(
 signals=$(
 	echo '#include <signal.h>' | $CC -x c - -E -dM $ccflags |
 	awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print $2 }' |
-	grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' |
+	grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' |
 	sort
 )
 
@@ -664,7 +678,7 @@ echo '#include <errno.h>' | $CC -x c - -E -dM $ccflags |
 	sort >_error.grep
 echo '#include <signal.h>' | $CC -x c - -E -dM $ccflags |
 	awk '$1=="#define" && $2 ~ /^SIG[A-Z0-9]+$/ { print "^\t" $2 "[ \t]*=" }' |
-	grep -v 'SIGSTKSIZE\|SIGSTKSZ\|SIGRT\|SIGMAX64' |
+	grep -E -v '(SIGSTKSIZE|SIGSTKSZ|SIGRT|SIGMAX64)' |
 	sort >_signal.grep
 
 echo '// mkerrors.sh' "$@"
diff --git a/vendor/golang.org/x/sys/unix/mremap.go b/vendor/golang.org/x/sys/unix/mremap.go
index fd45fe52..3a5e776f 100644
--- a/vendor/golang.org/x/sys/unix/mremap.go
+++ b/vendor/golang.org/x/sys/unix/mremap.go
@@ -50,3 +50,8 @@ func (m *mremapMmapper) Mremap(oldData []byte, newLength int, flags int) (data [
 func Mremap(oldData []byte, newLength int, flags int) (data []byte, err error) {
 	return mapper.Mremap(oldData, newLength, flags)
 }
+
+func MremapPtr(oldAddr unsafe.Pointer, oldSize uintptr, newAddr unsafe.Pointer, newSize uintptr, flags int) (ret unsafe.Pointer, err error) {
+	xaddr, err := mapper.mremap(uintptr(oldAddr), oldSize, newSize, flags, uintptr(newAddr))
+	return unsafe.Pointer(xaddr), err
+}
diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go
index 67ce6cef..6f15ba1e 100644
--- a/vendor/golang.org/x/sys/unix/syscall_aix.go
+++ b/vendor/golang.org/x/sys/unix/syscall_aix.go
@@ -360,7 +360,7 @@ func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int,
 	var status _C_int
 	var r Pid_t
 	err = ERESTART
-	// AIX wait4 may return with ERESTART errno, while the processus is still
+	// AIX wait4 may return with ERESTART errno, while the process is still
 	// active.
 	for err == ERESTART {
 		r, err = wait4(Pid_t(pid), &status, options, rusage)
diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go
index 59542a89..099867de 100644
--- a/vendor/golang.org/x/sys/unix/syscall_darwin.go
+++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go
@@ -402,6 +402,18 @@ func IoctlSetIfreqMTU(fd int, ifreq *IfreqMTU) error {
 	return ioctlPtr(fd, SIOCSIFMTU, unsafe.Pointer(ifreq))
 }
 
+//sys	renamexNp(from string, to string, flag uint32) (err error)
+
+func RenamexNp(from string, to string, flag uint32) (err error) {
+	return renamexNp(from, to, flag)
+}
+
+//sys	renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error)
+
+func RenameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) {
+	return renameatxNp(fromfd, from, tofd, to, flag)
+}
+
 //sys	sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) = SYS_SYSCTL
 
 func Uname(uname *Utsname) error {
@@ -542,6 +554,55 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) {
 	}
 }
 
+//sys	pthread_chdir_np(path string) (err error)
+
+func PthreadChdir(path string) (err error) {
+	return pthread_chdir_np(path)
+}
+
+//sys	pthread_fchdir_np(fd int) (err error)
+
+func PthreadFchdir(fd int) (err error) {
+	return pthread_fchdir_np(fd)
+}
+
+// Connectx calls connectx(2) to initiate a connection on a socket.
+//
+// srcIf, srcAddr, and dstAddr are filled into a [SaEndpoints] struct and passed as the endpoints argument.
+//
+//   - srcIf is the optional source interface index. 0 means unspecified.
+//   - srcAddr is the optional source address. nil means unspecified.
+//   - dstAddr is the destination address.
+//
+// On success, Connectx returns the number of bytes enqueued for transmission.
+func Connectx(fd int, srcIf uint32, srcAddr, dstAddr Sockaddr, associd SaeAssocID, flags uint32, iov []Iovec, connid *SaeConnID) (n uintptr, err error) {
+	endpoints := SaEndpoints{
+		Srcif: srcIf,
+	}
+
+	if srcAddr != nil {
+		addrp, addrlen, err := srcAddr.sockaddr()
+		if err != nil {
+			return 0, err
+		}
+		endpoints.Srcaddr = (*RawSockaddr)(addrp)
+		endpoints.Srcaddrlen = uint32(addrlen)
+	}
+
+	if dstAddr != nil {
+		addrp, addrlen, err := dstAddr.sockaddr()
+		if err != nil {
+			return 0, err
+		}
+		endpoints.Dstaddr = (*RawSockaddr)(addrp)
+		endpoints.Dstaddrlen = uint32(addrlen)
+	}
+
+	err = connectx(fd, &endpoints, associd, flags, iov, &n, connid)
+	return
+}
+
+//sys	connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error)
 //sys	sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error)
 
 //sys	shmat(id int, addr uintptr, flag int) (ret uintptr, err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go
index ba46651f..a6a2d2fc 100644
--- a/vendor/golang.org/x/sys/unix/syscall_hurd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go
@@ -11,6 +11,7 @@ package unix
 int ioctl(int, unsigned long int, uintptr_t);
 */
 import "C"
+import "unsafe"
 
 func ioctl(fd int, req uint, arg uintptr) (err error) {
 	r0, er := C.ioctl(C.int(fd), C.ulong(req), C.uintptr_t(arg))
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go
index 5682e262..230a9454 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux.go
@@ -1295,6 +1295,48 @@ func GetsockoptTCPInfo(fd, level, opt int) (*TCPInfo, error) {
 	return &value, err
 }
 
+// GetsockoptTCPCCVegasInfo returns algorithm specific congestion control information for a socket using the "vegas"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCVegasInfo(fd, level, opt int) (*TCPVegasInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPVegasInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
+// GetsockoptTCPCCDCTCPInfo returns algorithm specific congestion control information for a socket using the "dctp"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCDCTCPInfo(fd, level, opt int) (*TCPDCTCPInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPDCTCPInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
+// GetsockoptTCPCCBBRInfo returns algorithm specific congestion control information for a socket using the "bbr"
+// algorithm.
+//
+// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
+//
+//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
+func GetsockoptTCPCCBBRInfo(fd, level, opt int) (*TCPBBRInfo, error) {
+	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
+	vallen := _Socklen(SizeofTCPCCInfo)
+	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
+	out := (*TCPBBRInfo)(unsafe.Pointer(&value[0]))
+	return out, err
+}
+
 // GetsockoptString returns the string value of the socket option opt for the
 // socket associated with fd at the given socket level.
 func GetsockoptString(fd, level, opt int) (string, error) {
@@ -1818,6 +1860,7 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e
 //sys	ClockAdjtime(clockid int32, buf *Timex) (state int, err error)
 //sys	ClockGetres(clockid int32, res *Timespec) (err error)
 //sys	ClockGettime(clockid int32, time *Timespec) (err error)
+//sys	ClockSettime(clockid int32, time *Timespec) (err error)
 //sys	ClockNanosleep(clockid int32, flags int, request *Timespec, remain *Timespec) (err error)
 //sys	Close(fd int) (err error)
 //sys	CloseRange(first uint, last uint, flags uint) (err error)
@@ -1959,7 +2002,26 @@ func Getpgrp() (pid int) {
 //sysnb	Getpid() (pid int)
 //sysnb	Getppid() (ppid int)
 //sys	Getpriority(which int, who int) (prio int, err error)
-//sys	Getrandom(buf []byte, flags int) (n int, err error)
+
+func Getrandom(buf []byte, flags int) (n int, err error) {
+	vdsoRet, supported := vgetrandom(buf, uint32(flags))
+	if supported {
+		if vdsoRet < 0 {
+			return 0, errnoErr(syscall.Errno(-vdsoRet))
+		}
+		return vdsoRet, nil
+	}
+	var p *byte
+	if len(buf) > 0 {
+		p = &buf[0]
+	}
+	r, _, e := Syscall(SYS_GETRANDOM, uintptr(unsafe.Pointer(p)), uintptr(len(buf)), uintptr(flags))
+	if e != 0 {
+		return 0, errnoErr(e)
+	}
+	return int(r), nil
+}
+
 //sysnb	Getrusage(who int, rusage *Rusage) (err error)
 //sysnb	Getsid(pid int) (sid int, err error)
 //sysnb	Gettid() (tid int)
@@ -2592,3 +2654,4 @@ func SchedGetAttr(pid int, flags uint) (*SchedAttr, error) {
 }
 
 //sys	Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error)
+//sys	Mseal(b []byte, flags uint) (err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
index cf2ee6c7..745e5c7e 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go
@@ -182,3 +182,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error
 	}
 	return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
index 3d0e9845..dd2262a4 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go
@@ -214,3 +214,5 @@ func KexecFileLoad(kernelFd int, initrdFd int, cmdline string, flags int) error
 	}
 	return kexecFileLoad(kernelFd, initrdFd, cmdlineLen, cmdline, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
index 6f5a2889..8cf3670b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go
@@ -187,3 +187,5 @@ func RISCVHWProbe(pairs []RISCVHWProbePairs, set *CPUSet, flags uint) (err error
 	}
 	return riscvHWProbe(pairs, setSize, set, flags)
 }
+
+const SYS_FSTATAT = SYS_NEWFSTATAT
diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd.go b/vendor/golang.org/x/sys/unix/syscall_openbsd.go
index b25343c7..b86ded54 100644
--- a/vendor/golang.org/x/sys/unix/syscall_openbsd.go
+++ b/vendor/golang.org/x/sys/unix/syscall_openbsd.go
@@ -293,6 +293,7 @@ func Uname(uname *Utsname) error {
 //sys	Mkfifoat(dirfd int, path string, mode uint32) (err error)
 //sys	Mknod(path string, mode uint32, dev int) (err error)
 //sys	Mknodat(dirfd int, path string, mode uint32, dev int) (err error)
+//sys	Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error)
 //sys	Nanosleep(time *Timespec, leftover *Timespec) (err error)
 //sys	Open(path string, mode int, perm uint32) (fd int, err error)
 //sys	Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error)
diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go
index 77081de8..4e92e5aa 100644
--- a/vendor/golang.org/x/sys/unix/syscall_unix.go
+++ b/vendor/golang.org/x/sys/unix/syscall_unix.go
@@ -154,6 +154,15 @@ func Munmap(b []byte) (err error) {
 	return mapper.Munmap(b)
 }
 
+func MmapPtr(fd int, offset int64, addr unsafe.Pointer, length uintptr, prot int, flags int) (ret unsafe.Pointer, err error) {
+	xaddr, err := mapper.mmap(uintptr(addr), length, prot, flags, fd, offset)
+	return unsafe.Pointer(xaddr), err
+}
+
+func MunmapPtr(addr unsafe.Pointer, length uintptr) (err error) {
+	return mapper.munmap(uintptr(addr), length)
+}
+
 func Read(fd int, p []byte) (n int, err error) {
 	n, err = read(fd, p)
 	if raceenabled {
diff --git a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
index 312ae6ac..7bf5c04b 100644
--- a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go
@@ -768,6 +768,15 @@ func Munmap(b []byte) (err error) {
 	return mapper.Munmap(b)
 }
 
+func MmapPtr(fd int, offset int64, addr unsafe.Pointer, length uintptr, prot int, flags int) (ret unsafe.Pointer, err error) {
+	xaddr, err := mapper.mmap(uintptr(addr), length, prot, flags, fd, offset)
+	return unsafe.Pointer(xaddr), err
+}
+
+func MunmapPtr(addr unsafe.Pointer, length uintptr) (err error) {
+	return mapper.munmap(uintptr(addr), length)
+}
+
 //sys   Gethostname(buf []byte) (err error) = SYS___GETHOSTNAME_A
 //sysnb	Getgid() (gid int)
 //sysnb	Getpid() (pid int)
@@ -816,10 +825,10 @@ func Lstat(path string, stat *Stat_t) (err error) {
 // for checking symlinks begins with $VERSION/ $SYSNAME/ $SYSSYMR/ $SYSSYMA/
 func isSpecialPath(path []byte) (v bool) {
 	var special = [4][8]byte{
-		[8]byte{'V', 'E', 'R', 'S', 'I', 'O', 'N', '/'},
-		[8]byte{'S', 'Y', 'S', 'N', 'A', 'M', 'E', '/'},
-		[8]byte{'S', 'Y', 'S', 'S', 'Y', 'M', 'R', '/'},
-		[8]byte{'S', 'Y', 'S', 'S', 'Y', 'M', 'A', '/'}}
+		{'V', 'E', 'R', 'S', 'I', 'O', 'N', '/'},
+		{'S', 'Y', 'S', 'N', 'A', 'M', 'E', '/'},
+		{'S', 'Y', 'S', 'S', 'Y', 'M', 'R', '/'},
+		{'S', 'Y', 'S', 'S', 'Y', 'M', 'A', '/'}}
 
 	var i, j int
 	for i = 0; i < len(special); i++ {
@@ -3115,3 +3124,90 @@ func legacy_Mkfifoat(dirfd int, path string, mode uint32) (err error) {
 //sys	Posix_openpt(oflag int) (fd int, err error) = SYS_POSIX_OPENPT
 //sys	Grantpt(fildes int) (rc int, err error) = SYS_GRANTPT
 //sys	Unlockpt(fildes int) (rc int, err error) = SYS_UNLOCKPT
+
+func fcntlAsIs(fd uintptr, cmd int, arg uintptr) (val int, err error) {
+	runtime.EnterSyscall()
+	r0, e2, e1 := CallLeFuncWithErr(GetZosLibVec()+SYS_FCNTL<<4, uintptr(fd), uintptr(cmd), arg)
+	runtime.ExitSyscall()
+	val = int(r0)
+	if int64(r0) == -1 {
+		err = errnoErr2(e1, e2)
+	}
+	return
+}
+
+func Fcntl(fd uintptr, cmd int, op interface{}) (ret int, err error) {
+	switch op.(type) {
+	case *Flock_t:
+		err = FcntlFlock(fd, cmd, op.(*Flock_t))
+		if err != nil {
+			ret = -1
+		}
+		return
+	case int:
+		return FcntlInt(fd, cmd, op.(int))
+	case *F_cnvrt:
+		return fcntlAsIs(fd, cmd, uintptr(unsafe.Pointer(op.(*F_cnvrt))))
+	case unsafe.Pointer:
+		return fcntlAsIs(fd, cmd, uintptr(op.(unsafe.Pointer)))
+	default:
+		return -1, EINVAL
+	}
+	return
+}
+
+func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) {
+	if raceenabled {
+		raceReleaseMerge(unsafe.Pointer(&ioSync))
+	}
+	return sendfile(outfd, infd, offset, count)
+}
+
+func sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) {
+	// TODO: use LE call instead if the call is implemented
+	originalOffset, err := Seek(infd, 0, SEEK_CUR)
+	if err != nil {
+		return -1, err
+	}
+	//start reading data from in_fd
+	if offset != nil {
+		_, err := Seek(infd, *offset, SEEK_SET)
+		if err != nil {
+			return -1, err
+		}
+	}
+
+	buf := make([]byte, count)
+	readBuf := make([]byte, 0)
+	var n int = 0
+	for i := 0; i < count; i += n {
+		n, err := Read(infd, buf)
+		if n == 0 {
+			if err != nil {
+				return -1, err
+			} else { // EOF
+				break
+			}
+		}
+		readBuf = append(readBuf, buf...)
+		buf = buf[0:0]
+	}
+
+	n2, err := Write(outfd, readBuf)
+	if err != nil {
+		return -1, err
+	}
+
+	//When sendfile() returns, this variable will be set to the
+	// offset of the byte following the last byte that was read.
+	if offset != nil {
+		*offset = *offset + int64(n)
+		// If offset is not NULL, then sendfile() does not modify the file
+		// offset of in_fd
+		_, err := Seek(infd, originalOffset, SEEK_SET)
+		if err != nil {
+			return -1, err
+		}
+	}
+	return n2, nil
+}
diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_linux.go b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go
new file mode 100644
index 00000000..07ac8e09
--- /dev/null
+++ b/vendor/golang.org/x/sys/unix/vgetrandom_linux.go
@@ -0,0 +1,13 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && go1.24
+
+package unix
+
+import _ "unsafe"
+
+//go:linkname vgetrandom runtime.vgetrandom
+//go:noescape
+func vgetrandom(p []byte, flags uint32) (ret int, supported bool)
diff --git a/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go
new file mode 100644
index 00000000..297e97bc
--- /dev/null
+++ b/vendor/golang.org/x/sys/unix/vgetrandom_unsupported.go
@@ -0,0 +1,11 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !linux || !go1.24
+
+package unix
+
+func vgetrandom(p []byte, flags uint32) (ret int, supported bool) {
+	return -1, false
+}
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
index e40fa852..d73c4652 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go
@@ -237,6 +237,9 @@ const (
 	CLOCK_UPTIME_RAW_APPROX                 = 0x9
 	CLONE_NOFOLLOW                          = 0x1
 	CLONE_NOOWNERCOPY                       = 0x2
+	CONNECT_DATA_AUTHENTICATED              = 0x4
+	CONNECT_DATA_IDEMPOTENT                 = 0x2
+	CONNECT_RESUME_ON_READ_WRITE            = 0x1
 	CR0                                     = 0x0
 	CR1                                     = 0x1000
 	CR2                                     = 0x2000
@@ -1169,6 +1172,11 @@ const (
 	PT_WRITE_D                              = 0x5
 	PT_WRITE_I                              = 0x4
 	PT_WRITE_U                              = 0x6
+	RENAME_EXCL                             = 0x4
+	RENAME_NOFOLLOW_ANY                     = 0x10
+	RENAME_RESERVED1                        = 0x8
+	RENAME_SECLUDE                          = 0x1
+	RENAME_SWAP                             = 0x2
 	RLIMIT_AS                               = 0x5
 	RLIMIT_CORE                             = 0x4
 	RLIMIT_CPU                              = 0x0
@@ -1260,6 +1268,10 @@ const (
 	RTV_SSTHRESH                            = 0x20
 	RUSAGE_CHILDREN                         = -0x1
 	RUSAGE_SELF                             = 0x0
+	SAE_ASSOCID_ALL                         = 0xffffffff
+	SAE_ASSOCID_ANY                         = 0x0
+	SAE_CONNID_ALL                          = 0xffffffff
+	SAE_CONNID_ANY                          = 0x0
 	SCM_CREDS                               = 0x3
 	SCM_RIGHTS                              = 0x1
 	SCM_TIMESTAMP                           = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
index bb02aa6c..4a55a400 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go
@@ -237,6 +237,9 @@ const (
 	CLOCK_UPTIME_RAW_APPROX                 = 0x9
 	CLONE_NOFOLLOW                          = 0x1
 	CLONE_NOOWNERCOPY                       = 0x2
+	CONNECT_DATA_AUTHENTICATED              = 0x4
+	CONNECT_DATA_IDEMPOTENT                 = 0x2
+	CONNECT_RESUME_ON_READ_WRITE            = 0x1
 	CR0                                     = 0x0
 	CR1                                     = 0x1000
 	CR2                                     = 0x2000
@@ -1169,6 +1172,11 @@ const (
 	PT_WRITE_D                              = 0x5
 	PT_WRITE_I                              = 0x4
 	PT_WRITE_U                              = 0x6
+	RENAME_EXCL                             = 0x4
+	RENAME_NOFOLLOW_ANY                     = 0x10
+	RENAME_RESERVED1                        = 0x8
+	RENAME_SECLUDE                          = 0x1
+	RENAME_SWAP                             = 0x2
 	RLIMIT_AS                               = 0x5
 	RLIMIT_CORE                             = 0x4
 	RLIMIT_CPU                              = 0x0
@@ -1260,6 +1268,10 @@ const (
 	RTV_SSTHRESH                            = 0x20
 	RUSAGE_CHILDREN                         = -0x1
 	RUSAGE_SELF                             = 0x0
+	SAE_ASSOCID_ALL                         = 0xffffffff
+	SAE_ASSOCID_ANY                         = 0x0
+	SAE_CONNID_ALL                          = 0xffffffff
+	SAE_CONNID_ANY                          = 0x0
 	SCM_CREDS                               = 0x3
 	SCM_RIGHTS                              = 0x1
 	SCM_TIMESTAMP                           = 0x2
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go
index 877a62b4..6ebc48b3 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go
@@ -321,6 +321,9 @@ const (
 	AUDIT_INTEGRITY_STATUS                      = 0x70a
 	AUDIT_IPC                                   = 0x517
 	AUDIT_IPC_SET_PERM                          = 0x51f
+	AUDIT_IPE_ACCESS                            = 0x58c
+	AUDIT_IPE_CONFIG_CHANGE                     = 0x58d
+	AUDIT_IPE_POLICY_LOAD                       = 0x58e
 	AUDIT_KERNEL                                = 0x7d0
 	AUDIT_KERNEL_OTHER                          = 0x524
 	AUDIT_KERN_MODULE                           = 0x532
@@ -457,6 +460,7 @@ const (
 	B600                                        = 0x8
 	B75                                         = 0x2
 	B9600                                       = 0xd
+	BCACHEFS_SUPER_MAGIC                        = 0xca451a4e
 	BDEVFS_MAGIC                                = 0x62646576
 	BINDERFS_SUPER_MAGIC                        = 0x6c6f6f70
 	BINFMTFS_MAGIC                              = 0x42494e4d
@@ -488,12 +492,14 @@ const (
 	BPF_F_ID                                    = 0x20
 	BPF_F_NETFILTER_IP_DEFRAG                   = 0x1
 	BPF_F_QUERY_EFFECTIVE                       = 0x1
+	BPF_F_REDIRECT_FLAGS                        = 0x19
 	BPF_F_REPLACE                               = 0x4
 	BPF_F_SLEEPABLE                             = 0x10
 	BPF_F_STRICT_ALIGNMENT                      = 0x1
 	BPF_F_TEST_REG_INVARIANTS                   = 0x80
 	BPF_F_TEST_RND_HI32                         = 0x4
 	BPF_F_TEST_RUN_ON_CPU                       = 0x1
+	BPF_F_TEST_SKB_CHECKSUM_COMPLETE            = 0x4
 	BPF_F_TEST_STATE_FREQ                       = 0x8
 	BPF_F_TEST_XDP_LIVE_FRAMES                  = 0x2
 	BPF_F_XDP_DEV_BOUND_ONLY                    = 0x40
@@ -928,6 +934,7 @@ const (
 	EPOLL_CTL_ADD                               = 0x1
 	EPOLL_CTL_DEL                               = 0x2
 	EPOLL_CTL_MOD                               = 0x3
+	EPOLL_IOC_TYPE                              = 0x8a
 	EROFS_SUPER_MAGIC_V1                        = 0xe0f5e1e2
 	ESP_V4_FLOW                                 = 0xa
 	ESP_V6_FLOW                                 = 0xc
@@ -941,9 +948,6 @@ const (
 	ETHTOOL_FEC_OFF                             = 0x4
 	ETHTOOL_FEC_RS                              = 0x8
 	ETHTOOL_FLAG_ALL                            = 0x7
-	ETHTOOL_FLAG_COMPACT_BITSETS                = 0x1
-	ETHTOOL_FLAG_OMIT_REPLY                     = 0x2
-	ETHTOOL_FLAG_STATS                          = 0x4
 	ETHTOOL_FLASHDEV                            = 0x33
 	ETHTOOL_FLASH_MAX_FILENAME                  = 0x80
 	ETHTOOL_FWVERS_LEN                          = 0x20
@@ -1166,6 +1170,7 @@ const (
 	EXTA                                        = 0xe
 	EXTB                                        = 0xf
 	F2FS_SUPER_MAGIC                            = 0xf2f52010
+	FALLOC_FL_ALLOCATE_RANGE                    = 0x0
 	FALLOC_FL_COLLAPSE_RANGE                    = 0x8
 	FALLOC_FL_INSERT_RANGE                      = 0x20
 	FALLOC_FL_KEEP_SIZE                         = 0x1
@@ -1705,6 +1710,7 @@ const (
 	KEXEC_ARCH_S390                             = 0x160000
 	KEXEC_ARCH_SH                               = 0x2a0000
 	KEXEC_ARCH_X86_64                           = 0x3e0000
+	KEXEC_CRASH_HOTPLUG_SUPPORT                 = 0x8
 	KEXEC_FILE_DEBUG                            = 0x8
 	KEXEC_FILE_NO_INITRAMFS                     = 0x4
 	KEXEC_FILE_ON_CRASH                         = 0x2
@@ -1780,6 +1786,7 @@ const (
 	KEY_SPEC_USER_KEYRING                       = -0x4
 	KEY_SPEC_USER_SESSION_KEYRING               = -0x5
 	LANDLOCK_ACCESS_FS_EXECUTE                  = 0x1
+	LANDLOCK_ACCESS_FS_IOCTL_DEV                = 0x8000
 	LANDLOCK_ACCESS_FS_MAKE_BLOCK               = 0x800
 	LANDLOCK_ACCESS_FS_MAKE_CHAR                = 0x40
 	LANDLOCK_ACCESS_FS_MAKE_DIR                 = 0x80
@@ -1797,6 +1804,8 @@ const (
 	LANDLOCK_ACCESS_NET_BIND_TCP                = 0x1
 	LANDLOCK_ACCESS_NET_CONNECT_TCP             = 0x2
 	LANDLOCK_CREATE_RULESET_VERSION             = 0x1
+	LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET         = 0x1
+	LANDLOCK_SCOPE_SIGNAL                       = 0x2
 	LINUX_REBOOT_CMD_CAD_OFF                    = 0x0
 	LINUX_REBOOT_CMD_CAD_ON                     = 0x89abcdef
 	LINUX_REBOOT_CMD_HALT                       = 0xcdef0123
@@ -1861,6 +1870,19 @@ const (
 	MAP_FILE                                    = 0x0
 	MAP_FIXED                                   = 0x10
 	MAP_FIXED_NOREPLACE                         = 0x100000
+	MAP_HUGE_16GB                               = 0x88000000
+	MAP_HUGE_16KB                               = 0x38000000
+	MAP_HUGE_16MB                               = 0x60000000
+	MAP_HUGE_1GB                                = 0x78000000
+	MAP_HUGE_1MB                                = 0x50000000
+	MAP_HUGE_256MB                              = 0x70000000
+	MAP_HUGE_2GB                                = 0x7c000000
+	MAP_HUGE_2MB                                = 0x54000000
+	MAP_HUGE_32MB                               = 0x64000000
+	MAP_HUGE_512KB                              = 0x4c000000
+	MAP_HUGE_512MB                              = 0x74000000
+	MAP_HUGE_64KB                               = 0x40000000
+	MAP_HUGE_8MB                                = 0x5c000000
 	MAP_HUGE_MASK                               = 0x3f
 	MAP_HUGE_SHIFT                              = 0x1a
 	MAP_PRIVATE                                 = 0x2
@@ -1908,6 +1930,8 @@ const (
 	MNT_EXPIRE                                  = 0x4
 	MNT_FORCE                                   = 0x1
 	MNT_ID_REQ_SIZE_VER0                        = 0x18
+	MNT_ID_REQ_SIZE_VER1                        = 0x20
+	MNT_NS_INFO_SIZE_VER0                       = 0x10
 	MODULE_INIT_COMPRESSED_FILE                 = 0x4
 	MODULE_INIT_IGNORE_MODVERSIONS              = 0x1
 	MODULE_INIT_IGNORE_VERMAGIC                 = 0x2
@@ -2173,7 +2197,7 @@ const (
 	NFT_REG_SIZE                                = 0x10
 	NFT_REJECT_ICMPX_MAX                        = 0x3
 	NFT_RT_MAX                                  = 0x4
-	NFT_SECMARK_CTX_MAXLEN                      = 0x100
+	NFT_SECMARK_CTX_MAXLEN                      = 0x1000
 	NFT_SET_MAXNAMELEN                          = 0x100
 	NFT_SOCKET_MAX                              = 0x3
 	NFT_TABLE_F_MASK                            = 0x7
@@ -2342,9 +2366,11 @@ const (
 	PERF_MEM_LVLNUM_IO                          = 0xa
 	PERF_MEM_LVLNUM_L1                          = 0x1
 	PERF_MEM_LVLNUM_L2                          = 0x2
+	PERF_MEM_LVLNUM_L2_MHB                      = 0x5
 	PERF_MEM_LVLNUM_L3                          = 0x3
 	PERF_MEM_LVLNUM_L4                          = 0x4
 	PERF_MEM_LVLNUM_LFB                         = 0xc
+	PERF_MEM_LVLNUM_MSC                         = 0x6
 	PERF_MEM_LVLNUM_NA                          = 0xf
 	PERF_MEM_LVLNUM_PMEM                        = 0xe
 	PERF_MEM_LVLNUM_RAM                         = 0xd
@@ -2417,6 +2443,7 @@ const (
 	PRIO_PGRP                                   = 0x1
 	PRIO_PROCESS                                = 0x0
 	PRIO_USER                                   = 0x2
+	PROCFS_IOCTL_MAGIC                          = 'f'
 	PROC_SUPER_MAGIC                            = 0x9fa0
 	PROT_EXEC                                   = 0x4
 	PROT_GROWSDOWN                              = 0x1000000
@@ -2498,6 +2525,23 @@ const (
 	PR_PAC_GET_ENABLED_KEYS                     = 0x3d
 	PR_PAC_RESET_KEYS                           = 0x36
 	PR_PAC_SET_ENABLED_KEYS                     = 0x3c
+	PR_PPC_DEXCR_CTRL_CLEAR                     = 0x4
+	PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC              = 0x10
+	PR_PPC_DEXCR_CTRL_EDITABLE                  = 0x1
+	PR_PPC_DEXCR_CTRL_MASK                      = 0x1f
+	PR_PPC_DEXCR_CTRL_SET                       = 0x2
+	PR_PPC_DEXCR_CTRL_SET_ONEXEC                = 0x8
+	PR_PPC_DEXCR_IBRTPD                         = 0x1
+	PR_PPC_DEXCR_NPHIE                          = 0x3
+	PR_PPC_DEXCR_SBHE                           = 0x0
+	PR_PPC_DEXCR_SRAPD                          = 0x2
+	PR_PPC_GET_DEXCR                            = 0x48
+	PR_PPC_SET_DEXCR                            = 0x49
+	PR_RISCV_CTX_SW_FENCEI_OFF                  = 0x1
+	PR_RISCV_CTX_SW_FENCEI_ON                   = 0x0
+	PR_RISCV_SCOPE_PER_PROCESS                  = 0x0
+	PR_RISCV_SCOPE_PER_THREAD                   = 0x1
+	PR_RISCV_SET_ICACHE_FLUSH_CTX               = 0x47
 	PR_RISCV_V_GET_CONTROL                      = 0x46
 	PR_RISCV_V_SET_CONTROL                      = 0x45
 	PR_RISCV_V_VSTATE_CTRL_CUR_MASK             = 0x3
@@ -2589,6 +2633,28 @@ const (
 	PR_UNALIGN_NOPRINT                          = 0x1
 	PR_UNALIGN_SIGBUS                           = 0x2
 	PSTOREFS_MAGIC                              = 0x6165676c
+	PTP_CLK_MAGIC                               = '='
+	PTP_ENABLE_FEATURE                          = 0x1
+	PTP_EXTTS_EDGES                             = 0x6
+	PTP_EXTTS_EVENT_VALID                       = 0x1
+	PTP_EXTTS_V1_VALID_FLAGS                    = 0x7
+	PTP_EXTTS_VALID_FLAGS                       = 0x1f
+	PTP_EXT_OFFSET                              = 0x10
+	PTP_FALLING_EDGE                            = 0x4
+	PTP_MAX_SAMPLES                             = 0x19
+	PTP_PEROUT_DUTY_CYCLE                       = 0x2
+	PTP_PEROUT_ONE_SHOT                         = 0x1
+	PTP_PEROUT_PHASE                            = 0x4
+	PTP_PEROUT_V1_VALID_FLAGS                   = 0x0
+	PTP_PEROUT_VALID_FLAGS                      = 0x7
+	PTP_PIN_GETFUNC                             = 0xc0603d06
+	PTP_PIN_GETFUNC2                            = 0xc0603d0f
+	PTP_RISING_EDGE                             = 0x2
+	PTP_STRICT_FLAGS                            = 0x8
+	PTP_SYS_OFFSET_EXTENDED                     = 0xc4c03d09
+	PTP_SYS_OFFSET_EXTENDED2                    = 0xc4c03d12
+	PTP_SYS_OFFSET_PRECISE                      = 0xc0403d08
+	PTP_SYS_OFFSET_PRECISE2                     = 0xc0403d11
 	PTRACE_ATTACH                               = 0x10
 	PTRACE_CONT                                 = 0x7
 	PTRACE_DETACH                               = 0x11
@@ -2902,15 +2968,17 @@ const (
 	RUSAGE_SELF                                 = 0x0
 	RUSAGE_THREAD                               = 0x1
 	RWF_APPEND                                  = 0x10
+	RWF_ATOMIC                                  = 0x40
 	RWF_DSYNC                                   = 0x2
 	RWF_HIPRI                                   = 0x1
 	RWF_NOAPPEND                                = 0x20
 	RWF_NOWAIT                                  = 0x8
-	RWF_SUPPORTED                               = 0x3f
+	RWF_SUPPORTED                               = 0x7f
 	RWF_SYNC                                    = 0x4
 	RWF_WRITE_LIFE_NOT_SET                      = 0x0
 	SCHED_BATCH                                 = 0x3
 	SCHED_DEADLINE                              = 0x6
+	SCHED_EXT                                   = 0x7
 	SCHED_FIFO                                  = 0x1
 	SCHED_FLAG_ALL                              = 0x7f
 	SCHED_FLAG_DL_OVERRUN                       = 0x4
@@ -3179,6 +3247,7 @@ const (
 	STATX_ATTR_MOUNT_ROOT                       = 0x2000
 	STATX_ATTR_NODUMP                           = 0x40
 	STATX_ATTR_VERITY                           = 0x100000
+	STATX_ATTR_WRITE_ATOMIC                     = 0x400000
 	STATX_BASIC_STATS                           = 0x7ff
 	STATX_BLOCKS                                = 0x400
 	STATX_BTIME                                 = 0x800
@@ -3192,8 +3261,10 @@ const (
 	STATX_MTIME                                 = 0x40
 	STATX_NLINK                                 = 0x4
 	STATX_SIZE                                  = 0x200
+	STATX_SUBVOL                                = 0x8000
 	STATX_TYPE                                  = 0x1
 	STATX_UID                                   = 0x8
+	STATX_WRITE_ATOMIC                          = 0x10000
 	STATX__RESERVED                             = 0x80000000
 	SYNC_FILE_RANGE_WAIT_AFTER                  = 0x4
 	SYNC_FILE_RANGE_WAIT_BEFORE                 = 0x1
@@ -3592,6 +3663,7 @@ const (
 	XDP_UMEM_PGOFF_COMPLETION_RING              = 0x180000000
 	XDP_UMEM_PGOFF_FILL_RING                    = 0x100000000
 	XDP_UMEM_REG                                = 0x4
+	XDP_UMEM_TX_METADATA_LEN                    = 0x4
 	XDP_UMEM_TX_SW_CSUM                         = 0x2
 	XDP_UMEM_UNALIGNED_CHUNK_FLAG               = 0x1
 	XDP_USE_NEED_WAKEUP                         = 0x8
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
index e4bc0bd5..c0d45e32 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -107,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -151,9 +154,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -230,6 +238,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPXREGS                = 0x12
 	PTRACE_GET_THREAD_AREA           = 0x19
@@ -276,6 +298,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -314,6 +338,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
index 689317af..c731d24f 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -107,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -151,9 +154,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -230,6 +238,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_ARCH_PRCTL                = 0x1e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPXREGS                = 0x12
@@ -277,6 +299,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -315,6 +339,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
index 5cca668a..680018a4 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETCRUNCHREGS             = 0x19
 	PTRACE_GETFDPIC                  = 0x1f
 	PTRACE_GETFDPIC_EXEC             = 0x0
@@ -282,6 +304,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -320,6 +344,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
index 14270508..a63909f3 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	ESR_MAGIC                        = 0x45535201
 	EXTPROC                          = 0x10000
@@ -110,6 +112,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -152,9 +155,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -198,6 +206,7 @@ const (
 	PERF_EVENT_IOC_SET_BPF           = 0x40042408
 	PERF_EVENT_IOC_SET_FILTER        = 0x40082406
 	PERF_EVENT_IOC_SET_OUTPUT        = 0x2405
+	POE_MAGIC                        = 0x504f4530
 	PPPIOCATTACH                     = 0x4004743d
 	PPPIOCATTCHAN                    = 0x40047438
 	PPPIOCBRIDGECHAN                 = 0x40047435
@@ -233,6 +242,20 @@ const (
 	PROT_BTI                         = 0x10
 	PROT_MTE                         = 0x20
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_PEEKMTETAGS               = 0x21
 	PTRACE_POKEMTETAGS               = 0x22
 	PTRACE_SYSEMU                    = 0x1f
@@ -273,6 +296,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -311,6 +336,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
index 28e39afd..9b0a2573 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -107,6 +109,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -152,9 +155,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -231,6 +239,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_SYSEMU                    = 0x1f
 	PTRACE_SYSEMU_SINGLESTEP         = 0x20
 	RLIMIT_AS                        = 0x9
@@ -269,6 +291,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -307,6 +331,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
index cd66e92c..958e6e06 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x80
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -275,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -313,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
index c1595eba..50c7f25b 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x80
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -275,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -313,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
index ee9456b0..ced21d66 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x80
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -275,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -313,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
index 8cfca81e..226c0441 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x80
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x100
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x20
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GET_THREAD_AREA           = 0x19
 	PTRACE_GET_THREAD_AREA_3264      = 0xc4
@@ -275,6 +297,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -313,6 +337,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
index 60b0deb3..3122737c 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x20
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000000
 	FF1                              = 0x4000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -150,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -230,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -330,6 +352,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -368,6 +392,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
index f90aa728..eb5d3467 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x20
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000000
 	FF1                              = 0x4000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -150,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -230,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -334,6 +356,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -372,6 +396,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
index ba9e0150..e921ebc6 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x20
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000000
 	FF1                              = 0x4000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x4000
 	ICANON                           = 0x100
 	IEXTEN                           = 0x400
@@ -150,9 +153,14 @@ const (
 	NL3                              = 0x300
 	NLDLY                            = 0x300
 	NOFLSH                           = 0x80000000
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x4
 	ONLCR                            = 0x2
@@ -230,6 +238,20 @@ const (
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PROT_SAO                         = 0x10
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETEVRREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETREGS64                 = 0x16
@@ -334,6 +356,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -372,6 +396,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
index 07cdfd6e..38ba81c5 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_GETFDPIC                  = 0x21
 	PTRACE_GETFDPIC_EXEC             = 0x0
 	PTRACE_GETFDPIC_INTERP           = 0x1
@@ -266,6 +288,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -304,6 +328,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
index 2f1dd214..71f04009 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go
@@ -78,6 +78,8 @@ const (
 	ECHOPRT                          = 0x400
 	EFD_CLOEXEC                      = 0x80000
 	EFD_NONBLOCK                     = 0x800
+	EPIOCGPARAMS                     = 0x80088a02
+	EPIOCSPARAMS                     = 0x40088a01
 	EPOLL_CLOEXEC                    = 0x80000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -106,6 +108,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x80084803
 	HIDIOCGRDESC                     = 0x90044802
 	HIDIOCGRDESCSIZE                 = 0x80044801
+	HIDIOCREVOKE                     = 0x4004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -148,9 +151,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x8008b705
 	NS_GET_NSTYPE                    = 0xb703
 	NS_GET_OWNER_UID                 = 0xb704
 	NS_GET_PARENT                    = 0xb702
+	NS_GET_PID_FROM_PIDNS            = 0x8004b706
+	NS_GET_PID_IN_PIDNS              = 0x8004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x8004b707
+	NS_GET_TGID_IN_PIDNS             = 0x8004b709
 	NS_GET_USERNS                    = 0xb701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -227,6 +235,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x7434
 	PPPIOCXFERUNIT                   = 0x744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x80503d01
+	PTP_CLOCK_GETCAPS2               = 0x80503d0a
+	PTP_ENABLE_PPS                   = 0x40043d04
+	PTP_ENABLE_PPS2                  = 0x40043d0d
+	PTP_EXTTS_REQUEST                = 0x40103d02
+	PTP_EXTTS_REQUEST2               = 0x40103d0b
+	PTP_MASK_CLEAR_ALL               = 0x3d13
+	PTP_MASK_EN_SINGLE               = 0x40043d14
+	PTP_PEROUT_REQUEST               = 0x40383d03
+	PTP_PEROUT_REQUEST2              = 0x40383d0c
+	PTP_PIN_SETFUNC                  = 0x40603d07
+	PTP_PIN_SETFUNC2                 = 0x40603d10
+	PTP_SYS_OFFSET                   = 0x43403d05
+	PTP_SYS_OFFSET2                  = 0x43403d0e
 	PTRACE_DISABLE_TE                = 0x5010
 	PTRACE_ENABLE_TE                 = 0x5009
 	PTRACE_GET_LAST_BREAK            = 0x5006
@@ -338,6 +360,8 @@ const (
 	RTC_WIE_ON                       = 0x700f
 	RTC_WKALM_RD                     = 0x80287010
 	RTC_WKALM_SET                    = 0x4028700f
+	SCM_DEVMEM_DMABUF                = 0x4f
+	SCM_DEVMEM_LINEAR                = 0x4e
 	SCM_TIMESTAMPING                 = 0x25
 	SCM_TIMESTAMPING_OPT_STATS       = 0x36
 	SCM_TIMESTAMPING_PKTINFO         = 0x3a
@@ -376,6 +400,9 @@ const (
 	SO_CNX_ADVICE                    = 0x35
 	SO_COOKIE                        = 0x39
 	SO_DETACH_REUSEPORT_BPF          = 0x44
+	SO_DEVMEM_DMABUF                 = 0x4f
+	SO_DEVMEM_DONTNEED               = 0x50
+	SO_DEVMEM_LINEAR                 = 0x4e
 	SO_DOMAIN                        = 0x27
 	SO_DONTROUTE                     = 0x5
 	SO_ERROR                         = 0x4
diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
index f40519d9..c44a3133 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go
@@ -82,6 +82,8 @@ const (
 	EFD_CLOEXEC                      = 0x400000
 	EFD_NONBLOCK                     = 0x4000
 	EMT_TAGOVF                       = 0x1
+	EPIOCGPARAMS                     = 0x40088a02
+	EPIOCSPARAMS                     = 0x80088a01
 	EPOLL_CLOEXEC                    = 0x400000
 	EXTPROC                          = 0x10000
 	FF1                              = 0x8000
@@ -110,6 +112,7 @@ const (
 	HIDIOCGRAWINFO                   = 0x40084803
 	HIDIOCGRDESC                     = 0x50044802
 	HIDIOCGRDESCSIZE                 = 0x40044801
+	HIDIOCREVOKE                     = 0x8004480d
 	HUPCL                            = 0x400
 	ICANON                           = 0x2
 	IEXTEN                           = 0x8000
@@ -153,9 +156,14 @@ const (
 	NFDBITS                          = 0x40
 	NLDLY                            = 0x100
 	NOFLSH                           = 0x80
+	NS_GET_MNTNS_ID                  = 0x4008b705
 	NS_GET_NSTYPE                    = 0x2000b703
 	NS_GET_OWNER_UID                 = 0x2000b704
 	NS_GET_PARENT                    = 0x2000b702
+	NS_GET_PID_FROM_PIDNS            = 0x4004b706
+	NS_GET_PID_IN_PIDNS              = 0x4004b708
+	NS_GET_TGID_FROM_PIDNS           = 0x4004b707
+	NS_GET_TGID_IN_PIDNS             = 0x4004b709
 	NS_GET_USERNS                    = 0x2000b701
 	OLCUC                            = 0x2
 	ONLCR                            = 0x4
@@ -232,6 +240,20 @@ const (
 	PPPIOCUNBRIDGECHAN               = 0x20007434
 	PPPIOCXFERUNIT                   = 0x2000744e
 	PR_SET_PTRACER_ANY               = 0xffffffffffffffff
+	PTP_CLOCK_GETCAPS                = 0x40503d01
+	PTP_CLOCK_GETCAPS2               = 0x40503d0a
+	PTP_ENABLE_PPS                   = 0x80043d04
+	PTP_ENABLE_PPS2                  = 0x80043d0d
+	PTP_EXTTS_REQUEST                = 0x80103d02
+	PTP_EXTTS_REQUEST2               = 0x80103d0b
+	PTP_MASK_CLEAR_ALL               = 0x20003d13
+	PTP_MASK_EN_SINGLE               = 0x80043d14
+	PTP_PEROUT_REQUEST               = 0x80383d03
+	PTP_PEROUT_REQUEST2              = 0x80383d0c
+	PTP_PIN_SETFUNC                  = 0x80603d07
+	PTP_PIN_SETFUNC2                 = 0x80603d10
+	PTP_SYS_OFFSET                   = 0x83403d05
+	PTP_SYS_OFFSET2                  = 0x83403d0e
 	PTRACE_GETFPAREGS                = 0x14
 	PTRACE_GETFPREGS                 = 0xe
 	PTRACE_GETFPREGS64               = 0x19
@@ -329,6 +351,8 @@ const (
 	RTC_WIE_ON                       = 0x2000700f
 	RTC_WKALM_RD                     = 0x40287010
 	RTC_WKALM_SET                    = 0x8028700f
+	SCM_DEVMEM_DMABUF                = 0x58
+	SCM_DEVMEM_LINEAR                = 0x57
 	SCM_TIMESTAMPING                 = 0x23
 	SCM_TIMESTAMPING_OPT_STATS       = 0x38
 	SCM_TIMESTAMPING_PKTINFO         = 0x3c
@@ -415,6 +439,9 @@ const (
 	SO_CNX_ADVICE                    = 0x37
 	SO_COOKIE                        = 0x3b
 	SO_DETACH_REUSEPORT_BPF          = 0x47
+	SO_DEVMEM_DMABUF                 = 0x58
+	SO_DEVMEM_DONTNEED               = 0x59
+	SO_DEVMEM_LINEAR                 = 0x57
 	SO_DOMAIN                        = 0x1029
 	SO_DONTROUTE                     = 0x10
 	SO_ERROR                         = 0x1007
diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
index da08b2ab..1ec2b140 100644
--- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go
@@ -581,6 +581,8 @@ const (
 	AT_EMPTY_PATH                   = 0x1000
 	AT_REMOVEDIR                    = 0x200
 	RENAME_NOREPLACE                = 1 << 0
+	ST_RDONLY                       = 1
+	ST_NOSUID                       = 2
 )
 
 const (
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
index ccb02f24..24b346e1 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go
@@ -740,6 +740,54 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func renamexNp(from string, to string, flag uint32) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(from)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(to)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall(libc_renamex_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flag))
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_renamex_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_renamex_np renamex_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(from)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(to)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_renameatx_np_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), uintptr(flag), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_renameatx_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_renameatx_np renameatx_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
 	var _p0 unsafe.Pointer
 	if len(mib) > 0 {
@@ -760,6 +808,59 @@ var libc_sysctl_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func pthread_chdir_np(path string) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(path)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_pthread_chdir_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pthread_fchdir_np(fd int) (err error) {
+	_, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_pthread_fchdir_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+	var _p0 unsafe.Pointer
+	if len(iov) > 0 {
+		_p0 = unsafe.Pointer(&iov[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	_, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
 	_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
index 8b8bb284..ebd21310 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s
@@ -223,11 +223,36 @@ TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_ioctl_trampoline_addr(SB), RODATA, $8
 DATA	·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB)
 
+TEXT libc_renamex_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_renamex_np(SB)
+GLOBL	·libc_renamex_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_renamex_np_trampoline_addr(SB)/8, $libc_renamex_np_trampoline<>(SB)
+
+TEXT libc_renameatx_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_renameatx_np(SB)
+GLOBL	·libc_renameatx_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_renameatx_np_trampoline_addr(SB)/8, $libc_renameatx_np_trampoline<>(SB)
+
 TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sysctl(SB)
 GLOBL	·libc_sysctl_trampoline_addr(SB), RODATA, $8
 DATA	·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
 
+TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_pthread_chdir_np(SB)
+GLOBL	·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB)
+
+TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_pthread_fchdir_np(SB)
+GLOBL	·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
+
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_connectx(SB)
+GLOBL	·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA	·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
 TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sendfile(SB)
 GLOBL	·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
index 1b40b997..824b9c2d 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go
@@ -740,6 +740,54 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func renamexNp(from string, to string, flag uint32) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(from)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(to)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall(libc_renamex_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flag))
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_renamex_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_renamex_np renamex_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func renameatxNp(fromfd int, from string, tofd int, to string, flag uint32) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(from)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(to)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_renameatx_np_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), uintptr(flag), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_renameatx_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_renameatx_np renameatx_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) {
 	var _p0 unsafe.Pointer
 	if len(mib) > 0 {
@@ -760,6 +808,59 @@ var libc_sysctl_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func pthread_chdir_np(path string) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(path)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall(libc_pthread_chdir_np_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_pthread_chdir_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pthread_chdir_np pthread_chdir_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func pthread_fchdir_np(fd int) (err error) {
+	_, _, e1 := syscall_syscall(libc_pthread_fchdir_np_trampoline_addr, uintptr(fd), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_pthread_fchdir_np_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_pthread_fchdir_np pthread_fchdir_np "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) {
+	var _p0 unsafe.Pointer
+	if len(iov) > 0 {
+		_p0 = unsafe.Pointer(&iov[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	_, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_connectx_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) {
 	_, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags))
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
index 08362c1a..4f178a22 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s
@@ -223,11 +223,36 @@ TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_ioctl_trampoline_addr(SB), RODATA, $8
 DATA	·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB)
 
+TEXT libc_renamex_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_renamex_np(SB)
+GLOBL	·libc_renamex_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_renamex_np_trampoline_addr(SB)/8, $libc_renamex_np_trampoline<>(SB)
+
+TEXT libc_renameatx_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_renameatx_np(SB)
+GLOBL	·libc_renameatx_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_renameatx_np_trampoline_addr(SB)/8, $libc_renameatx_np_trampoline<>(SB)
+
 TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sysctl(SB)
 GLOBL	·libc_sysctl_trampoline_addr(SB), RODATA, $8
 DATA	·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB)
 
+TEXT libc_pthread_chdir_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_pthread_chdir_np(SB)
+GLOBL	·libc_pthread_chdir_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_pthread_chdir_np_trampoline_addr(SB)/8, $libc_pthread_chdir_np_trampoline<>(SB)
+
+TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_pthread_fchdir_np(SB)
+GLOBL	·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8
+DATA	·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB)
+
+TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_connectx(SB)
+GLOBL	·libc_connectx_trampoline_addr(SB), RODATA, $8
+DATA	·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB)
+
 TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_sendfile(SB)
 GLOBL	·libc_sendfile_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
index 87d8612a..5cc1e8eb 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go
@@ -592,6 +592,16 @@ func ClockGettime(clockid int32, time *Timespec) (err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func ClockSettime(clockid int32, time *Timespec) (err error) {
+	_, _, e1 := Syscall(SYS_CLOCK_SETTIME, uintptr(clockid), uintptr(unsafe.Pointer(time)), 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func ClockNanosleep(clockid int32, flags int, request *Timespec, remain *Timespec) (err error) {
 	_, _, e1 := Syscall6(SYS_CLOCK_NANOSLEEP, uintptr(clockid), uintptr(flags), uintptr(unsafe.Pointer(request)), uintptr(unsafe.Pointer(remain)), 0, 0)
 	if e1 != 0 {
@@ -971,23 +981,6 @@ func Getpriority(which int, who int) (prio int, err error) {
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
-func Getrandom(buf []byte, flags int) (n int, err error) {
-	var _p0 unsafe.Pointer
-	if len(buf) > 0 {
-		_p0 = unsafe.Pointer(&buf[0])
-	} else {
-		_p0 = unsafe.Pointer(&_zero)
-	}
-	r0, _, e1 := Syscall(SYS_GETRANDOM, uintptr(_p0), uintptr(len(buf)), uintptr(flags))
-	n = int(r0)
-	if e1 != 0 {
-		err = errnoErr(e1)
-	}
-	return
-}
-
-// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
-
 func Getrusage(who int, rusage *Rusage) (err error) {
 	_, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0)
 	if e1 != 0 {
@@ -2229,3 +2222,19 @@ func Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint)
 	}
 	return
 }
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
+func Mseal(b []byte, flags uint) (err error) {
+	var _p0 unsafe.Pointer
+	if len(b) > 0 {
+		_p0 = unsafe.Pointer(&b[0])
+	} else {
+		_p0 = unsafe.Pointer(&_zero)
+	}
+	_, _, e1 := Syscall(SYS_MSEAL, uintptr(_p0), uintptr(len(b)), uintptr(flags))
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
index 9dc42410..1851df14 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
index 41b56173..0b43c693 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $4
 DATA	·libc_mknodat_trampoline_addr(SB)/4, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $4
+DATA	·libc_mount_trampoline_addr(SB)/4, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $4
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
index 0d3a0751..e1ec0dbe 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
index 4019a656..880c6d6e 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $8
 DATA	·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $8
+DATA	·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
index c39f7776..7c8452a6 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
index ac4af24f..b8ef95b0 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $4
 DATA	·libc_mknodat_trampoline_addr(SB)/4, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $4
+DATA	·libc_mount_trampoline_addr(SB)/4, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $4
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
index 57571d07..2ffdf861 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
index f77d5321..2af3b5c7 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $8
 DATA	·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $8
+DATA	·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
index e62963e6..1da08d52 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
index fae140b6..b7a25135 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $8
 DATA	·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $8
+DATA	·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
index 00831354..6e85b0aa 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
index 9d1e0ff0..f15dadf0 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s
@@ -555,6 +555,12 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $8
 DATA	·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	CALL	libc_mount(SB)
+	RET
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $8
+DATA	·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	CALL	libc_nanosleep(SB)
 	RET
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
index 79029ed5..28b487df 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go
@@ -1493,6 +1493,30 @@ var libc_mknodat_trampoline_addr uintptr
 
 // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
 
+func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) {
+	var _p0 *byte
+	_p0, err = BytePtrFromString(fsType)
+	if err != nil {
+		return
+	}
+	var _p1 *byte
+	_p1, err = BytePtrFromString(dir)
+	if err != nil {
+		return
+	}
+	_, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0)
+	if e1 != 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
+var libc_mount_trampoline_addr uintptr
+
+//go:cgo_import_dynamic libc_mount mount "libc.so"
+
+// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT
+
 func Nanosleep(time *Timespec, leftover *Timespec) (err error) {
 	_, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0)
 	if e1 != 0 {
diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
index da115f9a..1e7f321e 100644
--- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
+++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s
@@ -463,6 +463,11 @@ TEXT libc_mknodat_trampoline<>(SB),NOSPLIT,$0-0
 GLOBL	·libc_mknodat_trampoline_addr(SB), RODATA, $8
 DATA	·libc_mknodat_trampoline_addr(SB)/8, $libc_mknodat_trampoline<>(SB)
 
+TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0
+	JMP	libc_mount(SB)
+GLOBL	·libc_mount_trampoline_addr(SB), RODATA, $8
+DATA	·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB)
+
 TEXT libc_nanosleep_trampoline<>(SB),NOSPLIT,$0-0
 	JMP	libc_nanosleep(SB)
 GLOBL	·libc_nanosleep_trampoline_addr(SB), RODATA, $8
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
index 53aef5dc..524b0820 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go
@@ -457,4 +457,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR            = 459
 	SYS_LSM_SET_SELF_ATTR            = 460
 	SYS_LSM_LIST_MODULES             = 461
+	SYS_MSEAL                        = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
index 71d52476..f485dbf4 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go
@@ -341,6 +341,7 @@ const (
 	SYS_STATX                   = 332
 	SYS_IO_PGETEVENTS           = 333
 	SYS_RSEQ                    = 334
+	SYS_URETPROBE               = 335
 	SYS_PIDFD_SEND_SIGNAL       = 424
 	SYS_IO_URING_SETUP          = 425
 	SYS_IO_URING_ENTER          = 426
@@ -379,4 +380,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
index c7477061..70b35bf3 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go
@@ -421,4 +421,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR            = 459
 	SYS_LSM_SET_SELF_ATTR            = 460
 	SYS_LSM_LIST_MODULES             = 461
+	SYS_MSEAL                        = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
index f96e214f..1893e2fe 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go
@@ -85,7 +85,7 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
-	SYS_FSTATAT                 = 79
+	SYS_NEWFSTATAT              = 79
 	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
@@ -324,4 +324,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
index 28425346..16a4017d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go
@@ -84,6 +84,8 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
+	SYS_NEWFSTATAT              = 79
+	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
 	SYS_FDATASYNC               = 83
@@ -318,4 +320,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
index d0953018..7e567f1e 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go
@@ -441,4 +441,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR            = 4459
 	SYS_LSM_SET_SELF_ATTR            = 4460
 	SYS_LSM_LIST_MODULES             = 4461
+	SYS_MSEAL                        = 4462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
index 295c7f4b..38ae55e5 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go
@@ -371,4 +371,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 5459
 	SYS_LSM_SET_SELF_ATTR       = 5460
 	SYS_LSM_LIST_MODULES        = 5461
+	SYS_MSEAL                   = 5462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
index d1a9eaca..55e92e60 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go
@@ -371,4 +371,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 5459
 	SYS_LSM_SET_SELF_ATTR       = 5460
 	SYS_LSM_LIST_MODULES        = 5461
+	SYS_MSEAL                   = 5462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
index bec157c3..60658d6a 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go
@@ -441,4 +441,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR            = 4459
 	SYS_LSM_SET_SELF_ATTR            = 4460
 	SYS_LSM_LIST_MODULES             = 4461
+	SYS_MSEAL                        = 4462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
index 7ee7bdc4..e203e8a7 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go
@@ -448,4 +448,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR            = 459
 	SYS_LSM_SET_SELF_ATTR            = 460
 	SYS_LSM_LIST_MODULES             = 461
+	SYS_MSEAL                        = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
index fad1f25b..5944b97d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go
@@ -420,4 +420,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
index 7d3e1635..c66d416d 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go
@@ -420,4 +420,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
index 0ed53ad9..a5459e76 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go
@@ -84,7 +84,7 @@ const (
 	SYS_SPLICE                  = 76
 	SYS_TEE                     = 77
 	SYS_READLINKAT              = 78
-	SYS_FSTATAT                 = 79
+	SYS_NEWFSTATAT              = 79
 	SYS_FSTAT                   = 80
 	SYS_SYNC                    = 81
 	SYS_FSYNC                   = 82
@@ -325,4 +325,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
index 2fba04ad..01d86825 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go
@@ -386,4 +386,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
index 621d00d7..7b703e77 100644
--- a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
+++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go
@@ -399,4 +399,5 @@ const (
 	SYS_LSM_GET_SELF_ATTR       = 459
 	SYS_LSM_SET_SELF_ATTR       = 460
 	SYS_LSM_LIST_MODULES        = 461
+	SYS_MSEAL                   = 462
 )
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
index 091d107f..17c53bd9 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
 
 type _Socklen uint32
 
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+	Srcif      uint32
+	Srcaddr    *RawSockaddr
+	Srcaddrlen uint32
+	Dstaddr    *RawSockaddr
+	Dstaddrlen uint32
+	_          [4]byte
+}
+
 type Xucred struct {
 	Version uint32
 	Uid     uint32
@@ -449,11 +462,14 @@ type FdSet struct {
 
 const (
 	SizeofIfMsghdr    = 0x70
+	SizeofIfMsghdr2   = 0xa0
 	SizeofIfData      = 0x60
+	SizeofIfData64    = 0x80
 	SizeofIfaMsghdr   = 0x14
 	SizeofIfmaMsghdr  = 0x10
 	SizeofIfmaMsghdr2 = 0x14
 	SizeofRtMsghdr    = 0x5c
+	SizeofRtMsghdr2   = 0x5c
 	SizeofRtMetrics   = 0x38
 )
 
@@ -467,6 +483,20 @@ type IfMsghdr struct {
 	Data    IfData
 }
 
+type IfMsghdr2 struct {
+	Msglen     uint16
+	Version    uint8
+	Type       uint8
+	Addrs      int32
+	Flags      int32
+	Index      uint16
+	Snd_len    int32
+	Snd_maxlen int32
+	Snd_drops  int32
+	Timer      int32
+	Data       IfData64
+}
+
 type IfData struct {
 	Type       uint8
 	Typelen    uint8
@@ -499,6 +529,34 @@ type IfData struct {
 	Reserved2  uint32
 }
 
+type IfData64 struct {
+	Type       uint8
+	Typelen    uint8
+	Physical   uint8
+	Addrlen    uint8
+	Hdrlen     uint8
+	Recvquota  uint8
+	Xmitquota  uint8
+	Unused1    uint8
+	Mtu        uint32
+	Metric     uint32
+	Baudrate   uint64
+	Ipackets   uint64
+	Ierrors    uint64
+	Opackets   uint64
+	Oerrors    uint64
+	Collisions uint64
+	Ibytes     uint64
+	Obytes     uint64
+	Imcasts    uint64
+	Omcasts    uint64
+	Iqdrops    uint64
+	Noproto    uint64
+	Recvtiming uint32
+	Xmittiming uint32
+	Lastchange Timeval32
+}
+
 type IfaMsghdr struct {
 	Msglen  uint16
 	Version uint8
@@ -544,6 +602,21 @@ type RtMsghdr struct {
 	Rmx     RtMetrics
 }
 
+type RtMsghdr2 struct {
+	Msglen      uint16
+	Version     uint8
+	Type        uint8
+	Index       uint16
+	Flags       int32
+	Addrs       int32
+	Refcnt      int32
+	Parentflags int32
+	Reserved    int32
+	Use         int32
+	Inits       uint32
+	Rmx         RtMetrics
+}
+
 type RtMetrics struct {
 	Locks    uint32
 	Mtu      uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
index 28ff4ef7..2392226a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go
@@ -306,6 +306,19 @@ type XVSockPgen struct {
 
 type _Socklen uint32
 
+type SaeAssocID uint32
+
+type SaeConnID uint32
+
+type SaEndpoints struct {
+	Srcif      uint32
+	Srcaddr    *RawSockaddr
+	Srcaddrlen uint32
+	Dstaddr    *RawSockaddr
+	Dstaddrlen uint32
+	_          [4]byte
+}
+
 type Xucred struct {
 	Version uint32
 	Uid     uint32
@@ -449,11 +462,14 @@ type FdSet struct {
 
 const (
 	SizeofIfMsghdr    = 0x70
+	SizeofIfMsghdr2   = 0xa0
 	SizeofIfData      = 0x60
+	SizeofIfData64    = 0x80
 	SizeofIfaMsghdr   = 0x14
 	SizeofIfmaMsghdr  = 0x10
 	SizeofIfmaMsghdr2 = 0x14
 	SizeofRtMsghdr    = 0x5c
+	SizeofRtMsghdr2   = 0x5c
 	SizeofRtMetrics   = 0x38
 )
 
@@ -467,6 +483,20 @@ type IfMsghdr struct {
 	Data    IfData
 }
 
+type IfMsghdr2 struct {
+	Msglen     uint16
+	Version    uint8
+	Type       uint8
+	Addrs      int32
+	Flags      int32
+	Index      uint16
+	Snd_len    int32
+	Snd_maxlen int32
+	Snd_drops  int32
+	Timer      int32
+	Data       IfData64
+}
+
 type IfData struct {
 	Type       uint8
 	Typelen    uint8
@@ -499,6 +529,34 @@ type IfData struct {
 	Reserved2  uint32
 }
 
+type IfData64 struct {
+	Type       uint8
+	Typelen    uint8
+	Physical   uint8
+	Addrlen    uint8
+	Hdrlen     uint8
+	Recvquota  uint8
+	Xmitquota  uint8
+	Unused1    uint8
+	Mtu        uint32
+	Metric     uint32
+	Baudrate   uint64
+	Ipackets   uint64
+	Ierrors    uint64
+	Opackets   uint64
+	Oerrors    uint64
+	Collisions uint64
+	Ibytes     uint64
+	Obytes     uint64
+	Imcasts    uint64
+	Omcasts    uint64
+	Iqdrops    uint64
+	Noproto    uint64
+	Recvtiming uint32
+	Xmittiming uint32
+	Lastchange Timeval32
+}
+
 type IfaMsghdr struct {
 	Msglen  uint16
 	Version uint8
@@ -544,6 +602,21 @@ type RtMsghdr struct {
 	Rmx     RtMetrics
 }
 
+type RtMsghdr2 struct {
+	Msglen      uint16
+	Version     uint8
+	Type        uint8
+	Index       uint16
+	Flags       int32
+	Addrs       int32
+	Refcnt      int32
+	Parentflags int32
+	Reserved    int32
+	Use         int32
+	Inits       uint32
+	Rmx         RtMetrics
+}
+
 type RtMetrics struct {
 	Locks    uint32
 	Mtu      uint32
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
index 6cbd094a..51e13eb0 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go
@@ -625,6 +625,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
index 7c03b6ee..d002d8ef 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go
@@ -630,6 +630,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
index 422107ee..3f863d89 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go
@@ -616,6 +616,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
index 505a12ac..61c72931 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go
@@ -610,6 +610,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
index cc986c79..b5d17414 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go
@@ -612,6 +612,7 @@ const (
 	POLLRDNORM   = 0x40
 	POLLWRBAND   = 0x100
 	POLLWRNORM   = 0x4
+	POLLRDHUP    = 0x4000
 )
 
 type CapRights struct {
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go
index 4740b834..5537148d 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go
@@ -87,30 +87,35 @@ type StatxTimestamp struct {
 }
 
 type Statx_t struct {
-	Mask             uint32
-	Blksize          uint32
-	Attributes       uint64
-	Nlink            uint32
-	Uid              uint32
-	Gid              uint32
-	Mode             uint16
-	_                [1]uint16
-	Ino              uint64
-	Size             uint64
-	Blocks           uint64
-	Attributes_mask  uint64
-	Atime            StatxTimestamp
-	Btime            StatxTimestamp
-	Ctime            StatxTimestamp
-	Mtime            StatxTimestamp
-	Rdev_major       uint32
-	Rdev_minor       uint32
-	Dev_major        uint32
-	Dev_minor        uint32
-	Mnt_id           uint64
-	Dio_mem_align    uint32
-	Dio_offset_align uint32
-	_                [12]uint64
+	Mask                      uint32
+	Blksize                   uint32
+	Attributes                uint64
+	Nlink                     uint32
+	Uid                       uint32
+	Gid                       uint32
+	Mode                      uint16
+	_                         [1]uint16
+	Ino                       uint64
+	Size                      uint64
+	Blocks                    uint64
+	Attributes_mask           uint64
+	Atime                     StatxTimestamp
+	Btime                     StatxTimestamp
+	Ctime                     StatxTimestamp
+	Mtime                     StatxTimestamp
+	Rdev_major                uint32
+	Rdev_minor                uint32
+	Dev_major                 uint32
+	Dev_minor                 uint32
+	Mnt_id                    uint64
+	Dio_mem_align             uint32
+	Dio_offset_align          uint32
+	Subvol                    uint64
+	Atomic_write_unit_min     uint32
+	Atomic_write_unit_max     uint32
+	Atomic_write_segments_max uint32
+	_                         [1]uint32
+	_                         [9]uint64
 }
 
 type Fsid struct {
@@ -515,6 +520,29 @@ type TCPInfo struct {
 	Total_rto_time       uint32
 }
 
+type TCPVegasInfo struct {
+	Enabled uint32
+	Rttcnt  uint32
+	Rtt     uint32
+	Minrtt  uint32
+}
+
+type TCPDCTCPInfo struct {
+	Enabled  uint16
+	Ce_state uint16
+	Alpha    uint32
+	Ab_ecn   uint32
+	Ab_tot   uint32
+}
+
+type TCPBBRInfo struct {
+	Bw_lo       uint32
+	Bw_hi       uint32
+	Min_rtt     uint32
+	Pacing_gain uint32
+	Cwnd_gain   uint32
+}
+
 type CanFilter struct {
 	Id   uint32
 	Mask uint32
@@ -556,6 +584,7 @@ const (
 	SizeofICMPv6Filter      = 0x20
 	SizeofUcred             = 0xc
 	SizeofTCPInfo           = 0xf8
+	SizeofTCPCCInfo         = 0x14
 	SizeofCanFilter         = 0x8
 	SizeofTCPRepairOpt      = 0x8
 )
@@ -1723,12 +1752,6 @@ const (
 	IFLA_IPVLAN_UNSPEC                         = 0x0
 	IFLA_IPVLAN_MODE                           = 0x1
 	IFLA_IPVLAN_FLAGS                          = 0x2
-	NETKIT_NEXT                                = -0x1
-	NETKIT_PASS                                = 0x0
-	NETKIT_DROP                                = 0x2
-	NETKIT_REDIRECT                            = 0x7
-	NETKIT_L2                                  = 0x0
-	NETKIT_L3                                  = 0x1
 	IFLA_NETKIT_UNSPEC                         = 0x0
 	IFLA_NETKIT_PEER_INFO                      = 0x1
 	IFLA_NETKIT_PRIMARY                        = 0x2
@@ -1767,6 +1790,7 @@ const (
 	IFLA_VXLAN_DF                              = 0x1d
 	IFLA_VXLAN_VNIFILTER                       = 0x1e
 	IFLA_VXLAN_LOCALBYPASS                     = 0x1f
+	IFLA_VXLAN_LABEL_POLICY                    = 0x20
 	IFLA_GENEVE_UNSPEC                         = 0x0
 	IFLA_GENEVE_ID                             = 0x1
 	IFLA_GENEVE_REMOTE                         = 0x2
@@ -1796,6 +1820,8 @@ const (
 	IFLA_GTP_ROLE                              = 0x4
 	IFLA_GTP_CREATE_SOCKETS                    = 0x5
 	IFLA_GTP_RESTART_COUNT                     = 0x6
+	IFLA_GTP_LOCAL                             = 0x7
+	IFLA_GTP_LOCAL6                            = 0x8
 	IFLA_BOND_UNSPEC                           = 0x0
 	IFLA_BOND_MODE                             = 0x1
 	IFLA_BOND_ACTIVE_SLAVE                     = 0x2
@@ -1828,6 +1854,7 @@ const (
 	IFLA_BOND_AD_LACP_ACTIVE                   = 0x1d
 	IFLA_BOND_MISSED_MAX                       = 0x1e
 	IFLA_BOND_NS_IP6_TARGET                    = 0x1f
+	IFLA_BOND_COUPLED_CONTROL                  = 0x20
 	IFLA_BOND_AD_INFO_UNSPEC                   = 0x0
 	IFLA_BOND_AD_INFO_AGGREGATOR               = 0x1
 	IFLA_BOND_AD_INFO_NUM_PORTS                = 0x2
@@ -1896,6 +1923,7 @@ const (
 	IFLA_HSR_SEQ_NR                            = 0x5
 	IFLA_HSR_VERSION                           = 0x6
 	IFLA_HSR_PROTOCOL                          = 0x7
+	IFLA_HSR_INTERLINK                         = 0x8
 	IFLA_STATS_UNSPEC                          = 0x0
 	IFLA_STATS_LINK_64                         = 0x1
 	IFLA_STATS_LINK_XSTATS                     = 0x2
@@ -1948,6 +1976,15 @@ const (
 	IFLA_DSA_MASTER                            = 0x1
 )
 
+const (
+	NETKIT_NEXT     = -0x1
+	NETKIT_PASS     = 0x0
+	NETKIT_DROP     = 0x2
+	NETKIT_REDIRECT = 0x7
+	NETKIT_L2       = 0x0
+	NETKIT_L3       = 0x1
+)
+
 const (
 	NF_INET_PRE_ROUTING  = 0x0
 	NF_INET_LOCAL_IN     = 0x1
@@ -2485,7 +2522,7 @@ type XDPMmapOffsets struct {
 type XDPUmemReg struct {
 	Addr            uint64
 	Len             uint64
-	Chunk_size      uint32
+	Size            uint32
 	Headroom        uint32
 	Flags           uint32
 	Tx_metadata_len uint32
@@ -2557,8 +2594,8 @@ const (
 	SOF_TIMESTAMPING_BIND_PHC     = 0x8000
 	SOF_TIMESTAMPING_OPT_ID_TCP   = 0x10000
 
-	SOF_TIMESTAMPING_LAST = 0x10000
-	SOF_TIMESTAMPING_MASK = 0x1ffff
+	SOF_TIMESTAMPING_LAST = 0x20000
+	SOF_TIMESTAMPING_MASK = 0x3ffff
 
 	SCM_TSTAMP_SND   = 0x0
 	SCM_TSTAMP_SCHED = 0x1
@@ -3473,7 +3510,7 @@ const (
 	DEVLINK_PORT_FN_ATTR_STATE                         = 0x2
 	DEVLINK_PORT_FN_ATTR_OPSTATE                       = 0x3
 	DEVLINK_PORT_FN_ATTR_CAPS                          = 0x4
-	DEVLINK_PORT_FUNCTION_ATTR_MAX                     = 0x5
+	DEVLINK_PORT_FUNCTION_ATTR_MAX                     = 0x6
 )
 
 type FsverityDigest struct {
@@ -3504,7 +3541,7 @@ type Nhmsg struct {
 type NexthopGrp struct {
 	Id     uint32
 	Weight uint8
-	Resvd1 uint8
+	High   uint8
 	Resvd2 uint16
 }
 
@@ -3765,7 +3802,7 @@ const (
 	ETHTOOL_MSG_PSE_GET                       = 0x24
 	ETHTOOL_MSG_PSE_SET                       = 0x25
 	ETHTOOL_MSG_RSS_GET                       = 0x26
-	ETHTOOL_MSG_USER_MAX                      = 0x2b
+	ETHTOOL_MSG_USER_MAX                      = 0x2d
 	ETHTOOL_MSG_KERNEL_NONE                   = 0x0
 	ETHTOOL_MSG_STRSET_GET_REPLY              = 0x1
 	ETHTOOL_MSG_LINKINFO_GET_REPLY            = 0x2
@@ -3805,12 +3842,15 @@ const (
 	ETHTOOL_MSG_MODULE_NTF                    = 0x24
 	ETHTOOL_MSG_PSE_GET_REPLY                 = 0x25
 	ETHTOOL_MSG_RSS_GET_REPLY                 = 0x26
-	ETHTOOL_MSG_KERNEL_MAX                    = 0x2b
+	ETHTOOL_MSG_KERNEL_MAX                    = 0x2e
+	ETHTOOL_FLAG_COMPACT_BITSETS              = 0x1
+	ETHTOOL_FLAG_OMIT_REPLY                   = 0x2
+	ETHTOOL_FLAG_STATS                        = 0x4
 	ETHTOOL_A_HEADER_UNSPEC                   = 0x0
 	ETHTOOL_A_HEADER_DEV_INDEX                = 0x1
 	ETHTOOL_A_HEADER_DEV_NAME                 = 0x2
 	ETHTOOL_A_HEADER_FLAGS                    = 0x3
-	ETHTOOL_A_HEADER_MAX                      = 0x3
+	ETHTOOL_A_HEADER_MAX                      = 0x4
 	ETHTOOL_A_BITSET_BIT_UNSPEC               = 0x0
 	ETHTOOL_A_BITSET_BIT_INDEX                = 0x1
 	ETHTOOL_A_BITSET_BIT_NAME                 = 0x2
@@ -3947,7 +3987,7 @@ const (
 	ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL   = 0x17
 	ETHTOOL_A_COALESCE_USE_CQE_MODE_TX        = 0x18
 	ETHTOOL_A_COALESCE_USE_CQE_MODE_RX        = 0x19
-	ETHTOOL_A_COALESCE_MAX                    = 0x1c
+	ETHTOOL_A_COALESCE_MAX                    = 0x1e
 	ETHTOOL_A_PAUSE_UNSPEC                    = 0x0
 	ETHTOOL_A_PAUSE_HEADER                    = 0x1
 	ETHTOOL_A_PAUSE_AUTONEG                   = 0x2
@@ -3975,7 +4015,7 @@ const (
 	ETHTOOL_A_TSINFO_TX_TYPES                 = 0x3
 	ETHTOOL_A_TSINFO_RX_FILTERS               = 0x4
 	ETHTOOL_A_TSINFO_PHC_INDEX                = 0x5
-	ETHTOOL_A_TSINFO_MAX                      = 0x5
+	ETHTOOL_A_TSINFO_MAX                      = 0x6
 	ETHTOOL_A_CABLE_TEST_UNSPEC               = 0x0
 	ETHTOOL_A_CABLE_TEST_HEADER               = 0x1
 	ETHTOOL_A_CABLE_TEST_MAX                  = 0x1
@@ -3991,11 +4031,11 @@ const (
 	ETHTOOL_A_CABLE_RESULT_UNSPEC             = 0x0
 	ETHTOOL_A_CABLE_RESULT_PAIR               = 0x1
 	ETHTOOL_A_CABLE_RESULT_CODE               = 0x2
-	ETHTOOL_A_CABLE_RESULT_MAX                = 0x2
+	ETHTOOL_A_CABLE_RESULT_MAX                = 0x3
 	ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC       = 0x0
 	ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR         = 0x1
 	ETHTOOL_A_CABLE_FAULT_LENGTH_CM           = 0x2
-	ETHTOOL_A_CABLE_FAULT_LENGTH_MAX          = 0x2
+	ETHTOOL_A_CABLE_FAULT_LENGTH_MAX          = 0x3
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC    = 0x0
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED   = 0x1
 	ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED = 0x2
@@ -4078,6 +4118,107 @@ type EthtoolDrvinfo struct {
 	Regdump_len  uint32
 }
 
+type EthtoolTsInfo struct {
+	Cmd             uint32
+	So_timestamping uint32
+	Phc_index       int32
+	Tx_types        uint32
+	Tx_reserved     [3]uint32
+	Rx_filters      uint32
+	Rx_reserved     [3]uint32
+}
+
+type HwTstampConfig struct {
+	Flags     int32
+	Tx_type   int32
+	Rx_filter int32
+}
+
+const (
+	HWTSTAMP_FILTER_NONE            = 0x0
+	HWTSTAMP_FILTER_ALL             = 0x1
+	HWTSTAMP_FILTER_SOME            = 0x2
+	HWTSTAMP_FILTER_PTP_V1_L4_EVENT = 0x3
+	HWTSTAMP_FILTER_PTP_V2_L4_EVENT = 0x6
+	HWTSTAMP_FILTER_PTP_V2_L2_EVENT = 0x9
+	HWTSTAMP_FILTER_PTP_V2_EVENT    = 0xc
+)
+
+const (
+	HWTSTAMP_TX_OFF          = 0x0
+	HWTSTAMP_TX_ON           = 0x1
+	HWTSTAMP_TX_ONESTEP_SYNC = 0x2
+)
+
+type (
+	PtpClockCaps struct {
+		Max_adj            int32
+		N_alarm            int32
+		N_ext_ts           int32
+		N_per_out          int32
+		Pps                int32
+		N_pins             int32
+		Cross_timestamping int32
+		Adjust_phase       int32
+		Max_phase_adj      int32
+		Rsv                [11]int32
+	}
+	PtpClockTime struct {
+		Sec      int64
+		Nsec     uint32
+		Reserved uint32
+	}
+	PtpExttsEvent struct {
+		T     PtpClockTime
+		Index uint32
+		Flags uint32
+		Rsv   [2]uint32
+	}
+	PtpExttsRequest struct {
+		Index uint32
+		Flags uint32
+		Rsv   [2]uint32
+	}
+	PtpPeroutRequest struct {
+		StartOrPhase PtpClockTime
+		Period       PtpClockTime
+		Index        uint32
+		Flags        uint32
+		On           PtpClockTime
+	}
+	PtpPinDesc struct {
+		Name  [64]byte
+		Index uint32
+		Func  uint32
+		Chan  uint32
+		Rsv   [5]uint32
+	}
+	PtpSysOffset struct {
+		Samples uint32
+		Rsv     [3]uint32
+		Ts      [51]PtpClockTime
+	}
+	PtpSysOffsetExtended struct {
+		Samples uint32
+		Clockid int32
+		Rsv     [2]uint32
+		Ts      [25][3]PtpClockTime
+	}
+	PtpSysOffsetPrecise struct {
+		Device   PtpClockTime
+		Realtime PtpClockTime
+		Monoraw  PtpClockTime
+		Rsv      [4]uint32
+	}
+)
+
+const (
+	PTP_PF_NONE    = 0x0
+	PTP_PF_EXTTS   = 0x1
+	PTP_PF_PEROUT  = 0x2
+	PTP_PF_PHYSYNC = 0x3
+)
+
 type (
 	HIDRawReportDescriptor struct {
 		Size  uint32
@@ -4259,6 +4400,7 @@ const (
 type LandlockRulesetAttr struct {
 	Access_fs  uint64
 	Access_net uint64
+	Scoped     uint64
 }
 
 type LandlockPathBeneathAttr struct {
@@ -4605,7 +4747,7 @@ const (
 	NL80211_ATTR_MAC_HINT                                   = 0xc8
 	NL80211_ATTR_MAC_MASK                                   = 0xd7
 	NL80211_ATTR_MAX_AP_ASSOC_STA                           = 0xca
-	NL80211_ATTR_MAX                                        = 0x14a
+	NL80211_ATTR_MAX                                        = 0x14c
 	NL80211_ATTR_MAX_CRIT_PROT_DURATION                     = 0xb4
 	NL80211_ATTR_MAX_CSA_COUNTERS                           = 0xce
 	NL80211_ATTR_MAX_MATCH_SETS                             = 0x85
@@ -5209,7 +5351,7 @@ const (
 	NL80211_FREQUENCY_ATTR_GO_CONCURRENT                    = 0xf
 	NL80211_FREQUENCY_ATTR_INDOOR_ONLY                      = 0xe
 	NL80211_FREQUENCY_ATTR_IR_CONCURRENT                    = 0xf
-	NL80211_FREQUENCY_ATTR_MAX                              = 0x20
+	NL80211_FREQUENCY_ATTR_MAX                              = 0x21
 	NL80211_FREQUENCY_ATTR_MAX_TX_POWER                     = 0x6
 	NL80211_FREQUENCY_ATTR_NO_10MHZ                         = 0x11
 	NL80211_FREQUENCY_ATTR_NO_160MHZ                        = 0xc
diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
index 15adc041..ad05b51a 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go
@@ -727,6 +727,37 @@ const (
 	RISCV_HWPROBE_EXT_ZBA                = 0x8
 	RISCV_HWPROBE_EXT_ZBB                = 0x10
 	RISCV_HWPROBE_EXT_ZBS                = 0x20
+	RISCV_HWPROBE_EXT_ZICBOZ             = 0x40
+	RISCV_HWPROBE_EXT_ZBC                = 0x80
+	RISCV_HWPROBE_EXT_ZBKB               = 0x100
+	RISCV_HWPROBE_EXT_ZBKC               = 0x200
+	RISCV_HWPROBE_EXT_ZBKX               = 0x400
+	RISCV_HWPROBE_EXT_ZKND               = 0x800
+	RISCV_HWPROBE_EXT_ZKNE               = 0x1000
+	RISCV_HWPROBE_EXT_ZKNH               = 0x2000
+	RISCV_HWPROBE_EXT_ZKSED              = 0x4000
+	RISCV_HWPROBE_EXT_ZKSH               = 0x8000
+	RISCV_HWPROBE_EXT_ZKT                = 0x10000
+	RISCV_HWPROBE_EXT_ZVBB               = 0x20000
+	RISCV_HWPROBE_EXT_ZVBC               = 0x40000
+	RISCV_HWPROBE_EXT_ZVKB               = 0x80000
+	RISCV_HWPROBE_EXT_ZVKG               = 0x100000
+	RISCV_HWPROBE_EXT_ZVKNED             = 0x200000
+	RISCV_HWPROBE_EXT_ZVKNHA             = 0x400000
+	RISCV_HWPROBE_EXT_ZVKNHB             = 0x800000
+	RISCV_HWPROBE_EXT_ZVKSED             = 0x1000000
+	RISCV_HWPROBE_EXT_ZVKSH              = 0x2000000
+	RISCV_HWPROBE_EXT_ZVKT               = 0x4000000
+	RISCV_HWPROBE_EXT_ZFH                = 0x8000000
+	RISCV_HWPROBE_EXT_ZFHMIN             = 0x10000000
+	RISCV_HWPROBE_EXT_ZIHINTNTL          = 0x20000000
+	RISCV_HWPROBE_EXT_ZVFH               = 0x40000000
+	RISCV_HWPROBE_EXT_ZVFHMIN            = 0x80000000
+	RISCV_HWPROBE_EXT_ZFA                = 0x100000000
+	RISCV_HWPROBE_EXT_ZTSO               = 0x200000000
+	RISCV_HWPROBE_EXT_ZACAS              = 0x400000000
+	RISCV_HWPROBE_EXT_ZICOND             = 0x800000000
+	RISCV_HWPROBE_EXT_ZIHINTPAUSE        = 0x1000000000
 	RISCV_HWPROBE_KEY_CPUPERF_0          = 0x5
 	RISCV_HWPROBE_MISALIGNED_UNKNOWN     = 0x0
 	RISCV_HWPROBE_MISALIGNED_EMULATED    = 0x1
@@ -734,4 +765,6 @@ const (
 	RISCV_HWPROBE_MISALIGNED_FAST        = 0x3
 	RISCV_HWPROBE_MISALIGNED_UNSUPPORTED = 0x4
 	RISCV_HWPROBE_MISALIGNED_MASK        = 0x7
+	RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE  = 0x6
+	RISCV_HWPROBE_WHICH_CPUS             = 0x1
 )
diff --git a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
index d9a13af4..2e5d5a44 100644
--- a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
+++ b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go
@@ -377,6 +377,12 @@ type Flock_t struct {
 	Pid    int32
 }
 
+type F_cnvrt struct {
+	Cvtcmd int32
+	Pccsid int16
+	Fccsid int16
+}
+
 type Termios struct {
 	Cflag uint32
 	Iflag uint32
diff --git a/vendor/golang.org/x/sys/windows/dll_windows.go b/vendor/golang.org/x/sys/windows/dll_windows.go
index 115341fb..4e613cf6 100644
--- a/vendor/golang.org/x/sys/windows/dll_windows.go
+++ b/vendor/golang.org/x/sys/windows/dll_windows.go
@@ -65,7 +65,7 @@ func LoadDLL(name string) (dll *DLL, err error) {
 	return d, nil
 }
 
-// MustLoadDLL is like LoadDLL but panics if load operation failes.
+// MustLoadDLL is like LoadDLL but panics if load operation fails.
 func MustLoadDLL(name string) *DLL {
 	d, e := LoadDLL(name)
 	if e != nil {
diff --git a/vendor/golang.org/x/sys/windows/security_windows.go b/vendor/golang.org/x/sys/windows/security_windows.go
index 6f7d2ac7..b6e1ab76 100644
--- a/vendor/golang.org/x/sys/windows/security_windows.go
+++ b/vendor/golang.org/x/sys/windows/security_windows.go
@@ -894,7 +894,7 @@ type ACL struct {
 	aclRevision byte
 	sbz1        byte
 	aclSize     uint16
-	aceCount    uint16
+	AceCount    uint16
 	sbz2        uint16
 }
 
@@ -1087,6 +1087,27 @@ type EXPLICIT_ACCESS struct {
 	Trustee           TRUSTEE
 }
 
+// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header
+type ACE_HEADER struct {
+	AceType  uint8
+	AceFlags uint8
+	AceSize  uint16
+}
+
+// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-access_allowed_ace
+type ACCESS_ALLOWED_ACE struct {
+	Header   ACE_HEADER
+	Mask     ACCESS_MASK
+	SidStart uint32
+}
+
+const (
+	// Constants for AceType
+	// https://learn.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-ace_header
+	ACCESS_ALLOWED_ACE_TYPE = 0
+	ACCESS_DENIED_ACE_TYPE  = 1
+)
+
 // This type is the union inside of TRUSTEE and must be created using one of the TrusteeValueFrom* functions.
 type TrusteeValue uintptr
 
@@ -1158,6 +1179,7 @@ type OBJECTS_AND_NAME struct {
 //sys	makeSelfRelativeSD(absoluteSD *SECURITY_DESCRIPTOR, selfRelativeSD *SECURITY_DESCRIPTOR, selfRelativeSDSize *uint32) (err error) = advapi32.MakeSelfRelativeSD
 
 //sys	setEntriesInAcl(countExplicitEntries uint32, explicitEntries *EXPLICIT_ACCESS, oldACL *ACL, newACL **ACL) (ret error) = advapi32.SetEntriesInAclW
+//sys	GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (err error) = advapi32.GetAce
 
 // Control returns the security descriptor control bits.
 func (sd *SECURITY_DESCRIPTOR) Control() (control SECURITY_DESCRIPTOR_CONTROL, revision uint32, err error) {
diff --git a/vendor/golang.org/x/sys/windows/svc/mgr/config.go b/vendor/golang.org/x/sys/windows/svc/mgr/config.go
index a6d3e8a8..3c7ba08f 100644
--- a/vendor/golang.org/x/sys/windows/svc/mgr/config.go
+++ b/vendor/golang.org/x/sys/windows/svc/mgr/config.go
@@ -63,7 +63,7 @@ func toStringSlice(ps *uint16) []string {
 	return r
 }
 
-// Config retrieves service s configuration paramteres.
+// Config retrieves service s configuration parameters.
 func (s *Service) Config() (Config, error) {
 	var p *windows.QUERY_SERVICE_CONFIG
 	n := uint32(1024)
diff --git a/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go b/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
index cdf880e1..ef2a6878 100644
--- a/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
+++ b/vendor/golang.org/x/sys/windows/svc/mgr/recovery.go
@@ -137,7 +137,7 @@ func (s *Service) RecoveryCommand() (string, error) {
 // SetRecoveryActionsOnNonCrashFailures sets the failure actions flag. If the
 // flag is set to false, recovery actions will only be performed if the service
 // terminates without reporting a status of SERVICE_STOPPED. If the flag is set
-// to true, recovery actions are also perfomed if the service stops with a
+// to true, recovery actions are also performed if the service stops with a
 // nonzero exit code.
 func (s *Service) SetRecoveryActionsOnNonCrashFailures(flag bool) error {
 	var setting windows.SERVICE_FAILURE_ACTIONS_FLAG
@@ -151,7 +151,7 @@ func (s *Service) SetRecoveryActionsOnNonCrashFailures(flag bool) error {
 // actions flag. If the flag is set to false, recovery actions will only be
 // performed if the service terminates without reporting a status of
 // SERVICE_STOPPED. If the flag is set to true, recovery actions are also
-// perfomed if the service stops with a nonzero exit code.
+// performed if the service stops with a nonzero exit code.
 func (s *Service) RecoveryActionsOnNonCrashFailures() (bool, error) {
 	b, err := s.queryServiceConfig2(windows.SERVICE_CONFIG_FAILURE_ACTIONS_FLAG)
 	if err != nil {
diff --git a/vendor/golang.org/x/sys/windows/svc/service.go b/vendor/golang.org/x/sys/windows/svc/service.go
index c96932d9..c4f74924 100644
--- a/vendor/golang.org/x/sys/windows/svc/service.go
+++ b/vendor/golang.org/x/sys/windows/svc/service.go
@@ -199,9 +199,8 @@ var (
 )
 
 func ctlHandler(ctl, evtype, evdata, context uintptr) uintptr {
-	s := (*service)(unsafe.Pointer(context))
 	e := ctlEvent{cmd: Cmd(ctl), eventType: uint32(evtype), eventData: evdata, context: 123456} // Set context to 123456 to test issue #25660.
-	s.c <- e
+	theService.c <- e
 	return 0
 }
 
@@ -210,7 +209,7 @@ var theService service // This is, unfortunately, a global, which means only one
 // serviceMain is the entry point called by the service manager, registered earlier by
 // the call to StartServiceCtrlDispatcher.
 func serviceMain(argc uint32, argv **uint16) uintptr {
-	handle, err := windows.RegisterServiceCtrlHandlerEx(windows.StringToUTF16Ptr(theService.name), ctlHandlerCallback, uintptr(unsafe.Pointer(&theService)))
+	handle, err := windows.RegisterServiceCtrlHandlerEx(windows.StringToUTF16Ptr(theService.name), ctlHandlerCallback, 0)
 	if sysErr, ok := err.(windows.Errno); ok {
 		return uintptr(sysErr)
 	} else if err != nil {
diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go
index 6525c62f..4a325438 100644
--- a/vendor/golang.org/x/sys/windows/syscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/syscall_windows.go
@@ -17,8 +17,10 @@ import (
 	"unsafe"
 )
 
-type Handle uintptr
-type HWND uintptr
+type (
+	Handle uintptr
+	HWND   uintptr
+)
 
 const (
 	InvalidHandle = ^Handle(0)
@@ -166,6 +168,8 @@ func NewCallbackCDecl(fn interface{}) uintptr {
 //sys	CreateNamedPipe(name *uint16, flags uint32, pipeMode uint32, maxInstances uint32, outSize uint32, inSize uint32, defaultTimeout uint32, sa *SecurityAttributes) (handle Handle, err error)  [failretval==InvalidHandle] = CreateNamedPipeW
 //sys	ConnectNamedPipe(pipe Handle, overlapped *Overlapped) (err error)
 //sys	DisconnectNamedPipe(pipe Handle) (err error)
+//sys   GetNamedPipeClientProcessId(pipe Handle, clientProcessID *uint32) (err error)
+//sys   GetNamedPipeServerProcessId(pipe Handle, serverProcessID *uint32) (err error)
 //sys	GetNamedPipeInfo(pipe Handle, flags *uint32, outSize *uint32, inSize *uint32, maxInstances *uint32) (err error)
 //sys	GetNamedPipeHandleState(pipe Handle, state *uint32, curInstances *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32, userName *uint16, maxUserNameSize uint32) (err error) = GetNamedPipeHandleStateW
 //sys	SetNamedPipeHandleState(pipe Handle, state *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32) (err error) = SetNamedPipeHandleState
@@ -211,6 +215,10 @@ func NewCallbackCDecl(fn interface{}) uintptr {
 //sys	OpenProcess(desiredAccess uint32, inheritHandle bool, processId uint32) (handle Handle, err error)
 //sys	ShellExecute(hwnd Handle, verb *uint16, file *uint16, args *uint16, cwd *uint16, showCmd int32) (err error) [failretval<=32] = shell32.ShellExecuteW
 //sys	GetWindowThreadProcessId(hwnd HWND, pid *uint32) (tid uint32, err error) = user32.GetWindowThreadProcessId
+//sys	LoadKeyboardLayout(name *uint16, flags uint32) (hkl Handle, err error) [failretval==0] = user32.LoadKeyboardLayoutW
+//sys	UnloadKeyboardLayout(hkl Handle) (err error) = user32.UnloadKeyboardLayout
+//sys	GetKeyboardLayout(tid uint32) (hkl Handle) = user32.GetKeyboardLayout
+//sys	ToUnicodeEx(vkey uint32, scancode uint32, keystate *byte, pwszBuff *uint16, cchBuff int32, flags uint32, hkl Handle) (ret int32) = user32.ToUnicodeEx
 //sys	GetShellWindow() (shellWindow HWND) = user32.GetShellWindow
 //sys	MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret int32, err error) [failretval==0] = user32.MessageBoxW
 //sys	ExitWindowsEx(flags uint32, reason uint32) (err error) = user32.ExitWindowsEx
@@ -307,6 +315,10 @@ func NewCallbackCDecl(fn interface{}) uintptr {
 //sys	SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode
 //sys	GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo
 //sys	setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition
+//sys	GetConsoleCP() (cp uint32, err error) = kernel32.GetConsoleCP
+//sys	GetConsoleOutputCP() (cp uint32, err error) = kernel32.GetConsoleOutputCP
+//sys	SetConsoleCP(cp uint32) (err error) = kernel32.SetConsoleCP
+//sys	SetConsoleOutputCP(cp uint32) (err error) = kernel32.SetConsoleOutputCP
 //sys	WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW
 //sys	ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW
 //sys	resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole
@@ -715,20 +727,12 @@ func DurationSinceBoot() time.Duration {
 }
 
 func Ftruncate(fd Handle, length int64) (err error) {
-	curoffset, e := Seek(fd, 0, 1)
-	if e != nil {
-		return e
+	type _FILE_END_OF_FILE_INFO struct {
+		EndOfFile int64
 	}
-	defer Seek(fd, curoffset, 0)
-	_, e = Seek(fd, length, 0)
-	if e != nil {
-		return e
-	}
-	e = SetEndOfFile(fd)
-	if e != nil {
-		return e
-	}
-	return nil
+	var info _FILE_END_OF_FILE_INFO
+	info.EndOfFile = length
+	return SetFileInformationByHandle(fd, FileEndOfFileInfo, (*byte)(unsafe.Pointer(&info)), uint32(unsafe.Sizeof(info)))
 }
 
 func Gettimeofday(tv *Timeval) (err error) {
@@ -884,6 +888,11 @@ const socket_error = uintptr(^uint32(0))
 //sys	GetACP() (acp uint32) = kernel32.GetACP
 //sys	MultiByteToWideChar(codePage uint32, dwFlags uint32, str *byte, nstr int32, wchar *uint16, nwchar int32) (nwrite int32, err error) = kernel32.MultiByteToWideChar
 //sys	getBestInterfaceEx(sockaddr unsafe.Pointer, pdwBestIfIndex *uint32) (errcode error) = iphlpapi.GetBestInterfaceEx
+//sys   GetIfEntry2Ex(level uint32, row *MibIfRow2) (errcode error) = iphlpapi.GetIfEntry2Ex
+//sys   GetUnicastIpAddressEntry(row *MibUnicastIpAddressRow) (errcode error) = iphlpapi.GetUnicastIpAddressEntry
+//sys   NotifyIpInterfaceChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) = iphlpapi.NotifyIpInterfaceChange
+//sys   NotifyUnicastIpAddressChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) = iphlpapi.NotifyUnicastIpAddressChange
+//sys   CancelMibChangeNotify2(notificationHandle Handle) (errcode error) = iphlpapi.CancelMibChangeNotify2
 
 // For testing: clients can set this flag to force
 // creation of IPv6 sockets to return EAFNOSUPPORT.
@@ -1368,9 +1377,11 @@ func SetsockoptLinger(fd Handle, level, opt int, l *Linger) (err error) {
 func SetsockoptInet4Addr(fd Handle, level, opt int, value [4]byte) (err error) {
 	return Setsockopt(fd, int32(level), int32(opt), (*byte)(unsafe.Pointer(&value[0])), 4)
 }
+
 func SetsockoptIPMreq(fd Handle, level, opt int, mreq *IPMreq) (err error) {
 	return Setsockopt(fd, int32(level), int32(opt), (*byte)(unsafe.Pointer(mreq)), int32(unsafe.Sizeof(*mreq)))
 }
+
 func SetsockoptIPv6Mreq(fd Handle, level, opt int, mreq *IPv6Mreq) (err error) {
 	return syscall.EWINDOWS
 }
@@ -1673,13 +1684,16 @@ func (s NTStatus) Error() string {
 // do not use NTUnicodeString, and instead UTF16PtrFromString should be used for
 // the more common *uint16 string type.
 func NewNTUnicodeString(s string) (*NTUnicodeString, error) {
-	var u NTUnicodeString
-	s16, err := UTF16PtrFromString(s)
+	s16, err := UTF16FromString(s)
 	if err != nil {
 		return nil, err
 	}
-	RtlInitUnicodeString(&u, s16)
-	return &u, nil
+	n := uint16(len(s16) * 2)
+	return &NTUnicodeString{
+		Length:        n - 2, // subtract 2 bytes for the NULL terminator
+		MaximumLength: n,
+		Buffer:        &s16[0],
+	}, nil
 }
 
 // Slice returns a uint16 slice that aliases the data in the NTUnicodeString.
diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go
index d8cb71db..9d138de5 100644
--- a/vendor/golang.org/x/sys/windows/types_windows.go
+++ b/vendor/golang.org/x/sys/windows/types_windows.go
@@ -176,6 +176,7 @@ const (
 	WAIT_FAILED    = 0xFFFFFFFF
 
 	// Access rights for process.
+	PROCESS_ALL_ACCESS                = 0xFFFF
 	PROCESS_CREATE_PROCESS            = 0x0080
 	PROCESS_CREATE_THREAD             = 0x0002
 	PROCESS_DUP_HANDLE                = 0x0040
@@ -1060,6 +1061,7 @@ const (
 	SIO_GET_EXTENSION_FUNCTION_POINTER = IOC_INOUT | IOC_WS2 | 6
 	SIO_KEEPALIVE_VALS                 = IOC_IN | IOC_VENDOR | 4
 	SIO_UDP_CONNRESET                  = IOC_IN | IOC_VENDOR | 12
+	SIO_UDP_NETRESET                   = IOC_IN | IOC_VENDOR | 15
 
 	// cf. http://support.microsoft.com/default.aspx?scid=kb;en-us;257460
 
@@ -2003,7 +2005,21 @@ const (
 	MOVEFILE_FAIL_IF_NOT_TRACKABLE = 0x20
 )
 
-const GAA_FLAG_INCLUDE_PREFIX = 0x00000010
+// Flags for GetAdaptersAddresses, see
+// https://learn.microsoft.com/en-us/windows/win32/api/iphlpapi/nf-iphlpapi-getadaptersaddresses.
+const (
+	GAA_FLAG_SKIP_UNICAST                = 0x1
+	GAA_FLAG_SKIP_ANYCAST                = 0x2
+	GAA_FLAG_SKIP_MULTICAST              = 0x4
+	GAA_FLAG_SKIP_DNS_SERVER             = 0x8
+	GAA_FLAG_INCLUDE_PREFIX              = 0x10
+	GAA_FLAG_SKIP_FRIENDLY_NAME          = 0x20
+	GAA_FLAG_INCLUDE_WINS_INFO           = 0x40
+	GAA_FLAG_INCLUDE_GATEWAYS            = 0x80
+	GAA_FLAG_INCLUDE_ALL_INTERFACES      = 0x100
+	GAA_FLAG_INCLUDE_ALL_COMPARTMENTS    = 0x200
+	GAA_FLAG_INCLUDE_TUNNEL_BINDINGORDER = 0x400
+)
 
 const (
 	IF_TYPE_OTHER              = 1
@@ -2017,6 +2033,50 @@ const (
 	IF_TYPE_IEEE1394           = 144
 )
 
+// Enum NL_PREFIX_ORIGIN for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_prefix_origin
+const (
+	IpPrefixOriginOther               = 0
+	IpPrefixOriginManual              = 1
+	IpPrefixOriginWellKnown           = 2
+	IpPrefixOriginDhcp                = 3
+	IpPrefixOriginRouterAdvertisement = 4
+	IpPrefixOriginUnchanged           = 1 << 4
+)
+
+// Enum NL_SUFFIX_ORIGIN for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_suffix_origin
+const (
+	NlsoOther                      = 0
+	NlsoManual                     = 1
+	NlsoWellKnown                  = 2
+	NlsoDhcp                       = 3
+	NlsoLinkLayerAddress           = 4
+	NlsoRandom                     = 5
+	IpSuffixOriginOther            = 0
+	IpSuffixOriginManual           = 1
+	IpSuffixOriginWellKnown        = 2
+	IpSuffixOriginDhcp             = 3
+	IpSuffixOriginLinkLayerAddress = 4
+	IpSuffixOriginRandom           = 5
+	IpSuffixOriginUnchanged        = 1 << 4
+)
+
+// Enum NL_DAD_STATE for [IpAdapterUnicastAddress], see
+// https://learn.microsoft.com/en-us/windows/win32/api/nldef/ne-nldef-nl_dad_state
+const (
+	NldsInvalid          = 0
+	NldsTentative        = 1
+	NldsDuplicate        = 2
+	NldsDeprecated       = 3
+	NldsPreferred        = 4
+	IpDadStateInvalid    = 0
+	IpDadStateTentative  = 1
+	IpDadStateDuplicate  = 2
+	IpDadStateDeprecated = 3
+	IpDadStatePreferred  = 4
+)
+
 type SocketAddress struct {
 	Sockaddr       *syscall.RawSockaddrAny
 	SockaddrLength int32
@@ -2144,6 +2204,132 @@ const (
 	IfOperStatusLowerLayerDown = 7
 )
 
+const (
+	IF_MAX_PHYS_ADDRESS_LENGTH = 32
+	IF_MAX_STRING_SIZE         = 256
+)
+
+// MIB_IF_ENTRY_LEVEL enumeration from netioapi.h or
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/nf-netioapi-getifentry2ex.
+const (
+	MibIfEntryNormal                  = 0
+	MibIfEntryNormalWithoutStatistics = 2
+)
+
+// MIB_NOTIFICATION_TYPE enumeration from netioapi.h or
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ne-netioapi-mib_notification_type.
+const (
+	MibParameterNotification = 0
+	MibAddInstance           = 1
+	MibDeleteInstance        = 2
+	MibInitialNotification   = 3
+)
+
+// MibIfRow2 stores information about a particular interface. See
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_if_row2.
+type MibIfRow2 struct {
+	InterfaceLuid               uint64
+	InterfaceIndex              uint32
+	InterfaceGuid               GUID
+	Alias                       [IF_MAX_STRING_SIZE + 1]uint16
+	Description                 [IF_MAX_STRING_SIZE + 1]uint16
+	PhysicalAddressLength       uint32
+	PhysicalAddress             [IF_MAX_PHYS_ADDRESS_LENGTH]uint8
+	PermanentPhysicalAddress    [IF_MAX_PHYS_ADDRESS_LENGTH]uint8
+	Mtu                         uint32
+	Type                        uint32
+	TunnelType                  uint32
+	MediaType                   uint32
+	PhysicalMediumType          uint32
+	AccessType                  uint32
+	DirectionType               uint32
+	InterfaceAndOperStatusFlags uint8
+	OperStatus                  uint32
+	AdminStatus                 uint32
+	MediaConnectState           uint32
+	NetworkGuid                 GUID
+	ConnectionType              uint32
+	TransmitLinkSpeed           uint64
+	ReceiveLinkSpeed            uint64
+	InOctets                    uint64
+	InUcastPkts                 uint64
+	InNUcastPkts                uint64
+	InDiscards                  uint64
+	InErrors                    uint64
+	InUnknownProtos             uint64
+	InUcastOctets               uint64
+	InMulticastOctets           uint64
+	InBroadcastOctets           uint64
+	OutOctets                   uint64
+	OutUcastPkts                uint64
+	OutNUcastPkts               uint64
+	OutDiscards                 uint64
+	OutErrors                   uint64
+	OutUcastOctets              uint64
+	OutMulticastOctets          uint64
+	OutBroadcastOctets          uint64
+	OutQLen                     uint64
+}
+
+// MIB_UNICASTIPADDRESS_ROW stores information about a unicast IP address. See
+// https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_unicastipaddress_row.
+type MibUnicastIpAddressRow struct {
+	Address            RawSockaddrInet6 // SOCKADDR_INET union
+	InterfaceLuid      uint64
+	InterfaceIndex     uint32
+	PrefixOrigin       uint32
+	SuffixOrigin       uint32
+	ValidLifetime      uint32
+	PreferredLifetime  uint32
+	OnLinkPrefixLength uint8
+	SkipAsSource       uint8
+	DadState           uint32
+	ScopeId            uint32
+	CreationTimeStamp  Filetime
+}
+
+const ScopeLevelCount = 16
+
+// MIB_IPINTERFACE_ROW stores interface management information for a particular IP address family on a network interface.
+// See https://learn.microsoft.com/en-us/windows/win32/api/netioapi/ns-netioapi-mib_ipinterface_row.
+type MibIpInterfaceRow struct {
+	Family                               uint16
+	InterfaceLuid                        uint64
+	InterfaceIndex                       uint32
+	MaxReassemblySize                    uint32
+	InterfaceIdentifier                  uint64
+	MinRouterAdvertisementInterval       uint32
+	MaxRouterAdvertisementInterval       uint32
+	AdvertisingEnabled                   uint8
+	ForwardingEnabled                    uint8
+	WeakHostSend                         uint8
+	WeakHostReceive                      uint8
+	UseAutomaticMetric                   uint8
+	UseNeighborUnreachabilityDetection   uint8
+	ManagedAddressConfigurationSupported uint8
+	OtherStatefulConfigurationSupported  uint8
+	AdvertiseDefaultRoute                uint8
+	RouterDiscoveryBehavior              uint32
+	DadTransmits                         uint32
+	BaseReachableTime                    uint32
+	RetransmitTime                       uint32
+	PathMtuDiscoveryTimeout              uint32
+	LinkLocalAddressBehavior             uint32
+	LinkLocalAddressTimeout              uint32
+	ZoneIndices                          [ScopeLevelCount]uint32
+	SitePrefixLength                     uint32
+	Metric                               uint32
+	NlMtu                                uint32
+	Connected                            uint8
+	SupportsWakeUpPatterns               uint8
+	SupportsNeighborDiscovery            uint8
+	SupportsRouterDiscovery              uint8
+	ReachableTime                        uint32
+	TransmitOffload                      uint32
+	ReceiveOffload                       uint32
+	DisableDefaultRoutes                 uint8
+}
+
 // Console related constants used for the mode parameter to SetConsoleMode. See
 // https://docs.microsoft.com/en-us/windows/console/setconsolemode for details.
 
@@ -3404,3 +3590,14 @@ type DCB struct {
 	EvtChar    byte
 	wReserved1 uint16
 }
+
+// Keyboard Layout Flags.
+// See https://learn.microsoft.com/en-us/windows/win32/api/winuser/nf-winuser-loadkeyboardlayoutw
+const (
+	KLF_ACTIVATE      = 0x00000001
+	KLF_SUBSTITUTE_OK = 0x00000002
+	KLF_REORDER       = 0x00000008
+	KLF_REPLACELANG   = 0x00000010
+	KLF_NOTELLSHELL   = 0x00000080
+	KLF_SETFORPROCESS = 0x00000100
+)
diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
index 9f73df75..01c0716c 100644
--- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go
+++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go
@@ -91,6 +91,7 @@ var (
 	procEnumServicesStatusExW                                = modadvapi32.NewProc("EnumServicesStatusExW")
 	procEqualSid                                             = modadvapi32.NewProc("EqualSid")
 	procFreeSid                                              = modadvapi32.NewProc("FreeSid")
+	procGetAce                                               = modadvapi32.NewProc("GetAce")
 	procGetLengthSid                                         = modadvapi32.NewProc("GetLengthSid")
 	procGetNamedSecurityInfoW                                = modadvapi32.NewProc("GetNamedSecurityInfoW")
 	procGetSecurityDescriptorControl                         = modadvapi32.NewProc("GetSecurityDescriptorControl")
@@ -180,10 +181,15 @@ var (
 	procDnsRecordListFree                                    = moddnsapi.NewProc("DnsRecordListFree")
 	procDwmGetWindowAttribute                                = moddwmapi.NewProc("DwmGetWindowAttribute")
 	procDwmSetWindowAttribute                                = moddwmapi.NewProc("DwmSetWindowAttribute")
+	procCancelMibChangeNotify2                               = modiphlpapi.NewProc("CancelMibChangeNotify2")
 	procGetAdaptersAddresses                                 = modiphlpapi.NewProc("GetAdaptersAddresses")
 	procGetAdaptersInfo                                      = modiphlpapi.NewProc("GetAdaptersInfo")
 	procGetBestInterfaceEx                                   = modiphlpapi.NewProc("GetBestInterfaceEx")
 	procGetIfEntry                                           = modiphlpapi.NewProc("GetIfEntry")
+	procGetIfEntry2Ex                                        = modiphlpapi.NewProc("GetIfEntry2Ex")
+	procGetUnicastIpAddressEntry                             = modiphlpapi.NewProc("GetUnicastIpAddressEntry")
+	procNotifyIpInterfaceChange                              = modiphlpapi.NewProc("NotifyIpInterfaceChange")
+	procNotifyUnicastIpAddressChange                         = modiphlpapi.NewProc("NotifyUnicastIpAddressChange")
 	procAddDllDirectory                                      = modkernel32.NewProc("AddDllDirectory")
 	procAssignProcessToJobObject                             = modkernel32.NewProc("AssignProcessToJobObject")
 	procCancelIo                                             = modkernel32.NewProc("CancelIo")
@@ -246,7 +252,9 @@ var (
 	procGetCommandLineW                                      = modkernel32.NewProc("GetCommandLineW")
 	procGetComputerNameExW                                   = modkernel32.NewProc("GetComputerNameExW")
 	procGetComputerNameW                                     = modkernel32.NewProc("GetComputerNameW")
+	procGetConsoleCP                                         = modkernel32.NewProc("GetConsoleCP")
 	procGetConsoleMode                                       = modkernel32.NewProc("GetConsoleMode")
+	procGetConsoleOutputCP                                   = modkernel32.NewProc("GetConsoleOutputCP")
 	procGetConsoleScreenBufferInfo                           = modkernel32.NewProc("GetConsoleScreenBufferInfo")
 	procGetCurrentDirectoryW                                 = modkernel32.NewProc("GetCurrentDirectoryW")
 	procGetCurrentProcessId                                  = modkernel32.NewProc("GetCurrentProcessId")
@@ -272,8 +280,10 @@ var (
 	procGetMaximumProcessorCount                             = modkernel32.NewProc("GetMaximumProcessorCount")
 	procGetModuleFileNameW                                   = modkernel32.NewProc("GetModuleFileNameW")
 	procGetModuleHandleExW                                   = modkernel32.NewProc("GetModuleHandleExW")
+	procGetNamedPipeClientProcessId                          = modkernel32.NewProc("GetNamedPipeClientProcessId")
 	procGetNamedPipeHandleStateW                             = modkernel32.NewProc("GetNamedPipeHandleStateW")
 	procGetNamedPipeInfo                                     = modkernel32.NewProc("GetNamedPipeInfo")
+	procGetNamedPipeServerProcessId                          = modkernel32.NewProc("GetNamedPipeServerProcessId")
 	procGetOverlappedResult                                  = modkernel32.NewProc("GetOverlappedResult")
 	procGetPriorityClass                                     = modkernel32.NewProc("GetPriorityClass")
 	procGetProcAddress                                       = modkernel32.NewProc("GetProcAddress")
@@ -346,8 +356,10 @@ var (
 	procSetCommMask                                          = modkernel32.NewProc("SetCommMask")
 	procSetCommState                                         = modkernel32.NewProc("SetCommState")
 	procSetCommTimeouts                                      = modkernel32.NewProc("SetCommTimeouts")
+	procSetConsoleCP                                         = modkernel32.NewProc("SetConsoleCP")
 	procSetConsoleCursorPosition                             = modkernel32.NewProc("SetConsoleCursorPosition")
 	procSetConsoleMode                                       = modkernel32.NewProc("SetConsoleMode")
+	procSetConsoleOutputCP                                   = modkernel32.NewProc("SetConsoleOutputCP")
 	procSetCurrentDirectoryW                                 = modkernel32.NewProc("SetCurrentDirectoryW")
 	procSetDefaultDllDirectories                             = modkernel32.NewProc("SetDefaultDllDirectories")
 	procSetDllDirectoryW                                     = modkernel32.NewProc("SetDllDirectoryW")
@@ -477,12 +489,16 @@ var (
 	procGetDesktopWindow                                     = moduser32.NewProc("GetDesktopWindow")
 	procGetForegroundWindow                                  = moduser32.NewProc("GetForegroundWindow")
 	procGetGUIThreadInfo                                     = moduser32.NewProc("GetGUIThreadInfo")
+	procGetKeyboardLayout                                    = moduser32.NewProc("GetKeyboardLayout")
 	procGetShellWindow                                       = moduser32.NewProc("GetShellWindow")
 	procGetWindowThreadProcessId                             = moduser32.NewProc("GetWindowThreadProcessId")
 	procIsWindow                                             = moduser32.NewProc("IsWindow")
 	procIsWindowUnicode                                      = moduser32.NewProc("IsWindowUnicode")
 	procIsWindowVisible                                      = moduser32.NewProc("IsWindowVisible")
+	procLoadKeyboardLayoutW                                  = moduser32.NewProc("LoadKeyboardLayoutW")
 	procMessageBoxW                                          = moduser32.NewProc("MessageBoxW")
+	procToUnicodeEx                                          = moduser32.NewProc("ToUnicodeEx")
+	procUnloadKeyboardLayout                                 = moduser32.NewProc("UnloadKeyboardLayout")
 	procCreateEnvironmentBlock                               = moduserenv.NewProc("CreateEnvironmentBlock")
 	procDestroyEnvironmentBlock                              = moduserenv.NewProc("DestroyEnvironmentBlock")
 	procGetUserProfileDirectoryW                             = moduserenv.NewProc("GetUserProfileDirectoryW")
@@ -788,6 +804,14 @@ func FreeSid(sid *SID) (err error) {
 	return
 }
 
+func GetAce(acl *ACL, aceIndex uint32, pAce **ACCESS_ALLOWED_ACE) (err error) {
+	r1, _, e1 := syscall.Syscall(procGetAce.Addr(), 3, uintptr(unsafe.Pointer(acl)), uintptr(aceIndex), uintptr(unsafe.Pointer(pAce)))
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetLengthSid(sid *SID) (len uint32) {
 	r0, _, _ := syscall.Syscall(procGetLengthSid.Addr(), 1, uintptr(unsafe.Pointer(sid)), 0, 0)
 	len = uint32(r0)
@@ -1589,6 +1613,14 @@ func DwmSetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, si
 	return
 }
 
+func CancelMibChangeNotify2(notificationHandle Handle) (errcode error) {
+	r0, _, _ := syscall.Syscall(procCancelMibChangeNotify2.Addr(), 1, uintptr(notificationHandle), 0, 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
 func GetAdaptersAddresses(family uint32, flags uint32, reserved uintptr, adapterAddresses *IpAdapterAddresses, sizePointer *uint32) (errcode error) {
 	r0, _, _ := syscall.Syscall6(procGetAdaptersAddresses.Addr(), 5, uintptr(family), uintptr(flags), uintptr(reserved), uintptr(unsafe.Pointer(adapterAddresses)), uintptr(unsafe.Pointer(sizePointer)), 0)
 	if r0 != 0 {
@@ -1621,6 +1653,46 @@ func GetIfEntry(pIfRow *MibIfRow) (errcode error) {
 	return
 }
 
+func GetIfEntry2Ex(level uint32, row *MibIfRow2) (errcode error) {
+	r0, _, _ := syscall.Syscall(procGetIfEntry2Ex.Addr(), 2, uintptr(level), uintptr(unsafe.Pointer(row)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func GetUnicastIpAddressEntry(row *MibUnicastIpAddressRow) (errcode error) {
+	r0, _, _ := syscall.Syscall(procGetUnicastIpAddressEntry.Addr(), 1, uintptr(unsafe.Pointer(row)), 0, 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func NotifyIpInterfaceChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) {
+	var _p0 uint32
+	if initialNotification {
+		_p0 = 1
+	}
+	r0, _, _ := syscall.Syscall6(procNotifyIpInterfaceChange.Addr(), 5, uintptr(family), uintptr(callback), uintptr(callerContext), uintptr(_p0), uintptr(unsafe.Pointer(notificationHandle)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
+func NotifyUnicastIpAddressChange(family uint16, callback uintptr, callerContext unsafe.Pointer, initialNotification bool, notificationHandle *Handle) (errcode error) {
+	var _p0 uint32
+	if initialNotification {
+		_p0 = 1
+	}
+	r0, _, _ := syscall.Syscall6(procNotifyUnicastIpAddressChange.Addr(), 5, uintptr(family), uintptr(callback), uintptr(callerContext), uintptr(_p0), uintptr(unsafe.Pointer(notificationHandle)), 0)
+	if r0 != 0 {
+		errcode = syscall.Errno(r0)
+	}
+	return
+}
+
 func AddDllDirectory(path *uint16) (cookie uintptr, err error) {
 	r0, _, e1 := syscall.Syscall(procAddDllDirectory.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
 	cookie = uintptr(r0)
@@ -2149,6 +2221,15 @@ func GetComputerName(buf *uint16, n *uint32) (err error) {
 	return
 }
 
+func GetConsoleCP() (cp uint32, err error) {
+	r0, _, e1 := syscall.Syscall(procGetConsoleCP.Addr(), 0, 0, 0, 0)
+	cp = uint32(r0)
+	if cp == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetConsoleMode(console Handle, mode *uint32) (err error) {
 	r1, _, e1 := syscall.Syscall(procGetConsoleMode.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(mode)), 0)
 	if r1 == 0 {
@@ -2157,6 +2238,15 @@ func GetConsoleMode(console Handle, mode *uint32) (err error) {
 	return
 }
 
+func GetConsoleOutputCP() (cp uint32, err error) {
+	r0, _, e1 := syscall.Syscall(procGetConsoleOutputCP.Addr(), 0, 0, 0, 0)
+	cp = uint32(r0)
+	if cp == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) {
 	r1, _, e1 := syscall.Syscall(procGetConsoleScreenBufferInfo.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(info)), 0)
 	if r1 == 0 {
@@ -2358,6 +2448,14 @@ func GetModuleHandleEx(flags uint32, moduleName *uint16, module *Handle) (err er
 	return
 }
 
+func GetNamedPipeClientProcessId(pipe Handle, clientProcessID *uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procGetNamedPipeClientProcessId.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(clientProcessID)), 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetNamedPipeHandleState(pipe Handle, state *uint32, curInstances *uint32, maxCollectionCount *uint32, collectDataTimeout *uint32, userName *uint16, maxUserNameSize uint32) (err error) {
 	r1, _, e1 := syscall.Syscall9(procGetNamedPipeHandleStateW.Addr(), 7, uintptr(pipe), uintptr(unsafe.Pointer(state)), uintptr(unsafe.Pointer(curInstances)), uintptr(unsafe.Pointer(maxCollectionCount)), uintptr(unsafe.Pointer(collectDataTimeout)), uintptr(unsafe.Pointer(userName)), uintptr(maxUserNameSize), 0, 0)
 	if r1 == 0 {
@@ -2374,6 +2472,14 @@ func GetNamedPipeInfo(pipe Handle, flags *uint32, outSize *uint32, inSize *uint3
 	return
 }
 
+func GetNamedPipeServerProcessId(pipe Handle, serverProcessID *uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procGetNamedPipeServerProcessId.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(serverProcessID)), 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func GetOverlappedResult(handle Handle, overlapped *Overlapped, done *uint32, wait bool) (err error) {
 	var _p0 uint32
 	if wait {
@@ -3025,6 +3131,14 @@ func SetCommTimeouts(handle Handle, timeouts *CommTimeouts) (err error) {
 	return
 }
 
+func SetConsoleCP(cp uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procSetConsoleCP.Addr(), 1, uintptr(cp), 0, 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func setConsoleCursorPosition(console Handle, position uint32) (err error) {
 	r1, _, e1 := syscall.Syscall(procSetConsoleCursorPosition.Addr(), 2, uintptr(console), uintptr(position), 0)
 	if r1 == 0 {
@@ -3041,6 +3155,14 @@ func SetConsoleMode(console Handle, mode uint32) (err error) {
 	return
 }
 
+func SetConsoleOutputCP(cp uint32) (err error) {
+	r1, _, e1 := syscall.Syscall(procSetConsoleOutputCP.Addr(), 1, uintptr(cp), 0, 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func SetCurrentDirectory(path *uint16) (err error) {
 	r1, _, e1 := syscall.Syscall(procSetCurrentDirectoryW.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0)
 	if r1 == 0 {
@@ -4073,6 +4195,12 @@ func GetGUIThreadInfo(thread uint32, info *GUIThreadInfo) (err error) {
 	return
 }
 
+func GetKeyboardLayout(tid uint32) (hkl Handle) {
+	r0, _, _ := syscall.Syscall(procGetKeyboardLayout.Addr(), 1, uintptr(tid), 0, 0)
+	hkl = Handle(r0)
+	return
+}
+
 func GetShellWindow() (shellWindow HWND) {
 	r0, _, _ := syscall.Syscall(procGetShellWindow.Addr(), 0, 0, 0, 0)
 	shellWindow = HWND(r0)
@@ -4106,6 +4234,15 @@ func IsWindowVisible(hwnd HWND) (isVisible bool) {
 	return
 }
 
+func LoadKeyboardLayout(name *uint16, flags uint32) (hkl Handle, err error) {
+	r0, _, e1 := syscall.Syscall(procLoadKeyboardLayoutW.Addr(), 2, uintptr(unsafe.Pointer(name)), uintptr(flags), 0)
+	hkl = Handle(r0)
+	if hkl == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret int32, err error) {
 	r0, _, e1 := syscall.Syscall6(procMessageBoxW.Addr(), 4, uintptr(hwnd), uintptr(unsafe.Pointer(text)), uintptr(unsafe.Pointer(caption)), uintptr(boxtype), 0, 0)
 	ret = int32(r0)
@@ -4115,6 +4252,20 @@ func MessageBox(hwnd HWND, text *uint16, caption *uint16, boxtype uint32) (ret i
 	return
 }
 
+func ToUnicodeEx(vkey uint32, scancode uint32, keystate *byte, pwszBuff *uint16, cchBuff int32, flags uint32, hkl Handle) (ret int32) {
+	r0, _, _ := syscall.Syscall9(procToUnicodeEx.Addr(), 7, uintptr(vkey), uintptr(scancode), uintptr(unsafe.Pointer(keystate)), uintptr(unsafe.Pointer(pwszBuff)), uintptr(cchBuff), uintptr(flags), uintptr(hkl), 0, 0)
+	ret = int32(r0)
+	return
+}
+
+func UnloadKeyboardLayout(hkl Handle) (err error) {
+	r1, _, e1 := syscall.Syscall(procUnloadKeyboardLayout.Addr(), 1, uintptr(hkl), 0, 0)
+	if r1 == 0 {
+		err = errnoErr(e1)
+	}
+	return
+}
+
 func CreateEnvironmentBlock(block **uint16, token Token, inheritExisting bool) (err error) {
 	var _p0 uint32
 	if inheritExisting {
diff --git a/vendor/golang.org/x/term/LICENSE b/vendor/golang.org/x/term/LICENSE
index 6a66aea5..2a7cf70d 100644
--- a/vendor/golang.org/x/term/LICENSE
+++ b/vendor/golang.org/x/term/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
+Copyright 2009 The Go Authors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer.
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
-   * Neither the name of Google Inc. nor the names of its
+   * Neither the name of Google LLC nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/vendor/golang.org/x/term/README.md b/vendor/golang.org/x/term/README.md
index d03d0aef..05ff623f 100644
--- a/vendor/golang.org/x/term/README.md
+++ b/vendor/golang.org/x/term/README.md
@@ -4,16 +4,13 @@
 
 This repository provides Go terminal and console support packages.
 
-## Download/Install
-
-The easiest way to install is to run `go get -u golang.org/x/term`. You can
-also manually git clone the repository to `$GOPATH/src/golang.org/x/term`.
-
 ## Report Issues / Send Patches
 
 This repository uses Gerrit for code changes. To learn how to submit changes to
-this repository, see https://golang.org/doc/contribute.html.
+this repository, see https://go.dev/doc/contribute.
+
+The git repository is https://go.googlesource.com/term.
 
 The main issue tracker for the term repository is located at
-https://github.com/golang/go/issues. Prefix your issue with "x/term:" in the
+https://go.dev/issues. Prefix your issue with "x/term:" in the
 subject line, so it is easy to find.
diff --git a/vendor/golang.org/x/term/term_windows.go b/vendor/golang.org/x/term/term_windows.go
index 465f5606..df6bf948 100644
--- a/vendor/golang.org/x/term/term_windows.go
+++ b/vendor/golang.org/x/term/term_windows.go
@@ -26,6 +26,7 @@ func makeRaw(fd int) (*State, error) {
 		return nil, err
 	}
 	raw := st &^ (windows.ENABLE_ECHO_INPUT | windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT)
+	raw |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT
 	if err := windows.SetConsoleMode(windows.Handle(fd), raw); err != nil {
 		return nil, err
 	}
diff --git a/vendor/golang.org/x/text/LICENSE b/vendor/golang.org/x/text/LICENSE
index 6a66aea5..2a7cf70d 100644
--- a/vendor/golang.org/x/text/LICENSE
+++ b/vendor/golang.org/x/text/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2009 The Go Authors. All rights reserved.
+Copyright 2009 The Go Authors.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer.
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
-   * Neither the name of Google Inc. nor the names of its
+   * Neither the name of Google LLC nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 468e4361..6ae88d40 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -221,7 +221,7 @@ github.com/prometheus/common/model
 github.com/prometheus/procfs
 github.com/prometheus/procfs/internal/fs
 github.com/prometheus/procfs/internal/util
-# github.com/quic-go/quic-go v0.45.0
+# github.com/quic-go/quic-go v0.45.0 => github.com/chungthuang/quic-go v0.45.1-0.20250128102735-2687bd175910
 ## explicit; go 1.21
 github.com/quic-go/quic-go
 github.com/quic-go/quic-go/internal/ackhandler
@@ -309,14 +309,13 @@ go.uber.org/automaxprocs/maxprocs
 go.uber.org/mock/gomock
 go.uber.org/mock/mockgen
 go.uber.org/mock/mockgen/model
-# golang.org/x/crypto v0.24.0
-## explicit; go 1.18
+# golang.org/x/crypto v0.31.0
+## explicit; go 1.20
 golang.org/x/crypto/blake2b
 golang.org/x/crypto/blowfish
 golang.org/x/crypto/chacha20
 golang.org/x/crypto/chacha20poly1305
 golang.org/x/crypto/curve25519
-golang.org/x/crypto/curve25519/internal/field
 golang.org/x/crypto/hkdf
 golang.org/x/crypto/internal/alias
 golang.org/x/crypto/internal/poly1305
@@ -358,10 +357,10 @@ golang.org/x/net/websocket
 ## explicit; go 1.18
 golang.org/x/oauth2
 golang.org/x/oauth2/internal
-# golang.org/x/sync v0.7.0
+# golang.org/x/sync v0.10.0
 ## explicit; go 1.18
 golang.org/x/sync/errgroup
-# golang.org/x/sys v0.21.0
+# golang.org/x/sys v0.28.0
 ## explicit; go 1.18
 golang.org/x/sys/cpu
 golang.org/x/sys/execabs
@@ -372,10 +371,10 @@ golang.org/x/sys/windows/registry
 golang.org/x/sys/windows/svc
 golang.org/x/sys/windows/svc/eventlog
 golang.org/x/sys/windows/svc/mgr
-# golang.org/x/term v0.21.0
+# golang.org/x/term v0.27.0
 ## explicit; go 1.18
 golang.org/x/term
-# golang.org/x/text v0.16.0
+# golang.org/x/text v0.21.0
 ## explicit; go 1.18
 golang.org/x/text/cases
 golang.org/x/text/internal
@@ -557,3 +556,4 @@ zombiezen.com/go/capnproto2/std/capnp/rpc
 # github.com/urfave/cli/v2 => github.com/ipostelnik/cli/v2 v2.3.1-0.20210324024421-b6ea8234fe3d
 # github.com/prometheus/golang_client => github.com/prometheus/golang_client v1.12.1
 # gopkg.in/yaml.v3 => gopkg.in/yaml.v3 v3.0.1
+# github.com/quic-go/quic-go => github.com/chungthuang/quic-go v0.45.1-0.20250128102735-2687bd175910