TUN-3090: Upgrade crypto dep

2020-06-12 00:03:09 -05:00 · 2020-06-12 00:03:09 -05:00 · 6e761cb7ae
parent ae8d784e36
commit 6e761cb7ae
65 changed files with 5603 additions and 3850 deletions
--- a/go.mod
+++ b/go.mod
@ -58,7 +58,7 @@ require (
 	github.com/stretchr/testify v1.3.0
 	github.com/tinylib/msgp v1.1.0 // indirect
 	github.com/xo/dburl v0.0.0-20191005012637-293c3298d6c0
-	golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
+	golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9
 	golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582
 	golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 // indirect
 	golang.org/x/sync v0.0.0-20190423024810-112230192c58
--- a/go.sum
+++ b/go.sum
@ -204,6 +204,8 @@ golang.org/x/crypto v0.0.0-20190325154230-a5d413f7728c/go.mod h1:djNgcEr1/C05ACk
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq69pTHfNouLtWZG7j9rPN8=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 h1:vEg9joUBmeBcK9iSJftGNf3coIG4HqZElCPehJsfAYM=
 golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
 golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
--- a/vendor/golang.org/x/crypto/blake2b/blake2b.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b.go
@ -0,0 +1,291 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package blake2b implements the BLAKE2b hash algorithm defined by RFC 7693
 // and the extendable output function (XOF) BLAKE2Xb.
 //
 // BLAKE2b is optimized for 64-bit platforms—including NEON-enabled ARMs—and
 // produces digests of any size between 1 and 64 bytes.
 // For a detailed specification of BLAKE2b see https://blake2.net/blake2.pdf
 // and for BLAKE2Xb see https://blake2.net/blake2x.pdf
 //
 // If you aren't sure which function you need, use BLAKE2b (Sum512 or New512).
 // If you need a secret-key MAC (message authentication code), use the New512
 // function with a non-nil key.
 //
 // BLAKE2X is a construction to compute hash values larger than 64 bytes. It
 // can produce hash values between 0 and 4 GiB.
 package blake2b
 import (
 	"encoding/binary"
 	"errors"
 	"hash"
 )
 const (
 	// The blocksize of BLAKE2b in bytes.
 	BlockSize = 128
 	// The hash size of BLAKE2b-512 in bytes.
 	Size = 64
 	// The hash size of BLAKE2b-384 in bytes.
 	Size384 = 48
 	// The hash size of BLAKE2b-256 in bytes.
 	Size256 = 32
 )
 var (
 	useAVX2 bool
 	useAVX  bool
 	useSSE4 bool
 )
 var (
 	errKeySize  = errors.New("blake2b: invalid key size")
 	errHashSize = errors.New("blake2b: invalid hash size")
 )
 var iv = [8]uint64{
 	0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
 	0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 }
 // Sum512 returns the BLAKE2b-512 checksum of the data.
 func Sum512(data []byte) [Size]byte {
 	var sum [Size]byte
 	checkSum(&sum, Size, data)
 	return sum
 }
 // Sum384 returns the BLAKE2b-384 checksum of the data.
 func Sum384(data []byte) [Size384]byte {
 	var sum [Size]byte
 	var sum384 [Size384]byte
 	checkSum(&sum, Size384, data)
 	copy(sum384[:], sum[:Size384])
 	return sum384
 }
 // Sum256 returns the BLAKE2b-256 checksum of the data.
 func Sum256(data []byte) [Size256]byte {
 	var sum [Size]byte
 	var sum256 [Size256]byte
 	checkSum(&sum, Size256, data)
 	copy(sum256[:], sum[:Size256])
 	return sum256
 }
 // New512 returns a new hash.Hash computing the BLAKE2b-512 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New512(key []byte) (hash.Hash, error) { return newDigest(Size, key) }
 // New384 returns a new hash.Hash computing the BLAKE2b-384 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New384(key []byte) (hash.Hash, error) { return newDigest(Size384, key) }
 // New256 returns a new hash.Hash computing the BLAKE2b-256 checksum. A non-nil
 // key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 func New256(key []byte) (hash.Hash, error) { return newDigest(Size256, key) }
 // New returns a new hash.Hash computing the BLAKE2b checksum with a custom length.
 // A non-nil key turns the hash into a MAC. The key must be between zero and 64 bytes long.
 // The hash size can be a value between 1 and 64 but it is highly recommended to use
 // values equal or greater than:
 // - 32 if BLAKE2b is used as a hash function (The key is zero bytes long).
 // - 16 if BLAKE2b is used as a MAC function (The key is at least 16 bytes long).
 // When the key is nil, the returned hash.Hash implements BinaryMarshaler
 // and BinaryUnmarshaler for state (de)serialization as documented by hash.Hash.
 func New(size int, key []byte) (hash.Hash, error) { return newDigest(size, key) }
 func newDigest(hashSize int, key []byte) (*digest, error) {
 	if hashSize < 1 || hashSize > Size {
 		return nil, errHashSize
 	}
 	if len(key) > Size {
 		return nil, errKeySize
 	}
 	d := &digest{
 		size:   hashSize,
 		keyLen: len(key),
 	}
 	copy(d.key[:], key)
 	d.Reset()
 	return d, nil
 }
 func checkSum(sum *[Size]byte, hashSize int, data []byte) {
 	h := iv
 	h[0] ^= uint64(hashSize) | (1 << 16) | (1 << 24)
 	var c [2]uint64
 	if length := len(data); length > BlockSize {
 		n := length &^ (BlockSize - 1)
 		if length == n {
 			n -= BlockSize
 		}
 		hashBlocks(&h, &c, 0, data[:n])
 		data = data[n:]
 	}
 	var block [BlockSize]byte
 	offset := copy(block[:], data)
 	remaining := uint64(BlockSize - offset)
 	if c[0] < remaining {
 		c[1]--
 	}
 	c[0] -= remaining
 	hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
 	for i, v := range h[:(hashSize+7)/8] {
 		binary.LittleEndian.PutUint64(sum[8*i:], v)
 	}
 }
 type digest struct {
 	h      [8]uint64
 	c      [2]uint64
 	size   int
 	block  [BlockSize]byte
 	offset int
 	key    [BlockSize]byte
 	keyLen int
 }
 const (
 	magic         = "b2b"
 	marshaledSize = len(magic) + 8*8 + 2*8 + 1 + BlockSize + 1
 )
 func (d *digest) MarshalBinary() ([]byte, error) {
 	if d.keyLen != 0 {
 		return nil, errors.New("crypto/blake2b: cannot marshal MACs")
 	}
 	b := make([]byte, 0, marshaledSize)
 	b = append(b, magic...)
 	for i := 0; i < 8; i++ {
 		b = appendUint64(b, d.h[i])
 	}
 	b = appendUint64(b, d.c[0])
 	b = appendUint64(b, d.c[1])
 	// Maximum value for size is 64
 	b = append(b, byte(d.size))
 	b = append(b, d.block[:]...)
 	b = append(b, byte(d.offset))
 	return b, nil
 }
 func (d *digest) UnmarshalBinary(b []byte) error {
 	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
 		return errors.New("crypto/blake2b: invalid hash state identifier")
 	}
 	if len(b) != marshaledSize {
 		return errors.New("crypto/blake2b: invalid hash state size")
 	}
 	b = b[len(magic):]
 	for i := 0; i < 8; i++ {
 		b, d.h[i] = consumeUint64(b)
 	}
 	b, d.c[0] = consumeUint64(b)
 	b, d.c[1] = consumeUint64(b)
 	d.size = int(b[0])
 	b = b[1:]
 	copy(d.block[:], b[:BlockSize])
 	b = b[BlockSize:]
 	d.offset = int(b[0])
 	return nil
 }
 func (d *digest) BlockSize() int { return BlockSize }
 func (d *digest) Size() int { return d.size }
 func (d *digest) Reset() {
 	d.h = iv
 	d.h[0] ^= uint64(d.size) | (uint64(d.keyLen) << 8) | (1 << 16) | (1 << 24)
 	d.offset, d.c[0], d.c[1] = 0, 0, 0
 	if d.keyLen > 0 {
 		d.block = d.key
 		d.offset = BlockSize
 	}
 }
 func (d *digest) Write(p []byte) (n int, err error) {
 	n = len(p)
 	if d.offset > 0 {
 		remaining := BlockSize - d.offset
 		if n <= remaining {
 			d.offset += copy(d.block[d.offset:], p)
 			return
 		}
 		copy(d.block[d.offset:], p[:remaining])
 		hashBlocks(&d.h, &d.c, 0, d.block[:])
 		d.offset = 0
 		p = p[remaining:]
 	}
 	if length := len(p); length > BlockSize {
 		nn := length &^ (BlockSize - 1)
 		if length == nn {
 			nn -= BlockSize
 		}
 		hashBlocks(&d.h, &d.c, 0, p[:nn])
 		p = p[nn:]
 	}
 	if len(p) > 0 {
 		d.offset += copy(d.block[:], p)
 	}
 	return
 }
 func (d *digest) Sum(sum []byte) []byte {
 	var hash [Size]byte
 	d.finalize(&hash)
 	return append(sum, hash[:d.size]...)
 }
 func (d *digest) finalize(hash *[Size]byte) {
 	var block [BlockSize]byte
 	copy(block[:], d.block[:d.offset])
 	remaining := uint64(BlockSize - d.offset)
 	c := d.c
 	if c[0] < remaining {
 		c[1]--
 	}
 	c[0] -= remaining
 	h := d.h
 	hashBlocks(&h, &c, 0xFFFFFFFFFFFFFFFF, block[:])
 	for i, v := range h {
 		binary.LittleEndian.PutUint64(hash[8*i:], v)
 	}
 }
 func appendUint64(b []byte, x uint64) []byte {
 	var a [8]byte
 	binary.BigEndian.PutUint64(a[:], x)
 	return append(b, a[:]...)
 }
 func appendUint32(b []byte, x uint32) []byte {
 	var a [4]byte
 	binary.BigEndian.PutUint32(a[:], x)
 	return append(b, a[:]...)
 }
 func consumeUint64(b []byte) ([]byte, uint64) {
 	x := binary.BigEndian.Uint64(b)
 	return b[8:], x
 }
 func consumeUint32(b []byte) ([]byte, uint32) {
 	x := binary.BigEndian.Uint32(b)
 	return b[4:], x
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.go
@ -0,0 +1,37 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.7,amd64,!gccgo,!appengine
 package blake2b
 import "golang.org/x/sys/cpu"
 func init() {
 	useAVX2 = cpu.X86.HasAVX2
 	useAVX = cpu.X86.HasAVX
 	useSSE4 = cpu.X86.HasSSE41
 }
 //go:noescape
 func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 //go:noescape
 func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 //go:noescape
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	switch {
 	case useAVX2:
 		hashBlocksAVX2(h, c, flag, blocks)
 	case useAVX:
 		hashBlocksAVX(h, c, flag, blocks)
 	case useSSE4:
 		hashBlocksSSE4(h, c, flag, blocks)
 	default:
 		hashBlocksGeneric(h, c, flag, blocks)
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s
@ -0,0 +1,750 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.7,amd64,!gccgo,!appengine
 #include "textflag.h"
 DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b
 DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179
 GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403
 DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302
 DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32
 DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
 DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
 GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16
 DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16
 #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39
 #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93
 #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e
 #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93
 #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39
 #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \
 	VPADDQ  m0, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFD $-79, Y3, Y3; \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPSHUFB c40, Y1, Y1;  \
 	VPADDQ  m1, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFB c48, Y3, Y3;  \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPADDQ  Y1, Y1, t;    \
 	VPSRLQ  $63, Y1, Y1;  \
 	VPXOR   t, Y1, Y1;    \
 	VPERMQ_0x39_Y1_Y1;    \
 	VPERMQ_0x4E_Y2_Y2;    \
 	VPERMQ_0x93_Y3_Y3;    \
 	VPADDQ  m2, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFD $-79, Y3, Y3; \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPSHUFB c40, Y1, Y1;  \
 	VPADDQ  m3, Y0, Y0;   \
 	VPADDQ  Y1, Y0, Y0;   \
 	VPXOR   Y0, Y3, Y3;   \
 	VPSHUFB c48, Y3, Y3;  \
 	VPADDQ  Y3, Y2, Y2;   \
 	VPXOR   Y2, Y1, Y1;   \
 	VPADDQ  Y1, Y1, t;    \
 	VPSRLQ  $63, Y1, Y1;  \
 	VPXOR   t, Y1, Y1;    \
 	VPERMQ_0x39_Y3_Y3;    \
 	VPERMQ_0x4E_Y2_Y2;    \
 	VPERMQ_0x93_Y1_Y1
 #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E
 #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26
 #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E
 #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36
 #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E
 #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n
 #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n
 #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n
 #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n
 #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n
 #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01
 #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01
 #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01
 #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01
 #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01
 #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01
 #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01
 #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8
 #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01
 // load msg: Y12 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \
 	VMOVQ_SI_X12(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X12(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y12, Y12
 // load msg: Y13 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \
 	VMOVQ_SI_X13(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X13(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y13, Y13
 // load msg: Y14 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \
 	VMOVQ_SI_X14(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X14(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y14, Y14
 // load msg: Y15 = (i0, i1, i2, i3)
 // i0, i1, i2, i3 must not be 0
 #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \
 	VMOVQ_SI_X15(i0*8);           \
 	VMOVQ_SI_X11(i2*8);           \
 	VPINSRQ_1_SI_X15(i1*8);       \
 	VPINSRQ_1_SI_X11(i3*8);       \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \
 	VMOVQ_SI_X12_0;                   \
 	VMOVQ_SI_X11(4*8);                \
 	VPINSRQ_1_SI_X12(2*8);            \
 	VPINSRQ_1_SI_X11(6*8);            \
 	VINSERTI128 $1, X11, Y12, Y12;    \
 	LOAD_MSG_AVX2_Y13(1, 3, 5, 7);    \
 	LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \
 	LOAD_MSG_AVX2_Y15(9, 11, 13, 15)
 #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \
 	LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \
 	LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \
 	VMOVQ_SI_X11(11*8);              \
 	VPSHUFD     $0x4E, 0*8(SI), X14; \
 	VPINSRQ_1_SI_X11(5*8);           \
 	VINSERTI128 $1, X11, Y14, Y14;   \
 	LOAD_MSG_AVX2_Y15(12, 2, 7, 3)
 #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \
 	VMOVQ_SI_X11(5*8);              \
 	VMOVDQU     11*8(SI), X12;      \
 	VPINSRQ_1_SI_X11(15*8);         \
 	VINSERTI128 $1, X11, Y12, Y12;  \
 	VMOVQ_SI_X13(8*8);              \
 	VMOVQ_SI_X11(2*8);              \
 	VPINSRQ_1_SI_X13_0;             \
 	VPINSRQ_1_SI_X11(13*8);         \
 	VINSERTI128 $1, X11, Y13, Y13;  \
 	LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \
 	LOAD_MSG_AVX2_Y15(14, 6, 1, 4)
 #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \
 	LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \
 	LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \
 	LOAD_MSG_AVX2_Y14(2, 5, 4, 15);  \
 	VMOVQ_SI_X15(6*8);               \
 	VMOVQ_SI_X11_0;                  \
 	VPINSRQ_1_SI_X15(10*8);          \
 	VPINSRQ_1_SI_X11(8*8);           \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \
 	LOAD_MSG_AVX2_Y12(9, 5, 2, 10);  \
 	VMOVQ_SI_X13_0;                  \
 	VMOVQ_SI_X11(4*8);               \
 	VPINSRQ_1_SI_X13(7*8);           \
 	VPINSRQ_1_SI_X11(15*8);          \
 	VINSERTI128 $1, X11, Y13, Y13;   \
 	LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \
 	LOAD_MSG_AVX2_Y15(1, 12, 8, 13)
 #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \
 	VMOVQ_SI_X12(2*8);                \
 	VMOVQ_SI_X11_0;                   \
 	VPINSRQ_1_SI_X12(6*8);            \
 	VPINSRQ_1_SI_X11(8*8);            \
 	VINSERTI128 $1, X11, Y12, Y12;    \
 	LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \
 	LOAD_MSG_AVX2_Y14(4, 7, 15, 1);   \
 	LOAD_MSG_AVX2_Y15(13, 5, 14, 9)
 #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \
 	LOAD_MSG_AVX2_Y12(12, 1, 14, 4);  \
 	LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \
 	VMOVQ_SI_X14_0;                   \
 	VPSHUFD     $0x4E, 8*8(SI), X11;  \
 	VPINSRQ_1_SI_X14(6*8);            \
 	VINSERTI128 $1, X11, Y14, Y14;    \
 	LOAD_MSG_AVX2_Y15(7, 3, 2, 11)
 #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \
 	LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \
 	LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \
 	LOAD_MSG_AVX2_Y14(5, 15, 8, 2);  \
 	VMOVQ_SI_X15_0;                  \
 	VMOVQ_SI_X11(6*8);               \
 	VPINSRQ_1_SI_X15(4*8);           \
 	VPINSRQ_1_SI_X11(10*8);          \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \
 	VMOVQ_SI_X12(6*8);              \
 	VMOVQ_SI_X11(11*8);             \
 	VPINSRQ_1_SI_X12(14*8);         \
 	VPINSRQ_1_SI_X11_0;             \
 	VINSERTI128 $1, X11, Y12, Y12;  \
 	LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \
 	VMOVQ_SI_X11(1*8);              \
 	VMOVDQU     12*8(SI), X14;      \
 	VPINSRQ_1_SI_X11(10*8);         \
 	VINSERTI128 $1, X11, Y14, Y14;  \
 	VMOVQ_SI_X15(2*8);              \
 	VMOVDQU     4*8(SI), X11;       \
 	VPINSRQ_1_SI_X15(7*8);          \
 	VINSERTI128 $1, X11, Y15, Y15
 #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \
 	LOAD_MSG_AVX2_Y12(10, 8, 7, 1);  \
 	VMOVQ_SI_X13(2*8);               \
 	VPSHUFD     $0x4E, 5*8(SI), X11; \
 	VPINSRQ_1_SI_X13(4*8);           \
 	VINSERTI128 $1, X11, Y13, Y13;   \
 	LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \
 	VMOVQ_SI_X15(11*8);              \
 	VMOVQ_SI_X11(12*8);              \
 	VPINSRQ_1_SI_X15(14*8);          \
 	VPINSRQ_1_SI_X11_0;              \
 	VINSERTI128 $1, X11, Y15, Y15
 // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, DX
 	MOVQ SP, R9
 	ADDQ $31, R9
 	ANDQ $~31, R9
 	MOVQ R9, SP
 	MOVQ CX, 16(SP)
 	XORQ CX, CX
 	MOVQ CX, 24(SP)
 	VMOVDQU ·AVX2_c40<>(SB), Y4
 	VMOVDQU ·AVX2_c48<>(SB), Y5
 	VMOVDQU 0(AX), Y8
 	VMOVDQU 32(AX), Y9
 	VMOVDQU ·AVX2_iv0<>(SB), Y6
 	VMOVDQU ·AVX2_iv1<>(SB), Y7
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 	MOVQ R9, 8(SP)
 loop:
 	ADDQ $128, R8
 	MOVQ R8, 0(SP)
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 	MOVQ R9, 8(SP)
 noinc:
 	VMOVDQA Y8, Y0
 	VMOVDQA Y9, Y1
 	VMOVDQA Y6, Y2
 	VPXOR   0(SP), Y7, Y3
 	LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15()
 	VMOVDQA Y12, 32(SP)
 	VMOVDQA Y13, 64(SP)
 	VMOVDQA Y14, 96(SP)
 	VMOVDQA Y15, 128(SP)
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3()
 	VMOVDQA Y12, 160(SP)
 	VMOVDQA Y13, 192(SP)
 	VMOVDQA Y14, 224(SP)
 	VMOVDQA Y15, 256(SP)
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0()
 	ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5)
 	ROUND_AVX2(32(SP), 64(SP), 96(SP), 128(SP), Y10, Y4, Y5)
 	ROUND_AVX2(160(SP), 192(SP), 224(SP), 256(SP), Y10, Y4, Y5)
 	VPXOR Y0, Y8, Y8
 	VPXOR Y1, Y9, Y9
 	VPXOR Y2, Y8, Y8
 	VPXOR Y3, Y9, Y9
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	VMOVDQU Y8, 0(AX)
 	VMOVDQU Y9, 32(AX)
 	VZEROUPPER
 	MOVQ DX, SP
 	RET
 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA
 #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB
 #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF
 #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD
 #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE
 #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7
 #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF
 #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7
 #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF
 #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7
 #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7
 #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF
 #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF
 #define SHUFFLE_AVX() \
 	VMOVDQA X6, X13;         \
 	VMOVDQA X2, X14;         \
 	VMOVDQA X4, X6;          \
 	VPUNPCKLQDQ_X13_X13_X15; \
 	VMOVDQA X5, X4;          \
 	VMOVDQA X6, X5;          \
 	VPUNPCKHQDQ_X15_X7_X6;   \
 	VPUNPCKLQDQ_X7_X7_X15;   \
 	VPUNPCKHQDQ_X15_X13_X7;  \
 	VPUNPCKLQDQ_X3_X3_X15;   \
 	VPUNPCKHQDQ_X15_X2_X2;   \
 	VPUNPCKLQDQ_X14_X14_X15; \
 	VPUNPCKHQDQ_X15_X3_X3;   \
 #define SHUFFLE_AVX_INV() \
 	VMOVDQA X2, X13;         \
 	VMOVDQA X4, X14;         \
 	VPUNPCKLQDQ_X2_X2_X15;   \
 	VMOVDQA X5, X4;          \
 	VPUNPCKHQDQ_X15_X3_X2;   \
 	VMOVDQA X14, X5;         \
 	VPUNPCKLQDQ_X3_X3_X15;   \
 	VMOVDQA X6, X14;         \
 	VPUNPCKHQDQ_X15_X13_X3;  \
 	VPUNPCKLQDQ_X7_X7_X15;   \
 	VPUNPCKHQDQ_X15_X6_X6;   \
 	VPUNPCKLQDQ_X14_X14_X15; \
 	VPUNPCKHQDQ_X15_X7_X7;   \
 #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
 	VPADDQ  m0, v0, v0;   \
 	VPADDQ  v2, v0, v0;   \
 	VPADDQ  m1, v1, v1;   \
 	VPADDQ  v3, v1, v1;   \
 	VPXOR   v0, v6, v6;   \
 	VPXOR   v1, v7, v7;   \
 	VPSHUFD $-79, v6, v6; \
 	VPSHUFD $-79, v7, v7; \
 	VPADDQ  v6, v4, v4;   \
 	VPADDQ  v7, v5, v5;   \
 	VPXOR   v4, v2, v2;   \
 	VPXOR   v5, v3, v3;   \
 	VPSHUFB c40, v2, v2;  \
 	VPSHUFB c40, v3, v3;  \
 	VPADDQ  m2, v0, v0;   \
 	VPADDQ  v2, v0, v0;   \
 	VPADDQ  m3, v1, v1;   \
 	VPADDQ  v3, v1, v1;   \
 	VPXOR   v0, v6, v6;   \
 	VPXOR   v1, v7, v7;   \
 	VPSHUFB c48, v6, v6;  \
 	VPSHUFB c48, v7, v7;  \
 	VPADDQ  v6, v4, v4;   \
 	VPADDQ  v7, v5, v5;   \
 	VPXOR   v4, v2, v2;   \
 	VPXOR   v5, v3, v3;   \
 	VPADDQ  v2, v2, t0;   \
 	VPSRLQ  $63, v2, v2;  \
 	VPXOR   t0, v2, v2;   \
 	VPADDQ  v3, v3, t0;   \
 	VPSRLQ  $63, v3, v3;  \
 	VPXOR   t0, v3, v3
 // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7)
 // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0
 #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \
 	VMOVQ_SI_X12(i0*8);     \
 	VMOVQ_SI_X13(i2*8);     \
 	VMOVQ_SI_X14(i4*8);     \
 	VMOVQ_SI_X15(i6*8);     \
 	VPINSRQ_1_SI_X12(i1*8); \
 	VPINSRQ_1_SI_X13(i3*8); \
 	VPINSRQ_1_SI_X14(i5*8); \
 	VPINSRQ_1_SI_X15(i7*8)
 // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7)
 #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \
 	VMOVQ_SI_X12_0;        \
 	VMOVQ_SI_X13(4*8);     \
 	VMOVQ_SI_X14(1*8);     \
 	VMOVQ_SI_X15(5*8);     \
 	VPINSRQ_1_SI_X12(2*8); \
 	VPINSRQ_1_SI_X13(6*8); \
 	VPINSRQ_1_SI_X14(3*8); \
 	VPINSRQ_1_SI_X15(7*8)
 // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3)
 #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \
 	VPSHUFD $0x4E, 0*8(SI), X12; \
 	VMOVQ_SI_X13(11*8);          \
 	VMOVQ_SI_X14(12*8);          \
 	VMOVQ_SI_X15(7*8);           \
 	VPINSRQ_1_SI_X13(5*8);       \
 	VPINSRQ_1_SI_X14(2*8);       \
 	VPINSRQ_1_SI_X15(3*8)
 // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13)
 #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \
 	VMOVDQU 11*8(SI), X12;  \
 	VMOVQ_SI_X13(5*8);      \
 	VMOVQ_SI_X14(8*8);      \
 	VMOVQ_SI_X15(2*8);      \
 	VPINSRQ_1_SI_X13(15*8); \
 	VPINSRQ_1_SI_X14_0;     \
 	VPINSRQ_1_SI_X15(13*8)
 // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8)
 #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \
 	VMOVQ_SI_X12(2*8);      \
 	VMOVQ_SI_X13(4*8);      \
 	VMOVQ_SI_X14(6*8);      \
 	VMOVQ_SI_X15_0;         \
 	VPINSRQ_1_SI_X12(5*8);  \
 	VPINSRQ_1_SI_X13(15*8); \
 	VPINSRQ_1_SI_X14(10*8); \
 	VPINSRQ_1_SI_X15(8*8)
 // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15)
 #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \
 	VMOVQ_SI_X12(9*8);      \
 	VMOVQ_SI_X13(2*8);      \
 	VMOVQ_SI_X14_0;         \
 	VMOVQ_SI_X15(4*8);      \
 	VPINSRQ_1_SI_X12(5*8);  \
 	VPINSRQ_1_SI_X13(10*8); \
 	VPINSRQ_1_SI_X14(7*8);  \
 	VPINSRQ_1_SI_X15(15*8)
 // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3)
 #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \
 	VMOVQ_SI_X12(2*8);      \
 	VMOVQ_SI_X13_0;         \
 	VMOVQ_SI_X14(12*8);     \
 	VMOVQ_SI_X15(11*8);     \
 	VPINSRQ_1_SI_X12(6*8);  \
 	VPINSRQ_1_SI_X13(8*8);  \
 	VPINSRQ_1_SI_X14(10*8); \
 	VPINSRQ_1_SI_X15(3*8)
 // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11)
 #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \
 	MOVQ    0*8(SI), X12;        \
 	VPSHUFD $0x4E, 8*8(SI), X13; \
 	MOVQ    7*8(SI), X14;        \
 	MOVQ    2*8(SI), X15;        \
 	VPINSRQ_1_SI_X12(6*8);       \
 	VPINSRQ_1_SI_X14(3*8);       \
 	VPINSRQ_1_SI_X15(11*8)
 // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8)
 #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \
 	MOVQ 6*8(SI), X12;      \
 	MOVQ 11*8(SI), X13;     \
 	MOVQ 15*8(SI), X14;     \
 	MOVQ 3*8(SI), X15;      \
 	VPINSRQ_1_SI_X12(14*8); \
 	VPINSRQ_1_SI_X13_0;     \
 	VPINSRQ_1_SI_X14(9*8);  \
 	VPINSRQ_1_SI_X15(8*8)
 // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10)
 #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \
 	MOVQ 5*8(SI), X12;      \
 	MOVQ 8*8(SI), X13;      \
 	MOVQ 0*8(SI), X14;      \
 	MOVQ 6*8(SI), X15;      \
 	VPINSRQ_1_SI_X12(15*8); \
 	VPINSRQ_1_SI_X13(2*8);  \
 	VPINSRQ_1_SI_X14(4*8);  \
 	VPINSRQ_1_SI_X15(10*8)
 // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5)
 #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \
 	VMOVDQU 12*8(SI), X12;  \
 	MOVQ    1*8(SI), X13;   \
 	MOVQ    2*8(SI), X14;   \
 	VPINSRQ_1_SI_X13(10*8); \
 	VPINSRQ_1_SI_X14(7*8);  \
 	VMOVDQU 4*8(SI), X15
 // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0)
 #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \
 	MOVQ 15*8(SI), X12;     \
 	MOVQ 3*8(SI), X13;      \
 	MOVQ 11*8(SI), X14;     \
 	MOVQ 12*8(SI), X15;     \
 	VPINSRQ_1_SI_X12(9*8);  \
 	VPINSRQ_1_SI_X13(13*8); \
 	VPINSRQ_1_SI_X14(14*8); \
 	VPINSRQ_1_SI_X15_0
 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, BP
 	MOVQ SP, R9
 	ADDQ $15, R9
 	ANDQ $~15, R9
 	MOVQ R9, SP
 	VMOVDQU ·AVX_c40<>(SB), X0
 	VMOVDQU ·AVX_c48<>(SB), X1
 	VMOVDQA X0, X8
 	VMOVDQA X1, X9
 	VMOVDQU ·AVX_iv3<>(SB), X0
 	VMOVDQA X0, 0(SP)
 	XORQ    CX, 0(SP)          // 0(SP) = ·AVX_iv3 ^ (CX || 0)
 	VMOVDQU 0(AX), X10
 	VMOVDQU 16(AX), X11
 	VMOVDQU 32(AX), X2
 	VMOVDQU 48(AX), X3
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 loop:
 	ADDQ $128, R8
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 noinc:
 	VMOVQ_R8_X15
 	VPINSRQ_1_R9_X15
 	VMOVDQA X10, X0
 	VMOVDQA X11, X1
 	VMOVDQU ·AVX_iv0<>(SB), X4
 	VMOVDQU ·AVX_iv1<>(SB), X5
 	VMOVDQU ·AVX_iv2<>(SB), X6
 	VPXOR   X15, X6, X6
 	VMOVDQA 0(SP), X7
 	LOAD_MSG_AVX_0_2_4_6_1_3_5_7()
 	VMOVDQA X12, 16(SP)
 	VMOVDQA X13, 32(SP)
 	VMOVDQA X14, 48(SP)
 	VMOVDQA X15, 64(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15)
 	VMOVDQA X12, 80(SP)
 	VMOVDQA X13, 96(SP)
 	VMOVDQA X14, 112(SP)
 	VMOVDQA X15, 128(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6)
 	VMOVDQA X12, 144(SP)
 	VMOVDQA X13, 160(SP)
 	VMOVDQA X14, 176(SP)
 	VMOVDQA X15, 192(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_1_0_11_5_12_2_7_3()
 	VMOVDQA X12, 208(SP)
 	VMOVDQA X13, 224(SP)
 	VMOVDQA X14, 240(SP)
 	VMOVDQA X15, 256(SP)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_11_12_5_15_8_0_2_13()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_2_5_4_15_6_10_0_8()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_9_5_2_10_0_7_4_15()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_2_6_0_8_12_10_11_3()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_0_6_9_8_7_3_2_11()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_5_15_8_2_0_4_6_10()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX_6_14_11_0_15_9_3_8()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_12_13_1_10_2_7_4_5()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5)
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX()
 	LOAD_MSG_AVX_15_9_3_13_11_14_12_0()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X15, X8, X9)
 	SHUFFLE_AVX()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X15, X8, X9)
 	SHUFFLE_AVX()
 	HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X15, X8, X9)
 	SHUFFLE_AVX_INV()
 	VMOVDQU 32(AX), X14
 	VMOVDQU 48(AX), X15
 	VPXOR   X0, X10, X10
 	VPXOR   X1, X11, X11
 	VPXOR   X2, X14, X14
 	VPXOR   X3, X15, X15
 	VPXOR   X4, X10, X10
 	VPXOR   X5, X11, X11
 	VPXOR   X6, X14, X2
 	VPXOR   X7, X15, X3
 	VMOVDQU X2, 32(AX)
 	VMOVDQU X3, 48(AX)
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	VMOVDQU X10, 0(AX)
 	VMOVDQU X11, 16(AX)
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	VZEROUPPER
 	MOVQ BP, SP
 	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.go
@ -0,0 +1,24 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !go1.7,amd64,!gccgo,!appengine
 package blake2b
 import "golang.org/x/sys/cpu"
 func init() {
 	useSSE4 = cpu.X86.HasSSE41
 }
 //go:noescape
 func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	if useSSE4 {
 		hashBlocksSSE4(h, c, flag, blocks)
 	} else {
 		hashBlocksGeneric(h, c, flag, blocks)
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s
@ -0,0 +1,281 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!gccgo,!appengine
 #include "textflag.h"
 DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908
 DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b
 GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16
 DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b
 DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1
 GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16
 DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1
 DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f
 GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16
 DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b
 DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179
 GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16
 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403
 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b
 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16
 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302
 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a
 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16
 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v6, t1; \
 	PUNPCKLQDQ v6, t2; \
 	PUNPCKHQDQ v7, v6; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ v7, t2; \
 	MOVO       t1, v7; \
 	MOVO       v2, t1; \
 	PUNPCKHQDQ t2, v7; \
 	PUNPCKLQDQ v3, t2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v3
 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \
 	MOVO       v4, t1; \
 	MOVO       v5, v4; \
 	MOVO       t1, v5; \
 	MOVO       v2, t1; \
 	PUNPCKLQDQ v2, t2; \
 	PUNPCKHQDQ v3, v2; \
 	PUNPCKHQDQ t2, v2; \
 	PUNPCKLQDQ v3, t2; \
 	MOVO       t1, v3; \
 	MOVO       v6, t1; \
 	PUNPCKHQDQ t2, v3; \
 	PUNPCKLQDQ v7, t2; \
 	PUNPCKHQDQ t2, v6; \
 	PUNPCKLQDQ t1, t2; \
 	PUNPCKHQDQ t2, v7
 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \
 	PADDQ  m0, v0;        \
 	PADDQ  m1, v1;        \
 	PADDQ  v2, v0;        \
 	PADDQ  v3, v1;        \
 	PXOR   v0, v6;        \
 	PXOR   v1, v7;        \
 	PSHUFD $0xB1, v6, v6; \
 	PSHUFD $0xB1, v7, v7; \
 	PADDQ  v6, v4;        \
 	PADDQ  v7, v5;        \
 	PXOR   v4, v2;        \
 	PXOR   v5, v3;        \
 	PSHUFB c40, v2;       \
 	PSHUFB c40, v3;       \
 	PADDQ  m2, v0;        \
 	PADDQ  m3, v1;        \
 	PADDQ  v2, v0;        \
 	PADDQ  v3, v1;        \
 	PXOR   v0, v6;        \
 	PXOR   v1, v7;        \
 	PSHUFB c48, v6;       \
 	PSHUFB c48, v7;       \
 	PADDQ  v6, v4;        \
 	PADDQ  v7, v5;        \
 	PXOR   v4, v2;        \
 	PXOR   v5, v3;        \
 	MOVOU  v2, t0;        \
 	PADDQ  v2, t0;        \
 	PSRLQ  $63, v2;       \
 	PXOR   t0, v2;        \
 	MOVOU  v3, t0;        \
 	PADDQ  v3, t0;        \
 	PSRLQ  $63, v3;       \
 	PXOR   t0, v3
 #define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \
 	MOVQ   i0*8(src), m0;     \
 	PINSRQ $1, i1*8(src), m0; \
 	MOVQ   i2*8(src), m1;     \
 	PINSRQ $1, i3*8(src), m1; \
 	MOVQ   i4*8(src), m2;     \
 	PINSRQ $1, i5*8(src), m2; \
 	MOVQ   i6*8(src), m3;     \
 	PINSRQ $1, i7*8(src), m3
 // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte)
 TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment
 	MOVQ h+0(FP), AX
 	MOVQ c+8(FP), BX
 	MOVQ flag+16(FP), CX
 	MOVQ blocks_base+24(FP), SI
 	MOVQ blocks_len+32(FP), DI
 	MOVQ SP, BP
 	MOVQ SP, R9
 	ADDQ $15, R9
 	ANDQ $~15, R9
 	MOVQ R9, SP
 	MOVOU ·iv3<>(SB), X0
 	MOVO  X0, 0(SP)
 	XORQ  CX, 0(SP)     // 0(SP) = ·iv3 ^ (CX || 0)
 	MOVOU ·c40<>(SB), X13
 	MOVOU ·c48<>(SB), X14
 	MOVOU 0(AX), X12
 	MOVOU 16(AX), X15
 	MOVQ 0(BX), R8
 	MOVQ 8(BX), R9
 loop:
 	ADDQ $128, R8
 	CMPQ R8, $128
 	JGE  noinc
 	INCQ R9
 noinc:
 	MOVQ R8, X8
 	PINSRQ $1, R9, X8
 	MOVO X12, X0
 	MOVO X15, X1
 	MOVOU 32(AX), X2
 	MOVOU 48(AX), X3
 	MOVOU ·iv0<>(SB), X4
 	MOVOU ·iv1<>(SB), X5
 	MOVOU ·iv2<>(SB), X6
 	PXOR X8, X6
 	MOVO 0(SP), X7
 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7)
 	MOVO X8, 16(SP)
 	MOVO X9, 32(SP)
 	MOVO X10, 48(SP)
 	MOVO X11, 64(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15)
 	MOVO X8, 80(SP)
 	MOVO X9, 96(SP)
 	MOVO X10, 112(SP)
 	MOVO X11, 128(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6)
 	MOVO X8, 144(SP)
 	MOVO X9, 160(SP)
 	MOVO X10, 176(SP)
 	MOVO X11, 192(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3)
 	MOVO X8, 208(SP)
 	MOVO X9, 224(SP)
 	MOVO X10, 240(SP)
 	MOVO X11, 256(SP)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(SP), 96(SP), 112(SP), 128(SP), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(SP), 160(SP), 176(SP), 192(SP), X11, X13, X14)
 	SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9)
 	HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(SP), 224(SP), 240(SP), 256(SP), X11, X13, X14)
 	SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9)
 	MOVOU 32(AX), X10
 	MOVOU 48(AX), X11
 	PXOR  X0, X12
 	PXOR  X1, X15
 	PXOR  X2, X10
 	PXOR  X3, X11
 	PXOR  X4, X12
 	PXOR  X5, X15
 	PXOR  X6, X10
 	PXOR  X7, X11
 	MOVOU X10, 32(AX)
 	MOVOU X11, 48(AX)
 	LEAQ 128(SI), SI
 	SUBQ $128, DI
 	JNE  loop
 	MOVOU X12, 0(AX)
 	MOVOU X15, 16(AX)
 	MOVQ R8, 0(BX)
 	MOVQ R9, 8(BX)
 	MOVQ BP, SP
 	RET
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_generic.go
@ -0,0 +1,182 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package blake2b
 import (
 	"encoding/binary"
 	"math/bits"
 )
 // the precomputed values for BLAKE2b
 // there are 12 16-byte arrays - one for each round
 // the entries are calculated from the sigma constants.
 var precomputed = [12][16]byte{
 	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
 	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
 	{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
 	{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
 	{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
 	{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
 	{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
 	{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
 	{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
 	{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
 	{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, // equal to the first
 	{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, // equal to the second
 }
 func hashBlocksGeneric(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	var m [16]uint64
 	c0, c1 := c[0], c[1]
 	for i := 0; i < len(blocks); {
 		c0 += BlockSize
 		if c0 < BlockSize {
 			c1++
 		}
 		v0, v1, v2, v3, v4, v5, v6, v7 := h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
 		v8, v9, v10, v11, v12, v13, v14, v15 := iv[0], iv[1], iv[2], iv[3], iv[4], iv[5], iv[6], iv[7]
 		v12 ^= c0
 		v13 ^= c1
 		v14 ^= flag
 		for j := range m {
 			m[j] = binary.LittleEndian.Uint64(blocks[i:])
 			i += 8
 		}
 		for j := range precomputed {
 			s := &(precomputed[j])
 			v0 += m[s[0]]
 			v0 += v4
 			v12 ^= v0
 			v12 = bits.RotateLeft64(v12, -32)
 			v8 += v12
 			v4 ^= v8
 			v4 = bits.RotateLeft64(v4, -24)
 			v1 += m[s[1]]
 			v1 += v5
 			v13 ^= v1
 			v13 = bits.RotateLeft64(v13, -32)
 			v9 += v13
 			v5 ^= v9
 			v5 = bits.RotateLeft64(v5, -24)
 			v2 += m[s[2]]
 			v2 += v6
 			v14 ^= v2
 			v14 = bits.RotateLeft64(v14, -32)
 			v10 += v14
 			v6 ^= v10
 			v6 = bits.RotateLeft64(v6, -24)
 			v3 += m[s[3]]
 			v3 += v7
 			v15 ^= v3
 			v15 = bits.RotateLeft64(v15, -32)
 			v11 += v15
 			v7 ^= v11
 			v7 = bits.RotateLeft64(v7, -24)
 			v0 += m[s[4]]
 			v0 += v4
 			v12 ^= v0
 			v12 = bits.RotateLeft64(v12, -16)
 			v8 += v12
 			v4 ^= v8
 			v4 = bits.RotateLeft64(v4, -63)
 			v1 += m[s[5]]
 			v1 += v5
 			v13 ^= v1
 			v13 = bits.RotateLeft64(v13, -16)
 			v9 += v13
 			v5 ^= v9
 			v5 = bits.RotateLeft64(v5, -63)
 			v2 += m[s[6]]
 			v2 += v6
 			v14 ^= v2
 			v14 = bits.RotateLeft64(v14, -16)
 			v10 += v14
 			v6 ^= v10
 			v6 = bits.RotateLeft64(v6, -63)
 			v3 += m[s[7]]
 			v3 += v7
 			v15 ^= v3
 			v15 = bits.RotateLeft64(v15, -16)
 			v11 += v15
 			v7 ^= v11
 			v7 = bits.RotateLeft64(v7, -63)
 			v0 += m[s[8]]
 			v0 += v5
 			v15 ^= v0
 			v15 = bits.RotateLeft64(v15, -32)
 			v10 += v15
 			v5 ^= v10
 			v5 = bits.RotateLeft64(v5, -24)
 			v1 += m[s[9]]
 			v1 += v6
 			v12 ^= v1
 			v12 = bits.RotateLeft64(v12, -32)
 			v11 += v12
 			v6 ^= v11
 			v6 = bits.RotateLeft64(v6, -24)
 			v2 += m[s[10]]
 			v2 += v7
 			v13 ^= v2
 			v13 = bits.RotateLeft64(v13, -32)
 			v8 += v13
 			v7 ^= v8
 			v7 = bits.RotateLeft64(v7, -24)
 			v3 += m[s[11]]
 			v3 += v4
 			v14 ^= v3
 			v14 = bits.RotateLeft64(v14, -32)
 			v9 += v14
 			v4 ^= v9
 			v4 = bits.RotateLeft64(v4, -24)
 			v0 += m[s[12]]
 			v0 += v5
 			v15 ^= v0
 			v15 = bits.RotateLeft64(v15, -16)
 			v10 += v15
 			v5 ^= v10
 			v5 = bits.RotateLeft64(v5, -63)
 			v1 += m[s[13]]
 			v1 += v6
 			v12 ^= v1
 			v12 = bits.RotateLeft64(v12, -16)
 			v11 += v12
 			v6 ^= v11
 			v6 = bits.RotateLeft64(v6, -63)
 			v2 += m[s[14]]
 			v2 += v7
 			v13 ^= v2
 			v13 = bits.RotateLeft64(v13, -16)
 			v8 += v13
 			v7 ^= v8
 			v7 = bits.RotateLeft64(v7, -63)
 			v3 += m[s[15]]
 			v3 += v4
 			v14 ^= v3
 			v14 = bits.RotateLeft64(v14, -16)
 			v9 += v14
 			v4 ^= v9
 			v4 = bits.RotateLeft64(v4, -63)
 		}
 		h[0] ^= v0 ^ v8
 		h[1] ^= v1 ^ v9
 		h[2] ^= v2 ^ v10
 		h[3] ^= v3 ^ v11
 		h[4] ^= v4 ^ v12
 		h[5] ^= v5 ^ v13
 		h[6] ^= v6 ^ v14
 		h[7] ^= v7 ^ v15
 	}
 	c[0], c[1] = c0, c1
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2b_ref.go
@ -0,0 +1,11 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 appengine gccgo
 package blake2b
 func hashBlocks(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) {
 	hashBlocksGeneric(h, c, flag, blocks)
 }
--- a/vendor/golang.org/x/crypto/blake2b/blake2x.go
+++ b/vendor/golang.org/x/crypto/blake2b/blake2x.go
@ -0,0 +1,177 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package blake2b
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
 // XOF defines the interface to hash functions that
 // support arbitrary-length output.
 type XOF interface {
 	// Write absorbs more data into the hash's state. It panics if called
 	// after Read.
 	io.Writer
 	// Read reads more output from the hash. It returns io.EOF if the limit
 	// has been reached.
 	io.Reader
 	// Clone returns a copy of the XOF in its current state.
 	Clone() XOF
 	// Reset resets the XOF to its initial state.
 	Reset()
 }
 // OutputLengthUnknown can be used as the size argument to NewXOF to indicate
 // the length of the output is not known in advance.
 const OutputLengthUnknown = 0
 // magicUnknownOutputLength is a magic value for the output size that indicates
 // an unknown number of output bytes.
 const magicUnknownOutputLength = (1 << 32) - 1
 // maxOutputLength is the absolute maximum number of bytes to produce when the
 // number of output bytes is unknown.
 const maxOutputLength = (1 << 32) * 64
 // NewXOF creates a new variable-output-length hash. The hash either produce a
 // known number of bytes (1 <= size < 2**32-1), or an unknown number of bytes
 // (size == OutputLengthUnknown). In the latter case, an absolute limit of
 // 256GiB applies.
 //
 // A non-nil key turns the hash into a MAC. The key must between
 // zero and 32 bytes long.
 func NewXOF(size uint32, key []byte) (XOF, error) {
 	if len(key) > Size {
 		return nil, errKeySize
 	}
 	if size == magicUnknownOutputLength {
 		// 2^32-1 indicates an unknown number of bytes and thus isn't a
 		// valid length.
 		return nil, errors.New("blake2b: XOF length too large")
 	}
 	if size == OutputLengthUnknown {
 		size = magicUnknownOutputLength
 	}
 	x := &xof{
 		d: digest{
 			size:   Size,
 			keyLen: len(key),
 		},
 		length: size,
 	}
 	copy(x.d.key[:], key)
 	x.Reset()
 	return x, nil
 }
 type xof struct {
 	d                digest
 	length           uint32
 	remaining        uint64
 	cfg, root, block [Size]byte
 	offset           int
 	nodeOffset       uint32
 	readMode         bool
 }
 func (x *xof) Write(p []byte) (n int, err error) {
 	if x.readMode {
 		panic("blake2b: write to XOF after read")
 	}
 	return x.d.Write(p)
 }
 func (x *xof) Clone() XOF {
 	clone := *x
 	return &clone
 }
 func (x *xof) Reset() {
 	x.cfg[0] = byte(Size)
 	binary.LittleEndian.PutUint32(x.cfg[4:], uint32(Size)) // leaf length
 	binary.LittleEndian.PutUint32(x.cfg[12:], x.length)    // XOF length
 	x.cfg[17] = byte(Size)                                 // inner hash size
 	x.d.Reset()
 	x.d.h[1] ^= uint64(x.length) << 32
 	x.remaining = uint64(x.length)
 	if x.remaining == magicUnknownOutputLength {
 		x.remaining = maxOutputLength
 	}
 	x.offset, x.nodeOffset = 0, 0
 	x.readMode = false
 }
 func (x *xof) Read(p []byte) (n int, err error) {
 	if !x.readMode {
 		x.d.finalize(&x.root)
 		x.readMode = true
 	}
 	if x.remaining == 0 {
 		return 0, io.EOF
 	}
 	n = len(p)
 	if uint64(n) > x.remaining {
 		n = int(x.remaining)
 		p = p[:n]
 	}
 	if x.offset > 0 {
 		blockRemaining := Size - x.offset
 		if n < blockRemaining {
 			x.offset += copy(p, x.block[x.offset:])
 			x.remaining -= uint64(n)
 			return
 		}
 		copy(p, x.block[x.offset:])
 		p = p[blockRemaining:]
 		x.offset = 0
 		x.remaining -= uint64(blockRemaining)
 	}
 	for len(p) >= Size {
 		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
 		x.nodeOffset++
 		x.d.initConfig(&x.cfg)
 		x.d.Write(x.root[:])
 		x.d.finalize(&x.block)
 		copy(p, x.block[:])
 		p = p[Size:]
 		x.remaining -= uint64(Size)
 	}
 	if todo := len(p); todo > 0 {
 		if x.remaining < uint64(Size) {
 			x.cfg[0] = byte(x.remaining)
 		}
 		binary.LittleEndian.PutUint32(x.cfg[8:], x.nodeOffset)
 		x.nodeOffset++
 		x.d.initConfig(&x.cfg)
 		x.d.Write(x.root[:])
 		x.d.finalize(&x.block)
 		x.offset = copy(p, x.block[:todo])
 		x.remaining -= uint64(todo)
 	}
 	return
 }
 func (d *digest) initConfig(cfg *[Size]byte) {
 	d.offset, d.c[0], d.c[1] = 0, 0, 0
 	for i := range d.h {
 		d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(cfg[i*8:])
 	}
 }
--- a/vendor/golang.org/x/crypto/blake2b/register.go
+++ b/vendor/golang.org/x/crypto/blake2b/register.go
@ -0,0 +1,32 @@
 // Copyright 2017 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.9
 package blake2b
 import (
 	"crypto"
 	"hash"
 )
 func init() {
 	newHash256 := func() hash.Hash {
 		h, _ := New256(nil)
 		return h
 	}
 	newHash384 := func() hash.Hash {
 		h, _ := New384(nil)
 		return h
 	}
 	newHash512 := func() hash.Hash {
 		h, _ := New512(nil)
 		return h
 	}
 	crypto.RegisterHash(crypto.BLAKE2b_256, newHash256)
 	crypto.RegisterHash(crypto.BLAKE2b_384, newHash384)
 	crypto.RegisterHash(crypto.BLAKE2b_512, newHash512)
 }
--- a/vendor/golang.org/x/crypto/blowfish/block.go
+++ b/vendor/golang.org/x/crypto/blowfish/block.go
@ -0,0 +1,159 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package blowfish
 // getNextWord returns the next big-endian uint32 value from the byte slice
 // at the given position in a circular manner, updating the position.
 func getNextWord(b []byte, pos *int) uint32 {
 	var w uint32
 	j := *pos
 	for i := 0; i < 4; i++ {
 		w = w<<8 | uint32(b[j])
 		j++
 		if j >= len(b) {
 			j = 0
 		}
 	}
 	*pos = j
 	return w
 }
 // ExpandKey performs a key expansion on the given *Cipher. Specifically, it
 // performs the Blowfish algorithm's key schedule which sets up the *Cipher's
 // pi and substitution tables for calls to Encrypt. This is used, primarily,
 // by the bcrypt package to reuse the Blowfish key schedule during its
 // set up. It's unlikely that you need to use this directly.
 func ExpandKey(key []byte, c *Cipher) {
 	j := 0
 	for i := 0; i < 18; i++ {
 		// Using inlined getNextWord for performance.
 		var d uint32
 		for k := 0; k < 4; k++ {
 			d = d<<8 | uint32(key[j])
 			j++
 			if j >= len(key) {
 				j = 0
 			}
 		}
 		c.p[i] ^= d
 	}
 	var l, r uint32
 	for i := 0; i < 18; i += 2 {
 		l, r = encryptBlock(l, r, c)
 		c.p[i], c.p[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l, r = encryptBlock(l, r, c)
 		c.s0[i], c.s0[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l, r = encryptBlock(l, r, c)
 		c.s1[i], c.s1[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l, r = encryptBlock(l, r, c)
 		c.s2[i], c.s2[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l, r = encryptBlock(l, r, c)
 		c.s3[i], c.s3[i+1] = l, r
 	}
 }
 // This is similar to ExpandKey, but folds the salt during the key
 // schedule. While ExpandKey is essentially expandKeyWithSalt with an all-zero
 // salt passed in, reusing ExpandKey turns out to be a place of inefficiency
 // and specializing it here is useful.
 func expandKeyWithSalt(key []byte, salt []byte, c *Cipher) {
 	j := 0
 	for i := 0; i < 18; i++ {
 		c.p[i] ^= getNextWord(key, &j)
 	}
 	j = 0
 	var l, r uint32
 	for i := 0; i < 18; i += 2 {
 		l ^= getNextWord(salt, &j)
 		r ^= getNextWord(salt, &j)
 		l, r = encryptBlock(l, r, c)
 		c.p[i], c.p[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l ^= getNextWord(salt, &j)
 		r ^= getNextWord(salt, &j)
 		l, r = encryptBlock(l, r, c)
 		c.s0[i], c.s0[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l ^= getNextWord(salt, &j)
 		r ^= getNextWord(salt, &j)
 		l, r = encryptBlock(l, r, c)
 		c.s1[i], c.s1[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l ^= getNextWord(salt, &j)
 		r ^= getNextWord(salt, &j)
 		l, r = encryptBlock(l, r, c)
 		c.s2[i], c.s2[i+1] = l, r
 	}
 	for i := 0; i < 256; i += 2 {
 		l ^= getNextWord(salt, &j)
 		r ^= getNextWord(salt, &j)
 		l, r = encryptBlock(l, r, c)
 		c.s3[i], c.s3[i+1] = l, r
 	}
 }
 func encryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
 	xl, xr := l, r
 	xl ^= c.p[0]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[1]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[2]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[3]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[4]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[5]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[6]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[7]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[8]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[9]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[10]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[11]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[12]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[13]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[14]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[15]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[16]
 	xr ^= c.p[17]
 	return xr, xl
 }
 func decryptBlock(l, r uint32, c *Cipher) (uint32, uint32) {
 	xl, xr := l, r
 	xl ^= c.p[17]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[16]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[15]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[14]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[13]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[12]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[11]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[10]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[9]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[8]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[7]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[6]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[5]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[4]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[3]
 	xr ^= ((c.s0[byte(xl>>24)] + c.s1[byte(xl>>16)]) ^ c.s2[byte(xl>>8)]) + c.s3[byte(xl)] ^ c.p[2]
 	xl ^= ((c.s0[byte(xr>>24)] + c.s1[byte(xr>>16)]) ^ c.s2[byte(xr>>8)]) + c.s3[byte(xr)] ^ c.p[1]
 	xr ^= c.p[0]
 	return xr, xl
 }
--- a/vendor/golang.org/x/crypto/blowfish/cipher.go
+++ b/vendor/golang.org/x/crypto/blowfish/cipher.go
@ -0,0 +1,99 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package blowfish implements Bruce Schneier's Blowfish encryption algorithm.
 //
 // Blowfish is a legacy cipher and its short block size makes it vulnerable to
 // birthday bound attacks (see https://sweet32.info). It should only be used
 // where compatibility with legacy systems, not security, is the goal.
 //
 // Deprecated: any new system should use AES (from crypto/aes, if necessary in
 // an AEAD mode like crypto/cipher.NewGCM) or XChaCha20-Poly1305 (from
 // golang.org/x/crypto/chacha20poly1305).
 package blowfish // import "golang.org/x/crypto/blowfish"
 // The code is a port of Bruce Schneier's C implementation.
 // See https://www.schneier.com/blowfish.html.
 import "strconv"
 // The Blowfish block size in bytes.
 const BlockSize = 8
 // A Cipher is an instance of Blowfish encryption using a particular key.
 type Cipher struct {
 	p              [18]uint32
 	s0, s1, s2, s3 [256]uint32
 }
 type KeySizeError int
 func (k KeySizeError) Error() string {
 	return "crypto/blowfish: invalid key size " + strconv.Itoa(int(k))
 }
 // NewCipher creates and returns a Cipher.
 // The key argument should be the Blowfish key, from 1 to 56 bytes.
 func NewCipher(key []byte) (*Cipher, error) {
 	var result Cipher
 	if k := len(key); k < 1 || k > 56 {
 		return nil, KeySizeError(k)
 	}
 	initCipher(&result)
 	ExpandKey(key, &result)
 	return &result, nil
 }
 // NewSaltedCipher creates a returns a Cipher that folds a salt into its key
 // schedule. For most purposes, NewCipher, instead of NewSaltedCipher, is
 // sufficient and desirable. For bcrypt compatibility, the key can be over 56
 // bytes.
 func NewSaltedCipher(key, salt []byte) (*Cipher, error) {
 	if len(salt) == 0 {
 		return NewCipher(key)
 	}
 	var result Cipher
 	if k := len(key); k < 1 {
 		return nil, KeySizeError(k)
 	}
 	initCipher(&result)
 	expandKeyWithSalt(key, salt, &result)
 	return &result, nil
 }
 // BlockSize returns the Blowfish block size, 8 bytes.
 // It is necessary to satisfy the Block interface in the
 // package "crypto/cipher".
 func (c *Cipher) BlockSize() int { return BlockSize }
 // Encrypt encrypts the 8-byte buffer src using the key k
 // and stores the result in dst.
 // Note that for amounts of data larger than a block,
 // it is not safe to just call Encrypt on successive blocks;
 // instead, use an encryption mode like CBC (see crypto/cipher/cbc.go).
 func (c *Cipher) Encrypt(dst, src []byte) {
 	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
 	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
 	l, r = encryptBlock(l, r, c)
 	dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
 	dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
 }
 // Decrypt decrypts the 8-byte buffer src using the key k
 // and stores the result in dst.
 func (c *Cipher) Decrypt(dst, src []byte) {
 	l := uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3])
 	r := uint32(src[4])<<24 | uint32(src[5])<<16 | uint32(src[6])<<8 | uint32(src[7])
 	l, r = decryptBlock(l, r, c)
 	dst[0], dst[1], dst[2], dst[3] = byte(l>>24), byte(l>>16), byte(l>>8), byte(l)
 	dst[4], dst[5], dst[6], dst[7] = byte(r>>24), byte(r>>16), byte(r>>8), byte(r)
 }
 func initCipher(c *Cipher) {
 	copy(c.p[0:], p[0:])
 	copy(c.s0[0:], s0[0:])
 	copy(c.s1[0:], s1[0:])
 	copy(c.s2[0:], s2[0:])
 	copy(c.s3[0:], s3[0:])
 }
--- a/vendor/golang.org/x/crypto/blowfish/const.go
+++ b/vendor/golang.org/x/crypto/blowfish/const.go
@ -0,0 +1,199 @@
 // Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // The startup permutation array and substitution boxes.
 // They are the hexadecimal digits of PI; see:
 // https://www.schneier.com/code/constants.txt.
 package blowfish
 var s0 = [256]uint32{
 	0xd1310ba6, 0x98dfb5ac, 0x2ffd72db, 0xd01adfb7, 0xb8e1afed, 0x6a267e96,
 	0xba7c9045, 0xf12c7f99, 0x24a19947, 0xb3916cf7, 0x0801f2e2, 0x858efc16,
 	0x636920d8, 0x71574e69, 0xa458fea3, 0xf4933d7e, 0x0d95748f, 0x728eb658,
 	0x718bcd58, 0x82154aee, 0x7b54a41d, 0xc25a59b5, 0x9c30d539, 0x2af26013,
 	0xc5d1b023, 0x286085f0, 0xca417918, 0xb8db38ef, 0x8e79dcb0, 0x603a180e,
 	0x6c9e0e8b, 0xb01e8a3e, 0xd71577c1, 0xbd314b27, 0x78af2fda, 0x55605c60,
 	0xe65525f3, 0xaa55ab94, 0x57489862, 0x63e81440, 0x55ca396a, 0x2aab10b6,
 	0xb4cc5c34, 0x1141e8ce, 0xa15486af, 0x7c72e993, 0xb3ee1411, 0x636fbc2a,
 	0x2ba9c55d, 0x741831f6, 0xce5c3e16, 0x9b87931e, 0xafd6ba33, 0x6c24cf5c,
 	0x7a325381, 0x28958677, 0x3b8f4898, 0x6b4bb9af, 0xc4bfe81b, 0x66282193,
 	0x61d809cc, 0xfb21a991, 0x487cac60, 0x5dec8032, 0xef845d5d, 0xe98575b1,
 	0xdc262302, 0xeb651b88, 0x23893e81, 0xd396acc5, 0x0f6d6ff3, 0x83f44239,
 	0x2e0b4482, 0xa4842004, 0x69c8f04a, 0x9e1f9b5e, 0x21c66842, 0xf6e96c9a,
 	0x670c9c61, 0xabd388f0, 0x6a51a0d2, 0xd8542f68, 0x960fa728, 0xab5133a3,
 	0x6eef0b6c, 0x137a3be4, 0xba3bf050, 0x7efb2a98, 0xa1f1651d, 0x39af0176,
 	0x66ca593e, 0x82430e88, 0x8cee8619, 0x456f9fb4, 0x7d84a5c3, 0x3b8b5ebe,
 	0xe06f75d8, 0x85c12073, 0x401a449f, 0x56c16aa6, 0x4ed3aa62, 0x363f7706,
 	0x1bfedf72, 0x429b023d, 0x37d0d724, 0xd00a1248, 0xdb0fead3, 0x49f1c09b,
 	0x075372c9, 0x80991b7b, 0x25d479d8, 0xf6e8def7, 0xe3fe501a, 0xb6794c3b,
 	0x976ce0bd, 0x04c006ba, 0xc1a94fb6, 0x409f60c4, 0x5e5c9ec2, 0x196a2463,
 	0x68fb6faf, 0x3e6c53b5, 0x1339b2eb, 0x3b52ec6f, 0x6dfc511f, 0x9b30952c,
 	0xcc814544, 0xaf5ebd09, 0xbee3d004, 0xde334afd, 0x660f2807, 0x192e4bb3,
 	0xc0cba857, 0x45c8740f, 0xd20b5f39, 0xb9d3fbdb, 0x5579c0bd, 0x1a60320a,
 	0xd6a100c6, 0x402c7279, 0x679f25fe, 0xfb1fa3cc, 0x8ea5e9f8, 0xdb3222f8,
 	0x3c7516df, 0xfd616b15, 0x2f501ec8, 0xad0552ab, 0x323db5fa, 0xfd238760,
 	0x53317b48, 0x3e00df82, 0x9e5c57bb, 0xca6f8ca0, 0x1a87562e, 0xdf1769db,
 	0xd542a8f6, 0x287effc3, 0xac6732c6, 0x8c4f5573, 0x695b27b0, 0xbbca58c8,
 	0xe1ffa35d, 0xb8f011a0, 0x10fa3d98, 0xfd2183b8, 0x4afcb56c, 0x2dd1d35b,
 	0x9a53e479, 0xb6f84565, 0xd28e49bc, 0x4bfb9790, 0xe1ddf2da, 0xa4cb7e33,
 	0x62fb1341, 0xcee4c6e8, 0xef20cada, 0x36774c01, 0xd07e9efe, 0x2bf11fb4,
 	0x95dbda4d, 0xae909198, 0xeaad8e71, 0x6b93d5a0, 0xd08ed1d0, 0xafc725e0,
 	0x8e3c5b2f, 0x8e7594b7, 0x8ff6e2fb, 0xf2122b64, 0x8888b812, 0x900df01c,
 	0x4fad5ea0, 0x688fc31c, 0xd1cff191, 0xb3a8c1ad, 0x2f2f2218, 0xbe0e1777,
 	0xea752dfe, 0x8b021fa1, 0xe5a0cc0f, 0xb56f74e8, 0x18acf3d6, 0xce89e299,
 	0xb4a84fe0, 0xfd13e0b7, 0x7cc43b81, 0xd2ada8d9, 0x165fa266, 0x80957705,
 	0x93cc7314, 0x211a1477, 0xe6ad2065, 0x77b5fa86, 0xc75442f5, 0xfb9d35cf,
 	0xebcdaf0c, 0x7b3e89a0, 0xd6411bd3, 0xae1e7e49, 0x00250e2d, 0x2071b35e,
 	0x226800bb, 0x57b8e0af, 0x2464369b, 0xf009b91e, 0x5563911d, 0x59dfa6aa,
 	0x78c14389, 0xd95a537f, 0x207d5ba2, 0x02e5b9c5, 0x83260376, 0x6295cfa9,
 	0x11c81968, 0x4e734a41, 0xb3472dca, 0x7b14a94a, 0x1b510052, 0x9a532915,
 	0xd60f573f, 0xbc9bc6e4, 0x2b60a476, 0x81e67400, 0x08ba6fb5, 0x571be91f,
 	0xf296ec6b, 0x2a0dd915, 0xb6636521, 0xe7b9f9b6, 0xff34052e, 0xc5855664,
 	0x53b02d5d, 0xa99f8fa1, 0x08ba4799, 0x6e85076a,
 }
 var s1 = [256]uint32{
 	0x4b7a70e9, 0xb5b32944, 0xdb75092e, 0xc4192623, 0xad6ea6b0, 0x49a7df7d,
 	0x9cee60b8, 0x8fedb266, 0xecaa8c71, 0x699a17ff, 0x5664526c, 0xc2b19ee1,
 	0x193602a5, 0x75094c29, 0xa0591340, 0xe4183a3e, 0x3f54989a, 0x5b429d65,
 	0x6b8fe4d6, 0x99f73fd6, 0xa1d29c07, 0xefe830f5, 0x4d2d38e6, 0xf0255dc1,
 	0x4cdd2086, 0x8470eb26, 0x6382e9c6, 0x021ecc5e, 0x09686b3f, 0x3ebaefc9,
 	0x3c971814, 0x6b6a70a1, 0x687f3584, 0x52a0e286, 0xb79c5305, 0xaa500737,
 	0x3e07841c, 0x7fdeae5c, 0x8e7d44ec, 0x5716f2b8, 0xb03ada37, 0xf0500c0d,
 	0xf01c1f04, 0x0200b3ff, 0xae0cf51a, 0x3cb574b2, 0x25837a58, 0xdc0921bd,
 	0xd19113f9, 0x7ca92ff6, 0x94324773, 0x22f54701, 0x3ae5e581, 0x37c2dadc,
 	0xc8b57634, 0x9af3dda7, 0xa9446146, 0x0fd0030e, 0xecc8c73e, 0xa4751e41,
 	0xe238cd99, 0x3bea0e2f, 0x3280bba1, 0x183eb331, 0x4e548b38, 0x4f6db908,
 	0x6f420d03, 0xf60a04bf, 0x2cb81290, 0x24977c79, 0x5679b072, 0xbcaf89af,
 	0xde9a771f, 0xd9930810, 0xb38bae12, 0xdccf3f2e, 0x5512721f, 0x2e6b7124,
 	0x501adde6, 0x9f84cd87, 0x7a584718, 0x7408da17, 0xbc9f9abc, 0xe94b7d8c,
 	0xec7aec3a, 0xdb851dfa, 0x63094366, 0xc464c3d2, 0xef1c1847, 0x3215d908,
 	0xdd433b37, 0x24c2ba16, 0x12a14d43, 0x2a65c451, 0x50940002, 0x133ae4dd,
 	0x71dff89e, 0x10314e55, 0x81ac77d6, 0x5f11199b, 0x043556f1, 0xd7a3c76b,
 	0x3c11183b, 0x5924a509, 0xf28fe6ed, 0x97f1fbfa, 0x9ebabf2c, 0x1e153c6e,
 	0x86e34570, 0xeae96fb1, 0x860e5e0a, 0x5a3e2ab3, 0x771fe71c, 0x4e3d06fa,
 	0x2965dcb9, 0x99e71d0f, 0x803e89d6, 0x5266c825, 0x2e4cc978, 0x9c10b36a,
 	0xc6150eba, 0x94e2ea78, 0xa5fc3c53, 0x1e0a2df4, 0xf2f74ea7, 0x361d2b3d,
 	0x1939260f, 0x19c27960, 0x5223a708, 0xf71312b6, 0xebadfe6e, 0xeac31f66,
 	0xe3bc4595, 0xa67bc883, 0xb17f37d1, 0x018cff28, 0xc332ddef, 0xbe6c5aa5,
 	0x65582185, 0x68ab9802, 0xeecea50f, 0xdb2f953b, 0x2aef7dad, 0x5b6e2f84,
 	0x1521b628, 0x29076170, 0xecdd4775, 0x619f1510, 0x13cca830, 0xeb61bd96,
 	0x0334fe1e, 0xaa0363cf, 0xb5735c90, 0x4c70a239, 0xd59e9e0b, 0xcbaade14,
 	0xeecc86bc, 0x60622ca7, 0x9cab5cab, 0xb2f3846e, 0x648b1eaf, 0x19bdf0ca,
 	0xa02369b9, 0x655abb50, 0x40685a32, 0x3c2ab4b3, 0x319ee9d5, 0xc021b8f7,
 	0x9b540b19, 0x875fa099, 0x95f7997e, 0x623d7da8, 0xf837889a, 0x97e32d77,
 	0x11ed935f, 0x16681281, 0x0e358829, 0xc7e61fd6, 0x96dedfa1, 0x7858ba99,
 	0x57f584a5, 0x1b227263, 0x9b83c3ff, 0x1ac24696, 0xcdb30aeb, 0x532e3054,
 	0x8fd948e4, 0x6dbc3128, 0x58ebf2ef, 0x34c6ffea, 0xfe28ed61, 0xee7c3c73,
 	0x5d4a14d9, 0xe864b7e3, 0x42105d14, 0x203e13e0, 0x45eee2b6, 0xa3aaabea,
 	0xdb6c4f15, 0xfacb4fd0, 0xc742f442, 0xef6abbb5, 0x654f3b1d, 0x41cd2105,
 	0xd81e799e, 0x86854dc7, 0xe44b476a, 0x3d816250, 0xcf62a1f2, 0x5b8d2646,
 	0xfc8883a0, 0xc1c7b6a3, 0x7f1524c3, 0x69cb7492, 0x47848a0b, 0x5692b285,
 	0x095bbf00, 0xad19489d, 0x1462b174, 0x23820e00, 0x58428d2a, 0x0c55f5ea,
 	0x1dadf43e, 0x233f7061, 0x3372f092, 0x8d937e41, 0xd65fecf1, 0x6c223bdb,
 	0x7cde3759, 0xcbee7460, 0x4085f2a7, 0xce77326e, 0xa6078084, 0x19f8509e,
 	0xe8efd855, 0x61d99735, 0xa969a7aa, 0xc50c06c2, 0x5a04abfc, 0x800bcadc,
 	0x9e447a2e, 0xc3453484, 0xfdd56705, 0x0e1e9ec9, 0xdb73dbd3, 0x105588cd,
 	0x675fda79, 0xe3674340, 0xc5c43465, 0x713e38d8, 0x3d28f89e, 0xf16dff20,
 	0x153e21e7, 0x8fb03d4a, 0xe6e39f2b, 0xdb83adf7,
 }
 var s2 = [256]uint32{
 	0xe93d5a68, 0x948140f7, 0xf64c261c, 0x94692934, 0x411520f7, 0x7602d4f7,
 	0xbcf46b2e, 0xd4a20068, 0xd4082471, 0x3320f46a, 0x43b7d4b7, 0x500061af,
 	0x1e39f62e, 0x97244546, 0x14214f74, 0xbf8b8840, 0x4d95fc1d, 0x96b591af,
 	0x70f4ddd3, 0x66a02f45, 0xbfbc09ec, 0x03bd9785, 0x7fac6dd0, 0x31cb8504,
 	0x96eb27b3, 0x55fd3941, 0xda2547e6, 0xabca0a9a, 0x28507825, 0x530429f4,
 	0x0a2c86da, 0xe9b66dfb, 0x68dc1462, 0xd7486900, 0x680ec0a4, 0x27a18dee,
 	0x4f3ffea2, 0xe887ad8c, 0xb58ce006, 0x7af4d6b6, 0xaace1e7c, 0xd3375fec,
 	0xce78a399, 0x406b2a42, 0x20fe9e35, 0xd9f385b9, 0xee39d7ab, 0x3b124e8b,
 	0x1dc9faf7, 0x4b6d1856, 0x26a36631, 0xeae397b2, 0x3a6efa74, 0xdd5b4332,
 	0x6841e7f7, 0xca7820fb, 0xfb0af54e, 0xd8feb397, 0x454056ac, 0xba489527,
 	0x55533a3a, 0x20838d87, 0xfe6ba9b7, 0xd096954b, 0x55a867bc, 0xa1159a58,
 	0xcca92963, 0x99e1db33, 0xa62a4a56, 0x3f3125f9, 0x5ef47e1c, 0x9029317c,
 	0xfdf8e802, 0x04272f70, 0x80bb155c, 0x05282ce3, 0x95c11548, 0xe4c66d22,
 	0x48c1133f, 0xc70f86dc, 0x07f9c9ee, 0x41041f0f, 0x404779a4, 0x5d886e17,
 	0x325f51eb, 0xd59bc0d1, 0xf2bcc18f, 0x41113564, 0x257b7834, 0x602a9c60,
 	0xdff8e8a3, 0x1f636c1b, 0x0e12b4c2, 0x02e1329e, 0xaf664fd1, 0xcad18115,
 	0x6b2395e0, 0x333e92e1, 0x3b240b62, 0xeebeb922, 0x85b2a20e, 0xe6ba0d99,
 	0xde720c8c, 0x2da2f728, 0xd0127845, 0x95b794fd, 0x647d0862, 0xe7ccf5f0,
 	0x5449a36f, 0x877d48fa, 0xc39dfd27, 0xf33e8d1e, 0x0a476341, 0x992eff74,
 	0x3a6f6eab, 0xf4f8fd37, 0xa812dc60, 0xa1ebddf8, 0x991be14c, 0xdb6e6b0d,
 	0xc67b5510, 0x6d672c37, 0x2765d43b, 0xdcd0e804, 0xf1290dc7, 0xcc00ffa3,
 	0xb5390f92, 0x690fed0b, 0x667b9ffb, 0xcedb7d9c, 0xa091cf0b, 0xd9155ea3,
 	0xbb132f88, 0x515bad24, 0x7b9479bf, 0x763bd6eb, 0x37392eb3, 0xcc115979,
 	0x8026e297, 0xf42e312d, 0x6842ada7, 0xc66a2b3b, 0x12754ccc, 0x782ef11c,
 	0x6a124237, 0xb79251e7, 0x06a1bbe6, 0x4bfb6350, 0x1a6b1018, 0x11caedfa,
 	0x3d25bdd8, 0xe2e1c3c9, 0x44421659, 0x0a121386, 0xd90cec6e, 0xd5abea2a,
 	0x64af674e, 0xda86a85f, 0xbebfe988, 0x64e4c3fe, 0x9dbc8057, 0xf0f7c086,
 	0x60787bf8, 0x6003604d, 0xd1fd8346, 0xf6381fb0, 0x7745ae04, 0xd736fccc,
 	0x83426b33, 0xf01eab71, 0xb0804187, 0x3c005e5f, 0x77a057be, 0xbde8ae24,
 	0x55464299, 0xbf582e61, 0x4e58f48f, 0xf2ddfda2, 0xf474ef38, 0x8789bdc2,
 	0x5366f9c3, 0xc8b38e74, 0xb475f255, 0x46fcd9b9, 0x7aeb2661, 0x8b1ddf84,
 	0x846a0e79, 0x915f95e2, 0x466e598e, 0x20b45770, 0x8cd55591, 0xc902de4c,
 	0xb90bace1, 0xbb8205d0, 0x11a86248, 0x7574a99e, 0xb77f19b6, 0xe0a9dc09,
 	0x662d09a1, 0xc4324633, 0xe85a1f02, 0x09f0be8c, 0x4a99a025, 0x1d6efe10,
 	0x1ab93d1d, 0x0ba5a4df, 0xa186f20f, 0x2868f169, 0xdcb7da83, 0x573906fe,
 	0xa1e2ce9b, 0x4fcd7f52, 0x50115e01, 0xa70683fa, 0xa002b5c4, 0x0de6d027,
 	0x9af88c27, 0x773f8641, 0xc3604c06, 0x61a806b5, 0xf0177a28, 0xc0f586e0,
 	0x006058aa, 0x30dc7d62, 0x11e69ed7, 0x2338ea63, 0x53c2dd94, 0xc2c21634,
 	0xbbcbee56, 0x90bcb6de, 0xebfc7da1, 0xce591d76, 0x6f05e409, 0x4b7c0188,
 	0x39720a3d, 0x7c927c24, 0x86e3725f, 0x724d9db9, 0x1ac15bb4, 0xd39eb8fc,
 	0xed545578, 0x08fca5b5, 0xd83d7cd3, 0x4dad0fc4, 0x1e50ef5e, 0xb161e6f8,
 	0xa28514d9, 0x6c51133c, 0x6fd5c7e7, 0x56e14ec4, 0x362abfce, 0xddc6c837,
 	0xd79a3234, 0x92638212, 0x670efa8e, 0x406000e0,
 }
 var s3 = [256]uint32{
 	0x3a39ce37, 0xd3faf5cf, 0xabc27737, 0x5ac52d1b, 0x5cb0679e, 0x4fa33742,
 	0xd3822740, 0x99bc9bbe, 0xd5118e9d, 0xbf0f7315, 0xd62d1c7e, 0xc700c47b,
 	0xb78c1b6b, 0x21a19045, 0xb26eb1be, 0x6a366eb4, 0x5748ab2f, 0xbc946e79,
 	0xc6a376d2, 0x6549c2c8, 0x530ff8ee, 0x468dde7d, 0xd5730a1d, 0x4cd04dc6,
 	0x2939bbdb, 0xa9ba4650, 0xac9526e8, 0xbe5ee304, 0xa1fad5f0, 0x6a2d519a,
 	0x63ef8ce2, 0x9a86ee22, 0xc089c2b8, 0x43242ef6, 0xa51e03aa, 0x9cf2d0a4,
 	0x83c061ba, 0x9be96a4d, 0x8fe51550, 0xba645bd6, 0x2826a2f9, 0xa73a3ae1,
 	0x4ba99586, 0xef5562e9, 0xc72fefd3, 0xf752f7da, 0x3f046f69, 0x77fa0a59,
 	0x80e4a915, 0x87b08601, 0x9b09e6ad, 0x3b3ee593, 0xe990fd5a, 0x9e34d797,
 	0x2cf0b7d9, 0x022b8b51, 0x96d5ac3a, 0x017da67d, 0xd1cf3ed6, 0x7c7d2d28,
 	0x1f9f25cf, 0xadf2b89b, 0x5ad6b472, 0x5a88f54c, 0xe029ac71, 0xe019a5e6,
 	0x47b0acfd, 0xed93fa9b, 0xe8d3c48d, 0x283b57cc, 0xf8d56629, 0x79132e28,
 	0x785f0191, 0xed756055, 0xf7960e44, 0xe3d35e8c, 0x15056dd4, 0x88f46dba,
 	0x03a16125, 0x0564f0bd, 0xc3eb9e15, 0x3c9057a2, 0x97271aec, 0xa93a072a,
 	0x1b3f6d9b, 0x1e6321f5, 0xf59c66fb, 0x26dcf319, 0x7533d928, 0xb155fdf5,
 	0x03563482, 0x8aba3cbb, 0x28517711, 0xc20ad9f8, 0xabcc5167, 0xccad925f,
 	0x4de81751, 0x3830dc8e, 0x379d5862, 0x9320f991, 0xea7a90c2, 0xfb3e7bce,
 	0x5121ce64, 0x774fbe32, 0xa8b6e37e, 0xc3293d46, 0x48de5369, 0x6413e680,
 	0xa2ae0810, 0xdd6db224, 0x69852dfd, 0x09072166, 0xb39a460a, 0x6445c0dd,
 	0x586cdecf, 0x1c20c8ae, 0x5bbef7dd, 0x1b588d40, 0xccd2017f, 0x6bb4e3bb,
 	0xdda26a7e, 0x3a59ff45, 0x3e350a44, 0xbcb4cdd5, 0x72eacea8, 0xfa6484bb,
 	0x8d6612ae, 0xbf3c6f47, 0xd29be463, 0x542f5d9e, 0xaec2771b, 0xf64e6370,
 	0x740e0d8d, 0xe75b1357, 0xf8721671, 0xaf537d5d, 0x4040cb08, 0x4eb4e2cc,
 	0x34d2466a, 0x0115af84, 0xe1b00428, 0x95983a1d, 0x06b89fb4, 0xce6ea048,
 	0x6f3f3b82, 0x3520ab82, 0x011a1d4b, 0x277227f8, 0x611560b1, 0xe7933fdc,
 	0xbb3a792b, 0x344525bd, 0xa08839e1, 0x51ce794b, 0x2f32c9b7, 0xa01fbac9,
 	0xe01cc87e, 0xbcc7d1f6, 0xcf0111c3, 0xa1e8aac7, 0x1a908749, 0xd44fbd9a,
 	0xd0dadecb, 0xd50ada38, 0x0339c32a, 0xc6913667, 0x8df9317c, 0xe0b12b4f,
 	0xf79e59b7, 0x43f5bb3a, 0xf2d519ff, 0x27d9459c, 0xbf97222c, 0x15e6fc2a,
 	0x0f91fc71, 0x9b941525, 0xfae59361, 0xceb69ceb, 0xc2a86459, 0x12baa8d1,
 	0xb6c1075e, 0xe3056a0c, 0x10d25065, 0xcb03a442, 0xe0ec6e0e, 0x1698db3b,
 	0x4c98a0be, 0x3278e964, 0x9f1f9532, 0xe0d392df, 0xd3a0342b, 0x8971f21e,
 	0x1b0a7441, 0x4ba3348c, 0xc5be7120, 0xc37632d8, 0xdf359f8d, 0x9b992f2e,
 	0xe60b6f47, 0x0fe3f11d, 0xe54cda54, 0x1edad891, 0xce6279cf, 0xcd3e7e6f,
 	0x1618b166, 0xfd2c1d05, 0x848fd2c5, 0xf6fb2299, 0xf523f357, 0xa6327623,
 	0x93a83531, 0x56cccd02, 0xacf08162, 0x5a75ebb5, 0x6e163697, 0x88d273cc,
 	0xde966292, 0x81b949d0, 0x4c50901b, 0x71c65614, 0xe6c6c7bd, 0x327a140a,
 	0x45e1d006, 0xc3f27b9a, 0xc9aa53fd, 0x62a80f00, 0xbb25bfe2, 0x35bdd2f6,
 	0x71126905, 0xb2040222, 0xb6cbcf7c, 0xcd769c2b, 0x53113ec0, 0x1640e3d3,
 	0x38abbd60, 0x2547adf0, 0xba38209c, 0xf746ce76, 0x77afa1c5, 0x20756060,
 	0x85cbfe4e, 0x8ae88dd8, 0x7aaaf9b0, 0x4cf9aa7e, 0x1948c25c, 0x02fb8a8c,
 	0x01c36ae4, 0xd6ebe1f9, 0x90d4f869, 0xa65cdea0, 0x3f09252d, 0xc208e69f,
 	0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6,
 }
 var p = [18]uint32{
 	0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, 0xa4093822, 0x299f31d0,
 	0x082efa98, 0xec4e6c89, 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,
 	0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917, 0x9216d5d9, 0x8979fb1b,
 }
--- a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go
@ -0,0 +1,16 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.11,!gccgo,!purego
 package chacha20
 const bufSize = 256
 //go:noescape
 func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
 	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/asm_arm64.s
+++ b/vendor/golang.org/x/crypto/internal/chacha20/asm_arm64.s
@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build go1.11
+// +build go1.11,!gccgo,!purego
 // +build !gccgo,!appengine
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
@ -0,0 +1,398 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package chacha20 implements the ChaCha20 and XChaCha20 encryption algorithms
 // as specified in RFC 8439 and draft-irtf-cfrg-xchacha-01.
 package chacha20
 import (
 	"crypto/cipher"
 	"encoding/binary"
 	"errors"
 	"math/bits"
 	"golang.org/x/crypto/internal/subtle"
 )
 const (
 	// KeySize is the size of the key used by this cipher, in bytes.
 	KeySize = 32
 	// NonceSize is the size of the nonce used with the standard variant of this
 	// cipher, in bytes.
 	//
 	// Note that this is too short to be safely generated at random if the same
 	// key is reused more than 2³² times.
 	NonceSize = 12
 	// NonceSizeX is the size of the nonce used with the XChaCha20 variant of
 	// this cipher, in bytes.
 	NonceSizeX = 24
 )
 // Cipher is a stateful instance of ChaCha20 or XChaCha20 using a particular key
 // and nonce. A *Cipher implements the cipher.Stream interface.
 type Cipher struct {
 	// The ChaCha20 state is 16 words: 4 constant, 8 of key, 1 of counter
 	// (incremented after each block), and 3 of nonce.
 	key     [8]uint32
 	counter uint32
 	nonce   [3]uint32
 	// The last len bytes of buf are leftover key stream bytes from the previous
 	// XORKeyStream invocation. The size of buf depends on how many blocks are
 	// computed at a time by xorKeyStreamBlocks.
 	buf [bufSize]byte
 	len int
 	// overflow is set when the counter overflowed, no more blocks can be
 	// generated, and the next XORKeyStream call should panic.
 	overflow bool
 	// The counter-independent results of the first round are cached after they
 	// are computed the first time.
 	precompDone      bool
 	p1, p5, p9, p13  uint32
 	p2, p6, p10, p14 uint32
 	p3, p7, p11, p15 uint32
 }
 var _ cipher.Stream = (*Cipher)(nil)
 // NewUnauthenticatedCipher creates a new ChaCha20 stream cipher with the given
 // 32 bytes key and a 12 or 24 bytes nonce. If a nonce of 24 bytes is provided,
 // the XChaCha20 construction will be used. It returns an error if key or nonce
 // have any other length.
 //
 // Note that ChaCha20, like all stream ciphers, is not authenticated and allows
 // attackers to silently tamper with the plaintext. For this reason, it is more
 // appropriate as a building block than as a standalone encryption mechanism.
 // Instead, consider using package golang.org/x/crypto/chacha20poly1305.
 func NewUnauthenticatedCipher(key, nonce []byte) (*Cipher, error) {
 	// This function is split into a wrapper so that the Cipher allocation will
 	// be inlined, and depending on how the caller uses the return value, won't
 	// escape to the heap.
 	c := &Cipher{}
 	return newUnauthenticatedCipher(c, key, nonce)
 }
 func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
 	if len(key) != KeySize {
 		return nil, errors.New("chacha20: wrong key size")
 	}
 	if len(nonce) == NonceSizeX {
 		// XChaCha20 uses the ChaCha20 core to mix 16 bytes of the nonce into a
 		// derived key, allowing it to operate on a nonce of 24 bytes. See
 		// draft-irtf-cfrg-xchacha-01, Section 2.3.
 		key, _ = HChaCha20(key, nonce[0:16])
 		cNonce := make([]byte, NonceSize)
 		copy(cNonce[4:12], nonce[16:24])
 		nonce = cNonce
 	} else if len(nonce) != NonceSize {
 		return nil, errors.New("chacha20: wrong nonce size")
 	}
 	key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
 	c.key = [8]uint32{
 		binary.LittleEndian.Uint32(key[0:4]),
 		binary.LittleEndian.Uint32(key[4:8]),
 		binary.LittleEndian.Uint32(key[8:12]),
 		binary.LittleEndian.Uint32(key[12:16]),
 		binary.LittleEndian.Uint32(key[16:20]),
 		binary.LittleEndian.Uint32(key[20:24]),
 		binary.LittleEndian.Uint32(key[24:28]),
 		binary.LittleEndian.Uint32(key[28:32]),
 	}
 	c.nonce = [3]uint32{
 		binary.LittleEndian.Uint32(nonce[0:4]),
 		binary.LittleEndian.Uint32(nonce[4:8]),
 		binary.LittleEndian.Uint32(nonce[8:12]),
 	}
 	return c, nil
 }
 // The constant first 4 words of the ChaCha20 state.
 const (
 	j0 uint32 = 0x61707865 // expa
 	j1 uint32 = 0x3320646e // nd 3
 	j2 uint32 = 0x79622d32 // 2-by
 	j3 uint32 = 0x6b206574 // te k
 )
 const blockSize = 64
 // quarterRound is the core of ChaCha20. It shuffles the bits of 4 state words.
 // It's executed 4 times for each of the 20 ChaCha20 rounds, operating on all 16
 // words each round, in columnar or diagonal groups of 4 at a time.
 func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
 	a += b
 	d ^= a
 	d = bits.RotateLeft32(d, 16)
 	c += d
 	b ^= c
 	b = bits.RotateLeft32(b, 12)
 	a += b
 	d ^= a
 	d = bits.RotateLeft32(d, 8)
 	c += d
 	b ^= c
 	b = bits.RotateLeft32(b, 7)
 	return a, b, c, d
 }
 // SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
 // behave as if (64 * counter) bytes had been encrypted so far.
 //
 // To prevent accidental counter reuse, SetCounter panics if counter is less
 // than the current value.
 //
 // Note that the execution time of XORKeyStream is not independent of the
 // counter value.
 func (s *Cipher) SetCounter(counter uint32) {
 	// Internally, s may buffer multiple blocks, which complicates this
 	// implementation slightly. When checking whether the counter has rolled
 	// back, we must use both s.counter and s.len to determine how many blocks
 	// we have already output.
 	outputCounter := s.counter - uint32(s.len)/blockSize
 	if s.overflow || counter < outputCounter {
 		panic("chacha20: SetCounter attempted to rollback counter")
 	}
 	// In the general case, we set the new counter value and reset s.len to 0,
 	// causing the next call to XORKeyStream to refill the buffer. However, if
 	// we're advancing within the existing buffer, we can save work by simply
 	// setting s.len.
 	if counter < s.counter {
 		s.len = int(s.counter-counter) * blockSize
 	} else {
 		s.counter = counter
 		s.len = 0
 	}
 }
 // XORKeyStream XORs each byte in the given slice with a byte from the
 // cipher's key stream. Dst and src must overlap entirely or not at all.
 //
 // If len(dst) < len(src), XORKeyStream will panic. It is acceptable
 // to pass a dst bigger than src, and in that case, XORKeyStream will
 // only update dst[:len(src)] and will not touch the rest of dst.
 //
 // Multiple calls to XORKeyStream behave as if the concatenation of
 // the src buffers was passed in a single run. That is, Cipher
 // maintains state and does not reset at each XORKeyStream call.
 func (s *Cipher) XORKeyStream(dst, src []byte) {
 	if len(src) == 0 {
 		return
 	}
 	if len(dst) < len(src) {
 		panic("chacha20: output smaller than input")
 	}
 	dst = dst[:len(src)]
 	if subtle.InexactOverlap(dst, src) {
 		panic("chacha20: invalid buffer overlap")
 	}
 	// First, drain any remaining key stream from a previous XORKeyStream.
 	if s.len != 0 {
 		keyStream := s.buf[bufSize-s.len:]
 		if len(src) < len(keyStream) {
 			keyStream = keyStream[:len(src)]
 		}
 		_ = src[len(keyStream)-1] // bounds check elimination hint
 		for i, b := range keyStream {
 			dst[i] = src[i] ^ b
 		}
 		s.len -= len(keyStream)
 		dst, src = dst[len(keyStream):], src[len(keyStream):]
 	}
 	if len(src) == 0 {
 		return
 	}
 	// If we'd need to let the counter overflow and keep generating output,
 	// panic immediately. If instead we'd only reach the last block, remember
 	// not to generate any more output after the buffer is drained.
 	numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
 	if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
 		panic("chacha20: counter overflow")
 	} else if uint64(s.counter)+numBlocks == 1<<32 {
 		s.overflow = true
 	}
 	// xorKeyStreamBlocks implementations expect input lengths that are a
 	// multiple of bufSize. Platform-specific ones process multiple blocks at a
 	// time, so have bufSizes that are a multiple of blockSize.
 	full := len(src) - len(src)%bufSize
 	if full > 0 {
 		s.xorKeyStreamBlocks(dst[:full], src[:full])
 	}
 	dst, src = dst[full:], src[full:]
 	// If using a multi-block xorKeyStreamBlocks would overflow, use the generic
 	// one that does one block at a time.
 	const blocksPerBuf = bufSize / blockSize
 	if uint64(s.counter)+blocksPerBuf > 1<<32 {
 		s.buf = [bufSize]byte{}
 		numBlocks := (len(src) + blockSize - 1) / blockSize
 		buf := s.buf[bufSize-numBlocks*blockSize:]
 		copy(buf, src)
 		s.xorKeyStreamBlocksGeneric(buf, buf)
 		s.len = len(buf) - copy(dst, buf)
 		return
 	}
 	// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
 	// keep the leftover keystream for the next XORKeyStream invocation.
 	if len(src) > 0 {
 		s.buf = [bufSize]byte{}
 		copy(s.buf[:], src)
 		s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
 		s.len = bufSize - copy(dst, s.buf[:])
 	}
 }
 func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
 	if len(dst) != len(src) || len(dst)%blockSize != 0 {
 		panic("chacha20: internal error: wrong dst and/or src length")
 	}
 	// To generate each block of key stream, the initial cipher state
 	// (represented below) is passed through 20 rounds of shuffling,
 	// alternatively applying quarterRounds by columns (like 1, 5, 9, 13)
 	// or by diagonals (like 1, 6, 11, 12).
 	//
 	//      0:cccccccc   1:cccccccc   2:cccccccc   3:cccccccc
 	//      4:kkkkkkkk   5:kkkkkkkk   6:kkkkkkkk   7:kkkkkkkk
 	//      8:kkkkkkkk   9:kkkkkkkk  10:kkkkkkkk  11:kkkkkkkk
 	//     12:bbbbbbbb  13:nnnnnnnn  14:nnnnnnnn  15:nnnnnnnn
 	//
 	//            c=constant k=key b=blockcount n=nonce
 	var (
 		c0, c1, c2, c3   = j0, j1, j2, j3
 		c4, c5, c6, c7   = s.key[0], s.key[1], s.key[2], s.key[3]
 		c8, c9, c10, c11 = s.key[4], s.key[5], s.key[6], s.key[7]
 		_, c13, c14, c15 = s.counter, s.nonce[0], s.nonce[1], s.nonce[2]
 	)
 	// Three quarters of the first round don't depend on the counter, so we can
 	// calculate them here, and reuse them for multiple blocks in the loop, and
 	// for future XORKeyStream invocations.
 	if !s.precompDone {
 		s.p1, s.p5, s.p9, s.p13 = quarterRound(c1, c5, c9, c13)
 		s.p2, s.p6, s.p10, s.p14 = quarterRound(c2, c6, c10, c14)
 		s.p3, s.p7, s.p11, s.p15 = quarterRound(c3, c7, c11, c15)
 		s.precompDone = true
 	}
 	// A condition of len(src) > 0 would be sufficient, but this also
 	// acts as a bounds check elimination hint.
 	for len(src) >= 64 && len(dst) >= 64 {
 		// The remainder of the first column round.
 		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
 		// The second diagonal round.
 		x0, x5, x10, x15 := quarterRound(fcr0, s.p5, s.p10, s.p15)
 		x1, x6, x11, x12 := quarterRound(s.p1, s.p6, s.p11, fcr12)
 		x2, x7, x8, x13 := quarterRound(s.p2, s.p7, fcr8, s.p13)
 		x3, x4, x9, x14 := quarterRound(s.p3, fcr4, s.p9, s.p14)
 		// The remaining 18 rounds.
 		for i := 0; i < 9; i++ {
 			// Column round.
 			x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 			x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 			x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 			x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 			// Diagonal round.
 			x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 			x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 			x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 		// Add back the initial state to generate the key stream, then
 		// XOR the key stream with the source and write out the result.
 		addXor(dst[0:4], src[0:4], x0, c0)
 		addXor(dst[4:8], src[4:8], x1, c1)
 		addXor(dst[8:12], src[8:12], x2, c2)
 		addXor(dst[12:16], src[12:16], x3, c3)
 		addXor(dst[16:20], src[16:20], x4, c4)
 		addXor(dst[20:24], src[20:24], x5, c5)
 		addXor(dst[24:28], src[24:28], x6, c6)
 		addXor(dst[28:32], src[28:32], x7, c7)
 		addXor(dst[32:36], src[32:36], x8, c8)
 		addXor(dst[36:40], src[36:40], x9, c9)
 		addXor(dst[40:44], src[40:44], x10, c10)
 		addXor(dst[44:48], src[44:48], x11, c11)
 		addXor(dst[48:52], src[48:52], x12, s.counter)
 		addXor(dst[52:56], src[52:56], x13, c13)
 		addXor(dst[56:60], src[56:60], x14, c14)
 		addXor(dst[60:64], src[60:64], x15, c15)
 		s.counter += 1
 		src, dst = src[blockSize:], dst[blockSize:]
 	}
 }
 // HChaCha20 uses the ChaCha20 core to generate a derived key from a 32 bytes
 // key and a 16 bytes nonce. It returns an error if key or nonce have any other
 // length. It is used as part of the XChaCha20 construction.
 func HChaCha20(key, nonce []byte) ([]byte, error) {
 	// This function is split into a wrapper so that the slice allocation will
 	// be inlined, and depending on how the caller uses the return value, won't
 	// escape to the heap.
 	out := make([]byte, 32)
 	return hChaCha20(out, key, nonce)
 }
 func hChaCha20(out, key, nonce []byte) ([]byte, error) {
 	if len(key) != KeySize {
 		return nil, errors.New("chacha20: wrong HChaCha20 key size")
 	}
 	if len(nonce) != 16 {
 		return nil, errors.New("chacha20: wrong HChaCha20 nonce size")
 	}
 	x0, x1, x2, x3 := j0, j1, j2, j3
 	x4 := binary.LittleEndian.Uint32(key[0:4])
 	x5 := binary.LittleEndian.Uint32(key[4:8])
 	x6 := binary.LittleEndian.Uint32(key[8:12])
 	x7 := binary.LittleEndian.Uint32(key[12:16])
 	x8 := binary.LittleEndian.Uint32(key[16:20])
 	x9 := binary.LittleEndian.Uint32(key[20:24])
 	x10 := binary.LittleEndian.Uint32(key[24:28])
 	x11 := binary.LittleEndian.Uint32(key[28:32])
 	x12 := binary.LittleEndian.Uint32(nonce[0:4])
 	x13 := binary.LittleEndian.Uint32(nonce[4:8])
 	x14 := binary.LittleEndian.Uint32(nonce[8:12])
 	x15 := binary.LittleEndian.Uint32(nonce[12:16])
 	for i := 0; i < 10; i++ {
 		// Diagonal round.
 		x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 		x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 		x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 		x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 		// Column round.
 		x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 		x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 		x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
 		x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 	}
 	_ = out[31] // bounds check elimination hint
 	binary.LittleEndian.PutUint32(out[0:4], x0)
 	binary.LittleEndian.PutUint32(out[4:8], x1)
 	binary.LittleEndian.PutUint32(out[8:12], x2)
 	binary.LittleEndian.PutUint32(out[12:16], x3)
 	binary.LittleEndian.PutUint32(out[16:20], x12)
 	binary.LittleEndian.PutUint32(out[20:24], x13)
 	binary.LittleEndian.PutUint32(out[24:28], x14)
 	binary.LittleEndian.PutUint32(out[28:32], x15)
 	return out, nil
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_noasm.go
@ -2,15 +2,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo appengine
+// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo purego
 package chacha20
-const (
+const bufSize = blockSize
 	bufSize = 64
 	haveAsm = false
 )
-func (*Cipher) xorKeyStreamAsm(dst, src []byte) {
+func (s *Cipher) xorKeyStreamBlocks(dst, src []byte) {
-	panic("not implemented")
+	s.xorKeyStreamBlocksGeneric(dst, src)
 }
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
@ -0,0 +1,16 @@
 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !gccgo,!purego
 package chacha20
 const bufSize = 256
 //go:noescape
 func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
 	chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/asm_ppc64le.s
+++ b/vendor/golang.org/x/crypto/internal/chacha20/asm_ppc64le.s
@ -19,7 +19,7 @@
 // The differences in this and the original implementation are
 // due to the calling conventions and initialization of constants.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
@ -31,24 +31,7 @@
 #define TMP  R15
 #define CONSTBASE  R16
-
+#define BLOCKS R17
 #define X0   R11
 #define X1   R12
 #define X2   R14
 #define X3   R15
 #define X4   R16
 #define X5   R17
 #define X6   R18
 #define X7   R19
 #define X8   R20
 #define X9   R21
 #define X10  R22
 #define X11  R23
 #define X12  R24
 #define X13  R25
 #define X14  R26
 #define X15  R27
 DATA consts<>+0x00(SB)/8, $0x3320646e61707865
 DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
@ -72,13 +55,13 @@ DATA consts<>+0x90(SB)/8, $0x0000000100000000
 DATA consts<>+0x98(SB)/8, $0x0000000300000002
 GLOBL consts<>(SB), RODATA, $0xa0
-//func chaCha20_ctr32_vsx(out, inp []byte, len int, key *[32]byte, counter *[16]byte)
+//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	MOVD out+0(FP), OUT
 	MOVD inp+8(FP), INP
 	MOVD len+16(FP), LEN
 	MOVD key+24(FP), KEY
-	MOVD cnt+32(FP), CNT
+	MOVD counter+32(FP), CNT
 	// Addressing for constants
 	MOVD $consts<>+0x00(SB), CONSTBASE
@ -86,6 +69,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 	MOVD $32, R9
 	MOVD $48, R10
 	MOVD $64, R11
 	SRD $6, LEN, BLOCKS
 	// V16
 	LXVW4X (CONSTBASE)(R0), VS48
 	ADD $80,CONSTBASE
@ -429,9 +413,9 @@ loop_vsx:
 	BNE  loop_outer_vsx
 done_vsx:
-	// Increment counter by 4
+	// Increment counter by number of 64 byte blocks
 	MOVD (CNT), R14
-	ADD  $4, R14
+	ADD  BLOCKS, R14
 	MOVD R14, (CNT)
 	RET
--- a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go
@ -0,0 +1,26 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !gccgo,!purego
 package chacha20
 import "golang.org/x/sys/cpu"
 var haveAsm = cpu.S390X.HasVX
 const bufSize = 256
 // xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
 // be called when the vector facility is available. Implementation in asm_s390x.s.
 //go:noescape
 func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 func (c *Cipher) xorKeyStreamBlocks(dst, src []byte) {
 	if cpu.S390X.HasVX {
 		xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
 	} else {
 		c.xorKeyStreamBlocksGeneric(dst, src)
 	}
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build s390x,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "go_asm.h"
 #include "textflag.h"
@ -24,15 +24,6 @@ DATA ·constants<>+0x14(SB)/4, $0x3320646e
 DATA ·constants<>+0x18(SB)/4, $0x79622d32
 DATA ·constants<>+0x1c(SB)/4, $0x6b206574
 // EXRL targets:
 TEXT ·mvcSrcToBuf(SB), NOFRAME|NOSPLIT, $0
 	MVC $1, (R1), (R8)
 	RET
 TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
 	MVC $1, (R8), (R9)
 	RET
 #define BSWAP V5
 #define J0    V6
 #define KEY0  V7
@ -144,7 +135,7 @@ TEXT ·mvcBufToDst(SB), NOFRAME|NOSPLIT, $0
 	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
 	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
-// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
+// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
 	MOVD $·constants<>(SB), R1
 	MOVD dst+0(FP), R2         // R2=&dst[0]
@ -152,25 +143,10 @@ TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
 	MOVD key+48(FP), R5        // R5=key
 	MOVD nonce+56(FP), R6      // R6=nonce
 	MOVD counter+64(FP), R7    // R7=counter
 	MOVD buf+72(FP), R8        // R8=buf
 	MOVD len+80(FP), R9        // R9=len
 	// load BSWAP and J0
 	VLM (R1), BSWAP, J0
 	// set up tail buffer
 	ADD     $-1, R4, R12
 	MOVBZ   R12, R12
 	CMPUBEQ R12, $255, aligned
 	MOVD    R4, R1
 	AND     $~255, R1
 	MOVD    $(R3)(R1*1), R1
 	EXRL    $·mvcSrcToBuf(SB), R12
 	MOVD    $255, R0
 	SUB     R12, R0
 	MOVD    R0, (R9)               // update len
 aligned:
 	// setup
 	MOVD  $95, R0
 	VLM   (R5), KEY0, KEY1
@ -217,9 +193,7 @@ loop:
 	// decrement length
 	ADD $-256, R4
 	BLT tail
 continue:
 	// rearrange vectors
 	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
 	ADDV(J0, X0, X1, X2, X3)
@ -245,16 +219,6 @@ continue:
 	MOVD $256(R3), R3
 	CMPBNE  R4, $0, chacha
 	CMPUBEQ R12, $255, return
 	EXRL    $·mvcBufToDst(SB), R12 // len was updated during setup
 return:
 	VSTEF $0, CTR, (R7)
 	RET
 tail:
 	MOVD R2, R9
 	MOVD R8, R2
 	MOVD R8, R3
 	MOVD $0, R4
 	JMP  continue
--- a/vendor/golang.org/x/crypto/internal/chacha20/xor.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/xor.go
@ -4,9 +4,7 @@
 package chacha20
-import (
+import "runtime"
 	"runtime"
 )
 // Platforms that have fast unaligned 32-bit little endian accesses.
 const unaligned = runtime.GOARCH == "386" ||
@ -15,10 +13,10 @@ const unaligned = runtime.GOARCH == "386" ||
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
 // places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
+func addXor(dst, src []byte, a, b uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
+	_, _ = src[3], dst[3] // bounds check elimination hint
 	if unaligned {
 		// The compiler should optimize this code into
 		// 32-bit unaligned little endian loads and stores.
@ -29,15 +27,16 @@ func xor(dst, src []byte, u uint32) {
 		v |= uint32(src[1]) << 8
 		v |= uint32(src[2]) << 16
 		v |= uint32(src[3]) << 24
-		v ^= u
+		v ^= a + b
 		dst[0] = byte(v)
 		dst[1] = byte(v >> 8)
 		dst[2] = byte(v >> 16)
 		dst[3] = byte(v >> 24)
 	} else {
-		dst[0] = src[0] ^ byte(u)
+		a += b
-		dst[1] = src[1] ^ byte(u>>8)
+		dst[0] = src[0] ^ byte(a)
-		dst[2] = src[2] ^ byte(u>>16)
+		dst[1] = src[1] ^ byte(a>>8)
-		dst[3] = src[3] ^ byte(u>>24)
+		dst[2] = src[2] ^ byte(a>>16)
 		dst[3] = src[3] ^ byte(a>>24)
 	}
 }
--- a/vendor/golang.org/x/crypto/curve25519/const_amd64.h
+++ b/vendor/golang.org/x/crypto/curve25519/const_amd64.h
@ -1,8 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 #define REDMASK51     0x0007FFFFFFFFFFFF
--- a/vendor/golang.org/x/crypto/curve25519/const_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/const_amd64.s
@ -1,20 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 // +build amd64,!gccgo,!appengine
 // These constants cannot be encoded in non-MOVQ immediates.
 // We access them directly from memory instead.
 DATA ·_121666_213(SB)/8, $996687872
 GLOBL ·_121666_213(SB), 8, $8
 DATA ·_2P0(SB)/8, $0xFFFFFFFFFFFDA
 GLOBL ·_2P0(SB), 8, $8
 DATA ·_2P1234(SB)/8, $0xFFFFFFFFFFFFE
 GLOBL ·_2P1234(SB), 8, $8
--- a/vendor/golang.org/x/crypto/curve25519/cswap_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/cswap_amd64.s
@ -1,65 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build amd64,!gccgo,!appengine
 // func cswap(inout *[4][5]uint64, v uint64)
 TEXT ·cswap(SB),7,$0
 	MOVQ inout+0(FP),DI
 	MOVQ v+8(FP),SI
 	SUBQ $1, SI
 	NOTQ SI
 	MOVQ SI, X15
 	PSHUFD $0x44, X15, X15
 	MOVOU 0(DI), X0
 	MOVOU 16(DI), X2
 	MOVOU 32(DI), X4
 	MOVOU 48(DI), X6
 	MOVOU 64(DI), X8
 	MOVOU 80(DI), X1
 	MOVOU 96(DI), X3
 	MOVOU 112(DI), X5
 	MOVOU 128(DI), X7
 	MOVOU 144(DI), X9
 	MOVO X1, X10
 	MOVO X3, X11
 	MOVO X5, X12
 	MOVO X7, X13
 	MOVO X9, X14
 	PXOR X0, X10
 	PXOR X2, X11
 	PXOR X4, X12
 	PXOR X6, X13
 	PXOR X8, X14
 	PAND X15, X10
 	PAND X15, X11
 	PAND X15, X12
 	PAND X15, X13
 	PAND X15, X14
 	PXOR X10, X0
 	PXOR X10, X1
 	PXOR X11, X2
 	PXOR X11, X3
 	PXOR X12, X4
 	PXOR X12, X5
 	PXOR X13, X6
 	PXOR X13, X7
 	PXOR X14, X8
 	PXOR X14, X9
 	MOVOU X0, 0(DI)
 	MOVOU X2, 16(DI)
 	MOVOU X4, 32(DI)
 	MOVOU X6, 48(DI)
 	MOVOU X8, 64(DI)
 	MOVOU X1, 80(DI)
 	MOVOU X3, 96(DI)
 	MOVOU X5, 112(DI)
 	MOVOU X7, 128(DI)
 	MOVOU X9, 144(DI)
 	RET
--- a/vendor/golang.org/x/crypto/curve25519/curve25519.go
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519.go
@ -1,834 +1,95 @@
-// Copyright 2013 The Go Authors. All rights reserved.
+// Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// We have an implementation in amd64 assembly so this code is only run on
+// Package curve25519 provides an implementation of the X25519 function, which
-// non-amd64 platforms. The amd64 assembly does not support gccgo.
+// performs scalar multiplication on the elliptic curve known as Curve25519.
-// +build !amd64 gccgo appengine
+// See RFC 7748.
-
+package curve25519 // import "golang.org/x/crypto/curve25519"
 package curve25519
 import (
-	"encoding/binary"
+	"crypto/subtle"
 	"fmt"
 )
-// This code is a port of the public domain, "ref10" implementation of
+// ScalarMult sets dst to the product scalar * point.
-// curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
+//
 // Deprecated: when provided a low-order point, ScalarMult will set dst to all
 // zeroes, irrespective of the scalar. Instead, use the X25519 function, which
 // will return an error.
 func ScalarMult(dst, scalar, point *[32]byte) {
 	scalarMult(dst, scalar, point)
 }
-// fieldElement represents an element of the field GF(2^255 - 19). An element
+// ScalarBaseMult sets dst to the product scalar * base where base is the
-// t, entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+// standard generator.
-// t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
+//
-// context.
+// It is recommended to use the X25519 function with Basepoint instead, as
-type fieldElement [10]int32
+// copying into fixed size arrays can lead to unexpected bugs.
 func ScalarBaseMult(dst, scalar *[32]byte) {
 	ScalarMult(dst, scalar, &basePoint)
 }
-func feZero(fe *fieldElement) {
+const (
-	for i := range fe {
+	// ScalarSize is the size of the scalar input to X25519.
-		fe[i] = 0
+	ScalarSize = 32
 	// PointSize is the size of the point input to X25519.
 	PointSize = 32
 )
 // Basepoint is the canonical Curve25519 generator.
 var Basepoint []byte
 var basePoint = [32]byte{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 func init() { Basepoint = basePoint[:] }
 func checkBasepoint() {
 	if subtle.ConstantTimeCompare(Basepoint, []byte{
 		0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 	}) != 1 {
 		panic("curve25519: global Basepoint value was modified")
 	}
 }
-func feOne(fe *fieldElement) {
+// X25519 returns the result of the scalar multiplication (scalar * point),
-	feZero(fe)
+// according to RFC 7748, Section 5. scalar, point and the return value are
-	fe[0] = 1
+// slices of 32 bytes.
 //
 // scalar can be generated at random, for example with crypto/rand. point should
 // be either Basepoint or the output of another X25519 call.
 //
 // If point is Basepoint (but not if it's a different slice with the same
 // contents) a precomputed implementation might be used for performance.
 func X25519(scalar, point []byte) ([]byte, error) {
 	// Outline the body of function, to let the allocation be inlined in the
 	// caller, and possibly avoid escaping to the heap.
 	var dst [32]byte
 	return x25519(&dst, scalar, point)
 }
-func feAdd(dst, a, b *fieldElement) {
+func x25519(dst *[32]byte, scalar, point []byte) ([]byte, error) {
-	for i := range dst {
+	var in [32]byte
-		dst[i] = a[i] + b[i]
+	if l := len(scalar); l != 32 {
 		return nil, fmt.Errorf("bad scalar length: %d, expected %d", l, 32)
 	}
-}
+	if l := len(point); l != 32 {
-
+		return nil, fmt.Errorf("bad point length: %d, expected %d", l, 32)
-func feSub(dst, a, b *fieldElement) {
+	}
-	for i := range dst {
+	copy(in[:], scalar)
-		dst[i] = a[i] - b[i]
+	if &point[0] == &Basepoint[0] {
-	}
+		checkBasepoint()
-}
+		ScalarBaseMult(dst, &in)
-
+	} else {
-func feCopy(dst, src *fieldElement) {
+		var base, zero [32]byte
-	for i := range dst {
+		copy(base[:], point)
-		dst[i] = src[i]
+		ScalarMult(dst, &in, &base)
-	}
+		if subtle.ConstantTimeCompare(dst[:], zero[:]) == 1 {
-}
+			return nil, fmt.Errorf("bad input point: low order point")
-
+		}
-// feCSwap replaces (f,g) with (g,f) if b == 1; replaces (f,g) with (f,g) if b == 0.
+	}
-//
+	return dst[:], nil
 // Preconditions: b in {0,1}.
 func feCSwap(f, g *fieldElement, b int32) {
 	b = -b
 	for i := range f {
 		t := b & (f[i] ^ g[i])
 		f[i] ^= t
 		g[i] ^= t
 	}
 }
 // load3 reads a 24-bit, little-endian value from in.
 func load3(in []byte) int64 {
 	var r int64
 	r = int64(in[0])
 	r |= int64(in[1]) << 8
 	r |= int64(in[2]) << 16
 	return r
 }
 // load4 reads a 32-bit, little-endian value from in.
 func load4(in []byte) int64 {
 	return int64(binary.LittleEndian.Uint32(in))
 }
 func feFromBytes(dst *fieldElement, src *[32]byte) {
 	h0 := load4(src[:])
 	h1 := load3(src[4:]) << 6
 	h2 := load3(src[7:]) << 5
 	h3 := load3(src[10:]) << 3
 	h4 := load3(src[13:]) << 2
 	h5 := load4(src[16:])
 	h6 := load3(src[20:]) << 7
 	h7 := load3(src[23:]) << 5
 	h8 := load3(src[26:]) << 4
 	h9 := (load3(src[29:]) & 0x7fffff) << 2
 	var carry [10]int64
 	carry[9] = (h9 + 1<<24) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[1] = (h1 + 1<<24) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[3] = (h3 + 1<<24) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[5] = (h5 + 1<<24) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[7] = (h7 + 1<<24) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[0] = (h0 + 1<<25) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[2] = (h2 + 1<<25) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[4] = (h4 + 1<<25) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[6] = (h6 + 1<<25) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[8] = (h8 + 1<<25) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	dst[0] = int32(h0)
 	dst[1] = int32(h1)
 	dst[2] = int32(h2)
 	dst[3] = int32(h3)
 	dst[4] = int32(h4)
 	dst[5] = int32(h5)
 	dst[6] = int32(h6)
 	dst[7] = int32(h7)
 	dst[8] = int32(h8)
 	dst[9] = int32(h9)
 }
 // feToBytes marshals h to s.
 // Preconditions:
 //   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 //
 // Write p=2^255-19; q=floor(h/p).
 // Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
 //
 // Proof:
 //   Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
 //   Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
 //
 //   Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
 //   Then 0<y<1.
 //
 //   Write r=h-pq.
 //   Have 0<=r<=p-1=2^255-20.
 //   Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
 //
 //   Write x=r+19(2^-255)r+y.
 //   Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
 //
 //   Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
 //   so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
 func feToBytes(s *[32]byte, h *fieldElement) {
 	var carry [10]int32
 	q := (19*h[9] + (1 << 24)) >> 25
 	q = (h[0] + q) >> 26
 	q = (h[1] + q) >> 25
 	q = (h[2] + q) >> 26
 	q = (h[3] + q) >> 25
 	q = (h[4] + q) >> 26
 	q = (h[5] + q) >> 25
 	q = (h[6] + q) >> 26
 	q = (h[7] + q) >> 25
 	q = (h[8] + q) >> 26
 	q = (h[9] + q) >> 25
 	// Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
 	h[0] += 19 * q
 	// Goal: Output h-2^255 q, which is between 0 and 2^255-20.
 	carry[0] = h[0] >> 26
 	h[1] += carry[0]
 	h[0] -= carry[0] << 26
 	carry[1] = h[1] >> 25
 	h[2] += carry[1]
 	h[1] -= carry[1] << 25
 	carry[2] = h[2] >> 26
 	h[3] += carry[2]
 	h[2] -= carry[2] << 26
 	carry[3] = h[3] >> 25
 	h[4] += carry[3]
 	h[3] -= carry[3] << 25
 	carry[4] = h[4] >> 26
 	h[5] += carry[4]
 	h[4] -= carry[4] << 26
 	carry[5] = h[5] >> 25
 	h[6] += carry[5]
 	h[5] -= carry[5] << 25
 	carry[6] = h[6] >> 26
 	h[7] += carry[6]
 	h[6] -= carry[6] << 26
 	carry[7] = h[7] >> 25
 	h[8] += carry[7]
 	h[7] -= carry[7] << 25
 	carry[8] = h[8] >> 26
 	h[9] += carry[8]
 	h[8] -= carry[8] << 26
 	carry[9] = h[9] >> 25
 	h[9] -= carry[9] << 25
 	// h10 = carry9
 	// Goal: Output h[0]+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
 	// Have h[0]+...+2^230 h[9] between 0 and 2^255-1;
 	// evidently 2^255 h10-2^255 q = 0.
 	// Goal: Output h[0]+...+2^230 h[9].
 	s[0] = byte(h[0] >> 0)
 	s[1] = byte(h[0] >> 8)
 	s[2] = byte(h[0] >> 16)
 	s[3] = byte((h[0] >> 24) | (h[1] << 2))
 	s[4] = byte(h[1] >> 6)
 	s[5] = byte(h[1] >> 14)
 	s[6] = byte((h[1] >> 22) | (h[2] << 3))
 	s[7] = byte(h[2] >> 5)
 	s[8] = byte(h[2] >> 13)
 	s[9] = byte((h[2] >> 21) | (h[3] << 5))
 	s[10] = byte(h[3] >> 3)
 	s[11] = byte(h[3] >> 11)
 	s[12] = byte((h[3] >> 19) | (h[4] << 6))
 	s[13] = byte(h[4] >> 2)
 	s[14] = byte(h[4] >> 10)
 	s[15] = byte(h[4] >> 18)
 	s[16] = byte(h[5] >> 0)
 	s[17] = byte(h[5] >> 8)
 	s[18] = byte(h[5] >> 16)
 	s[19] = byte((h[5] >> 24) | (h[6] << 1))
 	s[20] = byte(h[6] >> 7)
 	s[21] = byte(h[6] >> 15)
 	s[22] = byte((h[6] >> 23) | (h[7] << 3))
 	s[23] = byte(h[7] >> 5)
 	s[24] = byte(h[7] >> 13)
 	s[25] = byte((h[7] >> 21) | (h[8] << 4))
 	s[26] = byte(h[8] >> 4)
 	s[27] = byte(h[8] >> 12)
 	s[28] = byte((h[8] >> 20) | (h[9] << 6))
 	s[29] = byte(h[9] >> 2)
 	s[30] = byte(h[9] >> 10)
 	s[31] = byte(h[9] >> 18)
 }
 // feMul calculates h = f * g
 // Can overlap h with f or g.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //    |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 //
 // Notes on implementation strategy:
 //
 // Using schoolbook multiplication.
 // Karatsuba would save a little in some cost models.
 //
 // Most multiplications by 2 and 19 are 32-bit precomputations;
 // cheaper than 64-bit postcomputations.
 //
 // There is one remaining multiplication by 19 in the carry chain;
 // one *19 precomputation can be merged into this,
 // but the resulting data flow is considerably less clean.
 //
 // There are 12 carries below.
 // 10 of them are 2-way parallelizable and vectorizable.
 // Can get away with 11 carries, but then data flow is much deeper.
 //
 // With tighter constraints on inputs can squeeze carries into int32.
 func feMul(h, f, g *fieldElement) {
 	f0 := f[0]
 	f1 := f[1]
 	f2 := f[2]
 	f3 := f[3]
 	f4 := f[4]
 	f5 := f[5]
 	f6 := f[6]
 	f7 := f[7]
 	f8 := f[8]
 	f9 := f[9]
 	g0 := g[0]
 	g1 := g[1]
 	g2 := g[2]
 	g3 := g[3]
 	g4 := g[4]
 	g5 := g[5]
 	g6 := g[6]
 	g7 := g[7]
 	g8 := g[8]
 	g9 := g[9]
 	g1_19 := 19 * g1 // 1.4*2^29
 	g2_19 := 19 * g2 // 1.4*2^30; still ok
 	g3_19 := 19 * g3
 	g4_19 := 19 * g4
 	g5_19 := 19 * g5
 	g6_19 := 19 * g6
 	g7_19 := 19 * g7
 	g8_19 := 19 * g8
 	g9_19 := 19 * g9
 	f1_2 := 2 * f1
 	f3_2 := 2 * f3
 	f5_2 := 2 * f5
 	f7_2 := 2 * f7
 	f9_2 := 2 * f9
 	f0g0 := int64(f0) * int64(g0)
 	f0g1 := int64(f0) * int64(g1)
 	f0g2 := int64(f0) * int64(g2)
 	f0g3 := int64(f0) * int64(g3)
 	f0g4 := int64(f0) * int64(g4)
 	f0g5 := int64(f0) * int64(g5)
 	f0g6 := int64(f0) * int64(g6)
 	f0g7 := int64(f0) * int64(g7)
 	f0g8 := int64(f0) * int64(g8)
 	f0g9 := int64(f0) * int64(g9)
 	f1g0 := int64(f1) * int64(g0)
 	f1g1_2 := int64(f1_2) * int64(g1)
 	f1g2 := int64(f1) * int64(g2)
 	f1g3_2 := int64(f1_2) * int64(g3)
 	f1g4 := int64(f1) * int64(g4)
 	f1g5_2 := int64(f1_2) * int64(g5)
 	f1g6 := int64(f1) * int64(g6)
 	f1g7_2 := int64(f1_2) * int64(g7)
 	f1g8 := int64(f1) * int64(g8)
 	f1g9_38 := int64(f1_2) * int64(g9_19)
 	f2g0 := int64(f2) * int64(g0)
 	f2g1 := int64(f2) * int64(g1)
 	f2g2 := int64(f2) * int64(g2)
 	f2g3 := int64(f2) * int64(g3)
 	f2g4 := int64(f2) * int64(g4)
 	f2g5 := int64(f2) * int64(g5)
 	f2g6 := int64(f2) * int64(g6)
 	f2g7 := int64(f2) * int64(g7)
 	f2g8_19 := int64(f2) * int64(g8_19)
 	f2g9_19 := int64(f2) * int64(g9_19)
 	f3g0 := int64(f3) * int64(g0)
 	f3g1_2 := int64(f3_2) * int64(g1)
 	f3g2 := int64(f3) * int64(g2)
 	f3g3_2 := int64(f3_2) * int64(g3)
 	f3g4 := int64(f3) * int64(g4)
 	f3g5_2 := int64(f3_2) * int64(g5)
 	f3g6 := int64(f3) * int64(g6)
 	f3g7_38 := int64(f3_2) * int64(g7_19)
 	f3g8_19 := int64(f3) * int64(g8_19)
 	f3g9_38 := int64(f3_2) * int64(g9_19)
 	f4g0 := int64(f4) * int64(g0)
 	f4g1 := int64(f4) * int64(g1)
 	f4g2 := int64(f4) * int64(g2)
 	f4g3 := int64(f4) * int64(g3)
 	f4g4 := int64(f4) * int64(g4)
 	f4g5 := int64(f4) * int64(g5)
 	f4g6_19 := int64(f4) * int64(g6_19)
 	f4g7_19 := int64(f4) * int64(g7_19)
 	f4g8_19 := int64(f4) * int64(g8_19)
 	f4g9_19 := int64(f4) * int64(g9_19)
 	f5g0 := int64(f5) * int64(g0)
 	f5g1_2 := int64(f5_2) * int64(g1)
 	f5g2 := int64(f5) * int64(g2)
 	f5g3_2 := int64(f5_2) * int64(g3)
 	f5g4 := int64(f5) * int64(g4)
 	f5g5_38 := int64(f5_2) * int64(g5_19)
 	f5g6_19 := int64(f5) * int64(g6_19)
 	f5g7_38 := int64(f5_2) * int64(g7_19)
 	f5g8_19 := int64(f5) * int64(g8_19)
 	f5g9_38 := int64(f5_2) * int64(g9_19)
 	f6g0 := int64(f6) * int64(g0)
 	f6g1 := int64(f6) * int64(g1)
 	f6g2 := int64(f6) * int64(g2)
 	f6g3 := int64(f6) * int64(g3)
 	f6g4_19 := int64(f6) * int64(g4_19)
 	f6g5_19 := int64(f6) * int64(g5_19)
 	f6g6_19 := int64(f6) * int64(g6_19)
 	f6g7_19 := int64(f6) * int64(g7_19)
 	f6g8_19 := int64(f6) * int64(g8_19)
 	f6g9_19 := int64(f6) * int64(g9_19)
 	f7g0 := int64(f7) * int64(g0)
 	f7g1_2 := int64(f7_2) * int64(g1)
 	f7g2 := int64(f7) * int64(g2)
 	f7g3_38 := int64(f7_2) * int64(g3_19)
 	f7g4_19 := int64(f7) * int64(g4_19)
 	f7g5_38 := int64(f7_2) * int64(g5_19)
 	f7g6_19 := int64(f7) * int64(g6_19)
 	f7g7_38 := int64(f7_2) * int64(g7_19)
 	f7g8_19 := int64(f7) * int64(g8_19)
 	f7g9_38 := int64(f7_2) * int64(g9_19)
 	f8g0 := int64(f8) * int64(g0)
 	f8g1 := int64(f8) * int64(g1)
 	f8g2_19 := int64(f8) * int64(g2_19)
 	f8g3_19 := int64(f8) * int64(g3_19)
 	f8g4_19 := int64(f8) * int64(g4_19)
 	f8g5_19 := int64(f8) * int64(g5_19)
 	f8g6_19 := int64(f8) * int64(g6_19)
 	f8g7_19 := int64(f8) * int64(g7_19)
 	f8g8_19 := int64(f8) * int64(g8_19)
 	f8g9_19 := int64(f8) * int64(g9_19)
 	f9g0 := int64(f9) * int64(g0)
 	f9g1_38 := int64(f9_2) * int64(g1_19)
 	f9g2_19 := int64(f9) * int64(g2_19)
 	f9g3_38 := int64(f9_2) * int64(g3_19)
 	f9g4_19 := int64(f9) * int64(g4_19)
 	f9g5_38 := int64(f9_2) * int64(g5_19)
 	f9g6_19 := int64(f9) * int64(g6_19)
 	f9g7_38 := int64(f9_2) * int64(g7_19)
 	f9g8_19 := int64(f9) * int64(g8_19)
 	f9g9_38 := int64(f9_2) * int64(g9_19)
 	h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
 	h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
 	h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
 	h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
 	h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
 	h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
 	h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
 	h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
 	h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
 	h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
 	var carry [10]int64
 	// |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
 	//   i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
 	// |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
 	//   i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	// |h0| <= 2^25
 	// |h4| <= 2^25
 	// |h1| <= 1.51*2^58
 	// |h5| <= 1.51*2^58
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	// |h1| <= 2^24; from now on fits into int32
 	// |h5| <= 2^24; from now on fits into int32
 	// |h2| <= 1.21*2^59
 	// |h6| <= 1.21*2^59
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	// |h2| <= 2^25; from now on fits into int32 unchanged
 	// |h6| <= 2^25; from now on fits into int32 unchanged
 	// |h3| <= 1.51*2^58
 	// |h7| <= 1.51*2^58
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	// |h3| <= 2^24; from now on fits into int32 unchanged
 	// |h7| <= 2^24; from now on fits into int32 unchanged
 	// |h4| <= 1.52*2^33
 	// |h8| <= 1.52*2^33
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	// |h4| <= 2^25; from now on fits into int32 unchanged
 	// |h8| <= 2^25; from now on fits into int32 unchanged
 	// |h5| <= 1.01*2^24
 	// |h9| <= 1.51*2^58
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	// |h9| <= 2^24; from now on fits into int32 unchanged
 	// |h0| <= 1.8*2^37
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	// |h0| <= 2^25; from now on fits into int32 unchanged
 	// |h1| <= 1.01*2^24
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feSquare calculates h = f*f. Can overlap h with f.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 func feSquare(h, f *fieldElement) {
 	f0 := f[0]
 	f1 := f[1]
 	f2 := f[2]
 	f3 := f[3]
 	f4 := f[4]
 	f5 := f[5]
 	f6 := f[6]
 	f7 := f[7]
 	f8 := f[8]
 	f9 := f[9]
 	f0_2 := 2 * f0
 	f1_2 := 2 * f1
 	f2_2 := 2 * f2
 	f3_2 := 2 * f3
 	f4_2 := 2 * f4
 	f5_2 := 2 * f5
 	f6_2 := 2 * f6
 	f7_2 := 2 * f7
 	f5_38 := 38 * f5 // 1.31*2^30
 	f6_19 := 19 * f6 // 1.31*2^30
 	f7_38 := 38 * f7 // 1.31*2^30
 	f8_19 := 19 * f8 // 1.31*2^30
 	f9_38 := 38 * f9 // 1.31*2^30
 	f0f0 := int64(f0) * int64(f0)
 	f0f1_2 := int64(f0_2) * int64(f1)
 	f0f2_2 := int64(f0_2) * int64(f2)
 	f0f3_2 := int64(f0_2) * int64(f3)
 	f0f4_2 := int64(f0_2) * int64(f4)
 	f0f5_2 := int64(f0_2) * int64(f5)
 	f0f6_2 := int64(f0_2) * int64(f6)
 	f0f7_2 := int64(f0_2) * int64(f7)
 	f0f8_2 := int64(f0_2) * int64(f8)
 	f0f9_2 := int64(f0_2) * int64(f9)
 	f1f1_2 := int64(f1_2) * int64(f1)
 	f1f2_2 := int64(f1_2) * int64(f2)
 	f1f3_4 := int64(f1_2) * int64(f3_2)
 	f1f4_2 := int64(f1_2) * int64(f4)
 	f1f5_4 := int64(f1_2) * int64(f5_2)
 	f1f6_2 := int64(f1_2) * int64(f6)
 	f1f7_4 := int64(f1_2) * int64(f7_2)
 	f1f8_2 := int64(f1_2) * int64(f8)
 	f1f9_76 := int64(f1_2) * int64(f9_38)
 	f2f2 := int64(f2) * int64(f2)
 	f2f3_2 := int64(f2_2) * int64(f3)
 	f2f4_2 := int64(f2_2) * int64(f4)
 	f2f5_2 := int64(f2_2) * int64(f5)
 	f2f6_2 := int64(f2_2) * int64(f6)
 	f2f7_2 := int64(f2_2) * int64(f7)
 	f2f8_38 := int64(f2_2) * int64(f8_19)
 	f2f9_38 := int64(f2) * int64(f9_38)
 	f3f3_2 := int64(f3_2) * int64(f3)
 	f3f4_2 := int64(f3_2) * int64(f4)
 	f3f5_4 := int64(f3_2) * int64(f5_2)
 	f3f6_2 := int64(f3_2) * int64(f6)
 	f3f7_76 := int64(f3_2) * int64(f7_38)
 	f3f8_38 := int64(f3_2) * int64(f8_19)
 	f3f9_76 := int64(f3_2) * int64(f9_38)
 	f4f4 := int64(f4) * int64(f4)
 	f4f5_2 := int64(f4_2) * int64(f5)
 	f4f6_38 := int64(f4_2) * int64(f6_19)
 	f4f7_38 := int64(f4) * int64(f7_38)
 	f4f8_38 := int64(f4_2) * int64(f8_19)
 	f4f9_38 := int64(f4) * int64(f9_38)
 	f5f5_38 := int64(f5) * int64(f5_38)
 	f5f6_38 := int64(f5_2) * int64(f6_19)
 	f5f7_76 := int64(f5_2) * int64(f7_38)
 	f5f8_38 := int64(f5_2) * int64(f8_19)
 	f5f9_76 := int64(f5_2) * int64(f9_38)
 	f6f6_19 := int64(f6) * int64(f6_19)
 	f6f7_38 := int64(f6) * int64(f7_38)
 	f6f8_38 := int64(f6_2) * int64(f8_19)
 	f6f9_38 := int64(f6) * int64(f9_38)
 	f7f7_38 := int64(f7) * int64(f7_38)
 	f7f8_38 := int64(f7_2) * int64(f8_19)
 	f7f9_76 := int64(f7_2) * int64(f9_38)
 	f8f8_19 := int64(f8) * int64(f8_19)
 	f8f9_38 := int64(f8) * int64(f9_38)
 	f9f9_38 := int64(f9) * int64(f9_38)
 	h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
 	h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
 	h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
 	h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
 	h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
 	h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
 	h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
 	h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
 	h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
 	h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
 	var carry [10]int64
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feMul121666 calculates h = f * 121666. Can overlap h with f.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 func feMul121666(h, f *fieldElement) {
 	h0 := int64(f[0]) * 121666
 	h1 := int64(f[1]) * 121666
 	h2 := int64(f[2]) * 121666
 	h3 := int64(f[3]) * 121666
 	h4 := int64(f[4]) * 121666
 	h5 := int64(f[5]) * 121666
 	h6 := int64(f[6]) * 121666
 	h7 := int64(f[7]) * 121666
 	h8 := int64(f[8]) * 121666
 	h9 := int64(f[9]) * 121666
 	var carry [10]int64
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feInvert sets out = z^-1.
 func feInvert(out, z *fieldElement) {
 	var t0, t1, t2, t3 fieldElement
 	var i int
 	feSquare(&t0, z)
 	for i = 1; i < 1; i++ {
 		feSquare(&t0, &t0)
 	}
 	feSquare(&t1, &t0)
 	for i = 1; i < 2; i++ {
 		feSquare(&t1, &t1)
 	}
 	feMul(&t1, z, &t1)
 	feMul(&t0, &t0, &t1)
 	feSquare(&t2, &t0)
 	for i = 1; i < 1; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t1, &t2)
 	feSquare(&t2, &t1)
 	for i = 1; i < 5; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t2, &t1)
 	for i = 1; i < 10; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t2, &t2, &t1)
 	feSquare(&t3, &t2)
 	for i = 1; i < 20; i++ {
 		feSquare(&t3, &t3)
 	}
 	feMul(&t2, &t3, &t2)
 	feSquare(&t2, &t2)
 	for i = 1; i < 10; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t2, &t1)
 	for i = 1; i < 50; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t2, &t2, &t1)
 	feSquare(&t3, &t2)
 	for i = 1; i < 100; i++ {
 		feSquare(&t3, &t3)
 	}
 	feMul(&t2, &t3, &t2)
 	feSquare(&t2, &t2)
 	for i = 1; i < 50; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t1, &t1)
 	for i = 1; i < 5; i++ {
 		feSquare(&t1, &t1)
 	}
 	feMul(out, &t1, &t0)
 }
 func scalarMult(out, in, base *[32]byte) {
 	var e [32]byte
 	copy(e[:], in[:])
 	e[0] &= 248
 	e[31] &= 127
 	e[31] |= 64
 	var x1, x2, z2, x3, z3, tmp0, tmp1 fieldElement
 	feFromBytes(&x1, base)
 	feOne(&x2)
 	feCopy(&x3, &x1)
 	feOne(&z3)
 	swap := int32(0)
 	for pos := 254; pos >= 0; pos-- {
 		b := e[pos/8] >> uint(pos&7)
 		b &= 1
 		swap ^= int32(b)
 		feCSwap(&x2, &x3, swap)
 		feCSwap(&z2, &z3, swap)
 		swap = int32(b)
 		feSub(&tmp0, &x3, &z3)
 		feSub(&tmp1, &x2, &z2)
 		feAdd(&x2, &x2, &z2)
 		feAdd(&z2, &x3, &z3)
 		feMul(&z3, &tmp0, &x2)
 		feMul(&z2, &z2, &tmp1)
 		feSquare(&tmp0, &tmp1)
 		feSquare(&tmp1, &x2)
 		feAdd(&x3, &z3, &z2)
 		feSub(&z2, &z3, &z2)
 		feMul(&x2, &tmp1, &tmp0)
 		feSub(&tmp1, &tmp1, &tmp0)
 		feSquare(&z2, &z2)
 		feMul121666(&z3, &tmp1)
 		feSquare(&x3, &x3)
 		feAdd(&tmp0, &tmp0, &z3)
 		feMul(&z3, &x1, &z2)
 		feMul(&z2, &tmp1, &tmp0)
 	}
 	feCSwap(&x2, &x3, swap)
 	feCSwap(&z2, &z3, swap)
 	feInvert(&z2, &z2)
 	feMul(&x2, &x2, &z2)
 	feToBytes(out, &x2)
 }
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_amd64.go
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519_amd64.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+// +build amd64,!gccgo,!appengine,!purego
 package curve25519
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519_amd64.s
@ -5,9 +5,84 @@
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
-// +build amd64,!gccgo,!appengine
+// +build amd64,!gccgo,!appengine,!purego
-#include "const_amd64.h"
+#define REDMASK51     0x0007FFFFFFFFFFFF
 // These constants cannot be encoded in non-MOVQ immediates.
 // We access them directly from memory instead.
 DATA ·_121666_213(SB)/8, $996687872
 GLOBL ·_121666_213(SB), 8, $8
 DATA ·_2P0(SB)/8, $0xFFFFFFFFFFFDA
 GLOBL ·_2P0(SB), 8, $8
 DATA ·_2P1234(SB)/8, $0xFFFFFFFFFFFFE
 GLOBL ·_2P1234(SB), 8, $8
 // func freeze(inout *[5]uint64)
 TEXT ·freeze(SB),7,$0-8
 	MOVQ inout+0(FP), DI
 	MOVQ 0(DI),SI
 	MOVQ 8(DI),DX
 	MOVQ 16(DI),CX
 	MOVQ 24(DI),R8
 	MOVQ 32(DI),R9
 	MOVQ $REDMASK51,AX
 	MOVQ AX,R10
 	SUBQ $18,R10
 	MOVQ $3,R11
 REDUCELOOP:
 	MOVQ SI,R12
 	SHRQ $51,R12
 	ANDQ AX,SI
 	ADDQ R12,DX
 	MOVQ DX,R12
 	SHRQ $51,R12
 	ANDQ AX,DX
 	ADDQ R12,CX
 	MOVQ CX,R12
 	SHRQ $51,R12
 	ANDQ AX,CX
 	ADDQ R12,R8
 	MOVQ R8,R12
 	SHRQ $51,R12
 	ANDQ AX,R8
 	ADDQ R12,R9
 	MOVQ R9,R12
 	SHRQ $51,R12
 	ANDQ AX,R9
 	IMUL3Q $19,R12,R12
 	ADDQ R12,SI
 	SUBQ $1,R11
 	JA REDUCELOOP
 	MOVQ $1,R12
 	CMPQ R10,SI
 	CMOVQLT R11,R12
 	CMPQ AX,DX
 	CMOVQNE R11,R12
 	CMPQ AX,CX
 	CMOVQNE R11,R12
 	CMPQ AX,R8
 	CMOVQNE R11,R12
 	CMPQ AX,R9
 	CMOVQNE R11,R12
 	NEGQ R12
 	ANDQ R12,AX
 	ANDQ R12,R10
 	SUBQ R10,SI
 	SUBQ AX,DX
 	SUBQ AX,CX
 	SUBQ AX,R8
 	SUBQ AX,R9
 	MOVQ SI,0(DI)
 	MOVQ DX,8(DI)
 	MOVQ CX,16(DI)
 	MOVQ R8,24(DI)
 	MOVQ R9,32(DI)
 	RET
 // func ladderstep(inout *[5][5]uint64)
 TEXT ·ladderstep(SB),0,$296-8
@ -1375,3 +1450,344 @@ TEXT ·ladderstep(SB),0,$296-8
 	MOVQ AX,104(DI)
 	MOVQ R10,112(DI)
 	RET
 // func cswap(inout *[4][5]uint64, v uint64)
 TEXT ·cswap(SB),7,$0
 	MOVQ inout+0(FP),DI
 	MOVQ v+8(FP),SI
 	SUBQ $1, SI
 	NOTQ SI
 	MOVQ SI, X15
 	PSHUFD $0x44, X15, X15
 	MOVOU 0(DI), X0
 	MOVOU 16(DI), X2
 	MOVOU 32(DI), X4
 	MOVOU 48(DI), X6
 	MOVOU 64(DI), X8
 	MOVOU 80(DI), X1
 	MOVOU 96(DI), X3
 	MOVOU 112(DI), X5
 	MOVOU 128(DI), X7
 	MOVOU 144(DI), X9
 	MOVO X1, X10
 	MOVO X3, X11
 	MOVO X5, X12
 	MOVO X7, X13
 	MOVO X9, X14
 	PXOR X0, X10
 	PXOR X2, X11
 	PXOR X4, X12
 	PXOR X6, X13
 	PXOR X8, X14
 	PAND X15, X10
 	PAND X15, X11
 	PAND X15, X12
 	PAND X15, X13
 	PAND X15, X14
 	PXOR X10, X0
 	PXOR X10, X1
 	PXOR X11, X2
 	PXOR X11, X3
 	PXOR X12, X4
 	PXOR X12, X5
 	PXOR X13, X6
 	PXOR X13, X7
 	PXOR X14, X8
 	PXOR X14, X9
 	MOVOU X0, 0(DI)
 	MOVOU X2, 16(DI)
 	MOVOU X4, 32(DI)
 	MOVOU X6, 48(DI)
 	MOVOU X8, 64(DI)
 	MOVOU X1, 80(DI)
 	MOVOU X3, 96(DI)
 	MOVOU X5, 112(DI)
 	MOVOU X7, 128(DI)
 	MOVOU X9, 144(DI)
 	RET
 // func mul(dest, a, b *[5]uint64)
 TEXT ·mul(SB),0,$16-24
 	MOVQ dest+0(FP), DI
 	MOVQ a+8(FP), SI
 	MOVQ b+16(FP), DX
 	MOVQ DX,CX
 	MOVQ 24(SI),DX
 	IMUL3Q $19,DX,AX
 	MOVQ AX,0(SP)
 	MULQ 16(CX)
 	MOVQ AX,R8
 	MOVQ DX,R9
 	MOVQ 32(SI),DX
 	IMUL3Q $19,DX,AX
 	MOVQ AX,8(SP)
 	MULQ 8(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 0(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 0(SI),AX
 	MULQ 8(CX)
 	MOVQ AX,R10
 	MOVQ DX,R11
 	MOVQ 0(SI),AX
 	MULQ 16(CX)
 	MOVQ AX,R12
 	MOVQ DX,R13
 	MOVQ 0(SI),AX
 	MULQ 24(CX)
 	MOVQ AX,R14
 	MOVQ DX,R15
 	MOVQ 0(SI),AX
 	MULQ 32(CX)
 	MOVQ AX,BX
 	MOVQ DX,BP
 	MOVQ 8(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 8(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 8(SI),AX
 	MULQ 16(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 8(SI),AX
 	MULQ 24(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 8(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 16(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 16(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 16(SI),AX
 	MULQ 16(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 16(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 24(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 16(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 24(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 24(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 0(SP),AX
 	MULQ 24(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 0(SP),AX
 	MULQ 32(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 32(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 8(SP),AX
 	MULQ 16(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 8(SP),AX
 	MULQ 24(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 8(SP),AX
 	MULQ 32(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ $REDMASK51,SI
 	SHLQ $13,R8,R9
 	ANDQ SI,R8
 	SHLQ $13,R10,R11
 	ANDQ SI,R10
 	ADDQ R9,R10
 	SHLQ $13,R12,R13
 	ANDQ SI,R12
 	ADDQ R11,R12
 	SHLQ $13,R14,R15
 	ANDQ SI,R14
 	ADDQ R13,R14
 	SHLQ $13,BX,BP
 	ANDQ SI,BX
 	ADDQ R15,BX
 	IMUL3Q $19,BP,DX
 	ADDQ DX,R8
 	MOVQ R8,DX
 	SHRQ $51,DX
 	ADDQ R10,DX
 	MOVQ DX,CX
 	SHRQ $51,DX
 	ANDQ SI,R8
 	ADDQ R12,DX
 	MOVQ DX,R9
 	SHRQ $51,DX
 	ANDQ SI,CX
 	ADDQ R14,DX
 	MOVQ DX,AX
 	SHRQ $51,DX
 	ANDQ SI,R9
 	ADDQ BX,DX
 	MOVQ DX,R10
 	SHRQ $51,DX
 	ANDQ SI,AX
 	IMUL3Q $19,DX,DX
 	ADDQ DX,R8
 	ANDQ SI,R10
 	MOVQ R8,0(DI)
 	MOVQ CX,8(DI)
 	MOVQ R9,16(DI)
 	MOVQ AX,24(DI)
 	MOVQ R10,32(DI)
 	RET
 // func square(out, in *[5]uint64)
 TEXT ·square(SB),7,$0-16
 	MOVQ out+0(FP), DI
 	MOVQ in+8(FP), SI
 	MOVQ 0(SI),AX
 	MULQ 0(SI)
 	MOVQ AX,CX
 	MOVQ DX,R8
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 8(SI)
 	MOVQ AX,R9
 	MOVQ DX,R10
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 16(SI)
 	MOVQ AX,R11
 	MOVQ DX,R12
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 24(SI)
 	MOVQ AX,R13
 	MOVQ DX,R14
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 32(SI)
 	MOVQ AX,R15
 	MOVQ DX,BX
 	MOVQ 8(SI),AX
 	MULQ 8(SI)
 	ADDQ AX,R11
 	ADCQ DX,R12
 	MOVQ 8(SI),AX
 	SHLQ $1,AX
 	MULQ 16(SI)
 	ADDQ AX,R13
 	ADCQ DX,R14
 	MOVQ 8(SI),AX
 	SHLQ $1,AX
 	MULQ 24(SI)
 	ADDQ AX,R15
 	ADCQ DX,BX
 	MOVQ 8(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,CX
 	ADCQ DX,R8
 	MOVQ 16(SI),AX
 	MULQ 16(SI)
 	ADDQ AX,R15
 	ADCQ DX,BX
 	MOVQ 16(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 24(SI)
 	ADDQ AX,CX
 	ADCQ DX,R8
 	MOVQ 16(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R9
 	ADCQ DX,R10
 	MOVQ 24(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 24(SI)
 	ADDQ AX,R9
 	ADCQ DX,R10
 	MOVQ 24(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R11
 	ADCQ DX,R12
 	MOVQ 32(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R13
 	ADCQ DX,R14
 	MOVQ $REDMASK51,SI
 	SHLQ $13,CX,R8
 	ANDQ SI,CX
 	SHLQ $13,R9,R10
 	ANDQ SI,R9
 	ADDQ R8,R9
 	SHLQ $13,R11,R12
 	ANDQ SI,R11
 	ADDQ R10,R11
 	SHLQ $13,R13,R14
 	ANDQ SI,R13
 	ADDQ R12,R13
 	SHLQ $13,R15,BX
 	ANDQ SI,R15
 	ADDQ R14,R15
 	IMUL3Q $19,BX,DX
 	ADDQ DX,CX
 	MOVQ CX,DX
 	SHRQ $51,DX
 	ADDQ R9,DX
 	ANDQ SI,CX
 	MOVQ DX,R8
 	SHRQ $51,DX
 	ADDQ R11,DX
 	ANDQ SI,R8
 	MOVQ DX,R9
 	SHRQ $51,DX
 	ADDQ R13,DX
 	ANDQ SI,R9
 	MOVQ DX,AX
 	SHRQ $51,DX
 	ADDQ R15,DX
 	ANDQ SI,AX
 	MOVQ DX,R10
 	SHRQ $51,DX
 	IMUL3Q $19,DX,DX
 	ADDQ DX,CX
 	ANDQ SI,R10
 	MOVQ CX,0(DI)
 	MOVQ R8,8(DI)
 	MOVQ R9,16(DI)
 	MOVQ AX,24(DI)
 	MOVQ R10,32(DI)
 	RET
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_generic.go
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519_generic.go
@ -0,0 +1,828 @@
 // Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package curve25519
 import "encoding/binary"
 // This code is a port of the public domain, "ref10" implementation of
 // curve25519 from SUPERCOP 20130419 by D. J. Bernstein.
 // fieldElement represents an element of the field GF(2^255 - 19). An element
 // t, entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
 // t[3]+2^102 t[4]+...+2^230 t[9]. Bounds on each t[i] vary depending on
 // context.
 type fieldElement [10]int32
 func feZero(fe *fieldElement) {
 	for i := range fe {
 		fe[i] = 0
 	}
 }
 func feOne(fe *fieldElement) {
 	feZero(fe)
 	fe[0] = 1
 }
 func feAdd(dst, a, b *fieldElement) {
 	for i := range dst {
 		dst[i] = a[i] + b[i]
 	}
 }
 func feSub(dst, a, b *fieldElement) {
 	for i := range dst {
 		dst[i] = a[i] - b[i]
 	}
 }
 func feCopy(dst, src *fieldElement) {
 	for i := range dst {
 		dst[i] = src[i]
 	}
 }
 // feCSwap replaces (f,g) with (g,f) if b == 1; replaces (f,g) with (f,g) if b == 0.
 //
 // Preconditions: b in {0,1}.
 func feCSwap(f, g *fieldElement, b int32) {
 	b = -b
 	for i := range f {
 		t := b & (f[i] ^ g[i])
 		f[i] ^= t
 		g[i] ^= t
 	}
 }
 // load3 reads a 24-bit, little-endian value from in.
 func load3(in []byte) int64 {
 	var r int64
 	r = int64(in[0])
 	r |= int64(in[1]) << 8
 	r |= int64(in[2]) << 16
 	return r
 }
 // load4 reads a 32-bit, little-endian value from in.
 func load4(in []byte) int64 {
 	return int64(binary.LittleEndian.Uint32(in))
 }
 func feFromBytes(dst *fieldElement, src *[32]byte) {
 	h0 := load4(src[:])
 	h1 := load3(src[4:]) << 6
 	h2 := load3(src[7:]) << 5
 	h3 := load3(src[10:]) << 3
 	h4 := load3(src[13:]) << 2
 	h5 := load4(src[16:])
 	h6 := load3(src[20:]) << 7
 	h7 := load3(src[23:]) << 5
 	h8 := load3(src[26:]) << 4
 	h9 := (load3(src[29:]) & 0x7fffff) << 2
 	var carry [10]int64
 	carry[9] = (h9 + 1<<24) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[1] = (h1 + 1<<24) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[3] = (h3 + 1<<24) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[5] = (h5 + 1<<24) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[7] = (h7 + 1<<24) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[0] = (h0 + 1<<25) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[2] = (h2 + 1<<25) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[4] = (h4 + 1<<25) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[6] = (h6 + 1<<25) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[8] = (h8 + 1<<25) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	dst[0] = int32(h0)
 	dst[1] = int32(h1)
 	dst[2] = int32(h2)
 	dst[3] = int32(h3)
 	dst[4] = int32(h4)
 	dst[5] = int32(h5)
 	dst[6] = int32(h6)
 	dst[7] = int32(h7)
 	dst[8] = int32(h8)
 	dst[9] = int32(h9)
 }
 // feToBytes marshals h to s.
 // Preconditions:
 //   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 //
 // Write p=2^255-19; q=floor(h/p).
 // Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
 //
 // Proof:
 //   Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
 //   Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
 //
 //   Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
 //   Then 0<y<1.
 //
 //   Write r=h-pq.
 //   Have 0<=r<=p-1=2^255-20.
 //   Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
 //
 //   Write x=r+19(2^-255)r+y.
 //   Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
 //
 //   Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
 //   so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
 func feToBytes(s *[32]byte, h *fieldElement) {
 	var carry [10]int32
 	q := (19*h[9] + (1 << 24)) >> 25
 	q = (h[0] + q) >> 26
 	q = (h[1] + q) >> 25
 	q = (h[2] + q) >> 26
 	q = (h[3] + q) >> 25
 	q = (h[4] + q) >> 26
 	q = (h[5] + q) >> 25
 	q = (h[6] + q) >> 26
 	q = (h[7] + q) >> 25
 	q = (h[8] + q) >> 26
 	q = (h[9] + q) >> 25
 	// Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20.
 	h[0] += 19 * q
 	// Goal: Output h-2^255 q, which is between 0 and 2^255-20.
 	carry[0] = h[0] >> 26
 	h[1] += carry[0]
 	h[0] -= carry[0] << 26
 	carry[1] = h[1] >> 25
 	h[2] += carry[1]
 	h[1] -= carry[1] << 25
 	carry[2] = h[2] >> 26
 	h[3] += carry[2]
 	h[2] -= carry[2] << 26
 	carry[3] = h[3] >> 25
 	h[4] += carry[3]
 	h[3] -= carry[3] << 25
 	carry[4] = h[4] >> 26
 	h[5] += carry[4]
 	h[4] -= carry[4] << 26
 	carry[5] = h[5] >> 25
 	h[6] += carry[5]
 	h[5] -= carry[5] << 25
 	carry[6] = h[6] >> 26
 	h[7] += carry[6]
 	h[6] -= carry[6] << 26
 	carry[7] = h[7] >> 25
 	h[8] += carry[7]
 	h[7] -= carry[7] << 25
 	carry[8] = h[8] >> 26
 	h[9] += carry[8]
 	h[8] -= carry[8] << 26
 	carry[9] = h[9] >> 25
 	h[9] -= carry[9] << 25
 	// h10 = carry9
 	// Goal: Output h[0]+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
 	// Have h[0]+...+2^230 h[9] between 0 and 2^255-1;
 	// evidently 2^255 h10-2^255 q = 0.
 	// Goal: Output h[0]+...+2^230 h[9].
 	s[0] = byte(h[0] >> 0)
 	s[1] = byte(h[0] >> 8)
 	s[2] = byte(h[0] >> 16)
 	s[3] = byte((h[0] >> 24) | (h[1] << 2))
 	s[4] = byte(h[1] >> 6)
 	s[5] = byte(h[1] >> 14)
 	s[6] = byte((h[1] >> 22) | (h[2] << 3))
 	s[7] = byte(h[2] >> 5)
 	s[8] = byte(h[2] >> 13)
 	s[9] = byte((h[2] >> 21) | (h[3] << 5))
 	s[10] = byte(h[3] >> 3)
 	s[11] = byte(h[3] >> 11)
 	s[12] = byte((h[3] >> 19) | (h[4] << 6))
 	s[13] = byte(h[4] >> 2)
 	s[14] = byte(h[4] >> 10)
 	s[15] = byte(h[4] >> 18)
 	s[16] = byte(h[5] >> 0)
 	s[17] = byte(h[5] >> 8)
 	s[18] = byte(h[5] >> 16)
 	s[19] = byte((h[5] >> 24) | (h[6] << 1))
 	s[20] = byte(h[6] >> 7)
 	s[21] = byte(h[6] >> 15)
 	s[22] = byte((h[6] >> 23) | (h[7] << 3))
 	s[23] = byte(h[7] >> 5)
 	s[24] = byte(h[7] >> 13)
 	s[25] = byte((h[7] >> 21) | (h[8] << 4))
 	s[26] = byte(h[8] >> 4)
 	s[27] = byte(h[8] >> 12)
 	s[28] = byte((h[8] >> 20) | (h[9] << 6))
 	s[29] = byte(h[9] >> 2)
 	s[30] = byte(h[9] >> 10)
 	s[31] = byte(h[9] >> 18)
 }
 // feMul calculates h = f * g
 // Can overlap h with f or g.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //    |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 //
 // Notes on implementation strategy:
 //
 // Using schoolbook multiplication.
 // Karatsuba would save a little in some cost models.
 //
 // Most multiplications by 2 and 19 are 32-bit precomputations;
 // cheaper than 64-bit postcomputations.
 //
 // There is one remaining multiplication by 19 in the carry chain;
 // one *19 precomputation can be merged into this,
 // but the resulting data flow is considerably less clean.
 //
 // There are 12 carries below.
 // 10 of them are 2-way parallelizable and vectorizable.
 // Can get away with 11 carries, but then data flow is much deeper.
 //
 // With tighter constraints on inputs can squeeze carries into int32.
 func feMul(h, f, g *fieldElement) {
 	f0 := f[0]
 	f1 := f[1]
 	f2 := f[2]
 	f3 := f[3]
 	f4 := f[4]
 	f5 := f[5]
 	f6 := f[6]
 	f7 := f[7]
 	f8 := f[8]
 	f9 := f[9]
 	g0 := g[0]
 	g1 := g[1]
 	g2 := g[2]
 	g3 := g[3]
 	g4 := g[4]
 	g5 := g[5]
 	g6 := g[6]
 	g7 := g[7]
 	g8 := g[8]
 	g9 := g[9]
 	g1_19 := 19 * g1 // 1.4*2^29
 	g2_19 := 19 * g2 // 1.4*2^30; still ok
 	g3_19 := 19 * g3
 	g4_19 := 19 * g4
 	g5_19 := 19 * g5
 	g6_19 := 19 * g6
 	g7_19 := 19 * g7
 	g8_19 := 19 * g8
 	g9_19 := 19 * g9
 	f1_2 := 2 * f1
 	f3_2 := 2 * f3
 	f5_2 := 2 * f5
 	f7_2 := 2 * f7
 	f9_2 := 2 * f9
 	f0g0 := int64(f0) * int64(g0)
 	f0g1 := int64(f0) * int64(g1)
 	f0g2 := int64(f0) * int64(g2)
 	f0g3 := int64(f0) * int64(g3)
 	f0g4 := int64(f0) * int64(g4)
 	f0g5 := int64(f0) * int64(g5)
 	f0g6 := int64(f0) * int64(g6)
 	f0g7 := int64(f0) * int64(g7)
 	f0g8 := int64(f0) * int64(g8)
 	f0g9 := int64(f0) * int64(g9)
 	f1g0 := int64(f1) * int64(g0)
 	f1g1_2 := int64(f1_2) * int64(g1)
 	f1g2 := int64(f1) * int64(g2)
 	f1g3_2 := int64(f1_2) * int64(g3)
 	f1g4 := int64(f1) * int64(g4)
 	f1g5_2 := int64(f1_2) * int64(g5)
 	f1g6 := int64(f1) * int64(g6)
 	f1g7_2 := int64(f1_2) * int64(g7)
 	f1g8 := int64(f1) * int64(g8)
 	f1g9_38 := int64(f1_2) * int64(g9_19)
 	f2g0 := int64(f2) * int64(g0)
 	f2g1 := int64(f2) * int64(g1)
 	f2g2 := int64(f2) * int64(g2)
 	f2g3 := int64(f2) * int64(g3)
 	f2g4 := int64(f2) * int64(g4)
 	f2g5 := int64(f2) * int64(g5)
 	f2g6 := int64(f2) * int64(g6)
 	f2g7 := int64(f2) * int64(g7)
 	f2g8_19 := int64(f2) * int64(g8_19)
 	f2g9_19 := int64(f2) * int64(g9_19)
 	f3g0 := int64(f3) * int64(g0)
 	f3g1_2 := int64(f3_2) * int64(g1)
 	f3g2 := int64(f3) * int64(g2)
 	f3g3_2 := int64(f3_2) * int64(g3)
 	f3g4 := int64(f3) * int64(g4)
 	f3g5_2 := int64(f3_2) * int64(g5)
 	f3g6 := int64(f3) * int64(g6)
 	f3g7_38 := int64(f3_2) * int64(g7_19)
 	f3g8_19 := int64(f3) * int64(g8_19)
 	f3g9_38 := int64(f3_2) * int64(g9_19)
 	f4g0 := int64(f4) * int64(g0)
 	f4g1 := int64(f4) * int64(g1)
 	f4g2 := int64(f4) * int64(g2)
 	f4g3 := int64(f4) * int64(g3)
 	f4g4 := int64(f4) * int64(g4)
 	f4g5 := int64(f4) * int64(g5)
 	f4g6_19 := int64(f4) * int64(g6_19)
 	f4g7_19 := int64(f4) * int64(g7_19)
 	f4g8_19 := int64(f4) * int64(g8_19)
 	f4g9_19 := int64(f4) * int64(g9_19)
 	f5g0 := int64(f5) * int64(g0)
 	f5g1_2 := int64(f5_2) * int64(g1)
 	f5g2 := int64(f5) * int64(g2)
 	f5g3_2 := int64(f5_2) * int64(g3)
 	f5g4 := int64(f5) * int64(g4)
 	f5g5_38 := int64(f5_2) * int64(g5_19)
 	f5g6_19 := int64(f5) * int64(g6_19)
 	f5g7_38 := int64(f5_2) * int64(g7_19)
 	f5g8_19 := int64(f5) * int64(g8_19)
 	f5g9_38 := int64(f5_2) * int64(g9_19)
 	f6g0 := int64(f6) * int64(g0)
 	f6g1 := int64(f6) * int64(g1)
 	f6g2 := int64(f6) * int64(g2)
 	f6g3 := int64(f6) * int64(g3)
 	f6g4_19 := int64(f6) * int64(g4_19)
 	f6g5_19 := int64(f6) * int64(g5_19)
 	f6g6_19 := int64(f6) * int64(g6_19)
 	f6g7_19 := int64(f6) * int64(g7_19)
 	f6g8_19 := int64(f6) * int64(g8_19)
 	f6g9_19 := int64(f6) * int64(g9_19)
 	f7g0 := int64(f7) * int64(g0)
 	f7g1_2 := int64(f7_2) * int64(g1)
 	f7g2 := int64(f7) * int64(g2)
 	f7g3_38 := int64(f7_2) * int64(g3_19)
 	f7g4_19 := int64(f7) * int64(g4_19)
 	f7g5_38 := int64(f7_2) * int64(g5_19)
 	f7g6_19 := int64(f7) * int64(g6_19)
 	f7g7_38 := int64(f7_2) * int64(g7_19)
 	f7g8_19 := int64(f7) * int64(g8_19)
 	f7g9_38 := int64(f7_2) * int64(g9_19)
 	f8g0 := int64(f8) * int64(g0)
 	f8g1 := int64(f8) * int64(g1)
 	f8g2_19 := int64(f8) * int64(g2_19)
 	f8g3_19 := int64(f8) * int64(g3_19)
 	f8g4_19 := int64(f8) * int64(g4_19)
 	f8g5_19 := int64(f8) * int64(g5_19)
 	f8g6_19 := int64(f8) * int64(g6_19)
 	f8g7_19 := int64(f8) * int64(g7_19)
 	f8g8_19 := int64(f8) * int64(g8_19)
 	f8g9_19 := int64(f8) * int64(g9_19)
 	f9g0 := int64(f9) * int64(g0)
 	f9g1_38 := int64(f9_2) * int64(g1_19)
 	f9g2_19 := int64(f9) * int64(g2_19)
 	f9g3_38 := int64(f9_2) * int64(g3_19)
 	f9g4_19 := int64(f9) * int64(g4_19)
 	f9g5_38 := int64(f9_2) * int64(g5_19)
 	f9g6_19 := int64(f9) * int64(g6_19)
 	f9g7_38 := int64(f9_2) * int64(g7_19)
 	f9g8_19 := int64(f9) * int64(g8_19)
 	f9g9_38 := int64(f9_2) * int64(g9_19)
 	h0 := f0g0 + f1g9_38 + f2g8_19 + f3g7_38 + f4g6_19 + f5g5_38 + f6g4_19 + f7g3_38 + f8g2_19 + f9g1_38
 	h1 := f0g1 + f1g0 + f2g9_19 + f3g8_19 + f4g7_19 + f5g6_19 + f6g5_19 + f7g4_19 + f8g3_19 + f9g2_19
 	h2 := f0g2 + f1g1_2 + f2g0 + f3g9_38 + f4g8_19 + f5g7_38 + f6g6_19 + f7g5_38 + f8g4_19 + f9g3_38
 	h3 := f0g3 + f1g2 + f2g1 + f3g0 + f4g9_19 + f5g8_19 + f6g7_19 + f7g6_19 + f8g5_19 + f9g4_19
 	h4 := f0g4 + f1g3_2 + f2g2 + f3g1_2 + f4g0 + f5g9_38 + f6g8_19 + f7g7_38 + f8g6_19 + f9g5_38
 	h5 := f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + f6g9_19 + f7g8_19 + f8g7_19 + f9g6_19
 	h6 := f0g6 + f1g5_2 + f2g4 + f3g3_2 + f4g2 + f5g1_2 + f6g0 + f7g9_38 + f8g8_19 + f9g7_38
 	h7 := f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + f8g9_19 + f9g8_19
 	h8 := f0g8 + f1g7_2 + f2g6 + f3g5_2 + f4g4 + f5g3_2 + f6g2 + f7g1_2 + f8g0 + f9g9_38
 	h9 := f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
 	var carry [10]int64
 	// |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
 	//   i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
 	// |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
 	//   i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	// |h0| <= 2^25
 	// |h4| <= 2^25
 	// |h1| <= 1.51*2^58
 	// |h5| <= 1.51*2^58
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	// |h1| <= 2^24; from now on fits into int32
 	// |h5| <= 2^24; from now on fits into int32
 	// |h2| <= 1.21*2^59
 	// |h6| <= 1.21*2^59
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	// |h2| <= 2^25; from now on fits into int32 unchanged
 	// |h6| <= 2^25; from now on fits into int32 unchanged
 	// |h3| <= 1.51*2^58
 	// |h7| <= 1.51*2^58
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	// |h3| <= 2^24; from now on fits into int32 unchanged
 	// |h7| <= 2^24; from now on fits into int32 unchanged
 	// |h4| <= 1.52*2^33
 	// |h8| <= 1.52*2^33
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	// |h4| <= 2^25; from now on fits into int32 unchanged
 	// |h8| <= 2^25; from now on fits into int32 unchanged
 	// |h5| <= 1.01*2^24
 	// |h9| <= 1.51*2^58
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	// |h9| <= 2^24; from now on fits into int32 unchanged
 	// |h0| <= 1.8*2^37
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	// |h0| <= 2^25; from now on fits into int32 unchanged
 	// |h1| <= 1.01*2^24
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feSquare calculates h = f*f. Can overlap h with f.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 func feSquare(h, f *fieldElement) {
 	f0 := f[0]
 	f1 := f[1]
 	f2 := f[2]
 	f3 := f[3]
 	f4 := f[4]
 	f5 := f[5]
 	f6 := f[6]
 	f7 := f[7]
 	f8 := f[8]
 	f9 := f[9]
 	f0_2 := 2 * f0
 	f1_2 := 2 * f1
 	f2_2 := 2 * f2
 	f3_2 := 2 * f3
 	f4_2 := 2 * f4
 	f5_2 := 2 * f5
 	f6_2 := 2 * f6
 	f7_2 := 2 * f7
 	f5_38 := 38 * f5 // 1.31*2^30
 	f6_19 := 19 * f6 // 1.31*2^30
 	f7_38 := 38 * f7 // 1.31*2^30
 	f8_19 := 19 * f8 // 1.31*2^30
 	f9_38 := 38 * f9 // 1.31*2^30
 	f0f0 := int64(f0) * int64(f0)
 	f0f1_2 := int64(f0_2) * int64(f1)
 	f0f2_2 := int64(f0_2) * int64(f2)
 	f0f3_2 := int64(f0_2) * int64(f3)
 	f0f4_2 := int64(f0_2) * int64(f4)
 	f0f5_2 := int64(f0_2) * int64(f5)
 	f0f6_2 := int64(f0_2) * int64(f6)
 	f0f7_2 := int64(f0_2) * int64(f7)
 	f0f8_2 := int64(f0_2) * int64(f8)
 	f0f9_2 := int64(f0_2) * int64(f9)
 	f1f1_2 := int64(f1_2) * int64(f1)
 	f1f2_2 := int64(f1_2) * int64(f2)
 	f1f3_4 := int64(f1_2) * int64(f3_2)
 	f1f4_2 := int64(f1_2) * int64(f4)
 	f1f5_4 := int64(f1_2) * int64(f5_2)
 	f1f6_2 := int64(f1_2) * int64(f6)
 	f1f7_4 := int64(f1_2) * int64(f7_2)
 	f1f8_2 := int64(f1_2) * int64(f8)
 	f1f9_76 := int64(f1_2) * int64(f9_38)
 	f2f2 := int64(f2) * int64(f2)
 	f2f3_2 := int64(f2_2) * int64(f3)
 	f2f4_2 := int64(f2_2) * int64(f4)
 	f2f5_2 := int64(f2_2) * int64(f5)
 	f2f6_2 := int64(f2_2) * int64(f6)
 	f2f7_2 := int64(f2_2) * int64(f7)
 	f2f8_38 := int64(f2_2) * int64(f8_19)
 	f2f9_38 := int64(f2) * int64(f9_38)
 	f3f3_2 := int64(f3_2) * int64(f3)
 	f3f4_2 := int64(f3_2) * int64(f4)
 	f3f5_4 := int64(f3_2) * int64(f5_2)
 	f3f6_2 := int64(f3_2) * int64(f6)
 	f3f7_76 := int64(f3_2) * int64(f7_38)
 	f3f8_38 := int64(f3_2) * int64(f8_19)
 	f3f9_76 := int64(f3_2) * int64(f9_38)
 	f4f4 := int64(f4) * int64(f4)
 	f4f5_2 := int64(f4_2) * int64(f5)
 	f4f6_38 := int64(f4_2) * int64(f6_19)
 	f4f7_38 := int64(f4) * int64(f7_38)
 	f4f8_38 := int64(f4_2) * int64(f8_19)
 	f4f9_38 := int64(f4) * int64(f9_38)
 	f5f5_38 := int64(f5) * int64(f5_38)
 	f5f6_38 := int64(f5_2) * int64(f6_19)
 	f5f7_76 := int64(f5_2) * int64(f7_38)
 	f5f8_38 := int64(f5_2) * int64(f8_19)
 	f5f9_76 := int64(f5_2) * int64(f9_38)
 	f6f6_19 := int64(f6) * int64(f6_19)
 	f6f7_38 := int64(f6) * int64(f7_38)
 	f6f8_38 := int64(f6_2) * int64(f8_19)
 	f6f9_38 := int64(f6) * int64(f9_38)
 	f7f7_38 := int64(f7) * int64(f7_38)
 	f7f8_38 := int64(f7_2) * int64(f8_19)
 	f7f9_76 := int64(f7_2) * int64(f9_38)
 	f8f8_19 := int64(f8) * int64(f8_19)
 	f8f9_38 := int64(f8) * int64(f9_38)
 	f9f9_38 := int64(f9) * int64(f9_38)
 	h0 := f0f0 + f1f9_76 + f2f8_38 + f3f7_76 + f4f6_38 + f5f5_38
 	h1 := f0f1_2 + f2f9_38 + f3f8_38 + f4f7_38 + f5f6_38
 	h2 := f0f2_2 + f1f1_2 + f3f9_76 + f4f8_38 + f5f7_76 + f6f6_19
 	h3 := f0f3_2 + f1f2_2 + f4f9_38 + f5f8_38 + f6f7_38
 	h4 := f0f4_2 + f1f3_4 + f2f2 + f5f9_76 + f6f8_38 + f7f7_38
 	h5 := f0f5_2 + f1f4_2 + f2f3_2 + f6f9_38 + f7f8_38
 	h6 := f0f6_2 + f1f5_4 + f2f4_2 + f3f3_2 + f7f9_76 + f8f8_19
 	h7 := f0f7_2 + f1f6_2 + f2f5_2 + f3f4_2 + f8f9_38
 	h8 := f0f8_2 + f1f7_4 + f2f6_2 + f3f5_4 + f4f4 + f9f9_38
 	h9 := f0f9_2 + f1f8_2 + f2f7_2 + f3f6_2 + f4f5_2
 	var carry [10]int64
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feMul121666 calculates h = f * 121666. Can overlap h with f.
 //
 // Preconditions:
 //    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
 //
 // Postconditions:
 //    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
 func feMul121666(h, f *fieldElement) {
 	h0 := int64(f[0]) * 121666
 	h1 := int64(f[1]) * 121666
 	h2 := int64(f[2]) * 121666
 	h3 := int64(f[3]) * 121666
 	h4 := int64(f[4]) * 121666
 	h5 := int64(f[5]) * 121666
 	h6 := int64(f[6]) * 121666
 	h7 := int64(f[7]) * 121666
 	h8 := int64(f[8]) * 121666
 	h9 := int64(f[9]) * 121666
 	var carry [10]int64
 	carry[9] = (h9 + (1 << 24)) >> 25
 	h0 += carry[9] * 19
 	h9 -= carry[9] << 25
 	carry[1] = (h1 + (1 << 24)) >> 25
 	h2 += carry[1]
 	h1 -= carry[1] << 25
 	carry[3] = (h3 + (1 << 24)) >> 25
 	h4 += carry[3]
 	h3 -= carry[3] << 25
 	carry[5] = (h5 + (1 << 24)) >> 25
 	h6 += carry[5]
 	h5 -= carry[5] << 25
 	carry[7] = (h7 + (1 << 24)) >> 25
 	h8 += carry[7]
 	h7 -= carry[7] << 25
 	carry[0] = (h0 + (1 << 25)) >> 26
 	h1 += carry[0]
 	h0 -= carry[0] << 26
 	carry[2] = (h2 + (1 << 25)) >> 26
 	h3 += carry[2]
 	h2 -= carry[2] << 26
 	carry[4] = (h4 + (1 << 25)) >> 26
 	h5 += carry[4]
 	h4 -= carry[4] << 26
 	carry[6] = (h6 + (1 << 25)) >> 26
 	h7 += carry[6]
 	h6 -= carry[6] << 26
 	carry[8] = (h8 + (1 << 25)) >> 26
 	h9 += carry[8]
 	h8 -= carry[8] << 26
 	h[0] = int32(h0)
 	h[1] = int32(h1)
 	h[2] = int32(h2)
 	h[3] = int32(h3)
 	h[4] = int32(h4)
 	h[5] = int32(h5)
 	h[6] = int32(h6)
 	h[7] = int32(h7)
 	h[8] = int32(h8)
 	h[9] = int32(h9)
 }
 // feInvert sets out = z^-1.
 func feInvert(out, z *fieldElement) {
 	var t0, t1, t2, t3 fieldElement
 	var i int
 	feSquare(&t0, z)
 	for i = 1; i < 1; i++ {
 		feSquare(&t0, &t0)
 	}
 	feSquare(&t1, &t0)
 	for i = 1; i < 2; i++ {
 		feSquare(&t1, &t1)
 	}
 	feMul(&t1, z, &t1)
 	feMul(&t0, &t0, &t1)
 	feSquare(&t2, &t0)
 	for i = 1; i < 1; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t1, &t2)
 	feSquare(&t2, &t1)
 	for i = 1; i < 5; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t2, &t1)
 	for i = 1; i < 10; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t2, &t2, &t1)
 	feSquare(&t3, &t2)
 	for i = 1; i < 20; i++ {
 		feSquare(&t3, &t3)
 	}
 	feMul(&t2, &t3, &t2)
 	feSquare(&t2, &t2)
 	for i = 1; i < 10; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t2, &t1)
 	for i = 1; i < 50; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t2, &t2, &t1)
 	feSquare(&t3, &t2)
 	for i = 1; i < 100; i++ {
 		feSquare(&t3, &t3)
 	}
 	feMul(&t2, &t3, &t2)
 	feSquare(&t2, &t2)
 	for i = 1; i < 50; i++ {
 		feSquare(&t2, &t2)
 	}
 	feMul(&t1, &t2, &t1)
 	feSquare(&t1, &t1)
 	for i = 1; i < 5; i++ {
 		feSquare(&t1, &t1)
 	}
 	feMul(out, &t1, &t0)
 }
 func scalarMultGeneric(out, in, base *[32]byte) {
 	var e [32]byte
 	copy(e[:], in[:])
 	e[0] &= 248
 	e[31] &= 127
 	e[31] |= 64
 	var x1, x2, z2, x3, z3, tmp0, tmp1 fieldElement
 	feFromBytes(&x1, base)
 	feOne(&x2)
 	feCopy(&x3, &x1)
 	feOne(&z3)
 	swap := int32(0)
 	for pos := 254; pos >= 0; pos-- {
 		b := e[pos/8] >> uint(pos&7)
 		b &= 1
 		swap ^= int32(b)
 		feCSwap(&x2, &x3, swap)
 		feCSwap(&z2, &z3, swap)
 		swap = int32(b)
 		feSub(&tmp0, &x3, &z3)
 		feSub(&tmp1, &x2, &z2)
 		feAdd(&x2, &x2, &z2)
 		feAdd(&z2, &x3, &z3)
 		feMul(&z3, &tmp0, &x2)
 		feMul(&z2, &z2, &tmp1)
 		feSquare(&tmp0, &tmp1)
 		feSquare(&tmp1, &x2)
 		feAdd(&x3, &z3, &z2)
 		feSub(&z2, &z3, &z2)
 		feMul(&x2, &tmp1, &tmp0)
 		feSub(&tmp1, &tmp1, &tmp0)
 		feSquare(&z2, &z2)
 		feMul121666(&z3, &tmp1)
 		feSquare(&x3, &x3)
 		feAdd(&tmp0, &tmp0, &z3)
 		feMul(&z3, &x1, &z2)
 		feMul(&z2, &tmp1, &tmp0)
 	}
 	feCSwap(&x2, &x3, swap)
 	feCSwap(&z2, &z3, swap)
 	feInvert(&z2, &z2)
 	feMul(&x2, &x2, &z2)
 	feToBytes(out, &x2)
 }
--- a/vendor/golang.org/x/crypto/curve25519/curve25519_noasm.go
+++ b/vendor/golang.org/x/crypto/curve25519/curve25519_noasm.go
@ -0,0 +1,11 @@
 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64 gccgo appengine purego
 package curve25519
 func scalarMult(out, in, base *[32]byte) {
 	scalarMultGeneric(out, in, base)
 }
--- a/vendor/golang.org/x/crypto/curve25519/doc.go
+++ b/vendor/golang.org/x/crypto/curve25519/doc.go
@ -1,23 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package curve25519 provides an implementation of scalar multiplication on
 // the elliptic curve known as curve25519. See https://cr.yp.to/ecdh.html
 package curve25519 // import "golang.org/x/crypto/curve25519"
 // basePoint is the x coordinate of the generator of the curve.
 var basePoint = [32]byte{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 // ScalarMult sets dst to the product in*base where dst and base are the x
 // coordinates of group points and all values are in little-endian form.
 func ScalarMult(dst, in, base *[32]byte) {
 	scalarMult(dst, in, base)
 }
 // ScalarBaseMult sets dst to the product in*base where dst and base are the x
 // coordinates of group points, base is the standard generator and all values
 // are in little-endian form.
 func ScalarBaseMult(dst, in *[32]byte) {
 	ScalarMult(dst, in, &basePoint)
 }
--- a/vendor/golang.org/x/crypto/curve25519/freeze_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/freeze_amd64.s
@ -1,73 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 // +build amd64,!gccgo,!appengine
 #include "const_amd64.h"
 // func freeze(inout *[5]uint64)
 TEXT ·freeze(SB),7,$0-8
 	MOVQ inout+0(FP), DI
 	MOVQ 0(DI),SI
 	MOVQ 8(DI),DX
 	MOVQ 16(DI),CX
 	MOVQ 24(DI),R8
 	MOVQ 32(DI),R9
 	MOVQ $REDMASK51,AX
 	MOVQ AX,R10
 	SUBQ $18,R10
 	MOVQ $3,R11
 REDUCELOOP:
 	MOVQ SI,R12
 	SHRQ $51,R12
 	ANDQ AX,SI
 	ADDQ R12,DX
 	MOVQ DX,R12
 	SHRQ $51,R12
 	ANDQ AX,DX
 	ADDQ R12,CX
 	MOVQ CX,R12
 	SHRQ $51,R12
 	ANDQ AX,CX
 	ADDQ R12,R8
 	MOVQ R8,R12
 	SHRQ $51,R12
 	ANDQ AX,R8
 	ADDQ R12,R9
 	MOVQ R9,R12
 	SHRQ $51,R12
 	ANDQ AX,R9
 	IMUL3Q $19,R12,R12
 	ADDQ R12,SI
 	SUBQ $1,R11
 	JA REDUCELOOP
 	MOVQ $1,R12
 	CMPQ R10,SI
 	CMOVQLT R11,R12
 	CMPQ AX,DX
 	CMOVQNE R11,R12
 	CMPQ AX,CX
 	CMOVQNE R11,R12
 	CMPQ AX,R8
 	CMOVQNE R11,R12
 	CMPQ AX,R9
 	CMOVQNE R11,R12
 	NEGQ R12
 	ANDQ R12,AX
 	ANDQ R12,R10
 	SUBQ R10,SI
 	SUBQ AX,DX
 	SUBQ AX,CX
 	SUBQ AX,R8
 	SUBQ AX,R9
 	MOVQ SI,0(DI)
 	MOVQ DX,8(DI)
 	MOVQ CX,16(DI)
 	MOVQ R8,24(DI)
 	MOVQ R9,32(DI)
 	RET
--- a/vendor/golang.org/x/crypto/curve25519/mul_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/mul_amd64.s
@ -1,169 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 // +build amd64,!gccgo,!appengine
 #include "const_amd64.h"
 // func mul(dest, a, b *[5]uint64)
 TEXT ·mul(SB),0,$16-24
 	MOVQ dest+0(FP), DI
 	MOVQ a+8(FP), SI
 	MOVQ b+16(FP), DX
 	MOVQ DX,CX
 	MOVQ 24(SI),DX
 	IMUL3Q $19,DX,AX
 	MOVQ AX,0(SP)
 	MULQ 16(CX)
 	MOVQ AX,R8
 	MOVQ DX,R9
 	MOVQ 32(SI),DX
 	IMUL3Q $19,DX,AX
 	MOVQ AX,8(SP)
 	MULQ 8(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 0(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 0(SI),AX
 	MULQ 8(CX)
 	MOVQ AX,R10
 	MOVQ DX,R11
 	MOVQ 0(SI),AX
 	MULQ 16(CX)
 	MOVQ AX,R12
 	MOVQ DX,R13
 	MOVQ 0(SI),AX
 	MULQ 24(CX)
 	MOVQ AX,R14
 	MOVQ DX,R15
 	MOVQ 0(SI),AX
 	MULQ 32(CX)
 	MOVQ AX,BX
 	MOVQ DX,BP
 	MOVQ 8(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 8(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 8(SI),AX
 	MULQ 16(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 8(SI),AX
 	MULQ 24(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 8(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 16(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 16(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 16(SI),AX
 	MULQ 16(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 16(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 24(CX)
 	ADDQ AX,R8
 	ADCQ DX,R9
 	MOVQ 16(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 24(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ 24(SI),AX
 	MULQ 8(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 0(SP),AX
 	MULQ 24(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 0(SP),AX
 	MULQ 32(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 32(SI),AX
 	MULQ 0(CX)
 	ADDQ AX,BX
 	ADCQ DX,BP
 	MOVQ 8(SP),AX
 	MULQ 16(CX)
 	ADDQ AX,R10
 	ADCQ DX,R11
 	MOVQ 8(SP),AX
 	MULQ 24(CX)
 	ADDQ AX,R12
 	ADCQ DX,R13
 	MOVQ 8(SP),AX
 	MULQ 32(CX)
 	ADDQ AX,R14
 	ADCQ DX,R15
 	MOVQ $REDMASK51,SI
 	SHLQ $13,R8,R9
 	ANDQ SI,R8
 	SHLQ $13,R10,R11
 	ANDQ SI,R10
 	ADDQ R9,R10
 	SHLQ $13,R12,R13
 	ANDQ SI,R12
 	ADDQ R11,R12
 	SHLQ $13,R14,R15
 	ANDQ SI,R14
 	ADDQ R13,R14
 	SHLQ $13,BX,BP
 	ANDQ SI,BX
 	ADDQ R15,BX
 	IMUL3Q $19,BP,DX
 	ADDQ DX,R8
 	MOVQ R8,DX
 	SHRQ $51,DX
 	ADDQ R10,DX
 	MOVQ DX,CX
 	SHRQ $51,DX
 	ANDQ SI,R8
 	ADDQ R12,DX
 	MOVQ DX,R9
 	SHRQ $51,DX
 	ANDQ SI,CX
 	ADDQ R14,DX
 	MOVQ DX,AX
 	SHRQ $51,DX
 	ANDQ SI,R9
 	ADDQ BX,DX
 	MOVQ DX,R10
 	SHRQ $51,DX
 	ANDQ SI,AX
 	IMUL3Q $19,DX,DX
 	ADDQ DX,R8
 	ANDQ SI,R10
 	MOVQ R8,0(DI)
 	MOVQ CX,8(DI)
 	MOVQ R9,16(DI)
 	MOVQ AX,24(DI)
 	MOVQ R10,32(DI)
 	RET
--- a/vendor/golang.org/x/crypto/curve25519/square_amd64.s
+++ b/vendor/golang.org/x/crypto/curve25519/square_amd64.s
@ -1,132 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This code was translated into a form compatible with 6a from the public
 // domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html
 // +build amd64,!gccgo,!appengine
 #include "const_amd64.h"
 // func square(out, in *[5]uint64)
 TEXT ·square(SB),7,$0-16
 	MOVQ out+0(FP), DI
 	MOVQ in+8(FP), SI
 	MOVQ 0(SI),AX
 	MULQ 0(SI)
 	MOVQ AX,CX
 	MOVQ DX,R8
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 8(SI)
 	MOVQ AX,R9
 	MOVQ DX,R10
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 16(SI)
 	MOVQ AX,R11
 	MOVQ DX,R12
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 24(SI)
 	MOVQ AX,R13
 	MOVQ DX,R14
 	MOVQ 0(SI),AX
 	SHLQ $1,AX
 	MULQ 32(SI)
 	MOVQ AX,R15
 	MOVQ DX,BX
 	MOVQ 8(SI),AX
 	MULQ 8(SI)
 	ADDQ AX,R11
 	ADCQ DX,R12
 	MOVQ 8(SI),AX
 	SHLQ $1,AX
 	MULQ 16(SI)
 	ADDQ AX,R13
 	ADCQ DX,R14
 	MOVQ 8(SI),AX
 	SHLQ $1,AX
 	MULQ 24(SI)
 	ADDQ AX,R15
 	ADCQ DX,BX
 	MOVQ 8(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,CX
 	ADCQ DX,R8
 	MOVQ 16(SI),AX
 	MULQ 16(SI)
 	ADDQ AX,R15
 	ADCQ DX,BX
 	MOVQ 16(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 24(SI)
 	ADDQ AX,CX
 	ADCQ DX,R8
 	MOVQ 16(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R9
 	ADCQ DX,R10
 	MOVQ 24(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 24(SI)
 	ADDQ AX,R9
 	ADCQ DX,R10
 	MOVQ 24(SI),DX
 	IMUL3Q $38,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R11
 	ADCQ DX,R12
 	MOVQ 32(SI),DX
 	IMUL3Q $19,DX,AX
 	MULQ 32(SI)
 	ADDQ AX,R13
 	ADCQ DX,R14
 	MOVQ $REDMASK51,SI
 	SHLQ $13,CX,R8
 	ANDQ SI,CX
 	SHLQ $13,R9,R10
 	ANDQ SI,R9
 	ADDQ R8,R9
 	SHLQ $13,R11,R12
 	ANDQ SI,R11
 	ADDQ R10,R11
 	SHLQ $13,R13,R14
 	ANDQ SI,R13
 	ADDQ R12,R13
 	SHLQ $13,R15,BX
 	ANDQ SI,R15
 	ADDQ R14,R15
 	IMUL3Q $19,BX,DX
 	ADDQ DX,CX
 	MOVQ CX,DX
 	SHRQ $51,DX
 	ADDQ R9,DX
 	ANDQ SI,CX
 	MOVQ DX,R8
 	SHRQ $51,DX
 	ADDQ R11,DX
 	ANDQ SI,R8
 	MOVQ DX,R9
 	SHRQ $51,DX
 	ADDQ R13,DX
 	ANDQ SI,R9
 	MOVQ DX,AX
 	SHRQ $51,DX
 	ADDQ R15,DX
 	ANDQ SI,AX
 	MOVQ DX,R10
 	SHRQ $51,DX
 	IMUL3Q $19,DX,DX
 	ADDQ DX,CX
 	ANDQ SI,R10
 	MOVQ CX,0(DI)
 	MOVQ R8,8(DI)
 	MOVQ R9,16(DI)
 	MOVQ AX,24(DI)
 	MOVQ R10,32(DI)
 	RET
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_arm64.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_arm64.go
@ -1,31 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.11
 // +build !gccgo
 package chacha20
 const (
 	haveAsm = true
 	bufSize = 256
 )
 //go:noescape
 func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
 	if len(src) >= bufSize {
 		xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter)
 	}
 	if len(src)%bufSize != 0 {
 		i := len(src) - len(src)%bufSize
 		c.buf = [bufSize]byte{}
 		copy(c.buf[:], src[i:])
 		xorKeyStreamVX(c.buf[:], c.buf[:], &c.key, &c.nonce, &c.counter)
 		c.len = bufSize - copy(dst[i:], c.buf[:len(src)%bufSize])
 	}
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_generic.go
@ -1,264 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package ChaCha20 implements the core ChaCha20 function as specified
 // in https://tools.ietf.org/html/rfc7539#section-2.3.
 package chacha20
 import (
 	"crypto/cipher"
 	"encoding/binary"
 	"golang.org/x/crypto/internal/subtle"
 )
 // assert that *Cipher implements cipher.Stream
 var _ cipher.Stream = (*Cipher)(nil)
 // Cipher is a stateful instance of ChaCha20 using a particular key
 // and nonce. A *Cipher implements the cipher.Stream interface.
 type Cipher struct {
 	key     [8]uint32
 	counter uint32 // incremented after each block
 	nonce   [3]uint32
 	buf     [bufSize]byte // buffer for unused keystream bytes
 	len     int           // number of unused keystream bytes at end of buf
 }
 // New creates a new ChaCha20 stream cipher with the given key and nonce.
 // The initial counter value is set to 0.
 func New(key [8]uint32, nonce [3]uint32) *Cipher {
 	return &Cipher{key: key, nonce: nonce}
 }
 // ChaCha20 constants spelling "expand 32-byte k"
 const (
 	j0 uint32 = 0x61707865
 	j1 uint32 = 0x3320646e
 	j2 uint32 = 0x79622d32
 	j3 uint32 = 0x6b206574
 )
 func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
 	a += b
 	d ^= a
 	d = (d << 16) | (d >> 16)
 	c += d
 	b ^= c
 	b = (b << 12) | (b >> 20)
 	a += b
 	d ^= a
 	d = (d << 8) | (d >> 24)
 	c += d
 	b ^= c
 	b = (b << 7) | (b >> 25)
 	return a, b, c, d
 }
 // XORKeyStream XORs each byte in the given slice with a byte from the
 // cipher's key stream. Dst and src must overlap entirely or not at all.
 //
 // If len(dst) < len(src), XORKeyStream will panic. It is acceptable
 // to pass a dst bigger than src, and in that case, XORKeyStream will
 // only update dst[:len(src)] and will not touch the rest of dst.
 //
 // Multiple calls to XORKeyStream behave as if the concatenation of
 // the src buffers was passed in a single run. That is, Cipher
 // maintains state and does not reset at each XORKeyStream call.
 func (s *Cipher) XORKeyStream(dst, src []byte) {
 	if len(dst) < len(src) {
 		panic("chacha20: output smaller than input")
 	}
 	if subtle.InexactOverlap(dst[:len(src)], src) {
 		panic("chacha20: invalid buffer overlap")
 	}
 	// xor src with buffered keystream first
 	if s.len != 0 {
 		buf := s.buf[len(s.buf)-s.len:]
 		if len(src) < len(buf) {
 			buf = buf[:len(src)]
 		}
 		td, ts := dst[:len(buf)], src[:len(buf)] // BCE hint
 		for i, b := range buf {
 			td[i] = ts[i] ^ b
 		}
 		s.len -= len(buf)
 		if s.len != 0 {
 			return
 		}
 		s.buf = [len(s.buf)]byte{} // zero the empty buffer
 		src = src[len(buf):]
 		dst = dst[len(buf):]
 	}
 	if len(src) == 0 {
 		return
 	}
 	if haveAsm {
 		if uint64(len(src))+uint64(s.counter)*64 > (1<<38)-64 {
 			panic("chacha20: counter overflow")
 		}
 		s.xorKeyStreamAsm(dst, src)
 		return
 	}
 	// set up a 64-byte buffer to pad out the final block if needed
 	// (hoisted out of the main loop to avoid spills)
 	rem := len(src) % 64  // length of final block
 	fin := len(src) - rem // index of final block
 	if rem > 0 {
 		copy(s.buf[len(s.buf)-64:], src[fin:])
 	}
 	// pre-calculate most of the first round
 	s1, s5, s9, s13 := quarterRound(j1, s.key[1], s.key[5], s.nonce[0])
 	s2, s6, s10, s14 := quarterRound(j2, s.key[2], s.key[6], s.nonce[1])
 	s3, s7, s11, s15 := quarterRound(j3, s.key[3], s.key[7], s.nonce[2])
 	n := len(src)
 	src, dst = src[:n:n], dst[:n:n] // BCE hint
 	for i := 0; i < n; i += 64 {
 		// calculate the remainder of the first round
 		s0, s4, s8, s12 := quarterRound(j0, s.key[0], s.key[4], s.counter)
 		// execute the second round
 		x0, x5, x10, x15 := quarterRound(s0, s5, s10, s15)
 		x1, x6, x11, x12 := quarterRound(s1, s6, s11, s12)
 		x2, x7, x8, x13 := quarterRound(s2, s7, s8, s13)
 		x3, x4, x9, x14 := quarterRound(s3, s4, s9, s14)
 		// execute the remaining 18 rounds
 		for i := 0; i < 9; i++ {
 			x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 			x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 			x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 			x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 			x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 			x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 			x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
 		x0 += j0
 		x1 += j1
 		x2 += j2
 		x3 += j3
 		x4 += s.key[0]
 		x5 += s.key[1]
 		x6 += s.key[2]
 		x7 += s.key[3]
 		x8 += s.key[4]
 		x9 += s.key[5]
 		x10 += s.key[6]
 		x11 += s.key[7]
 		x12 += s.counter
 		x13 += s.nonce[0]
 		x14 += s.nonce[1]
 		x15 += s.nonce[2]
 		// increment the counter
 		s.counter += 1
 		if s.counter == 0 {
 			panic("chacha20: counter overflow")
 		}
 		// pad to 64 bytes if needed
 		in, out := src[i:], dst[i:]
 		if i == fin {
 			// src[fin:] has already been copied into s.buf before
 			// the main loop
 			in, out = s.buf[len(s.buf)-64:], s.buf[len(s.buf)-64:]
 		}
 		in, out = in[:64], out[:64] // BCE hint
 		// XOR the key stream with the source and write out the result
 		xor(out[0:], in[0:], x0)
 		xor(out[4:], in[4:], x1)
 		xor(out[8:], in[8:], x2)
 		xor(out[12:], in[12:], x3)
 		xor(out[16:], in[16:], x4)
 		xor(out[20:], in[20:], x5)
 		xor(out[24:], in[24:], x6)
 		xor(out[28:], in[28:], x7)
 		xor(out[32:], in[32:], x8)
 		xor(out[36:], in[36:], x9)
 		xor(out[40:], in[40:], x10)
 		xor(out[44:], in[44:], x11)
 		xor(out[48:], in[48:], x12)
 		xor(out[52:], in[52:], x13)
 		xor(out[56:], in[56:], x14)
 		xor(out[60:], in[60:], x15)
 	}
 	// copy any trailing bytes out of the buffer and into dst
 	if rem != 0 {
 		s.len = 64 - rem
 		copy(dst[fin:], s.buf[len(s.buf)-64:])
 	}
 }
 // Advance discards bytes in the key stream until the next 64 byte block
 // boundary is reached and updates the counter accordingly. If the key
 // stream is already at a block boundary no bytes will be discarded and
 // the counter will be unchanged.
 func (s *Cipher) Advance() {
 	s.len -= s.len % 64
 	if s.len == 0 {
 		s.buf = [len(s.buf)]byte{}
 	}
 }
 // XORKeyStream crypts bytes from in to out using the given key and counters.
 // In and out must overlap entirely or not at all. Counter contains the raw
 // ChaCha20 counter bytes (i.e. block counter followed by nonce).
 func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
 	s := Cipher{
 		key: [8]uint32{
 			binary.LittleEndian.Uint32(key[0:4]),
 			binary.LittleEndian.Uint32(key[4:8]),
 			binary.LittleEndian.Uint32(key[8:12]),
 			binary.LittleEndian.Uint32(key[12:16]),
 			binary.LittleEndian.Uint32(key[16:20]),
 			binary.LittleEndian.Uint32(key[20:24]),
 			binary.LittleEndian.Uint32(key[24:28]),
 			binary.LittleEndian.Uint32(key[28:32]),
 		},
 		nonce: [3]uint32{
 			binary.LittleEndian.Uint32(counter[4:8]),
 			binary.LittleEndian.Uint32(counter[8:12]),
 			binary.LittleEndian.Uint32(counter[12:16]),
 		},
 		counter: binary.LittleEndian.Uint32(counter[0:4]),
 	}
 	s.XORKeyStream(out, in)
 }
 // HChaCha20 uses the ChaCha20 core to generate a derived key from a key and a
 // nonce. It should only be used as part of the XChaCha20 construction.
 func HChaCha20(key *[8]uint32, nonce *[4]uint32) [8]uint32 {
 	x0, x1, x2, x3 := j0, j1, j2, j3
 	x4, x5, x6, x7 := key[0], key[1], key[2], key[3]
 	x8, x9, x10, x11 := key[4], key[5], key[6], key[7]
 	x12, x13, x14, x15 := nonce[0], nonce[1], nonce[2], nonce[3]
 	for i := 0; i < 10; i++ {
 		x0, x4, x8, x12 = quarterRound(x0, x4, x8, x12)
 		x1, x5, x9, x13 = quarterRound(x1, x5, x9, x13)
 		x2, x6, x10, x14 = quarterRound(x2, x6, x10, x14)
 		x3, x7, x11, x15 = quarterRound(x3, x7, x11, x15)
 		x0, x5, x10, x15 = quarterRound(x0, x5, x10, x15)
 		x1, x6, x11, x12 = quarterRound(x1, x6, x11, x12)
 		x2, x7, x8, x13 = quarterRound(x2, x7, x8, x13)
 		x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 	}
 	var out [8]uint32
 	out[0], out[1], out[2], out[3] = x0, x1, x2, x3
 	out[4], out[5], out[6], out[7] = x12, x13, x14, x15
 	return out
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_ppc64le.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_ppc64le.go
@ -1,53 +0,0 @@
 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build ppc64le,!gccgo,!appengine
 package chacha20
 import (
 	"encoding/binary"
 )
 var haveAsm = true
 const bufSize = 256
 //go:noescape
 func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
 	// This implementation can handle buffers that aren't multiples of
 	// 256.
 	if len(src) >= bufSize {
 		chaCha20_ctr32_vsx(&dst[0], &src[0], len(src), &c.key, &c.counter)
 	} else if len(src)%bufSize != 0 {
 		chaCha20_ctr32_vsx(&c.buf[0], &c.buf[0], bufSize, &c.key, &c.counter)
 		start := len(src) - len(src)%bufSize
 		ts, td, tb := src[start:], dst[start:], c.buf[:]
 		// Unroll loop to XOR 32 bytes per iteration.
 		for i := 0; i < len(ts)-32; i += 32 {
 			td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
 			s0 := binary.LittleEndian.Uint64(ts[0:8])
 			s1 := binary.LittleEndian.Uint64(ts[8:16])
 			s2 := binary.LittleEndian.Uint64(ts[16:24])
 			s3 := binary.LittleEndian.Uint64(ts[24:32])
 			b0 := binary.LittleEndian.Uint64(tb[0:8])
 			b1 := binary.LittleEndian.Uint64(tb[8:16])
 			b2 := binary.LittleEndian.Uint64(tb[16:24])
 			b3 := binary.LittleEndian.Uint64(tb[24:32])
 			binary.LittleEndian.PutUint64(td[0:8], s0^b0)
 			binary.LittleEndian.PutUint64(td[8:16], s1^b1)
 			binary.LittleEndian.PutUint64(td[16:24], s2^b2)
 			binary.LittleEndian.PutUint64(td[24:32], s3^b3)
 			ts, td, tb = ts[32:], td[32:], tb[32:]
 		}
 		td, tb = td[:len(ts)], tb[:len(ts)] // bounds check elimination
 		for i, v := range ts {
 			td[i] = tb[i] ^ v
 		}
 		c.len = bufSize - (len(src) % bufSize)
 	}
 }
--- a/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go
+++ b/vendor/golang.org/x/crypto/internal/chacha20/chacha_s390x.go
@ -1,29 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x,!gccgo,!appengine
 package chacha20
 import (
 	"golang.org/x/sys/cpu"
 )
 var haveAsm = cpu.S390X.HasVX
 const bufSize = 256
 // xorKeyStreamVX is an assembly implementation of XORKeyStream. It must only
 // be called when the vector facility is available.
 // Implementation in asm_s390x.s.
 //go:noescape
 func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32, buf *[256]byte, len *int)
 func (c *Cipher) xorKeyStreamAsm(dst, src []byte) {
 	xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter, &c.buf, &c.len)
 }
 // EXRL targets, DO NOT CALL!
 func mvcSrcToBuf()
 func mvcBufToDst()
--- a/vendor/golang.org/x/crypto/nacl/box/box.go
+++ b/vendor/golang.org/x/crypto/nacl/box/box.go
@ -31,19 +31,30 @@ Thus large amounts of data should be chunked so that each message is small.
 chunk size.
 This package is interoperable with NaCl: https://nacl.cr.yp.to/box.html.
 Anonymous sealing/opening is an extension of NaCl defined by and interoperable
 with libsodium:
 https://libsodium.gitbook.io/doc/public-key_cryptography/sealed_boxes.
 */
 package box // import "golang.org/x/crypto/nacl/box"
 import (
 	cryptorand "crypto/rand"
 	"io"
 	"golang.org/x/crypto/blake2b"
 	"golang.org/x/crypto/curve25519"
 	"golang.org/x/crypto/nacl/secretbox"
 	"golang.org/x/crypto/salsa20/salsa"
 )
-// Overhead is the number of bytes of overhead when boxing a message.
+const (
-const Overhead = secretbox.Overhead
+	// Overhead is the number of bytes of overhead when boxing a message.
 	Overhead = secretbox.Overhead
 	// AnonymousOverhead is the number of bytes of overhead when using anonymous
 	// sealed boxes.
 	AnonymousOverhead = Overhead + 32
 )
 // GenerateKey generates a new public/private key pair suitable for use with
 // Seal and Open.
@ -101,3 +112,71 @@ func Open(out, box []byte, nonce *[24]byte, peersPublicKey, privateKey *[32]byte
 func OpenAfterPrecomputation(out, box []byte, nonce *[24]byte, sharedKey *[32]byte) ([]byte, bool) {
 	return secretbox.Open(out, box, nonce, sharedKey)
 }
 // SealAnonymous appends an encrypted and authenticated copy of message to out,
 // which will be AnonymousOverhead bytes longer than the original and must not
 // overlap it. This differs from Seal in that the sender is not required to
 // provide a private key.
 func SealAnonymous(out, message []byte, recipient *[32]byte, rand io.Reader) ([]byte, error) {
 	if rand == nil {
 		rand = cryptorand.Reader
 	}
 	ephemeralPub, ephemeralPriv, err := GenerateKey(rand)
 	if err != nil {
 		return nil, err
 	}
 	var nonce [24]byte
 	if err := sealNonce(ephemeralPub, recipient, &nonce); err != nil {
 		return nil, err
 	}
 	if total := len(out) + AnonymousOverhead + len(message); cap(out) < total {
 		original := out
 		out = make([]byte, 0, total)
 		out = append(out, original...)
 	}
 	out = append(out, ephemeralPub[:]...)
 	return Seal(out, message, &nonce, recipient, ephemeralPriv), nil
 }
 // OpenAnonymous authenticates and decrypts a box produced by SealAnonymous and
 // appends the message to out, which must not overlap box. The output will be
 // AnonymousOverhead bytes smaller than box.
 func OpenAnonymous(out, box []byte, publicKey, privateKey *[32]byte) (message []byte, ok bool) {
 	if len(box) < AnonymousOverhead {
 		return nil, false
 	}
 	var ephemeralPub [32]byte
 	copy(ephemeralPub[:], box[:32])
 	var nonce [24]byte
 	if err := sealNonce(&ephemeralPub, publicKey, &nonce); err != nil {
 		return nil, false
 	}
 	return Open(out, box[32:], &nonce, &ephemeralPub, privateKey)
 }
 // sealNonce generates a 24 byte nonce that is a blake2b digest of the
 // ephemeral public key and the receiver's public key.
 func sealNonce(ephemeralPub, peersPublicKey *[32]byte, nonce *[24]byte) error {
 	h, err := blake2b.New(24, nil)
 	if err != nil {
 		return err
 	}
 	if _, err = h.Write(ephemeralPub[:]); err != nil {
 		return err
 	}
 	if _, err = h.Write(peersPublicKey[:]); err != nil {
 		return err
 	}
 	h.Sum(nonce[:0])
 	return nil
 }
--- a/vendor/golang.org/x/crypto/poly1305/bits_compat.go
+++ b/vendor/golang.org/x/crypto/poly1305/bits_compat.go
@ -0,0 +1,39 @@
 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !go1.13
 package poly1305
 // Generic fallbacks for the math/bits intrinsics, copied from
 // src/math/bits/bits.go. They were added in Go 1.12, but Add64 and Sum64 had
 // variable time fallbacks until Go 1.13.
 func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
 	sum = x + y + carry
 	carryOut = ((x & y) | ((x | y) &^ sum)) >> 63
 	return
 }
 func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
 	diff = x - y - borrow
 	borrowOut = ((^x & y) | (^(x ^ y) & diff)) >> 63
 	return
 }
 func bitsMul64(x, y uint64) (hi, lo uint64) {
 	const mask32 = 1<<32 - 1
 	x0 := x & mask32
 	x1 := x >> 32
 	y0 := y & mask32
 	y1 := y >> 32
 	w0 := x0 * y0
 	t := x1*y0 + w0>>32
 	w1 := t & mask32
 	w2 := t >> 32
 	w1 += x0 * y1
 	hi = x1*y1 + w2 + w1>>32
 	lo = x * y
 	return
 }
--- a/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go
+++ b/vendor/golang.org/x/crypto/poly1305/bits_go1.13.go
@ -0,0 +1,21 @@
 // Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build go1.13
 package poly1305
 import "math/bits"
 func bitsAdd64(x, y, carry uint64) (sum, carryOut uint64) {
 	return bits.Add64(x, y, carry)
 }
 func bitsSub64(x, y, borrow uint64) (diff, borrowOut uint64) {
 	return bits.Sub64(x, y, borrow)
 }
 func bitsMul64(x, y uint64) (hi, lo uint64) {
 	return bits.Mul64(x, y)
 }
--- a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
@ -2,10 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !amd64,!ppc64le gccgo appengine
+// +build !amd64,!ppc64le,!s390x gccgo purego
 package poly1305
 type mac struct{ macGeneric }
 func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
--- a/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go
@ -22,8 +22,16 @@ import "crypto/subtle"
 // TagSize is the size, in bytes, of a poly1305 authenticator.
 const TagSize = 16
-// Verify returns true if mac is a valid authenticator for m with the given
+// Sum generates an authenticator for msg using a one-time key and puts the
-// key.
+// 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
 	h := New(key)
 	h.Write(m)
 	h.Sum(out[:0])
 }
 // Verify returns true if mac is a valid authenticator for m with the given key.
 func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
 	var tmp [16]byte
 	Sum(&tmp, m, key)
@ -40,10 +48,9 @@ func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
 // two different messages with the same key allows an attacker
 // to forge messages at will.
 func New(key *[32]byte) *MAC {
-	return &MAC{
+	m := &MAC{}
-		mac:       newMAC(key),
+	initialize(key, &m.macState)
-		finalized: false,
+	return m
 	}
 }
 // MAC is an io.Writer computing an authentication tag
@ -52,7 +59,7 @@ func New(key *[32]byte) *MAC {
 // MAC cannot be used like common hash.Hash implementations,
 // because using a poly1305 key twice breaks its security.
 // Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
 type MAC struct {
 	mac // platform-dependent implementation
@ -65,10 +72,10 @@ func (h *MAC) Size() int { return TagSize }
 // Write adds more data to the running message authentication code.
 // It never returns an error.
 //
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
 func (h *MAC) Write(p []byte) (n int, err error) {
 	if h.finalized {
-		panic("poly1305: write to MAC after Sum")
+		panic("poly1305: write to MAC after Sum or Verify")
 	}
 	return h.mac.Write(p)
 }
@ -81,3 +88,12 @@ func (h *MAC) Sum(b []byte) []byte {
 	h.finalized = true
 	return append(b, mac[:]...)
 }
 // Verify returns whether the authenticator of all data written to
 // the message authentication code matches the expected value.
 func (h *MAC) Verify(expected []byte) bool {
 	var mac [TagSize]byte
 	h.mac.Sum(&mac)
 	h.finalized = true
 	return subtle.ConstantTimeCompare(expected, mac[:]) == 1
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@ -2,67 +2,46 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
 //go:noescape
-func initialize(state *[7]uint64, key *[32]byte)
+func update(state *macState, msg []byte)
-//go:noescape
+// mac is a wrapper for macGeneric that redirects calls that would have gone to
-func update(state *[7]uint64, msg []byte)
+// updateGeneric to update.
 //
 // Its Write and Sum methods are otherwise identical to the macGeneric ones, but
 // using function pointers would carry a major performance cost.
 type mac struct{ macGeneric }
-//go:noescape
+func (h *mac) Write(p []byte) (int, error) {
-func finalize(tag *[TagSize]byte, state *[7]uint64)
+	nn := len(p)
 // Sum generates an authenticator for m using a one-time key and puts the
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(m)
 	h.Sum(out)
 }
 func newMAC(key *[32]byte) (h mac) {
 	initialize(&h.state, key)
 	return
 }
 type mac struct {
 	state [7]uint64 // := uint64{ h0, h1, h2, r0, r1, pad0, pad1 }
 	buffer [TagSize]byte
 	offset int
 }
 func (h *mac) Write(p []byte) (n int, err error) {
 	n = len(p)
 	if h.offset > 0 {
-		remaining := TagSize - h.offset
+		n := copy(h.buffer[h.offset:], p)
-		if n < remaining {
+		if h.offset+n < TagSize {
-			h.offset += copy(h.buffer[h.offset:], p)
+			h.offset += n
-			return n, nil
+			return nn, nil
 		}
-		copy(h.buffer[h.offset:], p[:remaining])
+		p = p[n:]
 		p = p[remaining:]
 		h.offset = 0
-		update(&h.state, h.buffer[:])
+		update(&h.macState, h.buffer[:])
 	}
-	if nn := len(p) - (len(p) % TagSize); nn > 0 {
+	if n := len(p) - (len(p) % TagSize); n > 0 {
-		update(&h.state, p[:nn])
+		update(&h.macState, p[:n])
-		p = p[nn:]
+		p = p[n:]
 	}
 	if len(p) > 0 {
 		h.offset += copy(h.buffer[h.offset:], p)
 	}
-	return n, nil
+	return nn, nil
 }
 func (h *mac) Sum(out *[16]byte) {
-	state := h.state
+	state := h.macState
 	if h.offset > 0 {
 		update(&state, h.buffer[:h.offset])
 	}
-	finalize(out, &state)
+	finalize(out, &state.h, &state.s)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
@ -54,10 +54,6 @@
 	ADCQ  t3, h1;                  \
 	ADCQ  $0, h2
 DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
 DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
 GLOBL ·poly1305Mask<>(SB), RODATA, $16
 // func update(state *[7]uint64, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVQ state+0(FP), DI
@ -110,39 +106,3 @@ done:
 	MOVQ R9, 8(DI)
 	MOVQ R10, 16(DI)
 	RET
 // func initialize(state *[7]uint64, key *[32]byte)
 TEXT ·initialize(SB), $0-16
 	MOVQ state+0(FP), DI
 	MOVQ key+8(FP), SI
 	// state[0...7] is initialized with zero
 	MOVOU 0(SI), X0
 	MOVOU 16(SI), X1
 	MOVOU ·poly1305Mask<>(SB), X2
 	PAND  X2, X0
 	MOVOU X0, 24(DI)
 	MOVOU X1, 40(DI)
 	RET
 // func finalize(tag *[TagSize]byte, state *[7]uint64)
 TEXT ·finalize(SB), $0-16
 	MOVQ tag+0(FP), DI
 	MOVQ state+8(FP), SI
 	MOVQ    0(SI), AX
 	MOVQ    8(SI), BX
 	MOVQ    16(SI), CX
 	MOVQ    AX, R8
 	MOVQ    BX, R9
 	SUBQ    $0xFFFFFFFFFFFFFFFB, AX
 	SBBQ    $0xFFFFFFFFFFFFFFFF, BX
 	SBBQ    $3, CX
 	CMOVQCS R8, AX
 	CMOVQCS R9, BX
 	ADDQ    40(SI), AX
 	ADCQ    48(SI), BX
 	MOVQ AX, 0(DI)
 	MOVQ BX, 8(DI)
 	RET
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
@ -1,22 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build arm,!gccgo,!appengine,!nacl
 package poly1305
 // This function is implemented in sum_arm.s
 //go:noescape
 func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
 // Sum generates an authenticator for m using a one-time key and puts the
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
 	var mPtr *byte
 	if len(m) > 0 {
 		mPtr = &m[0]
 	}
 	poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
@ -1,427 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build arm,!gccgo,!appengine,!nacl
 #include "textflag.h"
 // This code was translated into a form compatible with 5a from the public
 // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
 DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
 DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
 DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
 DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
 DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
 GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
 // Warning: the linker may use R11 to synthesize certain instructions. Please
 // take care and verify that no synthetic instructions use it.
 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
 	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
 	// might look like it's only 60 bytes of space but the final four bytes
 	// will be written by another function.) We need to skip over four
 	// bytes of stack because that's saving the value of 'g'.
 	ADD       $4, R13, R8
 	MOVM.IB   [R4-R7], (R8)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVW      $·poly1305_init_constants_armv6<>(SB), R7
 	MOVW      R2, R8
 	MOVW      R2>>26, R9
 	MOVW      R3>>20, g
 	MOVW      R4>>14, R11
 	MOVW      R5>>8, R12
 	ORR       R3<<6, R9, R9
 	ORR       R4<<12, g, g
 	ORR       R5<<18, R11, R11
 	MOVM.IA   (R7), [R2-R6]
 	AND       R8, R2, R2
 	AND       R9, R3, R3
 	AND       g, R4, R4
 	AND       R11, R5, R5
 	AND       R12, R6, R6
 	MOVM.IA.W [R2-R6], (R0)
 	EOR       R2, R2, R2
 	EOR       R3, R3, R3
 	EOR       R4, R4, R4
 	EOR       R5, R5, R5
 	EOR       R6, R6, R6
 	MOVM.IA.W [R2-R6], (R0)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVM.IA   [R2-R6], (R0)
 	ADD       $20, R13, R0
 	MOVM.DA   (R0), [R4-R7]
 	RET
 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
 	MOVBU (offset+0)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+0)(Rdst); \
 	MOVBU (offset+1)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+1)(Rdst); \
 	MOVBU (offset+2)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+2)(Rdst); \
 	MOVBU (offset+3)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+3)(Rdst)
 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
 	// Needs 24 bytes of stack for saved registers and then 88 bytes of
 	// scratch space after that. We assume that 24 bytes at (R13) have
 	// already been used: four bytes for the link register saved in the
 	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
 	// in that function and 16 bytes of scratch space used around
 	// poly1305_finish_ext_armv6_skip1.
 	ADD     $24, R13, R12
 	MOVM.IB [R4-R8, R14], (R12)
 	MOVW    R0, 88(R13)
 	MOVW    R1, 92(R13)
 	MOVW    R2, 96(R13)
 	MOVW    R1, R14
 	MOVW    R2, R12
 	MOVW    56(R0), R8
 	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
 	EOR     R6, R6, R6
 	MOVW.EQ $(1<<24), R6
 	MOVW    R6, 84(R13)
 	ADD     $116, R13, g
 	MOVM.IA (R0), [R0-R9]
 	MOVM.IA [R0-R4], (g)
 	CMP     $16, R12
 	BLO     poly1305_blocks_armv6_done
 poly1305_blocks_armv6_mainloop:
 	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
 	BEQ     poly1305_blocks_armv6_mainloop_aligned
 	ADD     $100, R13, g
 	MOVW_UNALIGNED(R14, g, R0, 0)
 	MOVW_UNALIGNED(R14, g, R0, 4)
 	MOVW_UNALIGNED(R14, g, R0, 8)
 	MOVW_UNALIGNED(R14, g, R0, 12)
 	MOVM.IA (g), [R0-R3]
 	ADD     $16, R14
 	B       poly1305_blocks_armv6_mainloop_loaded
 poly1305_blocks_armv6_mainloop_aligned:
 	MOVM.IA.W (R14), [R0-R3]
 poly1305_blocks_armv6_mainloop_loaded:
 	MOVW    R0>>26, g
 	MOVW    R1>>20, R11
 	MOVW    R2>>14, R12
 	MOVW    R14, 92(R13)
 	MOVW    R3>>8, R4
 	ORR     R1<<6, g, g
 	ORR     R2<<12, R11, R11
 	ORR     R3<<18, R12, R12
 	BIC     $0xfc000000, R0, R0
 	BIC     $0xfc000000, g, g
 	MOVW    84(R13), R3
 	BIC     $0xfc000000, R11, R11
 	BIC     $0xfc000000, R12, R12
 	ADD     R0, R5, R5
 	ADD     g, R6, R6
 	ORR     R3, R4, R4
 	ADD     R11, R7, R7
 	ADD     $116, R13, R14
 	ADD     R12, R8, R8
 	ADD     R4, R9, R9
 	MOVM.IA (R14), [R0-R4]
 	MULLU   R4, R5, (R11, g)
 	MULLU   R3, R5, (R14, R12)
 	MULALU  R3, R6, (R11, g)
 	MULALU  R2, R6, (R14, R12)
 	MULALU  R2, R7, (R11, g)
 	MULALU  R1, R7, (R14, R12)
 	ADD     R4<<2, R4, R4
 	ADD     R3<<2, R3, R3
 	MULALU  R1, R8, (R11, g)
 	MULALU  R0, R8, (R14, R12)
 	MULALU  R0, R9, (R11, g)
 	MULALU  R4, R9, (R14, R12)
 	MOVW    g, 76(R13)
 	MOVW    R11, 80(R13)
 	MOVW    R12, 68(R13)
 	MOVW    R14, 72(R13)
 	MULLU   R2, R5, (R11, g)
 	MULLU   R1, R5, (R14, R12)
 	MULALU  R1, R6, (R11, g)
 	MULALU  R0, R6, (R14, R12)
 	MULALU  R0, R7, (R11, g)
 	MULALU  R4, R7, (R14, R12)
 	ADD     R2<<2, R2, R2
 	ADD     R1<<2, R1, R1
 	MULALU  R4, R8, (R11, g)
 	MULALU  R3, R8, (R14, R12)
 	MULALU  R3, R9, (R11, g)
 	MULALU  R2, R9, (R14, R12)
 	MOVW    g, 60(R13)
 	MOVW    R11, 64(R13)
 	MOVW    R12, 52(R13)
 	MOVW    R14, 56(R13)
 	MULLU   R0, R5, (R11, g)
 	MULALU  R4, R6, (R11, g)
 	MULALU  R3, R7, (R11, g)
 	MULALU  R2, R8, (R11, g)
 	MULALU  R1, R9, (R11, g)
 	ADD     $52, R13, R0
 	MOVM.IA (R0), [R0-R7]
 	MOVW    g>>26, R12
 	MOVW    R4>>26, R14
 	ORR     R11<<6, R12, R12
 	ORR     R5<<6, R14, R14
 	BIC     $0xfc000000, g, g
 	BIC     $0xfc000000, R4, R4
 	ADD.S   R12, R0, R0
 	ADC     $0, R1, R1
 	ADD.S   R14, R6, R6
 	ADC     $0, R7, R7
 	MOVW    R0>>26, R12
 	MOVW    R6>>26, R14
 	ORR     R1<<6, R12, R12
 	ORR     R7<<6, R14, R14
 	BIC     $0xfc000000, R0, R0
 	BIC     $0xfc000000, R6, R6
 	ADD     R14<<2, R14, R14
 	ADD.S   R12, R2, R2
 	ADC     $0, R3, R3
 	ADD     R14, g, g
 	MOVW    R2>>26, R12
 	MOVW    g>>26, R14
 	ORR     R3<<6, R12, R12
 	BIC     $0xfc000000, g, R5
 	BIC     $0xfc000000, R2, R7
 	ADD     R12, R4, R4
 	ADD     R14, R0, R0
 	MOVW    R4>>26, R12
 	BIC     $0xfc000000, R4, R8
 	ADD     R12, R6, R9
 	MOVW    96(R13), R12
 	MOVW    92(R13), R14
 	MOVW    R0, R6
 	CMP     $32, R12
 	SUB     $16, R12, R12
 	MOVW    R12, 96(R13)
 	BHS     poly1305_blocks_armv6_mainloop
 poly1305_blocks_armv6_done:
 	MOVW    88(R13), R12
 	MOVW    R5, 20(R12)
 	MOVW    R6, 24(R12)
 	MOVW    R7, 28(R12)
 	MOVW    R8, 32(R12)
 	MOVW    R9, 36(R12)
 	ADD     $48, R13, R0
 	MOVM.DA (R0), [R4-R8, R14]
 	RET
 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
 	MOVBU.P 1(Rsrc), Rtmp; \
 	MOVBU.P Rtmp, 1(Rdst); \
 	MOVBU.P 1(Rsrc), Rtmp; \
 	MOVBU.P Rtmp, 1(Rdst)
 #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
 // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
 TEXT ·poly1305_auth_armv6(SB), $196-16
 	// The value 196, just above, is the sum of 64 (the size of the context
 	// structure) and 132 (the amount of stack needed).
 	//
 	// At this point, the stack pointer (R13) has been moved down. It
 	// points to the saved link register and there's 196 bytes of free
 	// space above it.
 	//
 	// The stack for this function looks like:
 	//
 	// +---------------------
 	// |
 	// | 64 bytes of context structure
 	// |
 	// +---------------------
 	// |
 	// | 112 bytes for poly1305_blocks_armv6
 	// |
 	// +---------------------
 	// | 16 bytes of final block, constructed at
 	// | poly1305_finish_ext_armv6_skip8
 	// +---------------------
 	// | four bytes of saved 'g'
 	// +---------------------
 	// | lr, saved by prelude    <- R13 points here
 	// +---------------------
 	MOVW g, 4(R13)
 	MOVW out+0(FP), R4
 	MOVW m+4(FP), R5
 	MOVW mlen+8(FP), R6
 	MOVW key+12(FP), R7
 	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
 	MOVW R7, R1
 	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
 	// that's ok because none of the other values have been written yet.
 	BL    poly1305_init_ext_armv6<>(SB)
 	BIC.S $15, R6, R2
 	BEQ   poly1305_auth_armv6_noblocks
 	ADD   $136, R13, R0
 	MOVW  R5, R1
 	ADD   R2, R5, R5
 	SUB   R2, R6, R6
 	BL    poly1305_blocks_armv6<>(SB)
 poly1305_auth_armv6_noblocks:
 	ADD  $136, R13, R0
 	MOVW R5, R1
 	MOVW R6, R2
 	MOVW R4, R3
 	MOVW  R0, R5
 	MOVW  R1, R6
 	MOVW  R2, R7
 	MOVW  R3, R8
 	AND.S R2, R2, R2
 	BEQ   poly1305_finish_ext_armv6_noremaining
 	EOR   R0, R0
 	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
 	MOVW  R0, (R9)
 	MOVW  R0, 4(R9)
 	MOVW  R0, 8(R9)
 	MOVW  R0, 12(R9)
 	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
 	BEQ   poly1305_finish_ext_armv6_aligned
 	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
 	BEQ   poly1305_finish_ext_armv6_skip8
 	MOVWP_UNALIGNED(R1, R9, g)
 	MOVWP_UNALIGNED(R1, R9, g)
 poly1305_finish_ext_armv6_skip8:
 	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
 	BEQ  poly1305_finish_ext_armv6_skip4
 	MOVWP_UNALIGNED(R1, R9, g)
 poly1305_finish_ext_armv6_skip4:
 	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
 	BEQ  poly1305_finish_ext_armv6_skip2
 	MOVHUP_UNALIGNED(R1, R9, g)
 	B    poly1305_finish_ext_armv6_skip2
 poly1305_finish_ext_armv6_aligned:
 	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
 	BEQ       poly1305_finish_ext_armv6_skip8_aligned
 	MOVM.IA.W (R1), [g-R11]
 	MOVM.IA.W [g-R11], (R9)
 poly1305_finish_ext_armv6_skip8_aligned:
 	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
 	BEQ    poly1305_finish_ext_armv6_skip4_aligned
 	MOVW.P 4(R1), g
 	MOVW.P g, 4(R9)
 poly1305_finish_ext_armv6_skip4_aligned:
 	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
 	BEQ     poly1305_finish_ext_armv6_skip2
 	MOVHU.P 2(R1), g
 	MOVH.P  g, 2(R9)
 poly1305_finish_ext_armv6_skip2:
 	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
 	BEQ     poly1305_finish_ext_armv6_skip1
 	MOVBU.P 1(R1), g
 	MOVBU.P g, 1(R9)
 poly1305_finish_ext_armv6_skip1:
 	MOVW  $1, R11
 	MOVBU R11, 0(R9)
 	MOVW  R11, 56(R5)
 	MOVW  R5, R0
 	ADD   $8, R13, R1
 	MOVW  $16, R2
 	BL    poly1305_blocks_armv6<>(SB)
 poly1305_finish_ext_armv6_noremaining:
 	MOVW      20(R5), R0
 	MOVW      24(R5), R1
 	MOVW      28(R5), R2
 	MOVW      32(R5), R3
 	MOVW      36(R5), R4
 	MOVW      R4>>26, R12
 	BIC       $0xfc000000, R4, R4
 	ADD       R12<<2, R12, R12
 	ADD       R12, R0, R0
 	MOVW      R0>>26, R12
 	BIC       $0xfc000000, R0, R0
 	ADD       R12, R1, R1
 	MOVW      R1>>26, R12
 	BIC       $0xfc000000, R1, R1
 	ADD       R12, R2, R2
 	MOVW      R2>>26, R12
 	BIC       $0xfc000000, R2, R2
 	ADD       R12, R3, R3
 	MOVW      R3>>26, R12
 	BIC       $0xfc000000, R3, R3
 	ADD       R12, R4, R4
 	ADD       $5, R0, R6
 	MOVW      R6>>26, R12
 	BIC       $0xfc000000, R6, R6
 	ADD       R12, R1, R7
 	MOVW      R7>>26, R12
 	BIC       $0xfc000000, R7, R7
 	ADD       R12, R2, g
 	MOVW      g>>26, R12
 	BIC       $0xfc000000, g, g
 	ADD       R12, R3, R11
 	MOVW      $-(1<<26), R12
 	ADD       R11>>26, R12, R12
 	BIC       $0xfc000000, R11, R11
 	ADD       R12, R4, R9
 	MOVW      R9>>31, R12
 	SUB       $1, R12
 	AND       R12, R6, R6
 	AND       R12, R7, R7
 	AND       R12, g, g
 	AND       R12, R11, R11
 	AND       R12, R9, R9
 	MVN       R12, R12
 	AND       R12, R0, R0
 	AND       R12, R1, R1
 	AND       R12, R2, R2
 	AND       R12, R3, R3
 	AND       R12, R4, R4
 	ORR       R6, R0, R0
 	ORR       R7, R1, R1
 	ORR       g, R2, R2
 	ORR       R11, R3, R3
 	ORR       R9, R4, R4
 	ORR       R1<<26, R0, R0
 	MOVW      R1>>6, R1
 	ORR       R2<<20, R1, R1
 	MOVW      R2>>12, R2
 	ORR       R3<<14, R2, R2
 	MOVW      R3>>18, R3
 	ORR       R4<<8, R3, R3
 	MOVW      40(R5), R6
 	MOVW      44(R5), R7
 	MOVW      48(R5), g
 	MOVW      52(R5), R11
 	ADD.S     R6, R0, R0
 	ADC.S     R7, R1, R1
 	ADC.S     g, R2, R2
 	ADC.S     R11, R3, R3
 	MOVM.IA   [R0-R3], (R8)
 	MOVW      R5, R12
 	EOR       R0, R0, R0
 	EOR       R1, R1, R1
 	EOR       R2, R2, R2
 	EOR       R3, R3, R3
 	EOR       R4, R4, R4
 	EOR       R5, R5, R5
 	EOR       R6, R6, R6
 	EOR       R7, R7, R7
 	MOVM.IA.W [R0-R7], (R12)
 	MOVM.IA   [R0-R7], (R12)
 	MOVW      4(R13), g
 	RET
--- a/vendor/golang.org/x/crypto/poly1305/sum_generic.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_generic.go
@ -2,171 +2,309 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This file provides the generic implementation of Sum and MAC. Other files
 // might provide optimized assembly implementations of some of this code.
 package poly1305
 import "encoding/binary"
-const (
+// Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
-	msgBlock   = uint32(1 << 24)
+// for a 64 bytes message is approximately
-	finalBlock = uint32(0)
+//
-)
+//     s + m[0:16] * r⁴ + m[16:32] * r³ + m[32:48] * r² + m[48:64] * r  mod  2¹³⁰ - 5
 //
 // for some secret r and s. It can be computed sequentially like
 //
 //     for len(msg) > 0:
 //         h += read(msg, 16)
 //         h *= r
 //         h %= 2¹³⁰ - 5
 //     return h + s
 //
 // All the complexity is about doing performant constant-time math on numbers
 // larger than any available numeric type.
 // sumGeneric generates an authenticator for msg using a one-time key and
 // puts the 16-byte result into out. This is the generic implementation of
 // Sum and should be called if no assembly implementation is available.
 func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h := newMACGeneric(key)
 	h.Write(msg)
 	h.Sum(out)
 }
-func newMACGeneric(key *[32]byte) (h macGeneric) {
+func newMACGeneric(key *[32]byte) macGeneric {
-	h.r[0] = binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff
+	m := macGeneric{}
-	h.r[1] = (binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03
+	initialize(key, &m.macState)
-	h.r[2] = (binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff
+	return m
-	h.r[3] = (binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff
+}
 	h.r[4] = (binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff
-	h.s[0] = binary.LittleEndian.Uint32(key[16:])
+// macState holds numbers in saturated 64-bit little-endian limbs. That is,
-	h.s[1] = binary.LittleEndian.Uint32(key[20:])
+// the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
-	h.s[2] = binary.LittleEndian.Uint32(key[24:])
+type macState struct {
-	h.s[3] = binary.LittleEndian.Uint32(key[28:])
+	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
-	return
+	// can grow larger during and after rounds. It must, however, remain below
 	// 2 * (2¹³⁰ - 5).
 	h [3]uint64
 	// r and s are the private key components.
 	r [2]uint64
 	s [2]uint64
 }
 type macGeneric struct {
-	h, r [5]uint32
+	macState
 	s    [4]uint32
 	buffer [TagSize]byte
 	offset int
 }
-func (h *macGeneric) Write(p []byte) (n int, err error) {
+// Write splits the incoming message into TagSize chunks, and passes them to
-	n = len(p)
+// update. It buffers incomplete chunks.
 func (h *macGeneric) Write(p []byte) (int, error) {
 	nn := len(p)
 	if h.offset > 0 {
-		remaining := TagSize - h.offset
+		n := copy(h.buffer[h.offset:], p)
-		if n < remaining {
+		if h.offset+n < TagSize {
-			h.offset += copy(h.buffer[h.offset:], p)
+			h.offset += n
-			return n, nil
+			return nn, nil
 		}
-		copy(h.buffer[h.offset:], p[:remaining])
+		p = p[n:]
 		p = p[remaining:]
 		h.offset = 0
-		updateGeneric(h.buffer[:], msgBlock, &(h.h), &(h.r))
+		updateGeneric(&h.macState, h.buffer[:])
 	}
-	if nn := len(p) - (len(p) % TagSize); nn > 0 {
+	if n := len(p) - (len(p) % TagSize); n > 0 {
-		updateGeneric(p, msgBlock, &(h.h), &(h.r))
+		updateGeneric(&h.macState, p[:n])
-		p = p[nn:]
+		p = p[n:]
 	}
 	if len(p) > 0 {
 		h.offset += copy(h.buffer[h.offset:], p)
 	}
-	return n, nil
+	return nn, nil
 }
-func (h *macGeneric) Sum(out *[16]byte) {
+// Sum flushes the last incomplete chunk from the buffer, if any, and generates
-	H, R := h.h, h.r
+// the MAC output. It does not modify its state, in order to allow for multiple
 // calls to Sum, even if no Write is allowed after Sum.
 func (h *macGeneric) Sum(out *[TagSize]byte) {
 	state := h.macState
 	if h.offset > 0 {
-		var buffer [TagSize]byte
+		updateGeneric(&state, h.buffer[:h.offset])
 		copy(buffer[:], h.buffer[:h.offset])
 		buffer[h.offset] = 1 // invariant: h.offset < TagSize
 		updateGeneric(buffer[:], finalBlock, &H, &R)
 	}
-	finalizeGeneric(out, &H, &(h.s))
+	finalize(out, &state.h, &state.s)
 }
-func updateGeneric(msg []byte, flag uint32, h, r *[5]uint32) {
+// [rMask0, rMask1] is the specified Poly1305 clamping mask in little-endian. It
-	h0, h1, h2, h3, h4 := h[0], h[1], h[2], h[3], h[4]
+// clears some bits of the secret coefficient to make it possible to implement
-	r0, r1, r2, r3, r4 := uint64(r[0]), uint64(r[1]), uint64(r[2]), uint64(r[3]), uint64(r[4])
+// multiplication more efficiently.
-	R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5
+const (
 	rMask0 = 0x0FFFFFFC0FFFFFFF
 	rMask1 = 0x0FFFFFFC0FFFFFFC
 )
-	for len(msg) >= TagSize {
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
-		// h += msg
+func initialize(key *[32]byte, m *macState) {
-		h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff
+	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
-		h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff
+	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
-		h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff
+	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
-		h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff
+	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
-		h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | flag
+}
-		// h *= r
+// uint128 holds a 128-bit number as two 64-bit limbs, for use with the
-		d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
+// bits.Mul64 and bits.Add64 intrinsics.
-		d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
+type uint128 struct {
-		d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
+	lo, hi uint64
-		d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
+}
 		d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
-		// h %= p
+func mul64(a, b uint64) uint128 {
-		h0 = uint32(d0) & 0x3ffffff
+	hi, lo := bitsMul64(a, b)
-		h1 = uint32(d1) & 0x3ffffff
+	return uint128{lo, hi}
-		h2 = uint32(d2) & 0x3ffffff
+}
 		h3 = uint32(d3) & 0x3ffffff
 		h4 = uint32(d4) & 0x3ffffff
-		h0 += uint32(d4>>26) * 5
+func add128(a, b uint128) uint128 {
-		h1 += h0 >> 26
+	lo, c := bitsAdd64(a.lo, b.lo, 0)
-		h0 = h0 & 0x3ffffff
+	hi, c := bitsAdd64(a.hi, b.hi, c)
 	if c != 0 {
 		panic("poly1305: unexpected overflow")
 	}
 	return uint128{lo, hi}
 }
 func shiftRightBy2(a uint128) uint128 {
 	a.lo = a.lo>>2 | (a.hi&3)<<62
 	a.hi = a.hi >> 2
 	return a
 }
 // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
 // 128 bits of message, it computes
 //
 //     h₊ = (h + m) * r  mod  2¹³⁰ - 5
 //
 // If the msg length is not a multiple of TagSize, it assumes the last
 // incomplete chunk is the final one.
 func updateGeneric(state *macState, msg []byte) {
 	h0, h1, h2 := state.h[0], state.h[1], state.h[2]
 	r0, r1 := state.r[0], state.r[1]
 	for len(msg) > 0 {
 		var c uint64
 		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
 		// The resulting value of h might exceed 2¹³⁰ - 5, but will be partially
 		// reduced at the end of the multiplication below.
 		//
 		// The spec requires us to set a bit just above the message size, not to
 		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
 		// add 1 to the most significant (2¹²⁸) limb, h2.
 		if len(msg) >= TagSize {
 			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
 			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
 			h2 += c + 1
 			msg = msg[TagSize:]
 		} else {
 			var buf [TagSize]byte
 			copy(buf[:], msg)
 			buf[len(msg)] = 1
 			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(buf[0:8]), 0)
 			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(buf[8:16]), c)
 			h2 += c
 			msg = nil
 		}
-	h[0], h[1], h[2], h[3], h[4] = h0, h1, h2, h3, h4
+		// Multiplication of big number limbs is similar to elementary school
 		// columnar multiplication. Instead of digits, there are 64-bit limbs.
 		//
 		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
 		//
 		//                        h2    h1    h0  x
 		//                              r1    r0  =
 		//                       ----------------
 		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
 		//            +   h2r1  h1r1  h0r1
 		//               ------------------------
 		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
 		//               ------------------------
 		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
 		//     +         m3.lo m2.lo m1.lo m0.lo
 		//        -------------------------------
 		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
 		//
 		// The main difference from pen-and-paper multiplication is that we do
 		// carry propagation in a separate step, as if we wrote two digit sums
 		// at first (the 128-bit limbs), and then carried the tens all at once.
 		h0r0 := mul64(h0, r0)
 		h1r0 := mul64(h1, r0)
 		h2r0 := mul64(h2, r0)
 		h0r1 := mul64(h0, r1)
 		h1r1 := mul64(h1, r1)
 		h2r1 := mul64(h2, r1)
 		// Since h2 is known to be at most 7 (5 + 1 + 1), and r0 and r1 have their
 		// top 4 bits cleared by rMask{0,1}, we know that their product is not going
 		// to overflow 64 bits, so we can ignore the high part of the products.
 		//
 		// This also means that the product doesn't have a fifth limb (t4).
 		if h2r0.hi != 0 {
 			panic("poly1305: unexpected overflow")
 		}
 		if h2r1.hi != 0 {
 			panic("poly1305: unexpected overflow")
 		}
 		m0 := h0r0
 		m1 := add128(h1r0, h0r1) // These two additions don't overflow thanks again
 		m2 := add128(h2r0, h1r1) // to the 4 masked bits at the top of r0 and r1.
 		m3 := h2r1
 		t0 := m0.lo
 		t1, c := bitsAdd64(m1.lo, m0.hi, 0)
 		t2, c := bitsAdd64(m2.lo, m1.hi, c)
 		t3, _ := bitsAdd64(m3.lo, m2.hi, c)
 		// Now we have the result as 4 64-bit limbs, and we need to reduce it
 		// modulo 2¹³⁰ - 5. The special shape of this Crandall prime lets us do
 		// a cheap partial reduction according to the reduction identity
 		//
 		//     c * 2¹³⁰ + n  =  c * 5 + n  mod  2¹³⁰ - 5
 		//
 		// because 2¹³⁰ = 5 mod 2¹³⁰ - 5. Partial reduction since the result is
 		// likely to be larger than 2¹³⁰ - 5, but still small enough to fit the
 		// assumptions we make about h in the rest of the code.
 		//
 		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
 		// We split the final result at the 2¹³⁰ mark into h and cc, the carry.
 		// Note that the carry bits are effectively shifted left by 2, in other
 		// words, cc = c * 4 for the c in the reduction identity.
 		h0, h1, h2 = t0, t1, t2&maskLow2Bits
 		cc := uint128{t2 & maskNotLow2Bits, t3}
 		// To add c * 5 to h, we first add cc = c * 4, and then add (cc >> 2) = c.
 		h0, c = bitsAdd64(h0, cc.lo, 0)
 		h1, c = bitsAdd64(h1, cc.hi, c)
 		h2 += c
 		cc = shiftRightBy2(cc)
 		h0, c = bitsAdd64(h0, cc.lo, 0)
 		h1, c = bitsAdd64(h1, cc.hi, c)
 		h2 += c
 		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
 		//
 		//     5 * 2¹²⁸ + (2¹²⁸ - 1) = 6 * 2¹²⁸ - 1
 	}
 	state.h[0], state.h[1], state.h[2] = h0, h1, h2
 }
-func finalizeGeneric(out *[TagSize]byte, h *[5]uint32, s *[4]uint32) {
+const (
-	h0, h1, h2, h3, h4 := h[0], h[1], h[2], h[3], h[4]
+	maskLow2Bits    uint64 = 0x0000000000000003
 	maskNotLow2Bits uint64 = ^maskLow2Bits
 )
-	// h %= p reduction
+// select64 returns x if v == 1 and y if v == 0, in constant time.
-	h2 += h1 >> 26
+func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
 	h1 &= 0x3ffffff
 	h3 += h2 >> 26
 	h2 &= 0x3ffffff
 	h4 += h3 >> 26
 	h3 &= 0x3ffffff
 	h0 += 5 * (h4 >> 26)
 	h4 &= 0x3ffffff
 	h1 += h0 >> 26
 	h0 &= 0x3ffffff
-	// h - p
+// [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
-	t0 := h0 + 5
+const (
-	t1 := h1 + (t0 >> 26)
+	p0 = 0xFFFFFFFFFFFFFFFB
-	t2 := h2 + (t1 >> 26)
+	p1 = 0xFFFFFFFFFFFFFFFF
-	t3 := h3 + (t2 >> 26)
+	p2 = 0x0000000000000003
-	t4 := h4 + (t3 >> 26) - (1 << 26)
+)
 	t0 &= 0x3ffffff
 	t1 &= 0x3ffffff
 	t2 &= 0x3ffffff
 	t3 &= 0x3ffffff
-	// select h if h < p else h - p
+// finalize completes the modular reduction of h and computes
-	t_mask := (t4 >> 31) - 1
+//
-	h_mask := ^t_mask
+//     out = h + s  mod  2¹²⁸
-	h0 = (h0 & h_mask) | (t0 & t_mask)
+//
-	h1 = (h1 & h_mask) | (t1 & t_mask)
+func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
-	h2 = (h2 & h_mask) | (t2 & t_mask)
+	h0, h1, h2 := h[0], h[1], h[2]
 	h3 = (h3 & h_mask) | (t3 & t_mask)
 	h4 = (h4 & h_mask) | (t4 & t_mask)
-	// h %= 2^128
+	// After the partial reduction in updateGeneric, h might be more than
-	h0 |= h1 << 26
+	// 2¹³⁰ - 5, but will be less than 2 * (2¹³⁰ - 5). To complete the reduction
-	h1 = ((h1 >> 6) | (h2 << 20))
+	// in constant time, we compute t = h - (2¹³⁰ - 5), and select h as the
-	h2 = ((h2 >> 12) | (h3 << 14))
+	// result if the subtraction underflows, and t otherwise.
 	h3 = ((h3 >> 18) | (h4 << 8))
-	// s: the s part of the key
+	hMinusP0, b := bitsSub64(h0, p0, 0)
-	// tag = (h + s) % (2^128)
+	hMinusP1, b := bitsSub64(h1, p1, b)
-	t := uint64(h0) + uint64(s[0])
+	_, b = bitsSub64(h2, p2, b)
 	h0 = uint32(t)
 	t = uint64(h1) + uint64(s[1]) + (t >> 32)
 	h1 = uint32(t)
 	t = uint64(h2) + uint64(s[2]) + (t >> 32)
 	h2 = uint32(t)
 	t = uint64(h3) + uint64(s[3]) + (t >> 32)
 	h3 = uint32(t)
-	binary.LittleEndian.PutUint32(out[0:], h0)
+	// h = h if h < p else h - p
-	binary.LittleEndian.PutUint32(out[4:], h1)
+	h0 = select64(b, h0, hMinusP0)
-	binary.LittleEndian.PutUint32(out[8:], h2)
+	h1 = select64(b, h1, hMinusP1)
-	binary.LittleEndian.PutUint32(out[12:], h3)
+
 	// Finally, we compute the last Poly1305 step
 	//
 	//     tag = h + s  mod  2¹²⁸
 	//
 	// by just doing a wide addition with the 128 low bits of h and discarding
 	// the overflow.
 	h0, c := bitsAdd64(h0, s[0], 0)
 	h1, _ = bitsAdd64(h1, s[1], c)
 	binary.LittleEndian.PutUint64(out[0:8], h0)
 	binary.LittleEndian.PutUint64(out[8:16], h1)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
@ -1,16 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x,!go1.11 !arm,!amd64,!s390x,!ppc64le gccgo appengine nacl
 package poly1305
 // Sum generates an authenticator for msg using a one-time key and puts the
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(msg)
 	h.Sum(out)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
@ -2,67 +2,46 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
 //go:noescape
-func initialize(state *[7]uint64, key *[32]byte)
+func update(state *macState, msg []byte)
-//go:noescape
+// mac is a wrapper for macGeneric that redirects calls that would have gone to
-func update(state *[7]uint64, msg []byte)
+// updateGeneric to update.
 //
 // Its Write and Sum methods are otherwise identical to the macGeneric ones, but
 // using function pointers would carry a major performance cost.
 type mac struct{ macGeneric }
-//go:noescape
+func (h *mac) Write(p []byte) (int, error) {
-func finalize(tag *[TagSize]byte, state *[7]uint64)
+	nn := len(p)
 // Sum generates an authenticator for m using a one-time key and puts the
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(m)
 	h.Sum(out)
 }
 func newMAC(key *[32]byte) (h mac) {
 	initialize(&h.state, key)
 	return
 }
 type mac struct {
 	state [7]uint64 // := uint64{ h0, h1, h2, r0, r1, pad0, pad1 }
 	buffer [TagSize]byte
 	offset int
 }
 func (h *mac) Write(p []byte) (n int, err error) {
 	n = len(p)
 	if h.offset > 0 {
-		remaining := TagSize - h.offset
+		n := copy(h.buffer[h.offset:], p)
-		if n < remaining {
+		if h.offset+n < TagSize {
-			h.offset += copy(h.buffer[h.offset:], p)
+			h.offset += n
-			return n, nil
+			return nn, nil
 		}
-		copy(h.buffer[h.offset:], p[:remaining])
+		p = p[n:]
 		p = p[remaining:]
 		h.offset = 0
-		update(&h.state, h.buffer[:])
+		update(&h.macState, h.buffer[:])
 	}
-	if nn := len(p) - (len(p) % TagSize); nn > 0 {
+	if n := len(p) - (len(p) % TagSize); n > 0 {
-		update(&h.state, p[:nn])
+		update(&h.macState, p[:n])
-		p = p[nn:]
+		p = p[n:]
 	}
 	if len(p) > 0 {
 		h.offset += copy(h.buffer[h.offset:], p)
 	}
-	return n, nil
+	return nn, nil
 }
 func (h *mac) Sum(out *[16]byte) {
-	state := h.state
+	state := h.macState
 	if h.offset > 0 {
 		update(&state, h.buffer[:h.offset])
 	}
-	finalize(out, &state)
+	finalize(out, &state.h, &state.s)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
@ -58,7 +58,6 @@ DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
 GLOBL ·poly1305Mask<>(SB), RODATA, $16
 // func update(state *[7]uint64, msg []byte)
 TEXT ·update(SB), $0-32
 	MOVD state+0(FP), R3
 	MOVD msg_base+8(FP), R4
@ -180,68 +179,3 @@ done:
 	MOVD R9, 8(R3)
 	MOVD R10, 16(R3)
 	RET
 // func initialize(state *[7]uint64, key *[32]byte)
 TEXT ·initialize(SB), $0-16
 	MOVD state+0(FP), R3
 	MOVD key+8(FP), R4
 	// state[0...7] is initialized with zero
 	// Load key
 	MOVD 0(R4), R5
 	MOVD 8(R4), R6
 	MOVD 16(R4), R7
 	MOVD 24(R4), R8
 	// Address of key mask
 	MOVD $·poly1305Mask<>(SB), R9
 	// Save original key in state
 	MOVD R7, 40(R3)
 	MOVD R8, 48(R3)
 	// Get mask
 	MOVD (R9), R7
 	MOVD 8(R9), R8
 	// And with key
 	AND R5, R7, R5
 	AND R6, R8, R6
 	// Save masked key in state
 	MOVD R5, 24(R3)
 	MOVD R6, 32(R3)
 	RET
 // func finalize(tag *[TagSize]byte, state *[7]uint64)
 TEXT ·finalize(SB), $0-16
 	MOVD tag+0(FP), R3
 	MOVD state+8(FP), R4
 	// Get h0, h1, h2 from state
 	MOVD 0(R4), R5
 	MOVD 8(R4), R6
 	MOVD 16(R4), R7
 	// Save h0, h1
 	MOVD  R5, R8
 	MOVD  R6, R9
 	MOVD  $3, R20
 	MOVD  $-1, R21
 	SUBC  $-5, R5
 	SUBE  R21, R6
 	SUBE  R20, R7
 	MOVD  $0, R21
 	SUBZE R21
 	// Check for carry
 	CMP  $0, R21
 	ISEL $2, R5, R8, R5
 	ISEL $2, R6, R9, R6
 	MOVD 40(R4), R8
 	MOVD 48(R4), R9
 	ADDC R8, R5
 	ADDE R9, R6
 	MOVD R5, 0(R3)
 	MOVD R6, 8(R3)
 	RET
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build s390x,go1.11,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
@ -10,33 +10,66 @@ import (
 	"golang.org/x/sys/cpu"
 )
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// updateVX is an assembly implementation of Poly1305 that uses vector
 // instructions. It must only be called if the vector facility (vx) is
 // available.
 //go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+func updateVX(state *macState, msg []byte)
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
+// calls that would have gone to updateGeneric to updateVX if the vector
-// available and if VMSL is supported.
+// facility is installed.
-//go:noescape
+//
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+// A larger buffer is required for good performance because the vector
 // implementation has a higher fixed cost per call than the generic
 // implementation.
 type mac struct {
 	macState
-// Sum generates an authenticator for m using a one-time key and puts the
+	buffer [16 * TagSize]byte // size must be a multiple of block size (16)
-// 16-byte result into out. Authenticating two different messages with the same
+	offset int
-// key allows an attacker to forge messages at will.
+}
-func Sum(out *[16]byte, m []byte, key *[32]byte) {
+
-	if cpu.S390X.HasVX {
+func (h *mac) Write(p []byte) (int, error) {
-		var mPtr *byte
+	nn := len(p)
-		if len(m) > 0 {
+	if h.offset > 0 {
-			mPtr = &m[0]
+		n := copy(h.buffer[h.offset:], p)
-		}
+		if h.offset+n < len(h.buffer) {
-		if cpu.S390X.HasVXE && len(m) > 256 {
+			h.offset += n
-			poly1305vmsl(out, mPtr, uint64(len(m)), key)
+			return nn, nil
-		} else {
+		}
-			poly1305vx(out, mPtr, uint64(len(m)), key)
+		p = p[n:]
-		}
+		h.offset = 0
-	} else {
+		if cpu.S390X.HasVX {
-		sumGeneric(out, m, key)
+			updateVX(&h.macState, h.buffer[:])
-	}
+		} else {
 			updateGeneric(&h.macState, h.buffer[:])
 		}
 	}
 	tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
 	body := len(p) - tail          // number of bytes to process now
 	if body > 0 {
 		if cpu.S390X.HasVX {
 			updateVX(&h.macState, p[:body])
 		} else {
 			updateGeneric(&h.macState, p[:body])
 		}
 	}
 	h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
 	return nn, nil
 }
 func (h *mac) Sum(out *[TagSize]byte) {
 	state := h.macState
 	remainder := h.buffer[:h.offset]
 	// Use the generic implementation if we have 2 or fewer blocks left
 	// to sum. The vector implementation has a higher startup time.
 	if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
 		updateVX(&state, remainder)
 	} else if len(remainder) > 0 {
 		updateGeneric(&state, remainder)
 	}
 	finalize(out, &state.h, &state.s)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@ -2,115 +2,187 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build s390x,go1.11,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
-// Implementation of Poly1305 using the vector facility (vx).
+// This implementation of Poly1305 uses the vector facility (vx)
 // to process up to 2 blocks (32 bytes) per iteration using an
 // algorithm based on the one described in:
 //
 // NEON crypto, Daniel J. Bernstein & Peter Schwabe
 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 //
 // This algorithm uses 5 26-bit limbs to represent a 130-bit
 // value. These limbs are, for the most part, zero extended and
 // placed into 64-bit vector register elements. Each vector
 // register is 128-bits wide and so holds 2 of these elements.
 // Using 26-bit limbs allows us plenty of headroom to accomodate
 // accumulations before and after multiplication without
 // overflowing either 32-bits (before multiplication) or 64-bits
 // (after multiplication).
 //
 // In order to parallelise the operations required to calculate
 // the sum we use two separate accumulators and then sum those
 // in an extra final step. For compatibility with the generic
 // implementation we perform this summation at the end of every
 // updateVX call.
 //
 // To use two accumulators we must multiply the message blocks
 // by r² rather than r. Only the final message block should be
 // multiplied by r.
 //
 // Example:
 //
 // We want to calculate the sum (h) for a 64 byte message (m):
 //
 //   h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
 //
 // To do this we split the calculation into the even indices
 // and odd indices of the message. These form our SIMD 'lanes':
 //
 //   h = m[ 0:16]r⁴ + m[32:48]r² +   <- lane 0
 //       m[16:32]r³ + m[48:64]r      <- lane 1
 //
 // To calculate this iteratively we refactor so that both lanes
 // are written in terms of r² and r:
 //
 //   h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
 //       (m[16:32]r² + m[48:64])r    <- lane 1
 //                ^             ^
 //                |             coefficients for second iteration
 //                coefficients for first iteration
 //
 // So in this case we would have two iterations. In the first
 // both lanes are multiplied by r². In the second only the
 // first lane is multiplied by r² and the second lane is
 // instead multiplied by r. This gives use the odd and even
 // powers of r that we need from the original equation.
 //
 // Notation:
 //
 //   h - accumulator
 //   r - key
 //   m - message
 //
 //   [a, b]       - SIMD register holding two 64-bit values
 //   [a, b, c, d] - SIMD register holding four 32-bit values
 //   xᵢ[n]        - limb n of variable x with bit width i
 //
 // Limbs are expressed in little endian order, so for 26-bit
 // limbs x₂₆[4] will be the most significant limb and x₂₆[0]
 // will be the least significant limb.
-// constants
+// masking constants
-#define MOD26 V0
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
-#define EX0   V1
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
 #define EX1   V2
 #define EX2   V3
-// temporaries
+// expansion constants (see EXPAND macro)
-#define T_0 V4
+#define EX0 V2
-#define T_1 V5
+#define EX1 V3
-#define T_2 V6
+#define EX2 V4
 #define T_3 V7
 #define T_4 V8
-// key (r)
+// key (r², r or 1 depending on context)
-#define R_0  V9
+#define R_0 V5
-#define R_1  V10
+#define R_1 V6
-#define R_2  V11
+#define R_2 V7
-#define R_3  V12
+#define R_3 V8
-#define R_4  V13
+#define R_4 V9
 #define R5_1 V14
 #define R5_2 V15
 #define R5_3 V16
 #define R5_4 V17
 #define RSAVE_0 R5
 #define RSAVE_1 R6
 #define RSAVE_2 R7
 #define RSAVE_3 R8
 #define RSAVE_4 R9
 #define R5SAVE_1 V28
 #define R5SAVE_2 V29
 #define R5SAVE_3 V30
 #define R5SAVE_4 V31
-// message block
+// precalculated coefficients (5r², 5r or 0 depending on context)
-#define F_0 V18
+#define R5_1 V10
-#define F_1 V19
+#define R5_2 V11
-#define F_2 V20
+#define R5_3 V12
-#define F_3 V21
+#define R5_4 V13
 #define F_4 V22
-// accumulator
+// message block (m)
-#define H_0 V23
+#define M_0 V14
-#define H_1 V24
+#define M_1 V15
-#define H_2 V25
+#define M_2 V16
-#define H_3 V26
+#define M_3 V17
-#define H_4 V27
+#define M_4 V18
-GLOBL ·keyMask<>(SB), RODATA, $16
+// accumulator (h)
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
+#define H_0 V19
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
+#define H_1 V20
 #define H_2 V21
 #define H_3 V22
 #define H_4 V23
-GLOBL ·bswapMask<>(SB), RODATA, $16
+// temporary registers (for short-lived values)
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
+#define T_0 V24
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
+#define T_1 V25
 #define T_2 V26
 #define T_3 V27
 #define T_4 V28
-GLOBL ·constants<>(SB), RODATA, $64
+GLOBL ·constants<>(SB), RODATA, $0x30
 // MOD26
 DATA ·constants<>+0(SB)/8, $0x3ffffff
 DATA ·constants<>+8(SB)/8, $0x3ffffff
 // EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
 // EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
 // EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
-// h = (f*g) % (2**130-5) [partial reduction]
+// MULTIPLY multiplies each lane of f and g, partially reduced
 // modulo 2¹³⁰ - 5. The result, h, consists of partial products
 // in each lane that need to be reduced further to produce the
 // final result.
 //
 //   h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
 //
 // Note that the multiplication by 5 of the high bits is
 // achieved by precalculating the multiplication of four of the
 // g coefficients by 5. These are g51-g54.
 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
 	VMLOF  f0, g0, h0        \
 	VMLOF  f0, g1, h1        \
 	VMLOF  f0, g2, h2        \
 	VMLOF  f0, g3, h3        \
 	VMLOF  f0, g1, h1        \
 	VMLOF  f0, g4, h4        \
 	VMLOF  f0, g2, h2        \
 	VMLOF  f1, g54, T_0      \
 	VMLOF  f1, g0, T_1       \
 	VMLOF  f1, g1, T_2       \
 	VMLOF  f1, g2, T_3       \
 	VMLOF  f1, g0, T_1       \
 	VMLOF  f1, g3, T_4       \
 	VMLOF  f1, g1, T_2       \
 	VMALOF f2, g53, h0, h0   \
 	VMALOF f2, g54, h1, h1   \
 	VMALOF f2, g0, h2, h2    \
 	VMALOF f2, g1, h3, h3    \
 	VMALOF f2, g54, h1, h1   \
 	VMALOF f2, g2, h4, h4    \
 	VMALOF f2, g0, h2, h2    \
 	VMALOF f3, g52, T_0, T_0 \
 	VMALOF f3, g53, T_1, T_1 \
 	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f3, g0, T_3, T_3  \
 	VMALOF f3, g53, T_1, T_1 \
 	VMALOF f3, g1, T_4, T_4  \
 	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f4, g51, h0, h0   \
 	VMALOF f4, g52, h1, h1   \
 	VMALOF f4, g53, h2, h2   \
 	VMALOF f4, g54, h3, h3   \
 	VMALOF f4, g52, h1, h1   \
 	VMALOF f4, g0, h4, h4    \
 	VMALOF f4, g53, h2, h2   \
 	VAG    T_0, h0, h0       \
 	VAG    T_1, h1, h1       \
 	VAG    T_2, h2, h2       \
 	VAG    T_3, h3, h3       \
-	VAG    T_4, h4, h4
+	VAG    T_1, h1, h1       \
 	VAG    T_4, h4, h4       \
 	VAG    T_2, h2, h2
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
+// REDUCE performs the following carry operations in four
 // stages, as specified in Bernstein & Schwabe:
 //
 //   1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4]
 //   2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0]
 //   3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3]
 //   4: h₂₆[3]->h₂₆[4]
 //
 // The result is that all of the limbs are limited to 26-bits
 // except for h₂₆[1] and h₂₆[4] which are limited to 27-bits.
 //
 // Note that although each limb is aligned at 26-bit intervals
 // they may contain values that exceed 2²⁶ - 1, hence the need
 // to carry the excess bits in each limb.
 #define REDUCE(h0, h1, h2, h3, h4) \
 	VESRLG $26, h0, T_0  \
 	VESRLG $26, h3, T_1  \
@ -136,144 +208,155 @@ DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
 	VN     MOD26, h3, h3 \
 	VAG    T_2, h4, h4
-// expand in0 into d[0] and in1 into d[1]
+// EXPAND splits the 128-bit little-endian values in0 and in1
 // into 26-bit big-endian limbs and places the results into
 // the first and second lane of d₂₆[0:4] respectively.
 //
 // The EX0, EX1 and EX2 constants are arrays of byte indices
 // for permutation. The permutation both reverses the bytes
 // in the input and ensures the bytes are copied into the
 // destination limb ready to be shifted into their final
 // position.
 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
 	VGBM   $0x0707, d1       \ // d1=tmp
 	VPERM  in0, in1, EX2, d4 \
 	VPERM  in0, in1, EX0, d0 \
 	VPERM  in0, in1, EX1, d2 \
-	VN     d1, d4, d4        \
+	VPERM  in0, in1, EX2, d4 \
 	VESRLG $26, d0, d1       \
 	VESRLG $30, d2, d3       \
 	VESRLG $4, d2, d2        \
-	VN     MOD26, d0, d0     \
+	VN     MOD26, d0, d0     \ // [in0₂₆[0], in1₂₆[0]]
-	VN     MOD26, d1, d1     \
+	VN     MOD26, d3, d3     \ // [in0₂₆[3], in1₂₆[3]]
-	VN     MOD26, d2, d2     \
+	VN     MOD26, d1, d1     \ // [in0₂₆[1], in1₂₆[1]]
-	VN     MOD26, d3, d3
+	VN     MOD24, d4, d4     \ // [in0₂₆[4], in1₂₆[4]]
 	VN     MOD26, d2, d2     // [in0₂₆[2], in1₂₆[2]]
-// pack h4:h0 into h1:h0 (no carry)
+// func updateVX(state *macState, msg []byte)
-#define PACK(h0, h1, h2, h3, h4) \
+TEXT ·updateVX(SB), NOSPLIT, $0
-	VESLG $26, h1, h1  \
+	MOVD state+0(FP), R1
-	VESLG $26, h3, h3  \
+	LMG  msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
 	VO    h0, h1, h0   \
 	VO    h2, h3, h2   \
 	VESLG $4, h2, h2   \
 	VLEIB $7, $48, h1  \
 	VSLB  h1, h2, h2   \
 	VO    h0, h2, h0   \
 	VLEIB $7, $104, h1 \
 	VSLB  h1, h4, h3   \
 	VO    h3, h0, h0   \
 	VLEIB $7, $24, h1  \
 	VSRLB h1, h4, h1
-// if h > 2**130-5 then h -= 2**130-5
+	// load EX0, EX1 and EX2
 #define MOD(h0, h1, t0, t1, t2) \
 	VZERO t0          \
 	VLEIG $1, $5, t0  \
 	VACCQ h0, t0, t1  \
 	VAQ   h0, t0, t0  \
 	VONE  t2          \
 	VLEIG $1, $-4, t2 \
 	VAQ   t2, t1, t1  \
 	VACCQ h1, t1, t1  \
 	VONE  t2          \
 	VAQ   t2, t1, t1  \
 	VN    h0, t1, t2  \
 	VNC   t0, t1, t1  \
 	VO    t1, t2, h0
 // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
 TEXT ·poly1305vx(SB), $0-32
 	// This code processes up to 2 blocks (32 bytes) per iteration
 	// using the algorithm described in:
 	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
 	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
 	// load MOD26, EX0, EX1 and EX2
 	MOVD $·constants<>(SB), R5
-	VLM  (R5), MOD26, EX2
+	VLM  (R5), EX0, EX2
-	// setup r
+	// generate masks
-	VL   (R4), T_0
+	VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
-	MOVD $·keyMask<>(SB), R6
+	VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
 	VL   (R6), T_1
 	VN   T_0, T_1, T_0
 	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-	// setup r*5
+	// load h (accumulator) and r (key) from state
-	VLEIG $0, $5, T_0
+	VZERO T_1               // [0, 0]
-	VLEIG $1, $5, T_0
+	VL    0(R1), T_0        // [h₆₄[0], h₆₄[1]]
 	VLEG  $0, 16(R1), T_1   // [h₆₄[2], 0]
 	VL    24(R1), T_2       // [r₆₄[0], r₆₄[1]]
 	VPDI  $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]]
 	VPDI  $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]]
-	// store r (for final block)
+	// unpack h and r into 26-bit limbs
-	VMLOF T_0, R_1, R5SAVE_1
+	// note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value
-	VMLOF T_0, R_2, R5SAVE_2
+	VN     MOD26, T_3, H_0            // [h₂₆[0], r₂₆[0]]
-	VMLOF T_0, R_3, R5SAVE_3
+	VZERO  H_1                        // [0, 0]
-	VMLOF T_0, R_4, R5SAVE_4
+	VZERO  H_3                        // [0, 0]
-	VLGVG $0, R_0, RSAVE_0
+	VGMG   $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
-	VLGVG $0, R_1, RSAVE_1
+	VESLG  $24, T_1, T_1              // [h₆₄[2]<<24, 0]
-	VLGVG $0, R_2, RSAVE_2
+	VERIMG $-26&63, T_3, MOD26, H_1   // [h₂₆[1], r₂₆[1]]
-	VLGVG $0, R_3, RSAVE_3
+	VESRLG $+52&63, T_3, H_2          // [h₂₆[2], r₂₆[2]] - low 12 bits only
-	VLGVG $0, R_4, RSAVE_4
+	VERIMG $-14&63, T_4, MOD26, H_3   // [h₂₆[1], r₂₆[1]]
 	VESRLG $40, T_4, H_4              // [h₂₆[4], r₂₆[4]] - low 24 bits only
 	VERIMG $+12&63, T_4, T_0, H_2     // [h₂₆[2], r₂₆[2]] - complete
 	VO     T_1, H_4, H_4              // [h₂₆[4], r₂₆[4]] - complete
-	// skip r**2 calculation
+	// replicate r across all 4 vector elements
 	VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]]
 	VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]]
 	VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]]
 	VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]]
 	VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]]
 	// zero out lane 1 of h
 	VLEIG $1, $0, H_0 // [h₂₆[0], 0]
 	VLEIG $1, $0, H_1 // [h₂₆[1], 0]
 	VLEIG $1, $0, H_2 // [h₂₆[2], 0]
 	VLEIG $1, $0, H_3 // [h₂₆[3], 0]
 	VLEIG $1, $0, H_4 // [h₂₆[4], 0]
 	// calculate 5r (ignore least significant limb)
 	VREPIF $5, T_0
 	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]]
 	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]]
 	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]]
 	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]]
 	// skip r² calculation if we are only calculating one block
 	CMPBLE R3, $16, skip
-	// calculate r**2
+	// calculate r²
-	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
+	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
+	REDUCE(M_0, M_1, M_2, M_3, M_4)
-	VLEIG $0, $5, T_0
+	VGBM   $0x0f0f, T_0
-	VLEIG $1, $5, T_0
+	VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]]
-	VMLOF T_0, H_1, R5_1
+	VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]]
-	VMLOF T_0, H_2, R5_2
+	VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]]
-	VMLOF T_0, H_3, R5_3
+	VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]]
-	VMLOF T_0, H_4, R5_4
+	VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]]
 	VLR   H_0, R_0
 	VLR   H_1, R_1
 	VLR   H_2, R_2
 	VLR   H_3, R_3
 	VLR   H_4, R_4
-	// initialize h
+	// calculate 5r² (ignore least significant limb)
-	VZERO H_0
+	VREPIF $5, T_0
-	VZERO H_1
+	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]]
-	VZERO H_2
+	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]]
-	VZERO H_3
+	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]]
-	VZERO H_4
+	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]]
 loop:
-	CMPBLE R3, $32, b2
+	CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
 	// load next 2 blocks from message
 	VLM (R2), T_0, T_1
 	// update message slice
 	SUB  $32, R3
 	MOVD $32(R2), R2
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+
-	VLEIB  $4, $1, F_4
+	// unpack message blocks into 26-bit big-endian limbs
-	VLEIB  $12, $1, F_4
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
 	// add 2¹²⁸ to each message block value
 	VLEIB $4, $1, M_4
 	VLEIB $12, $1, M_4
 multiply:
-	VAG    H_0, F_0, F_0
+	// accumulate the incoming message
-	VAG    H_1, F_1, F_1
+	VAG H_0, M_0, M_0
-	VAG    H_2, F_2, F_2
+	VAG H_3, M_3, M_3
-	VAG    H_3, F_3, F_3
+	VAG H_1, M_1, M_1
-	VAG    H_4, F_4, F_4
+	VAG H_4, M_4, M_4
-	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+	VAG H_2, M_2, M_2
 	// multiply the accumulator by the key coefficient
 	MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
 	// carry and partially reduce the partial products
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
 	CMPBNE R3, $0, loop
 finish:
-	// sum vectors
+	// sum lane 0 and lane 1 and put the result in lane 1
 	VZERO  T_0
 	VSUMQG H_0, T_0, H_0
 	VSUMQG H_1, T_0, H_1
 	VSUMQG H_2, T_0, H_2
 	VSUMQG H_3, T_0, H_3
 	VSUMQG H_1, T_0, H_1
 	VSUMQG H_4, T_0, H_4
 	VSUMQG H_2, T_0, H_2
-	// h may be >= 2*(2**130-5) so we need to reduce it again
+	// reduce again after summation
 	// TODO(mundaym): there might be a more efficient way to do this
 	// now that we only have 1 active lane. For example, we could
 	// simultaneously pack the values as we reduce them.
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
-	// carry h1->h4
+	// carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1
 	// TODO(mundaym): in testing this final carry was unnecessary.
 	// Needs a proof before it can be removed though.
 	VESRLG $26, H_1, T_1
 	VN     MOD26, H_1, H_1
 	VAQ    T_1, H_2, H_2
@ -284,95 +367,137 @@ finish:
 	VN     MOD26, H_3, H_3
 	VAQ    T_3, H_4, H_4
-	// h is now < 2*(2**130-5)
+	// h is now < 2(2¹³⁰ - 5)
-	// pack h into h1 (hi) and h0 (lo)
+	// Pack each lane in h₂₆[0:4] into h₁₂₈[0:1].
-	PACK(H_0, H_1, H_2, H_3, H_4)
+	VESLG $26, H_1, H_1
-
+	VESLG $26, H_3, H_3
-	// if h > 2**130-5 then h -= 2**130-5
+	VO    H_0, H_1, H_0
-	MOD(H_0, H_1, T_0, T_1, T_2)
+	VO    H_2, H_3, H_2
-
+	VESLG $4, H_2, H_2
-	// h += s
+	VLEIB $7, $48, H_1
-	MOVD  $·bswapMask<>(SB), R5
+	VSLB  H_1, H_2, H_2
-	VL    (R5), T_1
+	VO    H_0, H_2, H_0
-	VL    16(R4), T_0
+	VLEIB $7, $104, H_1
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
+	VSLB  H_1, H_4, H_3
-	VAQ   T_0, H_0, H_0
+	VO    H_3, H_0, H_0
-	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
+	VLEIB $7, $24, H_1
-	VST   H_0, (R1)
+	VSRLB H_1, H_4, H_1
 	// update state
 	VSTEG $1, H_0, 0(R1)
 	VSTEG $0, H_0, 8(R1)
 	VSTEG $1, H_1, 16(R1)
 	RET
-b2:
+b2:  // 2 or fewer blocks remaining
 	CMPBLE R3, $16, b1
-	// 2 blocks remaining
+	// Load the 2 remaining blocks (17-32 bytes remaining).
-	SUB    $17, R3
+	MOVD $-17(R3), R0    // index of final byte to load modulo 16
-	VL     (R2), T_0
+	VL   (R2), T_0       // load full 16 byte block
-	VLL    R3, 16(R2), T_1
+	VLL  R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, T_1
 	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $12, $1, F_4
 	VLEIB  $4, $1, F_4
-	// setup [r²,r]
+	// The Poly1305 algorithm requires that a 1 bit be appended to
-	VLVGG $1, RSAVE_0, R_0
+	// each message block. If the final block is less than 16 bytes
-	VLVGG $1, RSAVE_1, R_1
+	// long then it is easiest to insert the 1 before the message
-	VLVGG $1, RSAVE_2, R_2
+	// block is split into 26-bit limbs. If, on the other hand, the
-	VLVGG $1, RSAVE_3, R_3
+	// final message block is 16 bytes long then we append the 1 bit
-	VLVGG $1, RSAVE_4, R_4
+	// after expansion as normal.
-	VPDI  $0, R5_1, R5SAVE_1, R5_1
+	MOVBZ  $1, R0
-	VPDI  $0, R5_2, R5SAVE_2, R5_2
+	MOVD   $-16(R3), R3   // index of byte in last block to insert 1 at (could be 16)
-	VPDI  $0, R5_3, R5SAVE_3, R5_3
+	CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
-	VPDI  $0, R5_4, R5SAVE_4, R5_4
+	VLVGB  R3, R0, T_1    // insert 1 into the byte at index R3
 	// Split both blocks into 26-bit limbs in the appropriate lanes.
 	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
 	// Append a 1 byte to the end of the second to last block.
 	VLEIB $4, $1, M_4
 	// Append a 1 byte to the end of the last block only if it is a
 	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $12, $1, M_4
 	// Finally, set up the coefficients for the final multiplication.
 	// We have previously saved r and 5r in the 32-bit even indexes
 	// of the R_[0-4] and R5_[1-4] coefficient registers.
 	//
 	// We want lane 0 to be multiplied by r² so that can be kept the
 	// same. We want lane 1 to be multiplied by r so we need to move
 	// the saved r value into the 32-bit odd index in lane 1 by
 	// rotating the 64-bit lane by 32.
 	VGBM   $0x00ff, T_0         // [0, 0xffffffffffffffff] - mask lane 1 only
 	VERIMG $32, R_0, T_0, R_0   // [_,  r²₂₆[0], _,  r₂₆[0]]
 	VERIMG $32, R_1, T_0, R_1   // [_,  r²₂₆[1], _,  r₂₆[1]]
 	VERIMG $32, R_2, T_0, R_2   // [_,  r²₂₆[2], _,  r₂₆[2]]
 	VERIMG $32, R_3, T_0, R_3   // [_,  r²₂₆[3], _,  r₂₆[3]]
 	VERIMG $32, R_4, T_0, R_4   // [_,  r²₂₆[4], _,  r₂₆[4]]
 	VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]]
 	VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]]
 	VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]]
 	VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]]
 	MOVD $0, R3
 	BR   multiply
 skip:
 	VZERO H_0
 	VZERO H_1
 	VZERO H_2
 	VZERO H_3
 	VZERO H_4
 	CMPBEQ R3, $0, finish
-b1:
+b1:  // 1 block remaining
-	// 1 block remaining
+
-	SUB    $1, R3
+	// Load the final block (1-16 bytes). This will be placed into
-	VLL    R3, (R2), T_0
+	// lane 0.
-	ADD    $1, R3
+	MOVD $-1(R3), R0
 	VLL  R0, (R2), T_0 // pad to 16 bytes with zeros
 	// The Poly1305 algorithm requires that a 1 bit be appended to
 	// each message block. If the final block is less than 16 bytes
 	// long then it is easiest to insert the 1 before the message
 	// block is split into 26-bit limbs. If, on the other hand, the
 	// final message block is 16 bytes long then we append the 1 bit
 	// after expansion as normal.
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, T_0
 	VZERO  T_1
 	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $4, $1, F_4
 	VLEIG  $1, $1, R_0
 	VZERO  R_1
 	VZERO  R_2
 	VZERO  R_3
 	VZERO  R_4
 	VZERO  R5_1
 	VZERO  R5_2
 	VZERO  R5_3
 	VZERO  R5_4
-	// setup [r, 1]
+	// Set the message block in lane 1 to the value 0 so that it
-	VLVGG $0, RSAVE_0, R_0
+	// can be accumulated without affecting the final result.
-	VLVGG $0, RSAVE_1, R_1
+	VZERO T_1
-	VLVGG $0, RSAVE_2, R_2
+
-	VLVGG $0, RSAVE_3, R_3
+	// Split the final message block into 26-bit limbs in lane 0.
-	VLVGG $0, RSAVE_4, R_4
+	// Lane 1 will be contain 0.
-	VPDI  $0, R5SAVE_1, R5_1, R5_1
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
-	VPDI  $0, R5SAVE_2, R5_2, R5_2
+
-	VPDI  $0, R5SAVE_3, R5_3, R5_3
+	// Append a 1 byte to the end of the last block only if it is a
-	VPDI  $0, R5SAVE_4, R5_4, R5_4
+	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $4, $1, M_4
 	// We have previously saved r and 5r in the 32-bit even indexes
 	// of the R_[0-4] and R5_[1-4] coefficient registers.
 	//
 	// We want lane 0 to be multiplied by r so we need to move the
 	// saved r value into the 32-bit odd index in lane 0. We want
 	// lane 1 to be set to the value 1. This makes multiplication
 	// a no-op. We do this by setting lane 1 in every register to 0
 	// and then just setting the 32-bit index 3 in R_0 to 1.
 	VZERO T_0
 	MOVD  $0, R0
 	MOVD  $0x10111213, R12
 	VLVGP R12, R0, T_1         // [_, 0x10111213, _, 0x00000000]
 	VPERM T_0, R_0, T_1, R_0   // [_,  r₂₆[0], _, 0]
 	VPERM T_0, R_1, T_1, R_1   // [_,  r₂₆[1], _, 0]
 	VPERM T_0, R_2, T_1, R_2   // [_,  r₂₆[2], _, 0]
 	VPERM T_0, R_3, T_1, R_3   // [_,  r₂₆[3], _, 0]
 	VPERM T_0, R_4, T_1, R_4   // [_,  r₂₆[4], _, 0]
 	VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0]
 	VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0]
 	VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0]
 	VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0]
 	// Set the value of lane 1 to be 1.
 	VLEIF $3, $1, R_0 // [_,  r₂₆[0], _, 1]
 	MOVD $0, R3
 	BR   multiply
--- a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
@ -1,909 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x,go1.11,!gccgo,!appengine
 #include "textflag.h"
 // Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
 // constants
 #define EX0   V1
 #define EX1   V2
 #define EX2   V3
 // temporaries
 #define T_0 V4
 #define T_1 V5
 #define T_2 V6
 #define T_3 V7
 #define T_4 V8
 #define T_5 V9
 #define T_6 V10
 #define T_7 V11
 #define T_8 V12
 #define T_9 V13
 #define T_10 V14
 // r**2 & r**4
 #define R_0  V15
 #define R_1  V16
 #define R_2  V17
 #define R5_1 V18
 #define R5_2 V19
 // key (r)
 #define RSAVE_0 R7
 #define RSAVE_1 R8
 #define RSAVE_2 R9
 #define R5SAVE_1 R10
 #define R5SAVE_2 R11
 // message block
 #define M0 V20
 #define M1 V21
 #define M2 V22
 #define M3 V23
 #define M4 V24
 #define M5 V25
 // accumulator
 #define H0_0 V26
 #define H1_0 V27
 #define H2_0 V28
 #define H0_1 V29
 #define H1_1 V30
 #define H2_1 V31
 GLOBL ·keyMask<>(SB), RODATA, $16
 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
 GLOBL ·bswapMask<>(SB), RODATA, $16
 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
 GLOBL ·constants<>(SB), RODATA, $48
 // EX0
 DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+8(SB)/8, $0x0000050403020100
 // EX1
 DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+24(SB)/8, $0x00000a0908070605
 // EX2
 DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
 GLOBL ·c<>(SB), RODATA, $48
 // EX0
 DATA ·c<>+0(SB)/8, $0x0000050403020100
 DATA ·c<>+8(SB)/8, $0x0000151413121110
 // EX1
 DATA ·c<>+16(SB)/8, $0x00000a0908070605
 DATA ·c<>+24(SB)/8, $0x00001a1918171615
 // EX2
 DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
 DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
 GLOBL ·reduce<>(SB), RODATA, $32
 // 44 bit
 DATA ·reduce<>+0(SB)/8, $0x0
 DATA ·reduce<>+8(SB)/8, $0xfffffffffff
 // 42 bit
 DATA ·reduce<>+16(SB)/8, $0x0
 DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
 // h = (f*g) % (2**130-5) [partial reduction]
 // uses T_0...T_9 temporary registers
 // input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
 // output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
 #define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
 	\ // Eliminate the dependency for the last 2 VMSLs
 	VMSLG m02_0, r_2, m4_2, m4_2                       \
 	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
 	VMSLG m02_0, r_0, m4_0, m4_0                       \
 	VMSLG m02_1, r5_2, V0, T_0                         \
 	VMSLG m02_0, r_1, m4_1, m4_1                       \
 	VMSLG m02_1, r_0, V0, T_1                          \
 	VMSLG m02_1, r_1, V0, T_2                          \
 	VMSLG m02_2, r5_1, V0, T_3                         \
 	VMSLG m02_2, r5_2, V0, T_4                         \
 	VMSLG m13_0, r_0, m5_0, m5_0                       \
 	VMSLG m13_1, r5_2, V0, T_5                         \
 	VMSLG m13_0, r_1, m5_1, m5_1                       \
 	VMSLG m13_1, r_0, V0, T_6                          \
 	VMSLG m13_1, r_1, V0, T_7                          \
 	VMSLG m13_2, r5_1, V0, T_8                         \
 	VMSLG m13_2, r5_2, V0, T_9                         \
 	VMSLG m02_2, r_0, m4_2, m4_2                       \
 	VMSLG m13_2, r_0, m5_2, m5_2                       \
 	VAQ   m4_0, T_0, m02_0                             \
 	VAQ   m4_1, T_1, m02_1                             \
 	VAQ   m5_0, T_5, m13_0                             \
 	VAQ   m5_1, T_6, m13_1                             \
 	VAQ   m02_0, T_3, m02_0                            \
 	VAQ   m02_1, T_4, m02_1                            \
 	VAQ   m13_0, T_8, m13_0                            \
 	VAQ   m13_1, T_9, m13_1                            \
 	VAQ   m4_2, T_2, m02_2                             \
 	VAQ   m5_2, T_7, m13_2                             \
 // SQUARE uses three limbs of r and r_2*5 to output square of r
 // uses T_1, T_5 and T_7 temporary registers
 // input: r_0, r_1, r_2, r5_2
 // temp: TEMP0, TEMP1, TEMP2
 // output: p0, p1, p2
 #define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
 	VMSLG r_0, r_0, p0, p0     \
 	VMSLG r_1, r5_2, V0, TEMP0 \
 	VMSLG r_2, r5_2, p1, p1    \
 	VMSLG r_0, r_1, V0, TEMP1  \
 	VMSLG r_1, r_1, p2, p2     \
 	VMSLG r_0, r_2, V0, TEMP2  \
 	VAQ   TEMP0, p0, p0        \
 	VAQ   TEMP1, p1, p1        \
 	VAQ   TEMP2, p2, p2        \
 	VAQ   TEMP0, p0, p0        \
 	VAQ   TEMP1, p1, p1        \
 	VAQ   TEMP2, p2, p2        \
 // carry h0->h1->h2->h0 || h3->h4->h5->h3
 // uses T_2, T_4, T_5, T_7, T_8, T_9
 //       t6,  t7,  t8,  t9, t10, t11
 // input: h0, h1, h2, h3, h4, h5
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
 // output: h0, h1, h2, h3, h4, h5
 #define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
 	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
 	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
 	VREPIB $4, t8         \ // 4 bit shift mask
 	VREPIB $2, t11        \ // 2 bit shift mask
 	VSRLB  t10, h0, t0    \ // h0 byte shift
 	VSRLB  t10, h1, t1    \ // h1 byte shift
 	VSRLB  t10, h2, t2    \ // h2 byte shift
 	VSRLB  t10, h3, t3    \ // h3 byte shift
 	VSRLB  t10, h4, t4    \ // h4 byte shift
 	VSRLB  t10, h5, t5    \ // h5 byte shift
 	VSRL   t8, t0, t0     \ // h0 bit shift
 	VSRL   t8, t1, t1     \ // h2 bit shift
 	VSRL   t11, t2, t2    \ // h2 bit shift
 	VSRL   t8, t3, t3     \ // h3 bit shift
 	VSRL   t8, t4, t4     \ // h4 bit shift
 	VESLG  $2, t2, t9     \ // h2 carry x5
 	VSRL   t11, t5, t5    \ // h5 bit shift
 	VN     t6, h0, h0     \ // h0 clear carry
 	VAQ    t2, t9, t2     \ // h2 carry x5
 	VESLG  $2, t5, t9     \ // h5 carry x5
 	VN     t6, h1, h1     \ // h1 clear carry
 	VN     t7, h2, h2     \ // h2 clear carry
 	VAQ    t5, t9, t5     \ // h5 carry x5
 	VN     t6, h3, h3     \ // h3 clear carry
 	VN     t6, h4, h4     \ // h4 clear carry
 	VN     t7, h5, h5     \ // h5 clear carry
 	VAQ    t0, h1, h1     \ // h0->h1
 	VAQ    t3, h4, h4     \ // h3->h4
 	VAQ    t1, h2, h2     \ // h1->h2
 	VAQ    t4, h5, h5     \ // h4->h5
 	VAQ    t2, h0, h0     \ // h2->h0
 	VAQ    t5, h3, h3     \ // h5->h3
 	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
 	VREPG  $1, t7, t7     \
 	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
 	VSLDB  $8, h1, h1, h1 \
 	VSLDB  $8, h2, h2, h2 \
 	VO     h0, h3, h3     \
 	VO     h1, h4, h4     \
 	VO     h2, h5, h5     \
 	VESRLG $44, h3, t0    \ // 44 bit shift right
 	VESRLG $44, h4, t1    \
 	VESRLG $42, h5, t2    \
 	VN     t6, h3, h3     \ // clear carry bits
 	VN     t6, h4, h4     \
 	VN     t7, h5, h5     \
 	VESLG  $2, t2, t9     \ // multiply carry by 5
 	VAQ    t9, t2, t2     \
 	VAQ    t0, h4, h4     \
 	VAQ    t1, h5, h5     \
 	VAQ    t2, h3, h3     \
 // carry h0->h1->h2->h0
 // input: h0, h1, h2
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
 // output: h0, h1, h2
 #define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
 	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
 	VREPIB $4, t4        \ // 4 bit shift mask
 	VREPIB $2, t7        \ // 2 bit shift mask
 	VGBM   $0x003F, t5   \ // mask to clear carry bits
 	VSRLB  t3, h0, t0    \
 	VSRLB  t3, h1, t1    \
 	VSRLB  t3, h2, t2    \
 	VESRLG $4, t5, t5    \ // 44 bit clear mask
 	VSRL   t4, t0, t0    \
 	VSRL   t4, t1, t1    \
 	VSRL   t7, t2, t2    \
 	VESRLG $2, t5, t6    \ // 42 bit clear mask
 	VESLG  $2, t2, t8    \
 	VAQ    t8, t2, t2    \
 	VN     t5, h0, h0    \
 	VN     t5, h1, h1    \
 	VN     t6, h2, h2    \
 	VAQ    t0, h1, h1    \
 	VAQ    t1, h2, h2    \
 	VAQ    t2, h0, h0    \
 	VSRLB  t3, h0, t0    \
 	VSRLB  t3, h1, t1    \
 	VSRLB  t3, h2, t2    \
 	VSRL   t4, t0, t0    \
 	VSRL   t4, t1, t1    \
 	VSRL   t7, t2, t2    \
 	VN     t5, h0, h0    \
 	VN     t5, h1, h1    \
 	VESLG  $2, t2, t8    \
 	VN     t6, h2, h2    \
 	VAQ    t0, h1, h1    \
 	VAQ    t8, t2, t2    \
 	VAQ    t1, h2, h2    \
 	VAQ    t2, h0, h0    \
 // expands two message blocks into the lower halfs of the d registers
 // moves the contents of the d registers into upper halfs
 // input: in1, in2, d0, d1, d2, d3, d4, d5
 // temp: TEMP0, TEMP1, TEMP2, TEMP3
 // output: d0, d1, d2, d3, d4, d5
 #define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
 	VGBM   $0xff3f, TEMP0      \
 	VGBM   $0xff1f, TEMP1      \
 	VESLG  $4, d1, TEMP2       \
 	VESLG  $4, d4, TEMP3       \
 	VESRLG $4, TEMP0, TEMP0    \
 	VPERM  in1, d0, EX0, d0    \
 	VPERM  in2, d3, EX0, d3    \
 	VPERM  in1, d2, EX2, d2    \
 	VPERM  in2, d5, EX2, d5    \
 	VPERM  in1, TEMP2, EX1, d1 \
 	VPERM  in2, TEMP3, EX1, d4 \
 	VN     TEMP0, d0, d0       \
 	VN     TEMP0, d3, d3       \
 	VESRLG $4, d1, d1          \
 	VESRLG $4, d4, d4          \
 	VN     TEMP1, d2, d2       \
 	VN     TEMP1, d5, d5       \
 	VN     TEMP0, d1, d1       \
 	VN     TEMP0, d4, d4       \
 // expands one message block into the lower halfs of the d registers
 // moves the contents of the d registers into upper halfs
 // input: in, d0, d1, d2
 // temp: TEMP0, TEMP1, TEMP2
 // output: d0, d1, d2
 #define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
 	VGBM   $0xff3f, TEMP0     \
 	VESLG  $4, d1, TEMP2      \
 	VGBM   $0xff1f, TEMP1     \
 	VPERM  in, d0, EX0, d0    \
 	VESRLG $4, TEMP0, TEMP0   \
 	VPERM  in, d2, EX2, d2    \
 	VPERM  in, TEMP2, EX1, d1 \
 	VN     TEMP0, d0, d0      \
 	VN     TEMP1, d2, d2      \
 	VESRLG $4, d1, d1         \
 	VN     TEMP0, d1, d1      \
 // pack h2:h0 into h1:h0 (no carry)
 // input: h0, h1, h2
 // output: h0, h1, h2
 #define PACK(h0, h1, h2) \
 	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
 	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
 	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
 	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
 	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
 	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
 	VLEIG  $0, $0, h2  \ // clear upper half of h2
 	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
 	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
 	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
 	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
 	VLEIG  $0, $0, h1  \ // clear upper half of h1
 // if h > 2**130-5 then h -= 2**130-5
 // input: h0, h1
 // temp: t0, t1, t2
 // output: h0
 #define MOD(h0, h1, t0, t1, t2) \
 	VZERO t0          \
 	VLEIG $1, $5, t0  \
 	VACCQ h0, t0, t1  \
 	VAQ   h0, t0, t0  \
 	VONE  t2          \
 	VLEIG $1, $-4, t2 \
 	VAQ   t2, t1, t1  \
 	VACCQ h1, t1, t1  \
 	VONE  t2          \
 	VAQ   t2, t1, t1  \
 	VN    h0, t1, t2  \
 	VNC   t0, t1, t1  \
 	VO    t1, t2, h0  \
 // func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
 TEXT ·poly1305vmsl(SB), $0-32
 	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
 	// using the algorithm described in:
 	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
 	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 	// And as moddified for VMSL as described in
 	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
 	// O'Farrell et al, CASCON 2017, p48-55
 	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
 	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
 	VZERO V0                // c
 	// load EX0, EX1 and EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2        // c
 	// setup r
 	VL    (R4), T_0
 	MOVD  $·keyMask<>(SB), R6
 	VL    (R6), T_1
 	VN    T_0, T_1, T_0
 	VZERO T_2                 // limbs for r
 	VZERO T_3
 	VZERO T_4
 	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
 	// T_2, T_3, T_4: [0, r]
 	// setup r*20
 	VLEIG $0, $0, T_0
 	VLEIG $1, $20, T_0       // T_0: [0, 20]
 	VZERO T_5
 	VZERO T_6
 	VMSLG T_0, T_3, T_5, T_5
 	VMSLG T_0, T_4, T_6, T_6
 	// store r for final block in GR
 	VLGVG $1, T_2, RSAVE_0  // c
 	VLGVG $1, T_3, RSAVE_1  // c
 	VLGVG $1, T_4, RSAVE_2  // c
 	VLGVG $1, T_5, R5SAVE_1 // c
 	VLGVG $1, T_6, R5SAVE_2 // c
 	// initialize h
 	VZERO H0_0
 	VZERO H1_0
 	VZERO H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	// initialize pointer for reduce constants
 	MOVD $·reduce<>(SB), R12
 	// calculate r**2 and 20*(r**2)
 	VZERO R_0
 	VZERO R_1
 	VZERO R_2
 	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
 	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
 	VZERO R5_1
 	VZERO R5_2
 	VMSLG T_0, R_1, R5_1, R5_1
 	VMSLG T_0, R_2, R5_2, R5_2
 	// skip r**4 calculation if 3 blocks or less
 	CMPBLE R3, $48, b4
 	// calculate r**4 and 20*(r**4)
 	VZERO T_8
 	VZERO T_9
 	VZERO T_10
 	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
 	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
 	VZERO T_2
 	VZERO T_3
 	VMSLG T_0, T_9, T_2, T_2
 	VMSLG T_0, T_10, T_3, T_3
 	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
 	VSLDB $8, T_8, T_8, T_8
 	VSLDB $8, T_9, T_9, T_9
 	VSLDB $8, T_10, T_10, T_10
 	VSLDB $8, T_2, T_2, T_2
 	VSLDB $8, T_3, T_3, T_3
 	VO T_8, R_0, R_0
 	VO T_9, R_1, R_1
 	VO T_10, R_2, R_2
 	VO T_2, R5_1, R5_1
 	VO T_3, R5_2, R5_2
 	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
 	// 6(or 5+1) blocks
 	SUB    $81, R3
 	VLM    (R2), M0, M4
 	VLL    R3, 80(R2), M5
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBGE R3, $16, 2(PC)
 	VLVGB  R3, R0, M5
 	MOVD   $96(R2), R2
 	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	VLEIB  $2, $1, H2_0
 	VLEIB  $2, $1, H2_1
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VZERO  M0
 	VZERO  M1
 	VZERO  M2
 	VZERO  M3
 	VZERO  T_4
 	VZERO  T_10
 	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
 	VLR    T_4, M4
 	VLEIB  $10, $1, M2
 	CMPBLT R3, $16, 2(PC)
 	VLEIB  $10, $1, T_10
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	SUB    $16, R3
 	CMPBLE R3, $0, square
 load:
 	// load EX0, EX1 and EX2
 	MOVD $·c<>(SB), R5
 	VLM  (R5), EX0, EX2
 loop:
 	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
 	// next 4 full blocks
 	VLM  (R2), M2, M5
 	SUB  $64, R3
 	MOVD $64(R2), R2
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
 	// expacc in-lined to create [m2, m3] limbs
 	VGBM   $0x3f3f, T_0     // 44 bit clear mask
 	VGBM   $0x1f1f, T_1     // 40 bit clear mask
 	VPERM  M2, M3, EX0, T_3
 	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
 	VPERM  M2, M3, EX1, T_4
 	VPERM  M2, M3, EX2, T_5
 	VN     T_0, T_3, T_3
 	VESRLG $4, T_4, T_4
 	VN     T_1, T_5, T_5
 	VN     T_0, T_4, T_4
 	VMRHG  H0_1, T_3, H0_0
 	VMRHG  H1_1, T_4, H1_0
 	VMRHG  H2_1, T_5, H2_0
 	VMRLG  H0_1, T_3, H0_1
 	VMRLG  H1_1, T_4, H1_1
 	VMRLG  H2_1, T_5, H2_1
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VPERM  M4, M5, EX0, T_3
 	VPERM  M4, M5, EX1, T_4
 	VPERM  M4, M5, EX2, T_5
 	VN     T_0, T_3, T_3
 	VESRLG $4, T_4, T_4
 	VN     T_1, T_5, T_5
 	VN     T_0, T_4, T_4
 	VMRHG  V0, T_3, M0
 	VMRHG  V0, T_4, M1
 	VMRHG  V0, T_5, M2
 	VMRLG  V0, T_3, M3
 	VMRLG  V0, T_4, M4
 	VMRLG  V0, T_5, M5
 	VLEIB  $10, $1, M2
 	VLEIB  $10, $1, M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	CMPBNE R3, $0, loop
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	// load EX0, EX1, EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2
 	// sum vectors
 	VAQ H0_0, H0_1, H0_0
 	VAQ H1_0, H1_1, H1_0
 	VAQ H2_0, H2_1, H2_0
 	// h may be >= 2*(2**130-5) so we need to reduce it again
 	// M0...M4 are used as temps here
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 next:  // carry h1->h2
 	VLEIB  $7, $0x28, T_1
 	VREPIB $4, T_2
 	VGBM   $0x003F, T_3
 	VESRLG $4, T_3
 	// byte shift
 	VSRLB T_1, H1_0, T_4
 	// bit shift
 	VSRL T_2, T_4, T_4
 	// clear h1 carry bits
 	VN T_3, H1_0, H1_0
 	// add carry
 	VAQ T_4, H2_0, H2_0
 	// h is now < 2*(2**130-5)
 	// pack h into h1 (hi) and h0 (lo)
 	PACK(H0_0, H1_0, H2_0)
 	// if h > 2**130-5 then h -= 2**130-5
 	MOD(H0_0, H1_0, T_0, T_1, T_2)
 	// h += s
 	MOVD  $·bswapMask<>(SB), R5
 	VL    (R5), T_1
 	VL    16(R4), T_0
 	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
 	VAQ   T_0, H0_0, H0_0
 	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
 	VST   H0_0, (R1)
 	RET
 add:
 	// load EX0, EX1, EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	CMPBLE R3, $64, b4
 b4:
 	CMPBLE R3, $48, b3 // 3 blocks or less
 	// 4(3+1) blocks remaining
 	SUB    $49, R3
 	VLM    (R2), M0, M2
 	VLL    R3, 48(R2), M3
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M3
 	MOVD   $64(R2), R2
 	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VZERO  M0
 	VZERO  M1
 	VZERO  M4
 	VZERO  M5
 	VZERO  T_4
 	VZERO  T_10
 	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
 	VLR    T_4, M2
 	VLEIB  $10, $1, M4
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_10
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	SUB    $16, R3
 	CMPBLE R3, $0, square // this condition must always hold true!
 b3:
 	CMPBLE R3, $32, b2
 	// 3 blocks remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
 	SUB    $33, R3
 	VLM    (R2), M0, M1
 	VLL    R3, 32(R2), M2
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M2
 	// H += m0
 	VZERO T_1
 	VZERO T_2
 	VZERO T_3
 	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
 	VLEIB $10, $1, T_3
 	VAG   H0_0, T_1, H0_0
 	VAG   H1_0, T_2, H1_0
 	VAG   H2_0, T_3, H2_0
 	VZERO M0
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	VZERO T_10
 	// (H+m0)*r
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
 	// H += m1
 	VZERO V0
 	VZERO T_1
 	VZERO T_2
 	VZERO T_3
 	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
 	VLEIB $10, $1, T_3
 	VAQ   H0_0, T_1, H0_0
 	VAQ   H1_0, T_2, H1_0
 	VAQ   H2_0, T_3, H2_0
 	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
 	// [H, m2] * [r**2, r]
 	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, H2_0
 	VZERO  M0
 	VZERO  M1
 	VZERO  M2
 	VZERO  M3
 	VZERO  M4
 	VZERO  M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
 	SUB    $16, R3
 	CMPBLE R3, $0, next   // this condition must always hold true!
 b2:
 	CMPBLE R3, $16, b1
 	// 2 blocks remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG V0, H0_1, H0_0
 	VMRHG V0, H1_1, H1_0
 	VMRHG V0, H2_1, H2_0
 	VMRLG V0, H0_1, H0_1
 	VMRLG V0, H1_1, H1_1
 	VMRLG V0, H2_1, H2_1
 	// move h to the left and 0s at the right
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	// get message blocks and append 1 to start
 	SUB    $17, R3
 	VL     (R2), M0
 	VLL    R3, 16(R2), M1
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M1
 	VZERO  T_6
 	VZERO  T_7
 	VZERO  T_8
 	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
 	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
 	VLEIB  $2, $1, T_8
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_8
 	// add [m0, m1] to h
 	VAG H0_0, T_6, H0_0
 	VAG H1_0, T_7, H1_0
 	VAG H2_0, T_8, H2_0
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	VZERO T_10
 	VZERO M0
 	// at this point R_0 .. R5_2 look like [r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
 	SUB    $16, R3, R3
 	CMPBLE R3, $0, next
 b1:
 	CMPBLE R3, $0, next
 	// 1 block remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	// set up [0, m0] limbs
 	SUB    $1, R3
 	VLL    R3, (R2), M0
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M0
 	VZERO  T_1
 	VZERO  T_2
 	VZERO  T_3
 	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_3
 	// h+m0
 	VAQ H0_0, T_1, H0_0
 	VAQ H1_0, T_2, H1_0
 	VAQ H2_0, T_3, H2_0
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	BR next
 square:
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// (h0*r**2) + (h1*r)
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	BR next
--- a/vendor/golang.org/x/crypto/ssh/certs.go
+++ b/vendor/golang.org/x/crypto/ssh/certs.go
@ -22,7 +22,9 @@ const (
 	CertAlgoECDSA256v01   = "ecdsa-sha2-nistp256-cert-v01@openssh.com"
 	CertAlgoECDSA384v01   = "ecdsa-sha2-nistp384-cert-v01@openssh.com"
 	CertAlgoECDSA521v01   = "ecdsa-sha2-nistp521-cert-v01@openssh.com"
 	CertAlgoSKECDSA256v01 = "sk-ecdsa-sha2-nistp256-cert-v01@openssh.com"
 	CertAlgoED25519v01    = "ssh-ed25519-cert-v01@openssh.com"
 	CertAlgoSKED25519v01  = "sk-ssh-ed25519-cert-v01@openssh.com"
 )
 // Certificate types distinguish between host and user
@ -37,6 +39,7 @@ const (
 type Signature struct {
 	Format string
 	Blob   []byte
 	Rest   []byte `ssh:"rest"`
 }
 // CertTimeInfinity can be used for OpenSSHCertV01.ValidBefore to indicate that
@ -411,8 +414,8 @@ func (c *CertChecker) CheckCert(principal string, cert *Certificate) error {
 	return nil
 }
-// SignCert sets c.SignatureKey to the authority's public key and stores a
+// SignCert signs the certificate with an authority, setting the Nonce,
-// Signature, by authority, in the certificate.
+// SignatureKey, and Signature fields.
 func (c *Certificate) SignCert(rand io.Reader, authority Signer) error {
 	c.Nonce = make([]byte, 32)
 	if _, err := io.ReadFull(rand, c.Nonce); err != nil {
@ -434,7 +437,9 @@ var certAlgoNames = map[string]string{
 	KeyAlgoECDSA256:   CertAlgoECDSA256v01,
 	KeyAlgoECDSA384:   CertAlgoECDSA384v01,
 	KeyAlgoECDSA521:   CertAlgoECDSA521v01,
 	KeyAlgoSKECDSA256: CertAlgoSKECDSA256v01,
 	KeyAlgoED25519:    CertAlgoED25519v01,
 	KeyAlgoSKED25519:  CertAlgoSKED25519v01,
 }
 // certToPrivAlgo returns the underlying algorithm for a certificate algorithm.
@ -518,6 +523,12 @@ func parseSignatureBody(in []byte) (out *Signature, rest []byte, ok bool) {
 		return
 	}
 	switch out.Format {
 	case KeyAlgoSKECDSA256, CertAlgoSKECDSA256v01, KeyAlgoSKED25519, CertAlgoSKED25519v01:
 		out.Rest = in
 		return out, nil, ok
 	}
 	return out, in, ok
 }
--- a/vendor/golang.org/x/crypto/ssh/cipher.go
+++ b/vendor/golang.org/x/crypto/ssh/cipher.go
@ -16,9 +16,8 @@ import (
 	"hash"
 	"io"
 	"io/ioutil"
 	"math/bits"
-	"golang.org/x/crypto/internal/chacha20"
+	"golang.org/x/crypto/chacha20"
 	"golang.org/x/crypto/poly1305"
 )
@ -120,7 +119,7 @@ var cipherModes = map[string]*cipherMode{
 	chacha20Poly1305ID: {64, 0, newChaCha20Cipher},
 	// CBC mode is insecure and so is not included in the default config.
-	// (See http://www.isg.rhul.ac.uk/~kp/SandPfinal.pdf). If absolutely
+	// (See https://www.ieee-security.org/TC/SP2013/papers/4977a526.pdf). If absolutely
 	// needed, it's possible to specify a custom Config to enable it.
 	// You should expect that an active attacker can recover plaintext if
 	// you do.
@ -642,8 +641,8 @@ const chacha20Poly1305ID = "chacha20-poly1305@openssh.com"
 // the methods here also implement padding, which RFC4253 Section 6
 // also requires of stream ciphers.
 type chacha20Poly1305Cipher struct {
-	lengthKey  [8]uint32
+	lengthKey  [32]byte
-	contentKey [8]uint32
+	contentKey [32]byte
 	buf        []byte
 }
@ -656,21 +655,21 @@ func newChaCha20Cipher(key, unusedIV, unusedMACKey []byte, unusedAlgs directionA
 		buf: make([]byte, 256),
 	}
-	for i := range c.contentKey {
+	copy(c.contentKey[:], key[:32])
-		c.contentKey[i] = binary.LittleEndian.Uint32(key[i*4 : (i+1)*4])
+	copy(c.lengthKey[:], key[32:])
 	}
 	for i := range c.lengthKey {
 		c.lengthKey[i] = binary.LittleEndian.Uint32(key[(i+8)*4 : (i+9)*4])
 	}
 	return c, nil
 }
 func (c *chacha20Poly1305Cipher) readCipherPacket(seqNum uint32, r io.Reader) ([]byte, error) {
-	nonce := [3]uint32{0, 0, bits.ReverseBytes32(seqNum)}
+	nonce := make([]byte, 12)
-	s := chacha20.New(c.contentKey, nonce)
+	binary.BigEndian.PutUint32(nonce[8:], seqNum)
-	var polyKey [32]byte
+	s, err := chacha20.NewUnauthenticatedCipher(c.contentKey[:], nonce)
 	if err != nil {
 		return nil, err
 	}
 	var polyKey, discardBuf [32]byte
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.Advance() // skip next 32 bytes
+	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
 	encryptedLength := c.buf[:4]
 	if _, err := io.ReadFull(r, encryptedLength); err != nil {
@ -678,7 +677,11 @@ func (c *chacha20Poly1305Cipher) readCipherPacket(seqNum uint32, r io.Reader) ([
 	}
 	var lenBytes [4]byte
-	chacha20.New(c.lengthKey, nonce).XORKeyStream(lenBytes[:], encryptedLength)
+	ls, err := chacha20.NewUnauthenticatedCipher(c.lengthKey[:], nonce)
 	if err != nil {
 		return nil, err
 	}
 	ls.XORKeyStream(lenBytes[:], encryptedLength)
 	length := binary.BigEndian.Uint32(lenBytes[:])
 	if length > maxPacket {
@ -724,11 +727,15 @@ func (c *chacha20Poly1305Cipher) readCipherPacket(seqNum uint32, r io.Reader) ([
 }
 func (c *chacha20Poly1305Cipher) writeCipherPacket(seqNum uint32, w io.Writer, rand io.Reader, payload []byte) error {
-	nonce := [3]uint32{0, 0, bits.ReverseBytes32(seqNum)}
+	nonce := make([]byte, 12)
-	s := chacha20.New(c.contentKey, nonce)
+	binary.BigEndian.PutUint32(nonce[8:], seqNum)
-	var polyKey [32]byte
+	s, err := chacha20.NewUnauthenticatedCipher(c.contentKey[:], nonce)
 	if err != nil {
 		return err
 	}
 	var polyKey, discardBuf [32]byte
 	s.XORKeyStream(polyKey[:], polyKey[:])
-	s.Advance() // skip next 32 bytes
+	s.XORKeyStream(discardBuf[:], discardBuf[:]) // skip the next 32 bytes
 	// There is no blocksize, so fall back to multiple of 8 byte
 	// padding, as described in RFC 4253, Sec 6.
@ -748,7 +755,11 @@ func (c *chacha20Poly1305Cipher) writeCipherPacket(seqNum uint32, w io.Writer, r
 	}
 	binary.BigEndian.PutUint32(c.buf, uint32(1+len(payload)+padding))
-	chacha20.New(c.lengthKey, nonce).XORKeyStream(c.buf, c.buf[:4])
+	ls, err := chacha20.NewUnauthenticatedCipher(c.lengthKey[:], nonce)
 	if err != nil {
 		return err
 	}
 	ls.XORKeyStream(c.buf, c.buf[:4])
 	c.buf[4] = byte(padding)
 	copy(c.buf[5:], payload)
 	packetEnd := 5 + len(payload) + padding
--- a/vendor/golang.org/x/crypto/ssh/internal/bcrypt_pbkdf/bcrypt_pbkdf.go
+++ b/vendor/golang.org/x/crypto/ssh/internal/bcrypt_pbkdf/bcrypt_pbkdf.go
@ -0,0 +1,93 @@
 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package bcrypt_pbkdf implements bcrypt_pbkdf(3) from OpenBSD.
 //
 // See https://flak.tedunangst.com/post/bcrypt-pbkdf and
 // https://cvsweb.openbsd.org/cgi-bin/cvsweb/src/lib/libutil/bcrypt_pbkdf.c.
 package bcrypt_pbkdf
 import (
 	"crypto/sha512"
 	"errors"
 	"golang.org/x/crypto/blowfish"
 )
 const blockSize = 32
 // Key derives a key from the password, salt and rounds count, returning a
 // []byte of length keyLen that can be used as cryptographic key.
 func Key(password, salt []byte, rounds, keyLen int) ([]byte, error) {
 	if rounds < 1 {
 		return nil, errors.New("bcrypt_pbkdf: number of rounds is too small")
 	}
 	if len(password) == 0 {
 		return nil, errors.New("bcrypt_pbkdf: empty password")
 	}
 	if len(salt) == 0 || len(salt) > 1<<20 {
 		return nil, errors.New("bcrypt_pbkdf: bad salt length")
 	}
 	if keyLen > 1024 {
 		return nil, errors.New("bcrypt_pbkdf: keyLen is too large")
 	}
 	numBlocks := (keyLen + blockSize - 1) / blockSize
 	key := make([]byte, numBlocks*blockSize)
 	h := sha512.New()
 	h.Write(password)
 	shapass := h.Sum(nil)
 	shasalt := make([]byte, 0, sha512.Size)
 	cnt, tmp := make([]byte, 4), make([]byte, blockSize)
 	for block := 1; block <= numBlocks; block++ {
 		h.Reset()
 		h.Write(salt)
 		cnt[0] = byte(block >> 24)
 		cnt[1] = byte(block >> 16)
 		cnt[2] = byte(block >> 8)
 		cnt[3] = byte(block)
 		h.Write(cnt)
 		bcryptHash(tmp, shapass, h.Sum(shasalt))
 		out := make([]byte, blockSize)
 		copy(out, tmp)
 		for i := 2; i <= rounds; i++ {
 			h.Reset()
 			h.Write(tmp)
 			bcryptHash(tmp, shapass, h.Sum(shasalt))
 			for j := 0; j < len(out); j++ {
 				out[j] ^= tmp[j]
 			}
 		}
 		for i, v := range out {
 			key[i*numBlocks+(block-1)] = v
 		}
 	}
 	return key[:keyLen], nil
 }
 var magic = []byte("OxychromaticBlowfishSwatDynamite")
 func bcryptHash(out, shapass, shasalt []byte) {
 	c, err := blowfish.NewSaltedCipher(shapass, shasalt)
 	if err != nil {
 		panic(err)
 	}
 	for i := 0; i < 64; i++ {
 		blowfish.ExpandKey(shasalt, c)
 		blowfish.ExpandKey(shapass, c)
 	}
 	copy(out, magic)
 	for i := 0; i < 32; i += 8 {
 		for j := 0; j < 64; j++ {
 			c.Encrypt(out[i:i+8], out[i:i+8])
 		}
 	}
 	// Swap bytes due to different endianness.
 	for i := 0; i < 32; i += 4 {
 		out[i+3], out[i+2], out[i+1], out[i] = out[i], out[i+1], out[i+2], out[i+3]
 	}
 }
--- a/vendor/golang.org/x/crypto/ssh/kex.go
+++ b/vendor/golang.org/x/crypto/ssh/kex.go
@ -212,7 +212,7 @@ func (group *dhGroup) Server(c packetConn, randSource io.Reader, magics *handsha
 		HostKey:   hostKeyBytes,
 		Signature: sig,
 		Hash:      crypto.SHA1,
-	}, nil
+	}, err
 }
 // ecdh performs Elliptic Curve Diffie-Hellman key exchange as
@ -572,7 +572,7 @@ func (gex *dhGEXSHA) diffieHellman(theirPublic, myPrivate *big.Int) (*big.Int, e
 	return new(big.Int).Exp(theirPublic, myPrivate, gex.p), nil
 }
-func (gex *dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshakeMagics) (*kexResult, error) {
+func (gex dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshakeMagics) (*kexResult, error) {
 	// Send GexRequest
 	kexDHGexRequest := kexDHGexRequestMsg{
 		MinBits:      dhGroupExchangeMinimumBits,
@ -677,7 +677,7 @@ func (gex *dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshak
 // Server half implementation of the Diffie Hellman Key Exchange with SHA1 and SHA256.
 //
 // This is a minimal implementation to satisfy the automated tests.
-func (gex *dhGEXSHA) Server(c packetConn, randSource io.Reader, magics *handshakeMagics, priv Signer) (result *kexResult, err error) {
+func (gex dhGEXSHA) Server(c packetConn, randSource io.Reader, magics *handshakeMagics, priv Signer) (result *kexResult, err error) {
 	// Receive GexRequest
 	packet, err := c.readPacket()
 	if err != nil {
--- a/vendor/golang.org/x/crypto/ssh/keys.go
+++ b/vendor/golang.org/x/crypto/ssh/keys.go
@ -7,6 +7,8 @@ package ssh
 import (
 	"bytes"
 	"crypto"
 	"crypto/aes"
 	"crypto/cipher"
 	"crypto/dsa"
 	"crypto/ecdsa"
 	"crypto/elliptic"
@ -25,6 +27,7 @@ import (
 	"strings"
 	"golang.org/x/crypto/ed25519"
 	"golang.org/x/crypto/ssh/internal/bcrypt_pbkdf"
 )
 // These constants represent the algorithm names for key types supported by this
@ -33,9 +36,11 @@ const (
 	KeyAlgoRSA        = "ssh-rsa"
 	KeyAlgoDSA        = "ssh-dss"
 	KeyAlgoECDSA256   = "ecdsa-sha2-nistp256"
 	KeyAlgoSKECDSA256 = "sk-ecdsa-sha2-nistp256@openssh.com"
 	KeyAlgoECDSA384   = "ecdsa-sha2-nistp384"
 	KeyAlgoECDSA521   = "ecdsa-sha2-nistp521"
 	KeyAlgoED25519    = "ssh-ed25519"
 	KeyAlgoSKED25519  = "sk-ssh-ed25519@openssh.com"
 )
 // These constants represent non-default signature algorithms that are supported
@ -58,9 +63,13 @@ func parsePubKey(in []byte, algo string) (pubKey PublicKey, rest []byte, err err
 		return parseDSA(in)
 	case KeyAlgoECDSA256, KeyAlgoECDSA384, KeyAlgoECDSA521:
 		return parseECDSA(in)
 	case KeyAlgoSKECDSA256:
 		return parseSKECDSA(in)
 	case KeyAlgoED25519:
 		return parseED25519(in)
-	case CertAlgoRSAv01, CertAlgoDSAv01, CertAlgoECDSA256v01, CertAlgoECDSA384v01, CertAlgoECDSA521v01, CertAlgoED25519v01:
+	case KeyAlgoSKED25519:
 		return parseSKEd25519(in)
 	case CertAlgoRSAv01, CertAlgoDSAv01, CertAlgoECDSA256v01, CertAlgoECDSA384v01, CertAlgoECDSA521v01, CertAlgoSKECDSA256v01, CertAlgoED25519v01, CertAlgoSKED25519v01:
 		cert, err := parseCert(in, certToPrivAlgo(algo))
 		if err != nil {
 			return nil, nil, err
@ -553,9 +562,11 @@ func parseED25519(in []byte) (out PublicKey, rest []byte, err error) {
 		return nil, nil, err
 	}
-	key := ed25519.PublicKey(w.KeyBytes)
+	if l := len(w.KeyBytes); l != ed25519.PublicKeySize {
 		return nil, nil, fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
-	return (ed25519PublicKey)(key), w.Rest, nil
+	return ed25519PublicKey(w.KeyBytes), w.Rest, nil
 }
 func (k ed25519PublicKey) Marshal() []byte {
@ -573,9 +584,11 @@ func (k ed25519PublicKey) Verify(b []byte, sig *Signature) error {
 	if sig.Format != k.Type() {
 		return fmt.Errorf("ssh: signature type %s for key type %s", sig.Format, k.Type())
 	}
 	if l := len(k); l != ed25519.PublicKeySize {
 		return fmt.Errorf("ssh: invalid size %d for Ed25519 public key", l)
 	}
-	edKey := (ed25519.PublicKey)(k)
+	if ok := ed25519.Verify(ed25519.PublicKey(k), b, sig.Blob); !ok {
 	if ok := ed25519.Verify(edKey, b, sig.Blob); !ok {
 		return errors.New("ssh: signature did not verify")
 	}
@ -685,6 +698,224 @@ func (k *ecdsaPublicKey) CryptoPublicKey() crypto.PublicKey {
 	return (*ecdsa.PublicKey)(k)
 }
 // skFields holds the additional fields present in U2F/FIDO2 signatures.
 // See openssh/PROTOCOL.u2f 'SSH U2F Signatures' for details.
 type skFields struct {
 	// Flags contains U2F/FIDO2 flags such as 'user present'
 	Flags byte
 	// Counter is a monotonic signature counter which can be
 	// used to detect concurrent use of a private key, should
 	// it be extracted from hardware.
 	Counter uint32
 }
 type skECDSAPublicKey struct {
 	// application is a URL-like string, typically "ssh:" for SSH.
 	// see openssh/PROTOCOL.u2f for details.
 	application string
 	ecdsa.PublicKey
 }
 func (k *skECDSAPublicKey) Type() string {
 	return KeyAlgoSKECDSA256
 }
 func (k *skECDSAPublicKey) nistID() string {
 	return "nistp256"
 }
 func parseSKECDSA(in []byte) (out PublicKey, rest []byte, err error) {
 	var w struct {
 		Curve       string
 		KeyBytes    []byte
 		Application string
 		Rest        []byte `ssh:"rest"`
 	}
 	if err := Unmarshal(in, &w); err != nil {
 		return nil, nil, err
 	}
 	key := new(skECDSAPublicKey)
 	key.application = w.Application
 	if w.Curve != "nistp256" {
 		return nil, nil, errors.New("ssh: unsupported curve")
 	}
 	key.Curve = elliptic.P256()
 	key.X, key.Y = elliptic.Unmarshal(key.Curve, w.KeyBytes)
 	if key.X == nil || key.Y == nil {
 		return nil, nil, errors.New("ssh: invalid curve point")
 	}
 	return key, w.Rest, nil
 }
 func (k *skECDSAPublicKey) Marshal() []byte {
 	// See RFC 5656, section 3.1.
 	keyBytes := elliptic.Marshal(k.Curve, k.X, k.Y)
 	w := struct {
 		Name        string
 		ID          string
 		Key         []byte
 		Application string
 	}{
 		k.Type(),
 		k.nistID(),
 		keyBytes,
 		k.application,
 	}
 	return Marshal(&w)
 }
 func (k *skECDSAPublicKey) Verify(data []byte, sig *Signature) error {
 	if sig.Format != k.Type() {
 		return fmt.Errorf("ssh: signature type %s for key type %s", sig.Format, k.Type())
 	}
 	h := ecHash(k.Curve).New()
 	h.Write([]byte(k.application))
 	appDigest := h.Sum(nil)
 	h.Reset()
 	h.Write(data)
 	dataDigest := h.Sum(nil)
 	var ecSig struct {
 		R *big.Int
 		S *big.Int
 	}
 	if err := Unmarshal(sig.Blob, &ecSig); err != nil {
 		return err
 	}
 	var skf skFields
 	if err := Unmarshal(sig.Rest, &skf); err != nil {
 		return err
 	}
 	blob := struct {
 		ApplicationDigest []byte `ssh:"rest"`
 		Flags             byte
 		Counter           uint32
 		MessageDigest     []byte `ssh:"rest"`
 	}{
 		appDigest,
 		skf.Flags,
 		skf.Counter,
 		dataDigest,
 	}
 	original := Marshal(blob)
 	h.Reset()
 	h.Write(original)
 	digest := h.Sum(nil)
 	if ecdsa.Verify((*ecdsa.PublicKey)(&k.PublicKey), digest, ecSig.R, ecSig.S) {
 		return nil
 	}
 	return errors.New("ssh: signature did not verify")
 }
 type skEd25519PublicKey struct {
 	// application is a URL-like string, typically "ssh:" for SSH.
 	// see openssh/PROTOCOL.u2f for details.
 	application string
 	ed25519.PublicKey
 }
 func (k *skEd25519PublicKey) Type() string {
 	return KeyAlgoSKED25519
 }
 func parseSKEd25519(in []byte) (out PublicKey, rest []byte, err error) {
 	var w struct {
 		KeyBytes    []byte
 		Application string
 		Rest        []byte `ssh:"rest"`
 	}
 	if err := Unmarshal(in, &w); err != nil {
 		return nil, nil, err
 	}
 	if l := len(w.KeyBytes); l != ed25519.PublicKeySize {
 		return nil, nil, fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
 	key := new(skEd25519PublicKey)
 	key.application = w.Application
 	key.PublicKey = ed25519.PublicKey(w.KeyBytes)
 	return key, w.Rest, nil
 }
 func (k *skEd25519PublicKey) Marshal() []byte {
 	w := struct {
 		Name        string
 		KeyBytes    []byte
 		Application string
 	}{
 		KeyAlgoSKED25519,
 		[]byte(k.PublicKey),
 		k.application,
 	}
 	return Marshal(&w)
 }
 func (k *skEd25519PublicKey) Verify(data []byte, sig *Signature) error {
 	if sig.Format != k.Type() {
 		return fmt.Errorf("ssh: signature type %s for key type %s", sig.Format, k.Type())
 	}
 	if l := len(k.PublicKey); l != ed25519.PublicKeySize {
 		return fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
 	h := sha256.New()
 	h.Write([]byte(k.application))
 	appDigest := h.Sum(nil)
 	h.Reset()
 	h.Write(data)
 	dataDigest := h.Sum(nil)
 	var edSig struct {
 		Signature []byte `ssh:"rest"`
 	}
 	if err := Unmarshal(sig.Blob, &edSig); err != nil {
 		return err
 	}
 	var skf skFields
 	if err := Unmarshal(sig.Rest, &skf); err != nil {
 		return err
 	}
 	blob := struct {
 		ApplicationDigest []byte `ssh:"rest"`
 		Flags             byte
 		Counter           uint32
 		MessageDigest     []byte `ssh:"rest"`
 	}{
 		appDigest,
 		skf.Flags,
 		skf.Counter,
 		dataDigest,
 	}
 	original := Marshal(blob)
 	if ok := ed25519.Verify(k.PublicKey, original, edSig.Signature); !ok {
 		return errors.New("ssh: signature did not verify")
 	}
 	return nil
 }
 // NewSignerFromKey takes an *rsa.PrivateKey, *dsa.PrivateKey,
 // *ecdsa.PrivateKey or any other crypto.Signer and returns a
 // corresponding Signer instance. ECDSA keys must use P-256, P-384 or
@ -830,14 +1061,18 @@ func NewPublicKey(key interface{}) (PublicKey, error) {
 	case *dsa.PublicKey:
 		return (*dsaPublicKey)(key), nil
 	case ed25519.PublicKey:
-		return (ed25519PublicKey)(key), nil
+		if l := len(key); l != ed25519.PublicKeySize {
 			return nil, fmt.Errorf("ssh: invalid size %d for Ed25519 public key", l)
 		}
 		return ed25519PublicKey(key), nil
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %T", key)
 	}
 }
 // ParsePrivateKey returns a Signer from a PEM encoded private key. It supports
-// the same keys as ParseRawPrivateKey.
+// the same keys as ParseRawPrivateKey. If the private key is encrypted, it
 // will return a PassphraseMissingError.
 func ParsePrivateKey(pemBytes []byte) (Signer, error) {
 	key, err := ParseRawPrivateKey(pemBytes)
 	if err != nil {
@ -850,8 +1085,8 @@ func ParsePrivateKey(pemBytes []byte) (Signer, error) {
 // ParsePrivateKeyWithPassphrase returns a Signer from a PEM encoded private
 // key and passphrase. It supports the same keys as
 // ParseRawPrivateKeyWithPassphrase.
-func ParsePrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (Signer, error) {
+func ParsePrivateKeyWithPassphrase(pemBytes, passphrase []byte) (Signer, error) {
-	key, err := ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase)
+	key, err := ParseRawPrivateKeyWithPassphrase(pemBytes, passphrase)
 	if err != nil {
 		return nil, err
 	}
@ -867,8 +1102,21 @@ func encryptedBlock(block *pem.Block) bool {
 	return strings.Contains(block.Headers["Proc-Type"], "ENCRYPTED")
 }
 // A PassphraseMissingError indicates that parsing this private key requires a
 // passphrase. Use ParsePrivateKeyWithPassphrase.
 type PassphraseMissingError struct {
 	// PublicKey will be set if the private key format includes an unencrypted
 	// public key along with the encrypted private key.
 	PublicKey PublicKey
 }
 func (*PassphraseMissingError) Error() string {
 	return "ssh: this private key is passphrase protected"
 }
 // ParseRawPrivateKey returns a private key from a PEM encoded private key. It
-// supports RSA (PKCS#1), PKCS#8, DSA (OpenSSL), and ECDSA private keys.
+// supports RSA (PKCS#1), PKCS#8, DSA (OpenSSL), and ECDSA private keys. If the
 // private key is encrypted, it will return a PassphraseMissingError.
 func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	block, _ := pem.Decode(pemBytes)
 	if block == nil {
@ -876,7 +1124,7 @@ func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	}
 	if encryptedBlock(block) {
-		return nil, errors.New("ssh: cannot decode encrypted private keys")
+		return nil, &PassphraseMissingError{}
 	}
 	switch block.Type {
@ -890,34 +1138,36 @@ func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	case "DSA PRIVATE KEY":
 		return ParseDSAPrivateKey(block.Bytes)
 	case "OPENSSH PRIVATE KEY":
-		return parseOpenSSHPrivateKey(block.Bytes)
+		return parseOpenSSHPrivateKey(block.Bytes, unencryptedOpenSSHKey)
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %q", block.Type)
 	}
 }
 // ParseRawPrivateKeyWithPassphrase returns a private key decrypted with
-// passphrase from a PEM encoded private key. If wrong passphrase, return
+// passphrase from a PEM encoded private key. If the passphrase is wrong, it
-// x509.IncorrectPasswordError.
+// will return x509.IncorrectPasswordError.
-func ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (interface{}, error) {
+func ParseRawPrivateKeyWithPassphrase(pemBytes, passphrase []byte) (interface{}, error) {
 	block, _ := pem.Decode(pemBytes)
 	if block == nil {
 		return nil, errors.New("ssh: no key found")
 	}
 	buf := block.Bytes
-	if encryptedBlock(block) {
+	if block.Type == "OPENSSH PRIVATE KEY" {
-		if x509.IsEncryptedPEMBlock(block) {
+		return parseOpenSSHPrivateKey(block.Bytes, passphraseProtectedOpenSSHKey(passphrase))
-			var err error
+	}
-			buf, err = x509.DecryptPEMBlock(block, passPhrase)
+
 	if !encryptedBlock(block) || !x509.IsEncryptedPEMBlock(block) {
 		return nil, errors.New("ssh: not an encrypted key")
 	}
 	buf, err := x509.DecryptPEMBlock(block, passphrase)
 	if err != nil {
 		if err == x509.IncorrectPasswordError {
 			return nil, err
 		}
 		return nil, fmt.Errorf("ssh: cannot decode encrypted private keys: %v", err)
 	}
 		}
 	}
 	switch block.Type {
 	case "RSA PRIVATE KEY":
@ -926,8 +1176,6 @@ func ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (interface{},
 		return x509.ParseECPrivateKey(buf)
 	case "DSA PRIVATE KEY":
 		return ParseDSAPrivateKey(buf)
 	case "OPENSSH PRIVATE KEY":
 		return parseOpenSSHPrivateKey(buf)
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %q", block.Type)
 	}
@ -965,9 +1213,68 @@ func ParseDSAPrivateKey(der []byte) (*dsa.PrivateKey, error) {
 	}, nil
 }
-// Implemented based on the documentation at
+func unencryptedOpenSSHKey(cipherName, kdfName, kdfOpts string, privKeyBlock []byte) ([]byte, error) {
-// https://github.com/openssh/openssh-portable/blob/master/PROTOCOL.key
+	if kdfName != "none" || cipherName != "none" {
-func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
+		return nil, &PassphraseMissingError{}
 	}
 	if kdfOpts != "" {
 		return nil, errors.New("ssh: invalid openssh private key")
 	}
 	return privKeyBlock, nil
 }
 func passphraseProtectedOpenSSHKey(passphrase []byte) openSSHDecryptFunc {
 	return func(cipherName, kdfName, kdfOpts string, privKeyBlock []byte) ([]byte, error) {
 		if kdfName == "none" || cipherName == "none" {
 			return nil, errors.New("ssh: key is not password protected")
 		}
 		if kdfName != "bcrypt" {
 			return nil, fmt.Errorf("ssh: unknown KDF %q, only supports %q", kdfName, "bcrypt")
 		}
 		var opts struct {
 			Salt   string
 			Rounds uint32
 		}
 		if err := Unmarshal([]byte(kdfOpts), &opts); err != nil {
 			return nil, err
 		}
 		k, err := bcrypt_pbkdf.Key(passphrase, []byte(opts.Salt), int(opts.Rounds), 32+16)
 		if err != nil {
 			return nil, err
 		}
 		key, iv := k[:32], k[32:]
 		c, err := aes.NewCipher(key)
 		if err != nil {
 			return nil, err
 		}
 		switch cipherName {
 		case "aes256-ctr":
 			ctr := cipher.NewCTR(c, iv)
 			ctr.XORKeyStream(privKeyBlock, privKeyBlock)
 		case "aes256-cbc":
 			if len(privKeyBlock)%c.BlockSize() != 0 {
 				return nil, fmt.Errorf("ssh: invalid encrypted private key length, not a multiple of the block size")
 			}
 			cbc := cipher.NewCBCDecrypter(c, iv)
 			cbc.CryptBlocks(privKeyBlock, privKeyBlock)
 		default:
 			return nil, fmt.Errorf("ssh: unknown cipher %q, only supports %q or %q", cipherName, "aes256-ctr", "aes256-cbc")
 		}
 		return privKeyBlock, nil
 	}
 }
 type openSSHDecryptFunc func(CipherName, KdfName, KdfOpts string, PrivKeyBlock []byte) ([]byte, error)
 // parseOpenSSHPrivateKey parses an OpenSSH private key, using the decrypt
 // function to unwrap the encrypted portion. unencryptedOpenSSHKey can be used
 // as the decrypt function to parse an unencrypted private key. See
 // https://github.com/openssh/openssh-portable/blob/master/PROTOCOL.key.
 func parseOpenSSHPrivateKey(key []byte, decrypt openSSHDecryptFunc) (crypto.PrivateKey, error) {
 	const magic = "openssh-key-v1\x00"
 	if len(key) < len(magic) || string(key[:len(magic)]) != magic {
 		return nil, errors.New("ssh: invalid openssh private key format")
@ -986,9 +1293,22 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 	if err := Unmarshal(remaining, &w); err != nil {
 		return nil, err
 	}
 	if w.NumKeys != 1 {
 		// We only support single key files, and so does OpenSSH.
 		// https://github.com/openssh/openssh-portable/blob/4103a3ec7/sshkey.c#L4171
 		return nil, errors.New("ssh: multi-key files are not supported")
 	}
-	if w.KdfName != "none" || w.CipherName != "none" {
+	privKeyBlock, err := decrypt(w.CipherName, w.KdfName, w.KdfOpts, w.PrivKeyBlock)
-		return nil, errors.New("ssh: cannot decode encrypted private keys")
+	if err != nil {
 		if err, ok := err.(*PassphraseMissingError); ok {
 			pub, errPub := ParsePublicKey(w.PubKey)
 			if errPub != nil {
 				return nil, fmt.Errorf("ssh: failed to parse embedded public key: %v", errPub)
 			}
 			err.PublicKey = pub
 		}
 		return nil, err
 	}
 	pk1 := struct {
@ -998,15 +1318,13 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 		Rest    []byte `ssh:"rest"`
 	}{}
-	if err := Unmarshal(w.PrivKeyBlock, &pk1); err != nil {
+	if err := Unmarshal(privKeyBlock, &pk1); err != nil || pk1.Check1 != pk1.Check2 {
-		return nil, err
+		if w.CipherName != "none" {
 			return nil, x509.IncorrectPasswordError
 		}
 		return nil, errors.New("ssh: malformed OpenSSH key")
 	}
 	if pk1.Check1 != pk1.Check2 {
 		return nil, errors.New("ssh: checkint mismatch")
 	}
 	// we only handle ed25519 and rsa keys currently
 	switch pk1.Keytype {
 	case KeyAlgoRSA:
 		// https://github.com/openssh/openssh-portable/blob/master/sshkey.c#L2760-L2773
@ -1025,10 +1343,8 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 			return nil, err
 		}
-		for i, b := range key.Pad {
+		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
-			if int(b) != i+1 {
+			return nil, err
 				return nil, errors.New("ssh: padding not as expected")
 			}
 		}
 		pk := &rsa.PrivateKey{
@ -1063,20 +1379,78 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 			return nil, errors.New("ssh: private key unexpected length")
 		}
-		for i, b := range key.Pad {
+		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
-			if int(b) != i+1 {
+			return nil, err
 				return nil, errors.New("ssh: padding not as expected")
 			}
 		}
 		pk := ed25519.PrivateKey(make([]byte, ed25519.PrivateKeySize))
 		copy(pk, key.Priv)
 		return &pk, nil
 	case KeyAlgoECDSA256, KeyAlgoECDSA384, KeyAlgoECDSA521:
 		key := struct {
 			Curve   string
 			Pub     []byte
 			D       *big.Int
 			Comment string
 			Pad     []byte `ssh:"rest"`
 		}{}
 		if err := Unmarshal(pk1.Rest, &key); err != nil {
 			return nil, err
 		}
 		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
 			return nil, err
 		}
 		var curve elliptic.Curve
 		switch key.Curve {
 		case "nistp256":
 			curve = elliptic.P256()
 		case "nistp384":
 			curve = elliptic.P384()
 		case "nistp521":
 			curve = elliptic.P521()
 		default:
 			return nil, errors.New("ssh: unhandled elliptic curve: " + key.Curve)
 		}
 		X, Y := elliptic.Unmarshal(curve, key.Pub)
 		if X == nil || Y == nil {
 			return nil, errors.New("ssh: failed to unmarshal public key")
 		}
 		if key.D.Cmp(curve.Params().N) >= 0 {
 			return nil, errors.New("ssh: scalar is out of range")
 		}
 		x, y := curve.ScalarBaseMult(key.D.Bytes())
 		if x.Cmp(X) != 0 || y.Cmp(Y) != 0 {
 			return nil, errors.New("ssh: public key does not match private key")
 		}
 		return &ecdsa.PrivateKey{
 			PublicKey: ecdsa.PublicKey{
 				Curve: curve,
 				X:     X,
 				Y:     Y,
 			},
 			D: key.D,
 		}, nil
 	default:
 		return nil, errors.New("ssh: unhandled key type")
 	}
 }
 func checkOpenSSHKeyPadding(pad []byte) error {
 	for i, b := range pad {
 		if int(b) != i+1 {
 			return errors.New("ssh: padding not as expected")
 		}
 	}
 	return nil
 }
 // FingerprintLegacyMD5 returns the user presentation of the key's
 // fingerprint as described by RFC 4716 section 4.
 func FingerprintLegacyMD5(pubKey PublicKey) string {
--- a/vendor/golang.org/x/crypto/ssh/mux.go
+++ b/vendor/golang.org/x/crypto/ssh/mux.go
@ -240,7 +240,7 @@ func (m *mux) onePacket() error {
 	id := binary.BigEndian.Uint32(packet[1:])
 	ch := m.chanList.getChan(id)
 	if ch == nil {
-		return fmt.Errorf("ssh: invalid channel %d", id)
+		return m.handleUnknownChannelPacket(id, packet)
 	}
 	return ch.handlePacket(packet)
@ -328,3 +328,24 @@ func (m *mux) openChannel(chanType string, extra []byte) (*channel, error) {
 		return nil, fmt.Errorf("ssh: unexpected packet in response to channel open: %T", msg)
 	}
 }
 func (m *mux) handleUnknownChannelPacket(id uint32, packet []byte) error {
 	msg, err := decode(packet)
 	if err != nil {
 		return err
 	}
 	switch msg := msg.(type) {
 	// RFC 4254 section 5.4 says unrecognized channel requests should
 	// receive a failure response.
 	case *channelRequestMsg:
 		if msg.WantReply {
 			return m.sendMessage(channelRequestFailureMsg{
 				PeersID: msg.PeersID,
 			})
 		}
 		return nil
 	default:
 		return fmt.Errorf("ssh: invalid channel %d", id)
 	}
 }
--- a/vendor/golang.org/x/crypto/ssh/server.go
+++ b/vendor/golang.org/x/crypto/ssh/server.go
@ -284,8 +284,8 @@ func (s *connection) serverHandshake(config *ServerConfig) (*Permissions, error)
 func isAcceptableAlgo(algo string) bool {
 	switch algo {
-	case KeyAlgoRSA, KeyAlgoDSA, KeyAlgoECDSA256, KeyAlgoECDSA384, KeyAlgoECDSA521, KeyAlgoED25519,
+	case KeyAlgoRSA, KeyAlgoDSA, KeyAlgoECDSA256, KeyAlgoECDSA384, KeyAlgoECDSA521, KeyAlgoSKECDSA256, KeyAlgoED25519, KeyAlgoSKED25519,
-		CertAlgoRSAv01, CertAlgoDSAv01, CertAlgoECDSA256v01, CertAlgoECDSA384v01, CertAlgoECDSA521v01, CertAlgoED25519v01:
+		CertAlgoRSAv01, CertAlgoDSAv01, CertAlgoECDSA256v01, CertAlgoECDSA384v01, CertAlgoECDSA521v01, CertAlgoSKECDSA256v01, CertAlgoED25519v01, CertAlgoSKED25519v01:
 		return true
 	}
 	return false
--- a/vendor/golang.org/x/crypto/ssh/terminal/terminal.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/terminal.go
@ -7,6 +7,7 @@ package terminal
 import (
 	"bytes"
 	"io"
 	"runtime"
 	"strconv"
 	"sync"
 	"unicode/utf8"
@ -112,6 +113,7 @@ func NewTerminal(c io.ReadWriter, prompt string) *Terminal {
 }
 const (
 	keyCtrlC     = 3
 	keyCtrlD     = 4
 	keyCtrlU     = 21
 	keyEnter     = '\r'
@ -150,8 +152,12 @@ func bytesToKey(b []byte, pasteActive bool) (rune, []byte) {
 		switch b[0] {
 		case 1: // ^A
 			return keyHome, b[1:]
 		case 2: // ^B
 			return keyLeft, b[1:]
 		case 5: // ^E
 			return keyEnd, b[1:]
 		case 6: // ^F
 			return keyRight, b[1:]
 		case 8: // ^H
 			return keyBackspace, b[1:]
 		case 11: // ^K
@ -737,6 +743,9 @@ func (t *Terminal) readLine() (line string, err error) {
 						return "", io.EOF
 					}
 				}
 				if key == keyCtrlC {
 					return "", io.EOF
 				}
 				if key == keyPasteStart {
 					t.pasteActive = true
 					if len(t.line) == 0 {
@ -939,6 +948,8 @@ func (s *stRingBuffer) NthPreviousEntry(n int) (value string, ok bool) {
 // readPasswordLine reads from reader until it finds \n or io.EOF.
 // The slice returned does not include the \n.
 // readPasswordLine also ignores any \r it finds.
 // Windows uses \r as end of line. So, on Windows, readPasswordLine
 // reads until it finds \r and ignores any \n it finds during processing.
 func readPasswordLine(reader io.Reader) ([]byte, error) {
 	var buf [1]byte
 	var ret []byte
@ -947,10 +958,20 @@ func readPasswordLine(reader io.Reader) ([]byte, error) {
 		n, err := reader.Read(buf[:])
 		if n > 0 {
 			switch buf[0] {
 			case '\b':
 				if len(ret) > 0 {
 					ret = ret[:len(ret)-1]
 				}
 			case '\n':
 				if runtime.GOOS != "windows" {
 					return ret, nil
 				}
 				// otherwise ignore \n
 			case '\r':
-				// remove \r from passwords on Windows
+				if runtime.GOOS == "windows" {
 					return ret, nil
 				}
 				// otherwise ignore \r
 			default:
 				ret = append(ret, buf[0])
 			}
--- a/vendor/golang.org/x/crypto/ssh/terminal/util_windows.go
+++ b/vendor/golang.org/x/crypto/ssh/terminal/util_windows.go
@ -85,8 +85,8 @@ func ReadPassword(fd int) ([]byte, error) {
 	}
 	old := st
-	st &^= (windows.ENABLE_ECHO_INPUT)
+	st &^= (windows.ENABLE_ECHO_INPUT | windows.ENABLE_LINE_INPUT)
-	st |= (windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT)
+	st |= (windows.ENABLE_PROCESSED_OUTPUT | windows.ENABLE_PROCESSED_INPUT)
 	if err := windows.SetConsoleMode(windows.Handle(fd), st); err != nil {
 		return nil, err
 	}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -194,11 +194,13 @@ github.com/stretchr/testify/assert
 github.com/stretchr/testify/require
 # github.com/xo/dburl v0.0.0-20191005012637-293c3298d6c0
 github.com/xo/dburl
-# golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550
+# golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9
 golang.org/x/crypto/blake2b
 golang.org/x/crypto/blowfish
 golang.org/x/crypto/chacha20
 golang.org/x/crypto/curve25519
 golang.org/x/crypto/ed25519
 golang.org/x/crypto/ed25519/internal/edwards25519
 golang.org/x/crypto/internal/chacha20
 golang.org/x/crypto/internal/subtle
 golang.org/x/crypto/md4
 golang.org/x/crypto/nacl/box
@ -207,6 +209,7 @@ golang.org/x/crypto/pbkdf2
 golang.org/x/crypto/poly1305
 golang.org/x/crypto/salsa20/salsa
 golang.org/x/crypto/ssh
 golang.org/x/crypto/ssh/internal/bcrypt_pbkdf
 golang.org/x/crypto/ssh/terminal
 # golang.org/x/net v0.0.0-20191014212845-da9a3fd4c582
 golang.org/x/net/bpf