From e9aa0072a5d412d1f11c8683722f614cb1855a2c Mon Sep 17 00:00:00 2001 From: Russ Magee Date: Fri, 21 Feb 2020 17:21:19 -0800 Subject: [PATCH] Initial aead/chacha20 support (ChaCha20_12) Signed-off-by: Russ Magee --- go.mod | 1 + go.sum | 2 + vendor/github.com/aead/chacha20/LICENSE | 21 + .../github.com/aead/chacha20/chacha/chacha.go | 197 +++ .../aead/chacha20/chacha/chachaAVX2_amd64.s | 406 +++++++ .../aead/chacha20/chacha/chacha_386.go | 60 + .../aead/chacha20/chacha/chacha_386.s | 163 +++ .../aead/chacha20/chacha/chacha_amd64.go | 76 ++ .../aead/chacha20/chacha/chacha_amd64.s | 1072 +++++++++++++++++ .../aead/chacha20/chacha/chacha_generic.go | 319 +++++ .../aead/chacha20/chacha/chacha_ref.go | 33 + .../github.com/aead/chacha20/chacha/const.s | 53 + .../github.com/aead/chacha20/chacha/macro.s | 163 +++ vendor/modules.txt | 2 + xs/xs.go | 2 +- xsnet/chan.go | 13 + xsnet/consts.go | 1 + xsnet/net.go | 10 +- 18 files changed, 2592 insertions(+), 2 deletions(-) create mode 100644 vendor/github.com/aead/chacha20/LICENSE create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha.go create mode 100644 vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_386.go create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_386.s create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_amd64.go create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_amd64.s create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_generic.go create mode 100644 vendor/github.com/aead/chacha20/chacha/chacha_ref.go create mode 100644 vendor/github.com/aead/chacha20/chacha/const.s create mode 100644 vendor/github.com/aead/chacha20/chacha/macro.s diff --git a/go.mod b/go.mod index acded96..1e35ba1 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9 blitter.com/go/mtwist v1.0.1 // indirect blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae + github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f github.com/klauspost/cpuid v1.2.2 // indirect github.com/klauspost/reedsolomon v1.9.3 // indirect diff --git a/go.sum b/go.sum index 32142b8..7760711 100644 --- a/go.sum +++ b/go.sum @@ -28,6 +28,8 @@ git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c h1:SGOx git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c/go.mod h1:QrbgzU5EL/1jaMD5pD4Tiikj3R5elPMa+RMwFUTGwQU= git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2 h1:89TYv/+wotJ+QWrH5B/yN0pEQutr2V/5za0VoYiVGCM= git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2/go.mod h1:weMqACFGzJs4Ni+K9shsRd02N4LkDrtGlkRxISK+II0= +github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da h1:KjTM2ks9d14ZYCvmHS9iAKVt9AyzRSqNU1qabPih5BY= +github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da/go.mod h1:eHEWzANqSiWQsof+nXEI9bUVUyV6F53Fp89EuCh2EAA= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f h1:UWGE8Vi+1Agt0lrvnd7UsmvwqWKRzb9byK9iQmsbY0Y= diff --git a/vendor/github.com/aead/chacha20/LICENSE b/vendor/github.com/aead/chacha20/LICENSE new file mode 100644 index 0000000..b6a9210 --- /dev/null +++ b/vendor/github.com/aead/chacha20/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2016 Andreas Auernhammer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/aead/chacha20/chacha/chacha.go b/vendor/github.com/aead/chacha20/chacha/chacha.go new file mode 100644 index 0000000..c2b39da --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha.go @@ -0,0 +1,197 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// Package chacha implements some low-level functions of the +// ChaCha cipher family. +package chacha // import "github.com/aead/chacha20/chacha" + +import ( + "encoding/binary" + "errors" + "math" +) + +const ( + // NonceSize is the size of the ChaCha20 nonce in bytes. + NonceSize = 8 + + // INonceSize is the size of the IETF-ChaCha20 nonce in bytes. + INonceSize = 12 + + // XNonceSize is the size of the XChaCha20 nonce in bytes. + XNonceSize = 24 + + // KeySize is the size of the key in bytes. + KeySize = 32 +) + +var ( + useSSE2 bool + useSSSE3 bool + useAVX bool + useAVX2 bool +) + +var ( + errKeySize = errors.New("chacha20/chacha: bad key length") + errInvalidNonce = errors.New("chacha20/chacha: bad nonce length") +) + +func setup(state *[64]byte, nonce, key []byte) (err error) { + if len(key) != KeySize { + err = errKeySize + return + } + var Nonce [16]byte + switch len(nonce) { + case NonceSize: + copy(Nonce[8:], nonce) + initialize(state, key, &Nonce) + case INonceSize: + copy(Nonce[4:], nonce) + initialize(state, key, &Nonce) + case XNonceSize: + var tmpKey [32]byte + var hNonce [16]byte + + copy(hNonce[:], nonce[:16]) + copy(tmpKey[:], key) + HChaCha20(&tmpKey, &hNonce, &tmpKey) + copy(Nonce[8:], nonce[16:]) + initialize(state, tmpKey[:], &Nonce) + + // BUG(aead): A "good" compiler will remove this (optimizations) + // But using the provided key instead of tmpKey, + // will change the key (-> probably confuses users) + for i := range tmpKey { + tmpKey[i] = 0 + } + default: + err = errInvalidNonce + } + return +} + +// XORKeyStream crypts bytes from src to dst using the given nonce and key. +// The length of the nonce determinds the version of ChaCha20: +// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period. +// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period. +// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period. +// The rounds argument specifies the number of rounds performed for keystream +// generation - valid values are 8, 12 or 20. The src and dst may be the same slice +// but otherwise should not overlap. If len(dst) < len(src) this function panics. +// If the nonce is neither 64, 96 nor 192 bits long, this function panics. +func XORKeyStream(dst, src, nonce, key []byte, rounds int) { + if rounds != 20 && rounds != 12 && rounds != 8 { + panic("chacha20/chacha: bad number of rounds") + } + if len(dst) < len(src) { + panic("chacha20/chacha: dst buffer is to small") + } + if len(nonce) == INonceSize && uint64(len(src)) > (1<<38) { + panic("chacha20/chacha: src is too large") + } + + var block, state [64]byte + if err := setup(&state, nonce, key); err != nil { + panic(err) + } + xorKeyStream(dst, src, &block, &state, rounds) +} + +// Cipher implements ChaCha20/r (XChaCha20/r) for a given number of rounds r. +type Cipher struct { + state, block [64]byte + off int + rounds int // 20 for ChaCha20 + noncesize int +} + +// NewCipher returns a new *chacha.Cipher implementing the ChaCha20/r or XChaCha20/r +// (r = 8, 12 or 20) stream cipher. The nonce must be unique for one key for all time. +// The length of the nonce determinds the version of ChaCha20: +// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period. +// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period. +// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period. +// If the nonce is neither 64, 96 nor 192 bits long, a non-nil error is returned. +func NewCipher(nonce, key []byte, rounds int) (*Cipher, error) { + if rounds != 20 && rounds != 12 && rounds != 8 { + panic("chacha20/chacha: bad number of rounds") + } + + c := new(Cipher) + if err := setup(&(c.state), nonce, key); err != nil { + return nil, err + } + c.rounds = rounds + + if len(nonce) == INonceSize { + c.noncesize = INonceSize + } else { + c.noncesize = NonceSize + } + + return c, nil +} + +// XORKeyStream crypts bytes from src to dst. Src and dst may be the same slice +// but otherwise should not overlap. If len(dst) < len(src) the function panics. +func (c *Cipher) XORKeyStream(dst, src []byte) { + if len(dst) < len(src) { + panic("chacha20/chacha: dst buffer is to small") + } + + if c.off > 0 { + n := len(c.block[c.off:]) + if len(src) <= n { + for i, v := range src { + dst[i] = v ^ c.block[c.off] + c.off++ + } + if c.off == 64 { + c.off = 0 + } + return + } + + for i, v := range c.block[c.off:] { + dst[i] = src[i] ^ v + } + src = src[n:] + dst = dst[n:] + c.off = 0 + } + + // check for counter overflow + blocksToXOR := len(src) / 64 + if len(src)%64 != 0 { + blocksToXOR++ + } + var overflow bool + if c.noncesize == INonceSize { + overflow = binary.LittleEndian.Uint32(c.state[48:]) > math.MaxUint32-uint32(blocksToXOR) + } else { + overflow = binary.LittleEndian.Uint64(c.state[48:]) > math.MaxUint64-uint64(blocksToXOR) + } + if overflow { + panic("chacha20/chacha: counter overflow") + } + + c.off += xorKeyStream(dst, src, &(c.block), &(c.state), c.rounds) +} + +// SetCounter skips ctr * 64 byte blocks. SetCounter(0) resets the cipher. +// This function always skips the unused keystream of the current 64 byte block. +func (c *Cipher) SetCounter(ctr uint64) { + if c.noncesize == INonceSize { + binary.LittleEndian.PutUint32(c.state[48:], uint32(ctr)) + } else { + binary.LittleEndian.PutUint64(c.state[48:], ctr) + } + c.off = 0 +} + +// HChaCha20 generates 32 pseudo-random bytes from a 128 bit nonce and a 256 bit secret key. +// It can be used as a key-derivation-function (KDF). +func HChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { hChaCha20(out, nonce, key) } diff --git a/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s b/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s new file mode 100644 index 0000000..c2b5f52 --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chachaAVX2_amd64.s @@ -0,0 +1,406 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build amd64,!gccgo,!appengine,!nacl + +#include "const.s" +#include "macro.s" + +#define TWO 0(SP) +#define C16 32(SP) +#define C8 64(SP) +#define STATE_0 96(SP) +#define STATE_1 128(SP) +#define STATE_2 160(SP) +#define STATE_3 192(SP) +#define TMP_0 224(SP) +#define TMP_1 256(SP) + +// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int +TEXT ·xorKeyStreamAVX2(SB), 4, $320-80 + MOVQ dst_base+0(FP), DI + MOVQ src_base+24(FP), SI + MOVQ block+48(FP), BX + MOVQ state+56(FP), AX + MOVQ rounds+64(FP), DX + MOVQ src_len+32(FP), CX + + MOVQ SP, R8 + ADDQ $32, SP + ANDQ $-32, SP + + VMOVDQU 0(AX), Y2 + VMOVDQU 32(AX), Y3 + VPERM2I128 $0x22, Y2, Y0, Y0 + VPERM2I128 $0x33, Y2, Y1, Y1 + VPERM2I128 $0x22, Y3, Y2, Y2 + VPERM2I128 $0x33, Y3, Y3, Y3 + + TESTQ CX, CX + JZ done + + VMOVDQU ·one_AVX2<>(SB), Y4 + VPADDD Y4, Y3, Y3 + + VMOVDQA Y0, STATE_0 + VMOVDQA Y1, STATE_1 + VMOVDQA Y2, STATE_2 + VMOVDQA Y3, STATE_3 + + VMOVDQU ·rol16_AVX2<>(SB), Y4 + VMOVDQU ·rol8_AVX2<>(SB), Y5 + VMOVDQU ·two_AVX2<>(SB), Y6 + VMOVDQA Y4, Y14 + VMOVDQA Y5, Y15 + VMOVDQA Y4, C16 + VMOVDQA Y5, C8 + VMOVDQA Y6, TWO + + CMPQ CX, $64 + JBE between_0_and_64 + CMPQ CX, $192 + JBE between_64_and_192 + CMPQ CX, $320 + JBE between_192_and_320 + CMPQ CX, $448 + JBE between_320_and_448 + +at_least_512: + VMOVDQA Y0, Y4 + VMOVDQA Y1, Y5 + VMOVDQA Y2, Y6 + VPADDQ TWO, Y3, Y7 + VMOVDQA Y0, Y8 + VMOVDQA Y1, Y9 + VMOVDQA Y2, Y10 + VPADDQ TWO, Y7, Y11 + VMOVDQA Y0, Y12 + VMOVDQA Y1, Y13 + VMOVDQA Y2, Y14 + VPADDQ TWO, Y11, Y15 + + MOVQ DX, R9 + +chacha_loop_512: + VMOVDQA Y8, TMP_0 + CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) + VMOVDQA TMP_0, Y8 + VMOVDQA Y0, TMP_0 + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) + CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) + CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) + CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) + CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) + CHACHA_SHUFFLE_AVX(Y13, Y14, Y15) + + CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8) + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8) + VMOVDQA TMP_0, Y0 + VMOVDQA Y8, TMP_0 + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8) + CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8) + VMOVDQA TMP_0, Y8 + CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) + CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) + CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) + CHACHA_SHUFFLE_AVX(Y15, Y14, Y13) + SUBQ $2, R9 + JA chacha_loop_512 + + VMOVDQA Y12, TMP_0 + VMOVDQA Y13, TMP_1 + VPADDD STATE_0, Y0, Y0 + VPADDD STATE_1, Y1, Y1 + VPADDD STATE_2, Y2, Y2 + VPADDD STATE_3, Y3, Y3 + XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) + VMOVDQA STATE_0, Y0 + VMOVDQA STATE_1, Y1 + VMOVDQA STATE_2, Y2 + VMOVDQA STATE_3, Y3 + VPADDQ TWO, Y3, Y3 + + VPADDD Y0, Y4, Y4 + VPADDD Y1, Y5, Y5 + VPADDD Y2, Y6, Y6 + VPADDD Y3, Y7, Y7 + XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) + VPADDQ TWO, Y3, Y3 + + VPADDD Y0, Y8, Y8 + VPADDD Y1, Y9, Y9 + VPADDD Y2, Y10, Y10 + VPADDD Y3, Y11, Y11 + XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) + VPADDQ TWO, Y3, Y3 + + VPADDD TMP_0, Y0, Y12 + VPADDD TMP_1, Y1, Y13 + VPADDD Y2, Y14, Y14 + VPADDD Y3, Y15, Y15 + VPADDQ TWO, Y3, Y3 + + CMPQ CX, $512 + JB less_than_512 + + XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) + VMOVDQA Y3, STATE_3 + ADDQ $512, SI + ADDQ $512, DI + SUBQ $512, CX + CMPQ CX, $448 + JA at_least_512 + + TESTQ CX, CX + JZ done + + VMOVDQA C16, Y14 + VMOVDQA C8, Y15 + + CMPQ CX, $64 + JBE between_0_and_64 + CMPQ CX, $192 + JBE between_64_and_192 + CMPQ CX, $320 + JBE between_192_and_320 + JMP between_320_and_448 + +less_than_512: + XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5) + EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4) + ADDQ $448, SI + ADDQ $448, DI + SUBQ $448, CX + JMP finalize + +between_320_and_448: + VMOVDQA Y0, Y4 + VMOVDQA Y1, Y5 + VMOVDQA Y2, Y6 + VPADDQ TWO, Y3, Y7 + VMOVDQA Y0, Y8 + VMOVDQA Y1, Y9 + VMOVDQA Y2, Y10 + VPADDQ TWO, Y7, Y11 + + MOVQ DX, R9 + +chacha_loop_384: + CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y1, Y2, Y3) + CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) + CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) + CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y3, Y2, Y1) + CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) + CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) + SUBQ $2, R9 + JA chacha_loop_384 + + VPADDD STATE_0, Y0, Y0 + VPADDD STATE_1, Y1, Y1 + VPADDD STATE_2, Y2, Y2 + VPADDD STATE_3, Y3, Y3 + XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13) + VMOVDQA STATE_0, Y0 + VMOVDQA STATE_1, Y1 + VMOVDQA STATE_2, Y2 + VMOVDQA STATE_3, Y3 + VPADDQ TWO, Y3, Y3 + + VPADDD Y0, Y4, Y4 + VPADDD Y1, Y5, Y5 + VPADDD Y2, Y6, Y6 + VPADDD Y3, Y7, Y7 + XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13) + VPADDQ TWO, Y3, Y3 + + VPADDD Y0, Y8, Y8 + VPADDD Y1, Y9, Y9 + VPADDD Y2, Y10, Y10 + VPADDD Y3, Y11, Y11 + VPADDQ TWO, Y3, Y3 + + CMPQ CX, $384 + JB less_than_384 + + XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) + SUBQ $384, CX + TESTQ CX, CX + JE done + + ADDQ $384, SI + ADDQ $384, DI + JMP between_0_and_64 + +less_than_384: + XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13) + EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) + ADDQ $320, SI + ADDQ $320, DI + SUBQ $320, CX + JMP finalize + +between_192_and_320: + VMOVDQA Y0, Y4 + VMOVDQA Y1, Y5 + VMOVDQA Y2, Y6 + VMOVDQA Y3, Y7 + VMOVDQA Y0, Y8 + VMOVDQA Y1, Y9 + VMOVDQA Y2, Y10 + VPADDQ TWO, Y3, Y11 + + MOVQ DX, R9 + +chacha_loop_256: + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) + CHACHA_SHUFFLE_AVX(Y9, Y10, Y11) + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) + CHACHA_SHUFFLE_AVX(Y11, Y10, Y9) + SUBQ $2, R9 + JA chacha_loop_256 + + VPADDD Y0, Y4, Y4 + VPADDD Y1, Y5, Y5 + VPADDD Y2, Y6, Y6 + VPADDD Y3, Y7, Y7 + VPADDQ TWO, Y3, Y3 + XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) + VPADDD Y0, Y8, Y8 + VPADDD Y1, Y9, Y9 + VPADDD Y2, Y10, Y10 + VPADDD Y3, Y11, Y11 + VPADDQ TWO, Y3, Y3 + + CMPQ CX, $256 + JB less_than_256 + + XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) + SUBQ $256, CX + TESTQ CX, CX + JE done + + ADDQ $256, SI + ADDQ $256, DI + JMP between_0_and_64 + +less_than_256: + XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13) + EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12) + ADDQ $192, SI + ADDQ $192, DI + SUBQ $192, CX + JMP finalize + +between_64_and_192: + VMOVDQA Y0, Y4 + VMOVDQA Y1, Y5 + VMOVDQA Y2, Y6 + VMOVDQA Y3, Y7 + + MOVQ DX, R9 + +chacha_loop_128: + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y5, Y6, Y7) + CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15) + CHACHA_SHUFFLE_AVX(Y7, Y6, Y5) + SUBQ $2, R9 + JA chacha_loop_128 + + VPADDD Y0, Y4, Y4 + VPADDD Y1, Y5, Y5 + VPADDD Y2, Y6, Y6 + VPADDD Y3, Y7, Y7 + VPADDQ TWO, Y3, Y3 + + CMPQ CX, $128 + JB less_than_128 + + XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) + SUBQ $128, CX + TESTQ CX, CX + JE done + + ADDQ $128, SI + ADDQ $128, DI + JMP between_0_and_64 + +less_than_128: + XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13) + EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13) + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, CX + JMP finalize + +between_0_and_64: + VMOVDQA X0, X4 + VMOVDQA X1, X5 + VMOVDQA X2, X6 + VMOVDQA X3, X7 + + MOVQ DX, R9 + +chacha_loop_64: + CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) + CHACHA_SHUFFLE_AVX(X5, X6, X7) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15) + CHACHA_SHUFFLE_AVX(X7, X6, X5) + SUBQ $2, R9 + JA chacha_loop_64 + + VPADDD X0, X4, X4 + VPADDD X1, X5, X5 + VPADDD X2, X6, X6 + VPADDD X3, X7, X7 + VMOVDQU ·one<>(SB), X0 + VPADDQ X0, X3, X3 + + CMPQ CX, $64 + JB less_than_64 + + XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13) + SUBQ $64, CX + JMP done + +less_than_64: + VMOVDQU X4, 0(BX) + VMOVDQU X5, 16(BX) + VMOVDQU X6, 32(BX) + VMOVDQU X7, 48(BX) + +finalize: + XORQ R11, R11 + XORQ R12, R12 + MOVQ CX, BP + +xor_loop: + MOVB 0(SI), R11 + MOVB 0(BX), R12 + XORQ R11, R12 + MOVB R12, 0(DI) + INCQ SI + INCQ BX + INCQ DI + DECQ BP + JA xor_loop + +done: + VMOVDQU X3, 48(AX) + VZEROUPPER + MOVQ R8, SP + MOVQ CX, ret+72(FP) + RET + diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_386.go b/vendor/github.com/aead/chacha20/chacha/chacha_386.go new file mode 100644 index 0000000..97e533d --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_386.go @@ -0,0 +1,60 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build 386,!gccgo,!appengine,!nacl + +package chacha + +import ( + "encoding/binary" + + "golang.org/x/sys/cpu" +) + +func init() { + useSSE2 = cpu.X86.HasSSE2 + useSSSE3 = cpu.X86.HasSSSE3 + useAVX = false + useAVX2 = false +} + +func initialize(state *[64]byte, key []byte, nonce *[16]byte) { + binary.LittleEndian.PutUint32(state[0:], sigma[0]) + binary.LittleEndian.PutUint32(state[4:], sigma[1]) + binary.LittleEndian.PutUint32(state[8:], sigma[2]) + binary.LittleEndian.PutUint32(state[12:], sigma[3]) + copy(state[16:], key[:]) + copy(state[48:], nonce[:]) +} + +// This function is implemented in chacha_386.s +//go:noescape +func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) + +// This function is implemented in chacha_386.s +//go:noescape +func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) + +// This function is implemented in chacha_386.s +//go:noescape +func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int + +func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { + switch { + case useSSSE3: + hChaCha20SSSE3(out, nonce, key) + case useSSE2: + hChaCha20SSE2(out, nonce, key) + default: + hChaCha20Generic(out, nonce, key) + } +} + +func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { + if useSSE2 { + return xorKeyStreamSSE2(dst, src, block, state, rounds) + } else { + return xorKeyStreamGeneric(dst, src, block, state, rounds) + } +} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_386.s b/vendor/github.com/aead/chacha20/chacha/chacha_386.s new file mode 100644 index 0000000..262fc86 --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_386.s @@ -0,0 +1,163 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build 386,!gccgo,!appengine,!nacl + +#include "const.s" +#include "macro.s" + +// FINALIZE xors len bytes from src and block using +// the temp. registers t0 and t1 and writes the result +// to dst. +#define FINALIZE(dst, src, block, len, t0, t1) \ + XORL t0, t0; \ + XORL t1, t1; \ + FINALIZE_LOOP:; \ + MOVB 0(src), t0; \ + MOVB 0(block), t1; \ + XORL t0, t1; \ + MOVB t1, 0(dst); \ + INCL src; \ + INCL block; \ + INCL dst; \ + DECL len; \ + JG FINALIZE_LOOP \ + +#define Dst DI +#define Nonce AX +#define Key BX +#define Rounds DX + +// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) +TEXT ·hChaCha20SSE2(SB), 4, $0-12 + MOVL out+0(FP), Dst + MOVL nonce+4(FP), Nonce + MOVL key+8(FP), Key + + MOVOU ·sigma<>(SB), X0 + MOVOU 0*16(Key), X1 + MOVOU 1*16(Key), X2 + MOVOU 0*16(Nonce), X3 + MOVL $20, Rounds + +chacha_loop: + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_SHUFFLE_SSE(X3, X2, X1) + SUBL $2, Rounds + JNZ chacha_loop + + MOVOU X0, 0*16(Dst) + MOVOU X3, 1*16(Dst) + RET + +// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) +TEXT ·hChaCha20SSSE3(SB), 4, $0-12 + MOVL out+0(FP), Dst + MOVL nonce+4(FP), Nonce + MOVL key+8(FP), Key + + MOVOU ·sigma<>(SB), X0 + MOVOU 0*16(Key), X1 + MOVOU 1*16(Key), X2 + MOVOU 0*16(Nonce), X3 + MOVL $20, Rounds + + MOVOU ·rol16<>(SB), X5 + MOVOU ·rol8<>(SB), X6 + +chacha_loop: + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_SSE(X3, X2, X1) + SUBL $2, Rounds + JNZ chacha_loop + + MOVOU X0, 0*16(Dst) + MOVOU X3, 1*16(Dst) + RET + +#undef Dst +#undef Nonce +#undef Key +#undef Rounds + +#define State AX +#define Dst DI +#define Src SI +#define Len DX +#define Tmp0 BX +#define Tmp1 BP + +// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int +TEXT ·xorKeyStreamSSE2(SB), 4, $0-40 + MOVL dst_base+0(FP), Dst + MOVL src_base+12(FP), Src + MOVL state+28(FP), State + MOVL src_len+16(FP), Len + MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0 + + MOVOU 0*16(State), X0 + MOVOU 1*16(State), X1 + MOVOU 2*16(State), X2 + MOVOU 3*16(State), X3 + TESTL Len, Len + JZ DONE + +GENERATE_KEYSTREAM: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + MOVL rounds+32(FP), Tmp0 + +CHACHA_LOOP: + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBL $2, Tmp0 + JA CHACHA_LOOP + + MOVOU 0*16(State), X0 // Restore X0 from state + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + MOVOU ·one<>(SB), X0 + PADDQ X0, X3 + + CMPL Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0) + MOVOU 0*16(State), X0 // Restore X0 from state + ADDL $64, Src + ADDL $64, Dst + SUBL $64, Len + JZ DONE + JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte + +BUFFER_KEYSTREAM: + MOVL block+24(FP), State + MOVOU X4, 0(State) + MOVOU X5, 16(State) + MOVOU X6, 32(State) + MOVOU X7, 48(State) + MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64 + FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1) + +DONE: + MOVL state+28(FP), State + MOVOU X3, 3*16(State) + RET + +#undef State +#undef Dst +#undef Src +#undef Len +#undef Tmp0 +#undef Tmp1 diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go new file mode 100644 index 0000000..635f7de --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.go @@ -0,0 +1,76 @@ +// Copyright (c) 2017 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build go1.7,amd64,!gccgo,!appengine,!nacl + +package chacha + +import "golang.org/x/sys/cpu" + +func init() { + useSSE2 = cpu.X86.HasSSE2 + useSSSE3 = cpu.X86.HasSSSE3 + useAVX = cpu.X86.HasAVX + useAVX2 = cpu.X86.HasAVX2 +} + +// This function is implemented in chacha_amd64.s +//go:noescape +func initialize(state *[64]byte, key []byte, nonce *[16]byte) + +// This function is implemented in chacha_amd64.s +//go:noescape +func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) + +// This function is implemented in chacha_amd64.s +//go:noescape +func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) + +// This function is implemented in chachaAVX2_amd64.s +//go:noescape +func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte) + +// This function is implemented in chacha_amd64.s +//go:noescape +func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int + +// This function is implemented in chacha_amd64.s +//go:noescape +func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int + +// This function is implemented in chacha_amd64.s +//go:noescape +func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int + +// This function is implemented in chachaAVX2_amd64.s +//go:noescape +func xorKeyStreamAVX2(dst, src []byte, block, state *[64]byte, rounds int) int + +func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { + switch { + case useAVX: + hChaCha20AVX(out, nonce, key) + case useSSSE3: + hChaCha20SSSE3(out, nonce, key) + case useSSE2: + hChaCha20SSE2(out, nonce, key) + default: + hChaCha20Generic(out, nonce, key) + } +} + +func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { + switch { + case useAVX2: + return xorKeyStreamAVX2(dst, src, block, state, rounds) + case useAVX: + return xorKeyStreamAVX(dst, src, block, state, rounds) + case useSSSE3: + return xorKeyStreamSSSE3(dst, src, block, state, rounds) + case useSSE2: + return xorKeyStreamSSE2(dst, src, block, state, rounds) + default: + return xorKeyStreamGeneric(dst, src, block, state, rounds) + } +} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s new file mode 100644 index 0000000..26a2383 --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_amd64.s @@ -0,0 +1,1072 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build amd64,!gccgo,!appengine,!nacl + +#include "const.s" +#include "macro.s" + +// FINALIZE xors len bytes from src and block using +// the temp. registers t0 and t1 and writes the result +// to dst. +#define FINALIZE(dst, src, block, len, t0, t1) \ + XORQ t0, t0; \ + XORQ t1, t1; \ + FINALIZE_LOOP:; \ + MOVB 0(src), t0; \ + MOVB 0(block), t1; \ + XORQ t0, t1; \ + MOVB t1, 0(dst); \ + INCQ src; \ + INCQ block; \ + INCQ dst; \ + DECQ len; \ + JG FINALIZE_LOOP \ + +#define Dst DI +#define Nonce AX +#define Key BX +#define Rounds DX + +// func initialize(state *[64]byte, key []byte, nonce *[16]byte) +TEXT ·initialize(SB), 4, $0-40 + MOVQ state+0(FP), Dst + MOVQ key+8(FP), Key + MOVQ nonce+32(FP), Nonce + + MOVOU ·sigma<>(SB), X0 + MOVOU 0*16(Key), X1 + MOVOU 1*16(Key), X2 + MOVOU 0*16(Nonce), X3 + + MOVOU X0, 0*16(Dst) + MOVOU X1, 1*16(Dst) + MOVOU X2, 2*16(Dst) + MOVOU X3, 3*16(Dst) + RET + +// func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte) +TEXT ·hChaCha20AVX(SB), 4, $0-24 + MOVQ out+0(FP), Dst + MOVQ nonce+8(FP), Nonce + MOVQ key+16(FP), Key + + VMOVDQU ·sigma<>(SB), X0 + VMOVDQU 0*16(Key), X1 + VMOVDQU 1*16(Key), X2 + VMOVDQU 0*16(Nonce), X3 + VMOVDQU ·rol16_AVX2<>(SB), X5 + VMOVDQU ·rol8_AVX2<>(SB), X6 + MOVQ $20, Rounds + +CHACHA_LOOP: + CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_AVX(X1, X2, X3) + CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_AVX(X3, X2, X1) + SUBQ $2, Rounds + JNZ CHACHA_LOOP + + VMOVDQU X0, 0*16(Dst) + VMOVDQU X3, 1*16(Dst) + VZEROUPPER + RET + +// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte) +TEXT ·hChaCha20SSE2(SB), 4, $0-24 + MOVQ out+0(FP), Dst + MOVQ nonce+8(FP), Nonce + MOVQ key+16(FP), Key + + MOVOU ·sigma<>(SB), X0 + MOVOU 0*16(Key), X1 + MOVOU 1*16(Key), X2 + MOVOU 0*16(Nonce), X3 + MOVQ $20, Rounds + +CHACHA_LOOP: + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_SHUFFLE_SSE(X3, X2, X1) + SUBQ $2, Rounds + JNZ CHACHA_LOOP + + MOVOU X0, 0*16(Dst) + MOVOU X3, 1*16(Dst) + RET + +// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte) +TEXT ·hChaCha20SSSE3(SB), 4, $0-24 + MOVQ out+0(FP), Dst + MOVQ nonce+8(FP), Nonce + MOVQ key+16(FP), Key + + MOVOU ·sigma<>(SB), X0 + MOVOU 0*16(Key), X1 + MOVOU 1*16(Key), X2 + MOVOU 0*16(Nonce), X3 + MOVOU ·rol16<>(SB), X5 + MOVOU ·rol8<>(SB), X6 + MOVQ $20, Rounds + +chacha_loop: + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6) + CHACHA_SHUFFLE_SSE(X3, X2, X1) + SUBQ $2, Rounds + JNZ chacha_loop + + MOVOU X0, 0*16(Dst) + MOVOU X3, 1*16(Dst) + RET + +#undef Dst +#undef Nonce +#undef Key +#undef Rounds + +#define Dst DI +#define Src SI +#define Len R12 +#define Rounds DX +#define Buffer BX +#define State AX +#define Stack SP +#define SavedSP R8 +#define Tmp0 R9 +#define Tmp1 R10 +#define Tmp2 R11 + +// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int +TEXT ·xorKeyStreamSSE2(SB), 4, $112-80 + MOVQ dst_base+0(FP), Dst + MOVQ src_base+24(FP), Src + MOVQ block+48(FP), Buffer + MOVQ state+56(FP), State + MOVQ rounds+64(FP), Rounds + MOVQ src_len+32(FP), Len + + MOVOU 0*16(State), X0 + MOVOU 1*16(State), X1 + MOVOU 2*16(State), X2 + MOVOU 3*16(State), X3 + + MOVQ Stack, SavedSP + ADDQ $16, Stack + ANDQ $-16, Stack + + TESTQ Len, Len + JZ DONE + + MOVOU ·one<>(SB), X4 + MOVO X0, 0*16(Stack) + MOVO X1, 1*16(Stack) + MOVO X2, 2*16(Stack) + MOVO X3, 3*16(Stack) + MOVO X4, 4*16(Stack) + + CMPQ Len, $64 + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 + JLE GENERATE_KEYSTREAM_192 + +GENERATE_KEYSTREAM_256: + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X3, X15 + PADDQ 4*16(Stack), X15 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X15, X11 + PADDQ 4*16(Stack), X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X11, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + + MOVO X3, 3*16(Stack) // Save X3 + +CHACHA_LOOP_256: + MOVO X4, 5*16(Stack) + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) + MOVO 5*16(Stack), X4 + MOVO X0, 5*16(Stack) + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + MOVO 5*16(Stack), X0 + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_SHUFFLE_SSE(X13, X14, X15) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + MOVO X4, 5*16(Stack) + CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4) + CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4) + MOVO 5*16(Stack), X4 + MOVO X0, 5*16(Stack) + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + MOVO 5*16(Stack), X0 + CHACHA_SHUFFLE_SSE(X3, X2, X1) + CHACHA_SHUFFLE_SSE(X15, X14, X13) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_256 + + PADDL 0*16(Stack), X0 + PADDL 1*16(Stack), X1 + PADDL 2*16(Stack), X2 + PADDL 3*16(Stack), X3 + MOVO X4, 5*16(Stack) // Save X4 + XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) + MOVO 5*16(Stack), X4 // Restore X4 + + MOVO 0*16(Stack), X0 + MOVO 1*16(Stack), X1 + MOVO 2*16(Stack), X2 + MOVO 3*16(Stack), X3 + PADDQ 4*16(Stack), X3 + + PADDL X0, X12 + PADDL X1, X13 + PADDL X2, X14 + PADDL X3, X15 + PADDQ 4*16(Stack), X3 + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) + XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) + MOVO 0*16(Stack), X0 // Restore X0 + ADDQ $192, Dst + ADDQ $192, Src + SUBQ $192, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream + JG GENERATE_KEYSTREAM_256 + +GENERATE_KEYSTREAM_192: + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X3, X15 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X3, X11 + PADDQ 4*16(Stack), X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X11, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + +CHACHA_LOOP_192: + CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + CHACHA_SHUFFLE_SSE(X13, X14, X15) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0) + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0) + CHACHA_SHUFFLE_SSE(X15, X14, X13) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_192 + + MOVO 0*16(Stack), X0 // Restore X0 + PADDL X0, X12 + PADDL X1, X13 + PADDL X2, X14 + PADDL X3, X15 + PADDQ 4*16(Stack), X3 + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) + XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) + MOVO 0*16(Stack), X0 // Restore X0 + ADDQ $128, Dst + ADDQ $128, Src + SUBQ $128, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + +GENERATE_KEYSTREAM_128: + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X3, X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + +CHACHA_LOOP_128: + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_128 + + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream + +GENERATE_KEYSTREAM_64: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + MOVQ Rounds, Tmp0 + +CHACHA_LOOP_64: + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_64 + + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Src + ADDQ $64, Dst + SUBQ $64, Len + JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. + +BUFFER_KEYSTREAM: + MOVOU X4, 0*16(Buffer) + MOVOU X5, 1*16(Buffer) + MOVOU X6, 2*16(Buffer) + MOVOU X7, 3*16(Buffer) + MOVQ Len, Tmp0 + FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) + +DONE: + MOVQ SavedSP, Stack // Restore stack pointer + MOVOU X3, 3*16(State) + MOVQ Len, ret+72(FP) + RET + +// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int +TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80 + MOVQ dst_base+0(FP), Dst + MOVQ src_base+24(FP), Src + MOVQ block+48(FP), Buffer + MOVQ state+56(FP), State + MOVQ rounds+64(FP), Rounds + MOVQ src_len+32(FP), Len + + MOVOU 0*16(State), X0 + MOVOU 1*16(State), X1 + MOVOU 2*16(State), X2 + MOVOU 3*16(State), X3 + + MOVQ Stack, SavedSP + ADDQ $16, Stack + ANDQ $-16, Stack + + TESTQ Len, Len + JZ DONE + + MOVOU ·one<>(SB), X4 + MOVOU ·rol16<>(SB), X5 + MOVOU ·rol8<>(SB), X6 + MOVO X0, 0*16(Stack) + MOVO X1, 1*16(Stack) + MOVO X2, 2*16(Stack) + MOVO X3, 3*16(Stack) + MOVO X4, 4*16(Stack) + MOVO X5, 6*16(Stack) + MOVO X6, 7*16(Stack) + + CMPQ Len, $64 + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 + JLE GENERATE_KEYSTREAM_192 + +GENERATE_KEYSTREAM_256: + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X3, X15 + PADDQ 4*16(Stack), X15 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X15, X11 + PADDQ 4*16(Stack), X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X11, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + + MOVO X3, 3*16(Stack) // Save X3 + +CHACHA_LOOP_256: + MOVO X4, 5*16(Stack) + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) + MOVO 5*16(Stack), X4 + MOVO X0, 5*16(Stack) + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) + MOVO 5*16(Stack), X0 + CHACHA_SHUFFLE_SSE(X1, X2, X3) + CHACHA_SHUFFLE_SSE(X13, X14, X15) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + MOVO X4, 5*16(Stack) + CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) + MOVO 5*16(Stack), X4 + MOVO X0, 5*16(Stack) + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) + MOVO 5*16(Stack), X0 + CHACHA_SHUFFLE_SSE(X3, X2, X1) + CHACHA_SHUFFLE_SSE(X15, X14, X13) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_256 + + PADDL 0*16(Stack), X0 + PADDL 1*16(Stack), X1 + PADDL 2*16(Stack), X2 + PADDL 3*16(Stack), X3 + MOVO X4, 5*16(Stack) // Save X4 + XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4) + MOVO 5*16(Stack), X4 // Restore X4 + + MOVO 0*16(Stack), X0 + MOVO 1*16(Stack), X1 + MOVO 2*16(Stack), X2 + MOVO 3*16(Stack), X3 + PADDQ 4*16(Stack), X3 + + PADDL X0, X12 + PADDL X1, X13 + PADDL X2, X14 + PADDL X3, X15 + PADDQ 4*16(Stack), X3 + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0) + XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0) + MOVO 0*16(Stack), X0 // Restore X0 + ADDQ $192, Dst + ADDQ $192, Src + SUBQ $192, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream + JG GENERATE_KEYSTREAM_256 + +GENERATE_KEYSTREAM_192: + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X3, X15 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X3, X11 + PADDQ 4*16(Stack), X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X11, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + + MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant + MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant + +CHACHA_LOOP_192: + CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) + CHACHA_SHUFFLE_SSE(X13, X14, X15) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2) + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2) + CHACHA_SHUFFLE_SSE(X15, X14, X13) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_192 + + MOVO 0*16(Stack), X0 // Restore X0 + MOVO 1*16(Stack), X1 // Restore X1 + MOVO 2*16(Stack), X2 // Restore X2 + PADDL X0, X12 + PADDL X1, X13 + PADDL X2, X14 + PADDL X3, X15 + PADDQ 4*16(Stack), X3 + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0) + XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0) + MOVO 0*16(Stack), X0 // Restore X0 + ADDQ $128, Dst + ADDQ $128, Src + SUBQ $128, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + +GENERATE_KEYSTREAM_128: + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X3, X11 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ 4*16(Stack), X7 + MOVQ Rounds, Tmp0 + + MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant + MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant + +CHACHA_LOOP_128: + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) + CHACHA_SHUFFLE_SSE(X9, X10, X11) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14) + CHACHA_SHUFFLE_SSE(X11, X10, X9) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_128 + + PADDL X0, X8 + PADDL X1, X9 + PADDL X2, X10 + PADDL X3, X11 + PADDQ 4*16(Stack), X3 + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream + +GENERATE_KEYSTREAM_64: + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + MOVQ Rounds, Tmp0 + + MOVO 6*16(Stack), X9 // Load 16 bit rotate-left constant + MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant + +CHACHA_LOOP_64: + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) + CHACHA_SHUFFLE_SSE(X5, X6, X7) + CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10) + CHACHA_SHUFFLE_SSE(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_64 + + PADDL X0, X4 + PADDL X1, X5 + PADDL X2, X6 + PADDL X3, X7 + PADDQ 4*16(Stack), X3 + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Src + ADDQ $64, Dst + SUBQ $64, Len + JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. + +BUFFER_KEYSTREAM: + MOVOU X4, 0*16(Buffer) + MOVOU X5, 1*16(Buffer) + MOVOU X6, 2*16(Buffer) + MOVOU X7, 3*16(Buffer) + MOVQ Len, Tmp0 + FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) + +DONE: + MOVQ SavedSP, Stack // Restore stack pointer + MOVOU X3, 3*16(State) + MOVQ Len, ret+72(FP) + RET + +// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int +TEXT ·xorKeyStreamAVX(SB), 4, $144-80 + MOVQ dst_base+0(FP), Dst + MOVQ src_base+24(FP), Src + MOVQ block+48(FP), Buffer + MOVQ state+56(FP), State + MOVQ rounds+64(FP), Rounds + MOVQ src_len+32(FP), Len + + VMOVDQU 0*16(State), X0 + VMOVDQU 1*16(State), X1 + VMOVDQU 2*16(State), X2 + VMOVDQU 3*16(State), X3 + + MOVQ Stack, SavedSP + ADDQ $16, Stack + ANDQ $-16, Stack + + TESTQ Len, Len + JZ DONE + + VMOVDQU ·one<>(SB), X4 + VMOVDQU ·rol16<>(SB), X5 + VMOVDQU ·rol8<>(SB), X6 + VMOVDQA X0, 0*16(Stack) + VMOVDQA X1, 1*16(Stack) + VMOVDQA X2, 2*16(Stack) + VMOVDQA X3, 3*16(Stack) + VMOVDQA X4, 4*16(Stack) + VMOVDQA X5, 6*16(Stack) + VMOVDQA X6, 7*16(Stack) + + CMPQ Len, $64 + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 + JLE GENERATE_KEYSTREAM_192 + +GENERATE_KEYSTREAM_256: + VMOVDQA X0, X12 + VMOVDQA X1, X13 + VMOVDQA X2, X14 + VMOVDQA X3, X15 + VPADDQ 4*16(Stack), X15, X15 + VMOVDQA X0, X8 + VMOVDQA X1, X9 + VMOVDQA X2, X10 + VMOVDQA X15, X11 + VPADDQ 4*16(Stack), X11, X11 + VMOVDQA X0, X4 + VMOVDQA X1, X5 + VMOVDQA X2, X6 + VMOVDQA X11, X7 + VPADDQ 4*16(Stack), X7, X7 + MOVQ Rounds, Tmp0 + + VMOVDQA X3, 3*16(Stack) // Save X3 + +CHACHA_LOOP_256: + VMOVDQA X4, 5*16(Stack) + CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) + VMOVDQA 5*16(Stack), X4 + VMOVDQA X0, 5*16(Stack) + CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) + VMOVDQA 5*16(Stack), X0 + CHACHA_SHUFFLE_AVX(X1, X2, X3) + CHACHA_SHUFFLE_AVX(X13, X14, X15) + CHACHA_SHUFFLE_AVX(X9, X10, X11) + CHACHA_SHUFFLE_AVX(X5, X6, X7) + VMOVDQA X4, 5*16(Stack) + CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack)) + VMOVDQA 5*16(Stack), X4 + VMOVDQA X0, 5*16(Stack) + CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack)) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack)) + VMOVDQA 5*16(Stack), X0 + CHACHA_SHUFFLE_AVX(X3, X2, X1) + CHACHA_SHUFFLE_AVX(X15, X14, X13) + CHACHA_SHUFFLE_AVX(X11, X10, X9) + CHACHA_SHUFFLE_AVX(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_256 + + VPADDD 0*16(Stack), X0, X0 + VPADDD 1*16(Stack), X1, X1 + VPADDD 2*16(Stack), X2, X2 + VPADDD 3*16(Stack), X3, X3 + VMOVDQA X4, 5*16(Stack) // Save X4 + XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4) + VMOVDQA 5*16(Stack), X4 // Restore X4 + + VMOVDQA 0*16(Stack), X0 + VMOVDQA 1*16(Stack), X1 + VMOVDQA 2*16(Stack), X2 + VMOVDQA 3*16(Stack), X3 + VPADDQ 4*16(Stack), X3, X3 + + VPADDD X0, X12, X12 + VPADDD X1, X13, X13 + VPADDD X2, X14, X14 + VPADDD X3, X15, X15 + VPADDQ 4*16(Stack), X3, X3 + VPADDD X0, X8, X8 + VPADDD X1, X9, X9 + VPADDD X2, X10, X10 + VPADDD X3, X11, X11 + VPADDQ 4*16(Stack), X3, X3 + VPADDD X0, X4, X4 + VPADDD X1, X5, X5 + VPADDD X2, X6, X6 + VPADDD X3, X7, X7 + VPADDQ 4*16(Stack), X3, X3 + + XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0) + XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0) + VMOVDQA 0*16(Stack), X0 // Restore X0 + ADDQ $192, Dst + ADDQ $192, Src + SUBQ $192, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream. + JLE GENERATE_KEYSTREAM_128 + CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream + JG GENERATE_KEYSTREAM_256 + +GENERATE_KEYSTREAM_192: + VMOVDQA X0, X12 + VMOVDQA X1, X13 + VMOVDQA X2, X14 + VMOVDQA X3, X15 + VMOVDQA X0, X8 + VMOVDQA X1, X9 + VMOVDQA X2, X10 + VMOVDQA X3, X11 + VPADDQ 4*16(Stack), X11, X11 + VMOVDQA X0, X4 + VMOVDQA X1, X5 + VMOVDQA X2, X6 + VMOVDQA X11, X7 + VPADDQ 4*16(Stack), X7, X7 + MOVQ Rounds, Tmp0 + + VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant + VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant + +CHACHA_LOOP_192: + CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) + CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) + CHACHA_SHUFFLE_AVX(X13, X14, X15) + CHACHA_SHUFFLE_AVX(X9, X10, X11) + CHACHA_SHUFFLE_AVX(X5, X6, X7) + CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2) + CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2) + CHACHA_SHUFFLE_AVX(X15, X14, X13) + CHACHA_SHUFFLE_AVX(X11, X10, X9) + CHACHA_SHUFFLE_AVX(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_192 + + VMOVDQA 0*16(Stack), X0 // Restore X0 + VMOVDQA 1*16(Stack), X1 // Restore X1 + VMOVDQA 2*16(Stack), X2 // Restore X2 + VPADDD X0, X12, X12 + VPADDD X1, X13, X13 + VPADDD X2, X14, X14 + VPADDD X3, X15, X15 + VPADDQ 4*16(Stack), X3, X3 + VPADDD X0, X8, X8 + VPADDD X1, X9, X9 + VPADDD X2, X10, X10 + VPADDD X3, X11, X11 + VPADDQ 4*16(Stack), X3, X3 + VPADDD X0, X4, X4 + VPADDD X1, X5, X5 + VPADDD X2, X6, X6 + VPADDD X3, X7, X7 + VPADDQ 4*16(Stack), X3, X3 + + XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0) + XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0) + VMOVDQA 0*16(Stack), X0 // Restore X0 + ADDQ $128, Dst + ADDQ $128, Src + SUBQ $128, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE + CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream. + JLE GENERATE_KEYSTREAM_64 + +GENERATE_KEYSTREAM_128: + VMOVDQA X0, X8 + VMOVDQA X1, X9 + VMOVDQA X2, X10 + VMOVDQA X3, X11 + VMOVDQA X0, X4 + VMOVDQA X1, X5 + VMOVDQA X2, X6 + VMOVDQA X3, X7 + VPADDQ 4*16(Stack), X7, X7 + MOVQ Rounds, Tmp0 + + VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant + VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant + +CHACHA_LOOP_128: + CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) + CHACHA_SHUFFLE_AVX(X9, X10, X11) + CHACHA_SHUFFLE_AVX(X5, X6, X7) + CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14) + CHACHA_SHUFFLE_AVX(X11, X10, X9) + CHACHA_SHUFFLE_AVX(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_128 + + VPADDD X0, X8, X8 + VPADDD X1, X9, X9 + VPADDD X2, X10, X10 + VPADDD X3, X11, X11 + VPADDQ 4*16(Stack), X3, X3 + VPADDD X0, X4, X4 + VPADDD X1, X5, X5 + VPADDD X2, X6, X6 + VPADDD X3, X7, X7 + VPADDQ 4*16(Stack), X3, X3 + + XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Dst + ADDQ $64, Src + SUBQ $64, Len + JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream + +GENERATE_KEYSTREAM_64: + VMOVDQA X0, X4 + VMOVDQA X1, X5 + VMOVDQA X2, X6 + VMOVDQA X3, X7 + MOVQ Rounds, Tmp0 + + VMOVDQA 6*16(Stack), X9 // Load 16 bit rotate-left constant + VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant + +CHACHA_LOOP_64: + CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) + CHACHA_SHUFFLE_AVX(X5, X6, X7) + CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10) + CHACHA_SHUFFLE_AVX(X7, X6, X5) + SUBQ $2, Tmp0 + JNZ CHACHA_LOOP_64 + + VPADDD X0, X4, X4 + VPADDD X1, X5, X5 + VPADDD X2, X6, X6 + VPADDD X3, X7, X7 + VPADDQ 4*16(Stack), X3, X3 + + CMPQ Len, $64 + JL BUFFER_KEYSTREAM + + XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8) + ADDQ $64, Src + ADDQ $64, Dst + SUBQ $64, Len + JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true. + +BUFFER_KEYSTREAM: + VMOVDQU X4, 0*16(Buffer) + VMOVDQU X5, 1*16(Buffer) + VMOVDQU X6, 2*16(Buffer) + VMOVDQU X7, 3*16(Buffer) + MOVQ Len, Tmp0 + FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2) + +DONE: + MOVQ SavedSP, Stack // Restore stack pointer + VMOVDQU X3, 3*16(State) + VZEROUPPER + MOVQ Len, ret+72(FP) + RET + +#undef Dst +#undef Src +#undef Len +#undef Rounds +#undef Buffer +#undef State +#undef Stack +#undef SavedSP +#undef Tmp0 +#undef Tmp1 +#undef Tmp2 diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_generic.go b/vendor/github.com/aead/chacha20/chacha/chacha_generic.go new file mode 100644 index 0000000..8832d5b --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_generic.go @@ -0,0 +1,319 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +package chacha + +import "encoding/binary" + +var sigma = [4]uint32{0x61707865, 0x3320646e, 0x79622d32, 0x6b206574} + +func xorKeyStreamGeneric(dst, src []byte, block, state *[64]byte, rounds int) int { + for len(src) >= 64 { + chachaGeneric(block, state, rounds) + + for i, v := range block { + dst[i] = src[i] ^ v + } + src = src[64:] + dst = dst[64:] + } + + n := len(src) + if n > 0 { + chachaGeneric(block, state, rounds) + for i, v := range src { + dst[i] = v ^ block[i] + } + } + return n +} + +func chachaGeneric(dst *[64]byte, state *[64]byte, rounds int) { + v00 := binary.LittleEndian.Uint32(state[0:]) + v01 := binary.LittleEndian.Uint32(state[4:]) + v02 := binary.LittleEndian.Uint32(state[8:]) + v03 := binary.LittleEndian.Uint32(state[12:]) + v04 := binary.LittleEndian.Uint32(state[16:]) + v05 := binary.LittleEndian.Uint32(state[20:]) + v06 := binary.LittleEndian.Uint32(state[24:]) + v07 := binary.LittleEndian.Uint32(state[28:]) + v08 := binary.LittleEndian.Uint32(state[32:]) + v09 := binary.LittleEndian.Uint32(state[36:]) + v10 := binary.LittleEndian.Uint32(state[40:]) + v11 := binary.LittleEndian.Uint32(state[44:]) + v12 := binary.LittleEndian.Uint32(state[48:]) + v13 := binary.LittleEndian.Uint32(state[52:]) + v14 := binary.LittleEndian.Uint32(state[56:]) + v15 := binary.LittleEndian.Uint32(state[60:]) + + s00, s01, s02, s03, s04, s05, s06, s07 := v00, v01, v02, v03, v04, v05, v06, v07 + s08, s09, s10, s11, s12, s13, s14, s15 := v08, v09, v10, v11, v12, v13, v14, v15 + + for i := 0; i < rounds; i += 2 { + v00 += v04 + v12 ^= v00 + v12 = (v12 << 16) | (v12 >> 16) + v08 += v12 + v04 ^= v08 + v04 = (v04 << 12) | (v04 >> 20) + v00 += v04 + v12 ^= v00 + v12 = (v12 << 8) | (v12 >> 24) + v08 += v12 + v04 ^= v08 + v04 = (v04 << 7) | (v04 >> 25) + v01 += v05 + v13 ^= v01 + v13 = (v13 << 16) | (v13 >> 16) + v09 += v13 + v05 ^= v09 + v05 = (v05 << 12) | (v05 >> 20) + v01 += v05 + v13 ^= v01 + v13 = (v13 << 8) | (v13 >> 24) + v09 += v13 + v05 ^= v09 + v05 = (v05 << 7) | (v05 >> 25) + v02 += v06 + v14 ^= v02 + v14 = (v14 << 16) | (v14 >> 16) + v10 += v14 + v06 ^= v10 + v06 = (v06 << 12) | (v06 >> 20) + v02 += v06 + v14 ^= v02 + v14 = (v14 << 8) | (v14 >> 24) + v10 += v14 + v06 ^= v10 + v06 = (v06 << 7) | (v06 >> 25) + v03 += v07 + v15 ^= v03 + v15 = (v15 << 16) | (v15 >> 16) + v11 += v15 + v07 ^= v11 + v07 = (v07 << 12) | (v07 >> 20) + v03 += v07 + v15 ^= v03 + v15 = (v15 << 8) | (v15 >> 24) + v11 += v15 + v07 ^= v11 + v07 = (v07 << 7) | (v07 >> 25) + v00 += v05 + v15 ^= v00 + v15 = (v15 << 16) | (v15 >> 16) + v10 += v15 + v05 ^= v10 + v05 = (v05 << 12) | (v05 >> 20) + v00 += v05 + v15 ^= v00 + v15 = (v15 << 8) | (v15 >> 24) + v10 += v15 + v05 ^= v10 + v05 = (v05 << 7) | (v05 >> 25) + v01 += v06 + v12 ^= v01 + v12 = (v12 << 16) | (v12 >> 16) + v11 += v12 + v06 ^= v11 + v06 = (v06 << 12) | (v06 >> 20) + v01 += v06 + v12 ^= v01 + v12 = (v12 << 8) | (v12 >> 24) + v11 += v12 + v06 ^= v11 + v06 = (v06 << 7) | (v06 >> 25) + v02 += v07 + v13 ^= v02 + v13 = (v13 << 16) | (v13 >> 16) + v08 += v13 + v07 ^= v08 + v07 = (v07 << 12) | (v07 >> 20) + v02 += v07 + v13 ^= v02 + v13 = (v13 << 8) | (v13 >> 24) + v08 += v13 + v07 ^= v08 + v07 = (v07 << 7) | (v07 >> 25) + v03 += v04 + v14 ^= v03 + v14 = (v14 << 16) | (v14 >> 16) + v09 += v14 + v04 ^= v09 + v04 = (v04 << 12) | (v04 >> 20) + v03 += v04 + v14 ^= v03 + v14 = (v14 << 8) | (v14 >> 24) + v09 += v14 + v04 ^= v09 + v04 = (v04 << 7) | (v04 >> 25) + } + + v00 += s00 + v01 += s01 + v02 += s02 + v03 += s03 + v04 += s04 + v05 += s05 + v06 += s06 + v07 += s07 + v08 += s08 + v09 += s09 + v10 += s10 + v11 += s11 + v12 += s12 + v13 += s13 + v14 += s14 + v15 += s15 + + s12++ + binary.LittleEndian.PutUint32(state[48:], s12) + if s12 == 0 { // indicates overflow + s13++ + binary.LittleEndian.PutUint32(state[52:], s13) + } + + binary.LittleEndian.PutUint32(dst[0:], v00) + binary.LittleEndian.PutUint32(dst[4:], v01) + binary.LittleEndian.PutUint32(dst[8:], v02) + binary.LittleEndian.PutUint32(dst[12:], v03) + binary.LittleEndian.PutUint32(dst[16:], v04) + binary.LittleEndian.PutUint32(dst[20:], v05) + binary.LittleEndian.PutUint32(dst[24:], v06) + binary.LittleEndian.PutUint32(dst[28:], v07) + binary.LittleEndian.PutUint32(dst[32:], v08) + binary.LittleEndian.PutUint32(dst[36:], v09) + binary.LittleEndian.PutUint32(dst[40:], v10) + binary.LittleEndian.PutUint32(dst[44:], v11) + binary.LittleEndian.PutUint32(dst[48:], v12) + binary.LittleEndian.PutUint32(dst[52:], v13) + binary.LittleEndian.PutUint32(dst[56:], v14) + binary.LittleEndian.PutUint32(dst[60:], v15) +} + +func hChaCha20Generic(out *[32]byte, nonce *[16]byte, key *[32]byte) { + v00 := sigma[0] + v01 := sigma[1] + v02 := sigma[2] + v03 := sigma[3] + v04 := binary.LittleEndian.Uint32(key[0:]) + v05 := binary.LittleEndian.Uint32(key[4:]) + v06 := binary.LittleEndian.Uint32(key[8:]) + v07 := binary.LittleEndian.Uint32(key[12:]) + v08 := binary.LittleEndian.Uint32(key[16:]) + v09 := binary.LittleEndian.Uint32(key[20:]) + v10 := binary.LittleEndian.Uint32(key[24:]) + v11 := binary.LittleEndian.Uint32(key[28:]) + v12 := binary.LittleEndian.Uint32(nonce[0:]) + v13 := binary.LittleEndian.Uint32(nonce[4:]) + v14 := binary.LittleEndian.Uint32(nonce[8:]) + v15 := binary.LittleEndian.Uint32(nonce[12:]) + + for i := 0; i < 20; i += 2 { + v00 += v04 + v12 ^= v00 + v12 = (v12 << 16) | (v12 >> 16) + v08 += v12 + v04 ^= v08 + v04 = (v04 << 12) | (v04 >> 20) + v00 += v04 + v12 ^= v00 + v12 = (v12 << 8) | (v12 >> 24) + v08 += v12 + v04 ^= v08 + v04 = (v04 << 7) | (v04 >> 25) + v01 += v05 + v13 ^= v01 + v13 = (v13 << 16) | (v13 >> 16) + v09 += v13 + v05 ^= v09 + v05 = (v05 << 12) | (v05 >> 20) + v01 += v05 + v13 ^= v01 + v13 = (v13 << 8) | (v13 >> 24) + v09 += v13 + v05 ^= v09 + v05 = (v05 << 7) | (v05 >> 25) + v02 += v06 + v14 ^= v02 + v14 = (v14 << 16) | (v14 >> 16) + v10 += v14 + v06 ^= v10 + v06 = (v06 << 12) | (v06 >> 20) + v02 += v06 + v14 ^= v02 + v14 = (v14 << 8) | (v14 >> 24) + v10 += v14 + v06 ^= v10 + v06 = (v06 << 7) | (v06 >> 25) + v03 += v07 + v15 ^= v03 + v15 = (v15 << 16) | (v15 >> 16) + v11 += v15 + v07 ^= v11 + v07 = (v07 << 12) | (v07 >> 20) + v03 += v07 + v15 ^= v03 + v15 = (v15 << 8) | (v15 >> 24) + v11 += v15 + v07 ^= v11 + v07 = (v07 << 7) | (v07 >> 25) + v00 += v05 + v15 ^= v00 + v15 = (v15 << 16) | (v15 >> 16) + v10 += v15 + v05 ^= v10 + v05 = (v05 << 12) | (v05 >> 20) + v00 += v05 + v15 ^= v00 + v15 = (v15 << 8) | (v15 >> 24) + v10 += v15 + v05 ^= v10 + v05 = (v05 << 7) | (v05 >> 25) + v01 += v06 + v12 ^= v01 + v12 = (v12 << 16) | (v12 >> 16) + v11 += v12 + v06 ^= v11 + v06 = (v06 << 12) | (v06 >> 20) + v01 += v06 + v12 ^= v01 + v12 = (v12 << 8) | (v12 >> 24) + v11 += v12 + v06 ^= v11 + v06 = (v06 << 7) | (v06 >> 25) + v02 += v07 + v13 ^= v02 + v13 = (v13 << 16) | (v13 >> 16) + v08 += v13 + v07 ^= v08 + v07 = (v07 << 12) | (v07 >> 20) + v02 += v07 + v13 ^= v02 + v13 = (v13 << 8) | (v13 >> 24) + v08 += v13 + v07 ^= v08 + v07 = (v07 << 7) | (v07 >> 25) + v03 += v04 + v14 ^= v03 + v14 = (v14 << 16) | (v14 >> 16) + v09 += v14 + v04 ^= v09 + v04 = (v04 << 12) | (v04 >> 20) + v03 += v04 + v14 ^= v03 + v14 = (v14 << 8) | (v14 >> 24) + v09 += v14 + v04 ^= v09 + v04 = (v04 << 7) | (v04 >> 25) + } + + binary.LittleEndian.PutUint32(out[0:], v00) + binary.LittleEndian.PutUint32(out[4:], v01) + binary.LittleEndian.PutUint32(out[8:], v02) + binary.LittleEndian.PutUint32(out[12:], v03) + binary.LittleEndian.PutUint32(out[16:], v12) + binary.LittleEndian.PutUint32(out[20:], v13) + binary.LittleEndian.PutUint32(out[24:], v14) + binary.LittleEndian.PutUint32(out[28:], v15) +} diff --git a/vendor/github.com/aead/chacha20/chacha/chacha_ref.go b/vendor/github.com/aead/chacha20/chacha/chacha_ref.go new file mode 100644 index 0000000..526877c --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/chacha_ref.go @@ -0,0 +1,33 @@ +// Copyright (c) 2016 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build !amd64,!386 gccgo appengine nacl + +package chacha + +import "encoding/binary" + +func init() { + useSSE2 = false + useSSSE3 = false + useAVX = false + useAVX2 = false +} + +func initialize(state *[64]byte, key []byte, nonce *[16]byte) { + binary.LittleEndian.PutUint32(state[0:], sigma[0]) + binary.LittleEndian.PutUint32(state[4:], sigma[1]) + binary.LittleEndian.PutUint32(state[8:], sigma[2]) + binary.LittleEndian.PutUint32(state[12:], sigma[3]) + copy(state[16:], key[:]) + copy(state[48:], nonce[:]) +} + +func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int { + return xorKeyStreamGeneric(dst, src, block, state, rounds) +} + +func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { + hChaCha20Generic(out, nonce, key) +} diff --git a/vendor/github.com/aead/chacha20/chacha/const.s b/vendor/github.com/aead/chacha20/chacha/const.s new file mode 100644 index 0000000..c7a94a4 --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/const.s @@ -0,0 +1,53 @@ +// Copyright (c) 2018 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl + +#include "textflag.h" + +DATA ·sigma<>+0x00(SB)/4, $0x61707865 +DATA ·sigma<>+0x04(SB)/4, $0x3320646e +DATA ·sigma<>+0x08(SB)/4, $0x79622d32 +DATA ·sigma<>+0x0C(SB)/4, $0x6b206574 +GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16 // The 4 ChaCha initialization constants + +// SSE2/SSE3/AVX constants + +DATA ·one<>+0x00(SB)/8, $1 +DATA ·one<>+0x08(SB)/8, $0 +GLOBL ·one<>(SB), (NOPTR+RODATA), $16 // The constant 1 as 128 bit value + +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 16 bit left rotate constant + +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 8 bit left rotate constant + +// AVX2 constants + +DATA ·one_AVX2<>+0x00(SB)/8, $0 +DATA ·one_AVX2<>+0x08(SB)/8, $0 +DATA ·one_AVX2<>+0x10(SB)/8, $1 +DATA ·one_AVX2<>+0x18(SB)/8, $0 +GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32 // The constant 1 as 256 bit value + +DATA ·two_AVX2<>+0x00(SB)/8, $2 +DATA ·two_AVX2<>+0x08(SB)/8, $0 +DATA ·two_AVX2<>+0x10(SB)/8, $2 +DATA ·two_AVX2<>+0x18(SB)/8, $0 +GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32 + +DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 16 bit left rotate constant + +DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B +GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 8 bit left rotate constant diff --git a/vendor/github.com/aead/chacha20/chacha/macro.s b/vendor/github.com/aead/chacha20/chacha/macro.s new file mode 100644 index 0000000..780108f --- /dev/null +++ b/vendor/github.com/aead/chacha20/chacha/macro.s @@ -0,0 +1,163 @@ +// Copyright (c) 2018 Andreas Auernhammer. All rights reserved. +// Use of this source code is governed by a license that can be +// found in the LICENSE file. + +// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl + +// ROTL_SSE rotates all 4 32 bit values of the XMM register v +// left by n bits using SSE2 instructions (0 <= n <= 32). +// The XMM register t is used as a temp. register. +#define ROTL_SSE(n, t, v) \ + MOVO v, t; \ + PSLLL $n, t; \ + PSRLL $(32-n), v; \ + PXOR t, v + +// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v +// left by n bits using AVX/AVX2 instructions (0 <= n <= 32). +// The AVX/AVX2 register t is used as a temp. register. +#define ROTL_AVX(n, t, v) \ + VPSLLD $n, v, t; \ + VPSRLD $(32-n), v, v; \ + VPXOR v, t, v + +// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the +// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for +// rotations. The XMM register t is used as a temp. register. +#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE(16, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE(12, t, v1); \ + PADDL v1, v0; \ + PXOR v0, v3; \ + ROTL_SSE(8, t, v3); \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE(7, t, v1) + +// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the +// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit +// rotations. The XMM register t is used as a temp. register. +// +// r16 holds the PSHUFB constant for a 16 bit left rotate. +// r8 holds the PSHUFB constant for a 8 bit left rotate. +#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \ + PADDL v1, v0; \ + PXOR v0, v3; \ + PSHUFB r16, v3; \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE(12, t, v1); \ + PADDL v1, v0; \ + PXOR v0, v3; \ + PSHUFB r8, v3; \ + PADDL v3, v2; \ + PXOR v2, v1; \ + ROTL_SSE(7, t, v1) + +// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the +// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit +// rotations. The AVX/AVX2 register t is used as a temp. register. +// +// r16 holds the VPSHUFB constant for a 16 bit left rotate. +// r8 holds the VPSHUFB constant for a 8 bit left rotate. +#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \ + VPADDD v0, v1, v0; \ + VPXOR v3, v0, v3; \ + VPSHUFB r16, v3, v3; \ + VPADDD v2, v3, v2; \ + VPXOR v1, v2, v1; \ + ROTL_AVX(12, t, v1); \ + VPADDD v0, v1, v0; \ + VPXOR v3, v0, v3; \ + VPSHUFB r8, v3, v3; \ + VPADDD v2, v3, v2; \ + VPXOR v1, v2, v1; \ + ROTL_AVX(7, t, v1) + +// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the +// 3 XMM registers v1, v2 and v3. The inverse shuffle is +// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1). +#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \ + PSHUFL $0x39, v1, v1; \ + PSHUFL $0x4E, v2, v2; \ + PSHUFL $0x93, v3, v3 + +// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the +// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is +// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1). +#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \ + VPSHUFD $0x39, v1, v1; \ + VPSHUFD $0x4E, v2, v2; \ + VPSHUFD $0x93, v3, v3 + +// XOR_SSE extracts 4x16 byte vectors from src at +// off, xors all vectors with the corresponding XMM +// register (v0 - v3) and writes the result to dst +// at off. +// The XMM register t is used as a temp. register. +#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \ + MOVOU 0+off(src), t; \ + PXOR v0, t; \ + MOVOU t, 0+off(dst); \ + MOVOU 16+off(src), t; \ + PXOR v1, t; \ + MOVOU t, 16+off(dst); \ + MOVOU 32+off(src), t; \ + PXOR v2, t; \ + MOVOU t, 32+off(dst); \ + MOVOU 48+off(src), t; \ + PXOR v3, t; \ + MOVOU t, 48+off(dst) + +// XOR_AVX extracts 4x16 byte vectors from src at +// off, xors all vectors with the corresponding AVX +// register (v0 - v3) and writes the result to dst +// at off. +// The XMM register t is used as a temp. register. +#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \ + VPXOR 0+off(src), v0, t; \ + VMOVDQU t, 0+off(dst); \ + VPXOR 16+off(src), v1, t; \ + VMOVDQU t, 16+off(dst); \ + VPXOR 32+off(src), v2, t; \ + VMOVDQU t, 32+off(dst); \ + VPXOR 48+off(src), v3, t; \ + VMOVDQU t, 48+off(dst) + +#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ + VMOVDQU (0+off)(src), t0; \ + VPERM2I128 $32, v1, v0, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (0+off)(dst); \ + VMOVDQU (32+off)(src), t0; \ + VPERM2I128 $32, v3, v2, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (32+off)(dst); \ + VMOVDQU (64+off)(src), t0; \ + VPERM2I128 $49, v1, v0, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (64+off)(dst); \ + VMOVDQU (96+off)(src), t0; \ + VPERM2I128 $49, v3, v2, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (96+off)(dst) + +#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \ + VMOVDQU (0+off)(src), t0; \ + VPERM2I128 $32, v1, v0, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (0+off)(dst); \ + VMOVDQU (32+off)(src), t0; \ + VPERM2I128 $32, v3, v2, t1; \ + VPXOR t0, t1, t0; \ + VMOVDQU t0, (32+off)(dst); \ + +#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \ + VPERM2I128 $49, v1, v0, t0; \ + VMOVDQU t0, 0(dst); \ + VPERM2I128 $49, v3, v2, t0; \ + VMOVDQU t0, 32(dst) diff --git a/vendor/modules.txt b/vendor/modules.txt index 3cdfc45..3c7837b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -15,6 +15,8 @@ blitter.com/go/kyber blitter.com/go/mtwist # blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae blitter.com/go/newhope +# github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da +github.com/aead/chacha20/chacha # github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f github.com/jameskeane/bcrypt # github.com/klauspost/cpuid v1.2.2 diff --git a/xs/xs.go b/xs/xs.go index c670e25..41c2c1b 100755 --- a/xs/xs.go +++ b/xs/xs.go @@ -624,7 +624,7 @@ func main() { flag.BoolVar(&vopt, "v", false, "show version") flag.BoolVar(&dbg, "d", false, "debug logging") - flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1]") + flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1 | C_CHACHA20_12]") flag.StringVar(&hmacAlg, "m", "H_SHA256", "session `HMAC` [H_SHA256 | H_SHA512]") flag.StringVar(&kexAlg, "k", "KEX_HERRADURA512", "KEx `alg` [KEX_HERRADURA{256/512/1024/2048} | KEX_KYBER{512/768/1024} | KEX_NEWHOPE | KEX_NEWHOPE_SIMPLE]") flag.StringVar(&kcpMode, "K", "unused", "KCP `alg`, one of [KCP_NONE | KCP_AES | KCP_BLOWFISH | KCP_CAST5 | KCP_SM4 | KCP_SALSA20 | KCP_SIMPLEXOR | KCP_TEA | KCP_3DES | KCP_TWOFISH | KCP_XTEA] to use KCP (github.com/xtaci/kcp-go) reliable UDP instead of TCP") diff --git a/xsnet/chan.go b/xsnet/chan.go index 6df470a..79dfd68 100644 --- a/xsnet/chan.go +++ b/xsnet/chan.go @@ -21,6 +21,7 @@ import ( "log" "blitter.com/go/cryptmt" + "github.com/aead/chacha20/chacha" "golang.org/x/crypto/blowfish" "golang.org/x/crypto/twofish" @@ -104,6 +105,18 @@ func (hc Conn) getStream(keymat []byte) (rc cipher.Stream, mc hash.Hash, err err case CAlgCryptMT1: rc = cryptmt.New(nil, nil, keymat) log.Printf("[cipher CRYPTMT1 (%d)]\n", copts) + case CAlgChaCha20_12: + keymat = expandKeyMat(keymat, chacha.KeySize) + key = keymat[0:chacha.KeySize] + ivlen = chacha.INonceSize + iv = keymat[chacha.KeySize : chacha.KeySize+ivlen] + rc, err = chacha.NewCipher(iv, key, 20) + if err != nil { + log.Printf("[ChaCha20 config error]\n") + fmt.Printf("[ChaCha20 config error]\n") + } + // TODO: SetCounter() to something derived from key or nonce or extra keymat? + log.Printf("[cipher CHACHA20_12 (%d)]\n", copts) default: log.Printf("[invalid cipher (%d)]\n", copts) fmt.Printf("DOOFUS SET A VALID CIPHER ALG (%d)\n", copts) diff --git a/xsnet/consts.go b/xsnet/consts.go index 6492c67..840937a 100644 --- a/xsnet/consts.go +++ b/xsnet/consts.go @@ -99,6 +99,7 @@ const ( CAlgTwofish128 // golang.org/x/crypto/twofish CAlgBlowfish64 // golang.org/x/crypto/blowfish CAlgCryptMT1 //cryptmt using mtwist64 + CAlgChaCha20_12 CAlgNoneDisallowed ) diff --git a/xsnet/net.go b/xsnet/net.go index 56ef46d..3d1034a 100644 --- a/xsnet/net.go +++ b/xsnet/net.go @@ -145,6 +145,8 @@ func (c *CSCipherAlg) String() string { return "C_BLOWFISH_64" case CAlgCryptMT1: return "C_CRYPTMT1" + case CAlgChaCha20_12: + return "C_CHACHA20_12" default: return "C_ERR_UNK" } @@ -280,6 +282,8 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) { hc.kex = KEX_HERRADURA512 log.Printf("[KEx alg %d ?? defaults to %d]\n", kexAlg, hc.kex) } + + //hc.logCipherText = true // !!! DEBUGGING ONLY !!! NEVER DEPLOY this uncommented !!! return } @@ -298,7 +302,7 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) { // // Session (symmetric) crypto // -// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1 +// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1 C_CHACHA20_12 // // Session HMACs // @@ -322,6 +326,10 @@ func (hc *Conn) applyConnExtensions(extensions ...string) { log.Println("[extension arg = C_CRYPTMT1]") hc.cipheropts &= (0xFFFFFF00) hc.cipheropts |= CAlgCryptMT1 + case "C_CHACHA20_12": + log.Println("[extension arg = C_CHACHA20_12]") + hc.cipheropts &= (0xFFFFFF00) + hc.cipheropts |= CAlgChaCha20_12 case "H_SHA256": log.Println("[extension arg = H_SHA256]") hc.cipheropts &= (0xFFFF00FF)