mirror of https://gogs.blitter.com/RLabs/xs
Initial aead/chacha20 support (ChaCha20_12)
Signed-off-by: Russ Magee <rmagee@gmail.com>
This commit is contained in:
parent
8de76520e4
commit
e9aa0072a5
1
go.mod
1
go.mod
|
@ -9,6 +9,7 @@ require (
|
|||
blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9
|
||||
blitter.com/go/mtwist v1.0.1 // indirect
|
||||
blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae
|
||||
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da
|
||||
github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f
|
||||
github.com/klauspost/cpuid v1.2.2 // indirect
|
||||
github.com/klauspost/reedsolomon v1.9.3 // indirect
|
||||
|
|
2
go.sum
2
go.sum
|
@ -28,6 +28,8 @@ git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c h1:SGOx
|
|||
git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c/go.mod h1:QrbgzU5EL/1jaMD5pD4Tiikj3R5elPMa+RMwFUTGwQU=
|
||||
git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2 h1:89TYv/+wotJ+QWrH5B/yN0pEQutr2V/5za0VoYiVGCM=
|
||||
git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2/go.mod h1:weMqACFGzJs4Ni+K9shsRd02N4LkDrtGlkRxISK+II0=
|
||||
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da h1:KjTM2ks9d14ZYCvmHS9iAKVt9AyzRSqNU1qabPih5BY=
|
||||
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da/go.mod h1:eHEWzANqSiWQsof+nXEI9bUVUyV6F53Fp89EuCh2EAA=
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f h1:UWGE8Vi+1Agt0lrvnd7UsmvwqWKRzb9byK9iQmsbY0Y=
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 Andreas Auernhammer
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,197 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// Package chacha implements some low-level functions of the
|
||||
// ChaCha cipher family.
|
||||
package chacha // import "github.com/aead/chacha20/chacha"
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"math"
|
||||
)
|
||||
|
||||
const (
|
||||
// NonceSize is the size of the ChaCha20 nonce in bytes.
|
||||
NonceSize = 8
|
||||
|
||||
// INonceSize is the size of the IETF-ChaCha20 nonce in bytes.
|
||||
INonceSize = 12
|
||||
|
||||
// XNonceSize is the size of the XChaCha20 nonce in bytes.
|
||||
XNonceSize = 24
|
||||
|
||||
// KeySize is the size of the key in bytes.
|
||||
KeySize = 32
|
||||
)
|
||||
|
||||
var (
|
||||
useSSE2 bool
|
||||
useSSSE3 bool
|
||||
useAVX bool
|
||||
useAVX2 bool
|
||||
)
|
||||
|
||||
var (
|
||||
errKeySize = errors.New("chacha20/chacha: bad key length")
|
||||
errInvalidNonce = errors.New("chacha20/chacha: bad nonce length")
|
||||
)
|
||||
|
||||
func setup(state *[64]byte, nonce, key []byte) (err error) {
|
||||
if len(key) != KeySize {
|
||||
err = errKeySize
|
||||
return
|
||||
}
|
||||
var Nonce [16]byte
|
||||
switch len(nonce) {
|
||||
case NonceSize:
|
||||
copy(Nonce[8:], nonce)
|
||||
initialize(state, key, &Nonce)
|
||||
case INonceSize:
|
||||
copy(Nonce[4:], nonce)
|
||||
initialize(state, key, &Nonce)
|
||||
case XNonceSize:
|
||||
var tmpKey [32]byte
|
||||
var hNonce [16]byte
|
||||
|
||||
copy(hNonce[:], nonce[:16])
|
||||
copy(tmpKey[:], key)
|
||||
HChaCha20(&tmpKey, &hNonce, &tmpKey)
|
||||
copy(Nonce[8:], nonce[16:])
|
||||
initialize(state, tmpKey[:], &Nonce)
|
||||
|
||||
// BUG(aead): A "good" compiler will remove this (optimizations)
|
||||
// But using the provided key instead of tmpKey,
|
||||
// will change the key (-> probably confuses users)
|
||||
for i := range tmpKey {
|
||||
tmpKey[i] = 0
|
||||
}
|
||||
default:
|
||||
err = errInvalidNonce
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// XORKeyStream crypts bytes from src to dst using the given nonce and key.
|
||||
// The length of the nonce determinds the version of ChaCha20:
|
||||
// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period.
|
||||
// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period.
|
||||
// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period.
|
||||
// The rounds argument specifies the number of rounds performed for keystream
|
||||
// generation - valid values are 8, 12 or 20. The src and dst may be the same slice
|
||||
// but otherwise should not overlap. If len(dst) < len(src) this function panics.
|
||||
// If the nonce is neither 64, 96 nor 192 bits long, this function panics.
|
||||
func XORKeyStream(dst, src, nonce, key []byte, rounds int) {
|
||||
if rounds != 20 && rounds != 12 && rounds != 8 {
|
||||
panic("chacha20/chacha: bad number of rounds")
|
||||
}
|
||||
if len(dst) < len(src) {
|
||||
panic("chacha20/chacha: dst buffer is to small")
|
||||
}
|
||||
if len(nonce) == INonceSize && uint64(len(src)) > (1<<38) {
|
||||
panic("chacha20/chacha: src is too large")
|
||||
}
|
||||
|
||||
var block, state [64]byte
|
||||
if err := setup(&state, nonce, key); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
xorKeyStream(dst, src, &block, &state, rounds)
|
||||
}
|
||||
|
||||
// Cipher implements ChaCha20/r (XChaCha20/r) for a given number of rounds r.
|
||||
type Cipher struct {
|
||||
state, block [64]byte
|
||||
off int
|
||||
rounds int // 20 for ChaCha20
|
||||
noncesize int
|
||||
}
|
||||
|
||||
// NewCipher returns a new *chacha.Cipher implementing the ChaCha20/r or XChaCha20/r
|
||||
// (r = 8, 12 or 20) stream cipher. The nonce must be unique for one key for all time.
|
||||
// The length of the nonce determinds the version of ChaCha20:
|
||||
// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period.
|
||||
// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period.
|
||||
// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period.
|
||||
// If the nonce is neither 64, 96 nor 192 bits long, a non-nil error is returned.
|
||||
func NewCipher(nonce, key []byte, rounds int) (*Cipher, error) {
|
||||
if rounds != 20 && rounds != 12 && rounds != 8 {
|
||||
panic("chacha20/chacha: bad number of rounds")
|
||||
}
|
||||
|
||||
c := new(Cipher)
|
||||
if err := setup(&(c.state), nonce, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c.rounds = rounds
|
||||
|
||||
if len(nonce) == INonceSize {
|
||||
c.noncesize = INonceSize
|
||||
} else {
|
||||
c.noncesize = NonceSize
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
// XORKeyStream crypts bytes from src to dst. Src and dst may be the same slice
|
||||
// but otherwise should not overlap. If len(dst) < len(src) the function panics.
|
||||
func (c *Cipher) XORKeyStream(dst, src []byte) {
|
||||
if len(dst) < len(src) {
|
||||
panic("chacha20/chacha: dst buffer is to small")
|
||||
}
|
||||
|
||||
if c.off > 0 {
|
||||
n := len(c.block[c.off:])
|
||||
if len(src) <= n {
|
||||
for i, v := range src {
|
||||
dst[i] = v ^ c.block[c.off]
|
||||
c.off++
|
||||
}
|
||||
if c.off == 64 {
|
||||
c.off = 0
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
for i, v := range c.block[c.off:] {
|
||||
dst[i] = src[i] ^ v
|
||||
}
|
||||
src = src[n:]
|
||||
dst = dst[n:]
|
||||
c.off = 0
|
||||
}
|
||||
|
||||
// check for counter overflow
|
||||
blocksToXOR := len(src) / 64
|
||||
if len(src)%64 != 0 {
|
||||
blocksToXOR++
|
||||
}
|
||||
var overflow bool
|
||||
if c.noncesize == INonceSize {
|
||||
overflow = binary.LittleEndian.Uint32(c.state[48:]) > math.MaxUint32-uint32(blocksToXOR)
|
||||
} else {
|
||||
overflow = binary.LittleEndian.Uint64(c.state[48:]) > math.MaxUint64-uint64(blocksToXOR)
|
||||
}
|
||||
if overflow {
|
||||
panic("chacha20/chacha: counter overflow")
|
||||
}
|
||||
|
||||
c.off += xorKeyStream(dst, src, &(c.block), &(c.state), c.rounds)
|
||||
}
|
||||
|
||||
// SetCounter skips ctr * 64 byte blocks. SetCounter(0) resets the cipher.
|
||||
// This function always skips the unused keystream of the current 64 byte block.
|
||||
func (c *Cipher) SetCounter(ctr uint64) {
|
||||
if c.noncesize == INonceSize {
|
||||
binary.LittleEndian.PutUint32(c.state[48:], uint32(ctr))
|
||||
} else {
|
||||
binary.LittleEndian.PutUint64(c.state[48:], ctr)
|
||||
}
|
||||
c.off = 0
|
||||
}
|
||||
|
||||
// HChaCha20 generates 32 pseudo-random bytes from a 128 bit nonce and a 256 bit secret key.
|
||||
// It can be used as a key-derivation-function (KDF).
|
||||
func HChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { hChaCha20(out, nonce, key) }
|
|
@ -0,0 +1,406 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "const.s"
|
||||
#include "macro.s"
|
||||
|
||||
#define TWO 0(SP)
|
||||
#define C16 32(SP)
|
||||
#define C8 64(SP)
|
||||
#define STATE_0 96(SP)
|
||||
#define STATE_1 128(SP)
|
||||
#define STATE_2 160(SP)
|
||||
#define STATE_3 192(SP)
|
||||
#define TMP_0 224(SP)
|
||||
#define TMP_1 256(SP)
|
||||
|
||||
// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
TEXT ·xorKeyStreamAVX2(SB), 4, $320-80
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ src_base+24(FP), SI
|
||||
MOVQ block+48(FP), BX
|
||||
MOVQ state+56(FP), AX
|
||||
MOVQ rounds+64(FP), DX
|
||||
MOVQ src_len+32(FP), CX
|
||||
|
||||
MOVQ SP, R8
|
||||
ADDQ $32, SP
|
||||
ANDQ $-32, SP
|
||||
|
||||
VMOVDQU 0(AX), Y2
|
||||
VMOVDQU 32(AX), Y3
|
||||
VPERM2I128 $0x22, Y2, Y0, Y0
|
||||
VPERM2I128 $0x33, Y2, Y1, Y1
|
||||
VPERM2I128 $0x22, Y3, Y2, Y2
|
||||
VPERM2I128 $0x33, Y3, Y3, Y3
|
||||
|
||||
TESTQ CX, CX
|
||||
JZ done
|
||||
|
||||
VMOVDQU ·one_AVX2<>(SB), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
|
||||
VMOVDQA Y0, STATE_0
|
||||
VMOVDQA Y1, STATE_1
|
||||
VMOVDQA Y2, STATE_2
|
||||
VMOVDQA Y3, STATE_3
|
||||
|
||||
VMOVDQU ·rol16_AVX2<>(SB), Y4
|
||||
VMOVDQU ·rol8_AVX2<>(SB), Y5
|
||||
VMOVDQU ·two_AVX2<>(SB), Y6
|
||||
VMOVDQA Y4, Y14
|
||||
VMOVDQA Y5, Y15
|
||||
VMOVDQA Y4, C16
|
||||
VMOVDQA Y5, C8
|
||||
VMOVDQA Y6, TWO
|
||||
|
||||
CMPQ CX, $64
|
||||
JBE between_0_and_64
|
||||
CMPQ CX, $192
|
||||
JBE between_64_and_192
|
||||
CMPQ CX, $320
|
||||
JBE between_192_and_320
|
||||
CMPQ CX, $448
|
||||
JBE between_320_and_448
|
||||
|
||||
at_least_512:
|
||||
VMOVDQA Y0, Y4
|
||||
VMOVDQA Y1, Y5
|
||||
VMOVDQA Y2, Y6
|
||||
VPADDQ TWO, Y3, Y7
|
||||
VMOVDQA Y0, Y8
|
||||
VMOVDQA Y1, Y9
|
||||
VMOVDQA Y2, Y10
|
||||
VPADDQ TWO, Y7, Y11
|
||||
VMOVDQA Y0, Y12
|
||||
VMOVDQA Y1, Y13
|
||||
VMOVDQA Y2, Y14
|
||||
VPADDQ TWO, Y11, Y15
|
||||
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_512:
|
||||
VMOVDQA Y8, TMP_0
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
VMOVDQA TMP_0, Y8
|
||||
VMOVDQA Y0, TMP_0
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_SHUFFLE_AVX(Y13, Y14, Y15)
|
||||
|
||||
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
|
||||
VMOVDQA TMP_0, Y0
|
||||
VMOVDQA Y8, TMP_0
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
|
||||
VMOVDQA TMP_0, Y8
|
||||
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
CHACHA_SHUFFLE_AVX(Y15, Y14, Y13)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_512
|
||||
|
||||
VMOVDQA Y12, TMP_0
|
||||
VMOVDQA Y13, TMP_1
|
||||
VPADDD STATE_0, Y0, Y0
|
||||
VPADDD STATE_1, Y1, Y1
|
||||
VPADDD STATE_2, Y2, Y2
|
||||
VPADDD STATE_3, Y3, Y3
|
||||
XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
|
||||
VMOVDQA STATE_0, Y0
|
||||
VMOVDQA STATE_1, Y1
|
||||
VMOVDQA STATE_2, Y2
|
||||
VMOVDQA STATE_3, Y3
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
VPADDD Y0, Y4, Y4
|
||||
VPADDD Y1, Y5, Y5
|
||||
VPADDD Y2, Y6, Y6
|
||||
VPADDD Y3, Y7, Y7
|
||||
XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
VPADDD Y0, Y8, Y8
|
||||
VPADDD Y1, Y9, Y9
|
||||
VPADDD Y2, Y10, Y10
|
||||
VPADDD Y3, Y11, Y11
|
||||
XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
VPADDD TMP_0, Y0, Y12
|
||||
VPADDD TMP_1, Y1, Y13
|
||||
VPADDD Y2, Y14, Y14
|
||||
VPADDD Y3, Y15, Y15
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
CMPQ CX, $512
|
||||
JB less_than_512
|
||||
|
||||
XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
|
||||
VMOVDQA Y3, STATE_3
|
||||
ADDQ $512, SI
|
||||
ADDQ $512, DI
|
||||
SUBQ $512, CX
|
||||
CMPQ CX, $448
|
||||
JA at_least_512
|
||||
|
||||
TESTQ CX, CX
|
||||
JZ done
|
||||
|
||||
VMOVDQA C16, Y14
|
||||
VMOVDQA C8, Y15
|
||||
|
||||
CMPQ CX, $64
|
||||
JBE between_0_and_64
|
||||
CMPQ CX, $192
|
||||
JBE between_64_and_192
|
||||
CMPQ CX, $320
|
||||
JBE between_192_and_320
|
||||
JMP between_320_and_448
|
||||
|
||||
less_than_512:
|
||||
XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
|
||||
EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4)
|
||||
ADDQ $448, SI
|
||||
ADDQ $448, DI
|
||||
SUBQ $448, CX
|
||||
JMP finalize
|
||||
|
||||
between_320_and_448:
|
||||
VMOVDQA Y0, Y4
|
||||
VMOVDQA Y1, Y5
|
||||
VMOVDQA Y2, Y6
|
||||
VPADDQ TWO, Y3, Y7
|
||||
VMOVDQA Y0, Y8
|
||||
VMOVDQA Y1, Y9
|
||||
VMOVDQA Y2, Y10
|
||||
VPADDQ TWO, Y7, Y11
|
||||
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_384:
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_384
|
||||
|
||||
VPADDD STATE_0, Y0, Y0
|
||||
VPADDD STATE_1, Y1, Y1
|
||||
VPADDD STATE_2, Y2, Y2
|
||||
VPADDD STATE_3, Y3, Y3
|
||||
XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
|
||||
VMOVDQA STATE_0, Y0
|
||||
VMOVDQA STATE_1, Y1
|
||||
VMOVDQA STATE_2, Y2
|
||||
VMOVDQA STATE_3, Y3
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
VPADDD Y0, Y4, Y4
|
||||
VPADDD Y1, Y5, Y5
|
||||
VPADDD Y2, Y6, Y6
|
||||
VPADDD Y3, Y7, Y7
|
||||
XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
VPADDD Y0, Y8, Y8
|
||||
VPADDD Y1, Y9, Y9
|
||||
VPADDD Y2, Y10, Y10
|
||||
VPADDD Y3, Y11, Y11
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
CMPQ CX, $384
|
||||
JB less_than_384
|
||||
|
||||
XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
|
||||
SUBQ $384, CX
|
||||
TESTQ CX, CX
|
||||
JE done
|
||||
|
||||
ADDQ $384, SI
|
||||
ADDQ $384, DI
|
||||
JMP between_0_and_64
|
||||
|
||||
less_than_384:
|
||||
XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
|
||||
EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
|
||||
ADDQ $320, SI
|
||||
ADDQ $320, DI
|
||||
SUBQ $320, CX
|
||||
JMP finalize
|
||||
|
||||
between_192_and_320:
|
||||
VMOVDQA Y0, Y4
|
||||
VMOVDQA Y1, Y5
|
||||
VMOVDQA Y2, Y6
|
||||
VMOVDQA Y3, Y7
|
||||
VMOVDQA Y0, Y8
|
||||
VMOVDQA Y1, Y9
|
||||
VMOVDQA Y2, Y10
|
||||
VPADDQ TWO, Y3, Y11
|
||||
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_256:
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_256
|
||||
|
||||
VPADDD Y0, Y4, Y4
|
||||
VPADDD Y1, Y5, Y5
|
||||
VPADDD Y2, Y6, Y6
|
||||
VPADDD Y3, Y7, Y7
|
||||
VPADDQ TWO, Y3, Y3
|
||||
XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
|
||||
VPADDD Y0, Y8, Y8
|
||||
VPADDD Y1, Y9, Y9
|
||||
VPADDD Y2, Y10, Y10
|
||||
VPADDD Y3, Y11, Y11
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
CMPQ CX, $256
|
||||
JB less_than_256
|
||||
|
||||
XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
|
||||
SUBQ $256, CX
|
||||
TESTQ CX, CX
|
||||
JE done
|
||||
|
||||
ADDQ $256, SI
|
||||
ADDQ $256, DI
|
||||
JMP between_0_and_64
|
||||
|
||||
less_than_256:
|
||||
XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
|
||||
EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
|
||||
ADDQ $192, SI
|
||||
ADDQ $192, DI
|
||||
SUBQ $192, CX
|
||||
JMP finalize
|
||||
|
||||
between_64_and_192:
|
||||
VMOVDQA Y0, Y4
|
||||
VMOVDQA Y1, Y5
|
||||
VMOVDQA Y2, Y6
|
||||
VMOVDQA Y3, Y7
|
||||
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_128:
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
|
||||
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
|
||||
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_128
|
||||
|
||||
VPADDD Y0, Y4, Y4
|
||||
VPADDD Y1, Y5, Y5
|
||||
VPADDD Y2, Y6, Y6
|
||||
VPADDD Y3, Y7, Y7
|
||||
VPADDQ TWO, Y3, Y3
|
||||
|
||||
CMPQ CX, $128
|
||||
JB less_than_128
|
||||
|
||||
XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
|
||||
SUBQ $128, CX
|
||||
TESTQ CX, CX
|
||||
JE done
|
||||
|
||||
ADDQ $128, SI
|
||||
ADDQ $128, DI
|
||||
JMP between_0_and_64
|
||||
|
||||
less_than_128:
|
||||
XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
|
||||
EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13)
|
||||
ADDQ $64, SI
|
||||
ADDQ $64, DI
|
||||
SUBQ $64, CX
|
||||
JMP finalize
|
||||
|
||||
between_0_and_64:
|
||||
VMOVDQA X0, X4
|
||||
VMOVDQA X1, X5
|
||||
VMOVDQA X2, X6
|
||||
VMOVDQA X3, X7
|
||||
|
||||
MOVQ DX, R9
|
||||
|
||||
chacha_loop_64:
|
||||
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||||
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
|
||||
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||||
SUBQ $2, R9
|
||||
JA chacha_loop_64
|
||||
|
||||
VPADDD X0, X4, X4
|
||||
VPADDD X1, X5, X5
|
||||
VPADDD X2, X6, X6
|
||||
VPADDD X3, X7, X7
|
||||
VMOVDQU ·one<>(SB), X0
|
||||
VPADDQ X0, X3, X3
|
||||
|
||||
CMPQ CX, $64
|
||||
JB less_than_64
|
||||
|
||||
XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13)
|
||||
SUBQ $64, CX
|
||||
JMP done
|
||||
|
||||
less_than_64:
|
||||
VMOVDQU X4, 0(BX)
|
||||
VMOVDQU X5, 16(BX)
|
||||
VMOVDQU X6, 32(BX)
|
||||
VMOVDQU X7, 48(BX)
|
||||
|
||||
finalize:
|
||||
XORQ R11, R11
|
||||
XORQ R12, R12
|
||||
MOVQ CX, BP
|
||||
|
||||
xor_loop:
|
||||
MOVB 0(SI), R11
|
||||
MOVB 0(BX), R12
|
||||
XORQ R11, R12
|
||||
MOVB R12, 0(DI)
|
||||
INCQ SI
|
||||
INCQ BX
|
||||
INCQ DI
|
||||
DECQ BP
|
||||
JA xor_loop
|
||||
|
||||
done:
|
||||
VMOVDQU X3, 48(AX)
|
||||
VZEROUPPER
|
||||
MOVQ R8, SP
|
||||
MOVQ CX, ret+72(FP)
|
||||
RET
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl
|
||||
|
||||
package chacha
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
func init() {
|
||||
useSSE2 = cpu.X86.HasSSE2
|
||||
useSSSE3 = cpu.X86.HasSSSE3
|
||||
useAVX = false
|
||||
useAVX2 = false
|
||||
}
|
||||
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
|
||||
binary.LittleEndian.PutUint32(state[0:], sigma[0])
|
||||
binary.LittleEndian.PutUint32(state[4:], sigma[1])
|
||||
binary.LittleEndian.PutUint32(state[8:], sigma[2])
|
||||
binary.LittleEndian.PutUint32(state[12:], sigma[3])
|
||||
copy(state[16:], key[:])
|
||||
copy(state[48:], nonce[:])
|
||||
}
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_386.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
switch {
|
||||
case useSSSE3:
|
||||
hChaCha20SSSE3(out, nonce, key)
|
||||
case useSSE2:
|
||||
hChaCha20SSE2(out, nonce, key)
|
||||
default:
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
if useSSE2 {
|
||||
return xorKeyStreamSSE2(dst, src, block, state, rounds)
|
||||
} else {
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,163 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "const.s"
|
||||
#include "macro.s"
|
||||
|
||||
// FINALIZE xors len bytes from src and block using
|
||||
// the temp. registers t0 and t1 and writes the result
|
||||
// to dst.
|
||||
#define FINALIZE(dst, src, block, len, t0, t1) \
|
||||
XORL t0, t0; \
|
||||
XORL t1, t1; \
|
||||
FINALIZE_LOOP:; \
|
||||
MOVB 0(src), t0; \
|
||||
MOVB 0(block), t1; \
|
||||
XORL t0, t1; \
|
||||
MOVB t1, 0(dst); \
|
||||
INCL src; \
|
||||
INCL block; \
|
||||
INCL dst; \
|
||||
DECL len; \
|
||||
JG FINALIZE_LOOP \
|
||||
|
||||
#define Dst DI
|
||||
#define Nonce AX
|
||||
#define Key BX
|
||||
#define Rounds DX
|
||||
|
||||
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSE2(SB), 4, $0-12
|
||||
MOVL out+0(FP), Dst
|
||||
MOVL nonce+4(FP), Nonce
|
||||
MOVL key+8(FP), Key
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0*16(Key), X1
|
||||
MOVOU 1*16(Key), X2
|
||||
MOVOU 0*16(Nonce), X3
|
||||
MOVL $20, Rounds
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||||
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||||
SUBL $2, Rounds
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0*16(Dst)
|
||||
MOVOU X3, 1*16(Dst)
|
||||
RET
|
||||
|
||||
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
|
||||
MOVL out+0(FP), Dst
|
||||
MOVL nonce+4(FP), Nonce
|
||||
MOVL key+8(FP), Key
|
||||
|
||||
MOVOU ·sigma<>(SB), X0
|
||||
MOVOU 0*16(Key), X1
|
||||
MOVOU 1*16(Key), X2
|
||||
MOVOU 0*16(Nonce), X3
|
||||
MOVL $20, Rounds
|
||||
|
||||
MOVOU ·rol16<>(SB), X5
|
||||
MOVOU ·rol8<>(SB), X6
|
||||
|
||||
chacha_loop:
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||||
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||||
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||||
SUBL $2, Rounds
|
||||
JNZ chacha_loop
|
||||
|
||||
MOVOU X0, 0*16(Dst)
|
||||
MOVOU X3, 1*16(Dst)
|
||||
RET
|
||||
|
||||
#undef Dst
|
||||
#undef Nonce
|
||||
#undef Key
|
||||
#undef Rounds
|
||||
|
||||
#define State AX
|
||||
#define Dst DI
|
||||
#define Src SI
|
||||
#define Len DX
|
||||
#define Tmp0 BX
|
||||
#define Tmp1 BP
|
||||
|
||||
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
|
||||
MOVL dst_base+0(FP), Dst
|
||||
MOVL src_base+12(FP), Src
|
||||
MOVL state+28(FP), State
|
||||
MOVL src_len+16(FP), Len
|
||||
MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
|
||||
|
||||
MOVOU 0*16(State), X0
|
||||
MOVOU 1*16(State), X1
|
||||
MOVOU 2*16(State), X2
|
||||
MOVOU 3*16(State), X3
|
||||
TESTL Len, Len
|
||||
JZ DONE
|
||||
|
||||
GENERATE_KEYSTREAM:
|
||||
MOVO X0, X4
|
||||
MOVO X1, X5
|
||||
MOVO X2, X6
|
||||
MOVO X3, X7
|
||||
MOVL rounds+32(FP), Tmp0
|
||||
|
||||
CHACHA_LOOP:
|
||||
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||||
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||||
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||||
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||||
SUBL $2, Tmp0
|
||||
JA CHACHA_LOOP
|
||||
|
||||
MOVOU 0*16(State), X0 // Restore X0 from state
|
||||
PADDL X0, X4
|
||||
PADDL X1, X5
|
||||
PADDL X2, X6
|
||||
PADDL X3, X7
|
||||
MOVOU ·one<>(SB), X0
|
||||
PADDQ X0, X3
|
||||
|
||||
CMPL Len, $64
|
||||
JL BUFFER_KEYSTREAM
|
||||
|
||||
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
|
||||
MOVOU 0*16(State), X0 // Restore X0 from state
|
||||
ADDL $64, Src
|
||||
ADDL $64, Dst
|
||||
SUBL $64, Len
|
||||
JZ DONE
|
||||
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
|
||||
|
||||
BUFFER_KEYSTREAM:
|
||||
MOVL block+24(FP), State
|
||||
MOVOU X4, 0(State)
|
||||
MOVOU X5, 16(State)
|
||||
MOVOU X6, 32(State)
|
||||
MOVOU X7, 48(State)
|
||||
MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64
|
||||
FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
|
||||
|
||||
DONE:
|
||||
MOVL state+28(FP), State
|
||||
MOVOU X3, 3*16(State)
|
||||
RET
|
||||
|
||||
#undef State
|
||||
#undef Dst
|
||||
#undef Src
|
||||
#undef Len
|
||||
#undef Tmp0
|
||||
#undef Tmp1
|
|
@ -0,0 +1,76 @@
|
|||
// Copyright (c) 2017 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build go1.7,amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
package chacha
|
||||
|
||||
import "golang.org/x/sys/cpu"
|
||||
|
||||
func init() {
|
||||
useSSE2 = cpu.X86.HasSSE2
|
||||
useSSSE3 = cpu.X86.HasSSSE3
|
||||
useAVX = cpu.X86.HasAVX
|
||||
useAVX2 = cpu.X86.HasAVX2
|
||||
}
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chachaAVX2_amd64.s
|
||||
//go:noescape
|
||||
func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chacha_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
// This function is implemented in chachaAVX2_amd64.s
|
||||
//go:noescape
|
||||
func xorKeyStreamAVX2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
switch {
|
||||
case useAVX:
|
||||
hChaCha20AVX(out, nonce, key)
|
||||
case useSSSE3:
|
||||
hChaCha20SSSE3(out, nonce, key)
|
||||
case useSSE2:
|
||||
hChaCha20SSE2(out, nonce, key)
|
||||
default:
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
switch {
|
||||
case useAVX2:
|
||||
return xorKeyStreamAVX2(dst, src, block, state, rounds)
|
||||
case useAVX:
|
||||
return xorKeyStreamAVX(dst, src, block, state, rounds)
|
||||
case useSSSE3:
|
||||
return xorKeyStreamSSSE3(dst, src, block, state, rounds)
|
||||
case useSSE2:
|
||||
return xorKeyStreamSSE2(dst, src, block, state, rounds)
|
||||
default:
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,319 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
package chacha
|
||||
|
||||
import "encoding/binary"
|
||||
|
||||
var sigma = [4]uint32{0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}
|
||||
|
||||
func xorKeyStreamGeneric(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
for len(src) >= 64 {
|
||||
chachaGeneric(block, state, rounds)
|
||||
|
||||
for i, v := range block {
|
||||
dst[i] = src[i] ^ v
|
||||
}
|
||||
src = src[64:]
|
||||
dst = dst[64:]
|
||||
}
|
||||
|
||||
n := len(src)
|
||||
if n > 0 {
|
||||
chachaGeneric(block, state, rounds)
|
||||
for i, v := range src {
|
||||
dst[i] = v ^ block[i]
|
||||
}
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func chachaGeneric(dst *[64]byte, state *[64]byte, rounds int) {
|
||||
v00 := binary.LittleEndian.Uint32(state[0:])
|
||||
v01 := binary.LittleEndian.Uint32(state[4:])
|
||||
v02 := binary.LittleEndian.Uint32(state[8:])
|
||||
v03 := binary.LittleEndian.Uint32(state[12:])
|
||||
v04 := binary.LittleEndian.Uint32(state[16:])
|
||||
v05 := binary.LittleEndian.Uint32(state[20:])
|
||||
v06 := binary.LittleEndian.Uint32(state[24:])
|
||||
v07 := binary.LittleEndian.Uint32(state[28:])
|
||||
v08 := binary.LittleEndian.Uint32(state[32:])
|
||||
v09 := binary.LittleEndian.Uint32(state[36:])
|
||||
v10 := binary.LittleEndian.Uint32(state[40:])
|
||||
v11 := binary.LittleEndian.Uint32(state[44:])
|
||||
v12 := binary.LittleEndian.Uint32(state[48:])
|
||||
v13 := binary.LittleEndian.Uint32(state[52:])
|
||||
v14 := binary.LittleEndian.Uint32(state[56:])
|
||||
v15 := binary.LittleEndian.Uint32(state[60:])
|
||||
|
||||
s00, s01, s02, s03, s04, s05, s06, s07 := v00, v01, v02, v03, v04, v05, v06, v07
|
||||
s08, s09, s10, s11, s12, s13, s14, s15 := v08, v09, v10, v11, v12, v13, v14, v15
|
||||
|
||||
for i := 0; i < rounds; i += 2 {
|
||||
v00 += v04
|
||||
v12 ^= v00
|
||||
v12 = (v12 << 16) | (v12 >> 16)
|
||||
v08 += v12
|
||||
v04 ^= v08
|
||||
v04 = (v04 << 12) | (v04 >> 20)
|
||||
v00 += v04
|
||||
v12 ^= v00
|
||||
v12 = (v12 << 8) | (v12 >> 24)
|
||||
v08 += v12
|
||||
v04 ^= v08
|
||||
v04 = (v04 << 7) | (v04 >> 25)
|
||||
v01 += v05
|
||||
v13 ^= v01
|
||||
v13 = (v13 << 16) | (v13 >> 16)
|
||||
v09 += v13
|
||||
v05 ^= v09
|
||||
v05 = (v05 << 12) | (v05 >> 20)
|
||||
v01 += v05
|
||||
v13 ^= v01
|
||||
v13 = (v13 << 8) | (v13 >> 24)
|
||||
v09 += v13
|
||||
v05 ^= v09
|
||||
v05 = (v05 << 7) | (v05 >> 25)
|
||||
v02 += v06
|
||||
v14 ^= v02
|
||||
v14 = (v14 << 16) | (v14 >> 16)
|
||||
v10 += v14
|
||||
v06 ^= v10
|
||||
v06 = (v06 << 12) | (v06 >> 20)
|
||||
v02 += v06
|
||||
v14 ^= v02
|
||||
v14 = (v14 << 8) | (v14 >> 24)
|
||||
v10 += v14
|
||||
v06 ^= v10
|
||||
v06 = (v06 << 7) | (v06 >> 25)
|
||||
v03 += v07
|
||||
v15 ^= v03
|
||||
v15 = (v15 << 16) | (v15 >> 16)
|
||||
v11 += v15
|
||||
v07 ^= v11
|
||||
v07 = (v07 << 12) | (v07 >> 20)
|
||||
v03 += v07
|
||||
v15 ^= v03
|
||||
v15 = (v15 << 8) | (v15 >> 24)
|
||||
v11 += v15
|
||||
v07 ^= v11
|
||||
v07 = (v07 << 7) | (v07 >> 25)
|
||||
v00 += v05
|
||||
v15 ^= v00
|
||||
v15 = (v15 << 16) | (v15 >> 16)
|
||||
v10 += v15
|
||||
v05 ^= v10
|
||||
v05 = (v05 << 12) | (v05 >> 20)
|
||||
v00 += v05
|
||||
v15 ^= v00
|
||||
v15 = (v15 << 8) | (v15 >> 24)
|
||||
v10 += v15
|
||||
v05 ^= v10
|
||||
v05 = (v05 << 7) | (v05 >> 25)
|
||||
v01 += v06
|
||||
v12 ^= v01
|
||||
v12 = (v12 << 16) | (v12 >> 16)
|
||||
v11 += v12
|
||||
v06 ^= v11
|
||||
v06 = (v06 << 12) | (v06 >> 20)
|
||||
v01 += v06
|
||||
v12 ^= v01
|
||||
v12 = (v12 << 8) | (v12 >> 24)
|
||||
v11 += v12
|
||||
v06 ^= v11
|
||||
v06 = (v06 << 7) | (v06 >> 25)
|
||||
v02 += v07
|
||||
v13 ^= v02
|
||||
v13 = (v13 << 16) | (v13 >> 16)
|
||||
v08 += v13
|
||||
v07 ^= v08
|
||||
v07 = (v07 << 12) | (v07 >> 20)
|
||||
v02 += v07
|
||||
v13 ^= v02
|
||||
v13 = (v13 << 8) | (v13 >> 24)
|
||||
v08 += v13
|
||||
v07 ^= v08
|
||||
v07 = (v07 << 7) | (v07 >> 25)
|
||||
v03 += v04
|
||||
v14 ^= v03
|
||||
v14 = (v14 << 16) | (v14 >> 16)
|
||||
v09 += v14
|
||||
v04 ^= v09
|
||||
v04 = (v04 << 12) | (v04 >> 20)
|
||||
v03 += v04
|
||||
v14 ^= v03
|
||||
v14 = (v14 << 8) | (v14 >> 24)
|
||||
v09 += v14
|
||||
v04 ^= v09
|
||||
v04 = (v04 << 7) | (v04 >> 25)
|
||||
}
|
||||
|
||||
v00 += s00
|
||||
v01 += s01
|
||||
v02 += s02
|
||||
v03 += s03
|
||||
v04 += s04
|
||||
v05 += s05
|
||||
v06 += s06
|
||||
v07 += s07
|
||||
v08 += s08
|
||||
v09 += s09
|
||||
v10 += s10
|
||||
v11 += s11
|
||||
v12 += s12
|
||||
v13 += s13
|
||||
v14 += s14
|
||||
v15 += s15
|
||||
|
||||
s12++
|
||||
binary.LittleEndian.PutUint32(state[48:], s12)
|
||||
if s12 == 0 { // indicates overflow
|
||||
s13++
|
||||
binary.LittleEndian.PutUint32(state[52:], s13)
|
||||
}
|
||||
|
||||
binary.LittleEndian.PutUint32(dst[0:], v00)
|
||||
binary.LittleEndian.PutUint32(dst[4:], v01)
|
||||
binary.LittleEndian.PutUint32(dst[8:], v02)
|
||||
binary.LittleEndian.PutUint32(dst[12:], v03)
|
||||
binary.LittleEndian.PutUint32(dst[16:], v04)
|
||||
binary.LittleEndian.PutUint32(dst[20:], v05)
|
||||
binary.LittleEndian.PutUint32(dst[24:], v06)
|
||||
binary.LittleEndian.PutUint32(dst[28:], v07)
|
||||
binary.LittleEndian.PutUint32(dst[32:], v08)
|
||||
binary.LittleEndian.PutUint32(dst[36:], v09)
|
||||
binary.LittleEndian.PutUint32(dst[40:], v10)
|
||||
binary.LittleEndian.PutUint32(dst[44:], v11)
|
||||
binary.LittleEndian.PutUint32(dst[48:], v12)
|
||||
binary.LittleEndian.PutUint32(dst[52:], v13)
|
||||
binary.LittleEndian.PutUint32(dst[56:], v14)
|
||||
binary.LittleEndian.PutUint32(dst[60:], v15)
|
||||
}
|
||||
|
||||
func hChaCha20Generic(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
v00 := sigma[0]
|
||||
v01 := sigma[1]
|
||||
v02 := sigma[2]
|
||||
v03 := sigma[3]
|
||||
v04 := binary.LittleEndian.Uint32(key[0:])
|
||||
v05 := binary.LittleEndian.Uint32(key[4:])
|
||||
v06 := binary.LittleEndian.Uint32(key[8:])
|
||||
v07 := binary.LittleEndian.Uint32(key[12:])
|
||||
v08 := binary.LittleEndian.Uint32(key[16:])
|
||||
v09 := binary.LittleEndian.Uint32(key[20:])
|
||||
v10 := binary.LittleEndian.Uint32(key[24:])
|
||||
v11 := binary.LittleEndian.Uint32(key[28:])
|
||||
v12 := binary.LittleEndian.Uint32(nonce[0:])
|
||||
v13 := binary.LittleEndian.Uint32(nonce[4:])
|
||||
v14 := binary.LittleEndian.Uint32(nonce[8:])
|
||||
v15 := binary.LittleEndian.Uint32(nonce[12:])
|
||||
|
||||
for i := 0; i < 20; i += 2 {
|
||||
v00 += v04
|
||||
v12 ^= v00
|
||||
v12 = (v12 << 16) | (v12 >> 16)
|
||||
v08 += v12
|
||||
v04 ^= v08
|
||||
v04 = (v04 << 12) | (v04 >> 20)
|
||||
v00 += v04
|
||||
v12 ^= v00
|
||||
v12 = (v12 << 8) | (v12 >> 24)
|
||||
v08 += v12
|
||||
v04 ^= v08
|
||||
v04 = (v04 << 7) | (v04 >> 25)
|
||||
v01 += v05
|
||||
v13 ^= v01
|
||||
v13 = (v13 << 16) | (v13 >> 16)
|
||||
v09 += v13
|
||||
v05 ^= v09
|
||||
v05 = (v05 << 12) | (v05 >> 20)
|
||||
v01 += v05
|
||||
v13 ^= v01
|
||||
v13 = (v13 << 8) | (v13 >> 24)
|
||||
v09 += v13
|
||||
v05 ^= v09
|
||||
v05 = (v05 << 7) | (v05 >> 25)
|
||||
v02 += v06
|
||||
v14 ^= v02
|
||||
v14 = (v14 << 16) | (v14 >> 16)
|
||||
v10 += v14
|
||||
v06 ^= v10
|
||||
v06 = (v06 << 12) | (v06 >> 20)
|
||||
v02 += v06
|
||||
v14 ^= v02
|
||||
v14 = (v14 << 8) | (v14 >> 24)
|
||||
v10 += v14
|
||||
v06 ^= v10
|
||||
v06 = (v06 << 7) | (v06 >> 25)
|
||||
v03 += v07
|
||||
v15 ^= v03
|
||||
v15 = (v15 << 16) | (v15 >> 16)
|
||||
v11 += v15
|
||||
v07 ^= v11
|
||||
v07 = (v07 << 12) | (v07 >> 20)
|
||||
v03 += v07
|
||||
v15 ^= v03
|
||||
v15 = (v15 << 8) | (v15 >> 24)
|
||||
v11 += v15
|
||||
v07 ^= v11
|
||||
v07 = (v07 << 7) | (v07 >> 25)
|
||||
v00 += v05
|
||||
v15 ^= v00
|
||||
v15 = (v15 << 16) | (v15 >> 16)
|
||||
v10 += v15
|
||||
v05 ^= v10
|
||||
v05 = (v05 << 12) | (v05 >> 20)
|
||||
v00 += v05
|
||||
v15 ^= v00
|
||||
v15 = (v15 << 8) | (v15 >> 24)
|
||||
v10 += v15
|
||||
v05 ^= v10
|
||||
v05 = (v05 << 7) | (v05 >> 25)
|
||||
v01 += v06
|
||||
v12 ^= v01
|
||||
v12 = (v12 << 16) | (v12 >> 16)
|
||||
v11 += v12
|
||||
v06 ^= v11
|
||||
v06 = (v06 << 12) | (v06 >> 20)
|
||||
v01 += v06
|
||||
v12 ^= v01
|
||||
v12 = (v12 << 8) | (v12 >> 24)
|
||||
v11 += v12
|
||||
v06 ^= v11
|
||||
v06 = (v06 << 7) | (v06 >> 25)
|
||||
v02 += v07
|
||||
v13 ^= v02
|
||||
v13 = (v13 << 16) | (v13 >> 16)
|
||||
v08 += v13
|
||||
v07 ^= v08
|
||||
v07 = (v07 << 12) | (v07 >> 20)
|
||||
v02 += v07
|
||||
v13 ^= v02
|
||||
v13 = (v13 << 8) | (v13 >> 24)
|
||||
v08 += v13
|
||||
v07 ^= v08
|
||||
v07 = (v07 << 7) | (v07 >> 25)
|
||||
v03 += v04
|
||||
v14 ^= v03
|
||||
v14 = (v14 << 16) | (v14 >> 16)
|
||||
v09 += v14
|
||||
v04 ^= v09
|
||||
v04 = (v04 << 12) | (v04 >> 20)
|
||||
v03 += v04
|
||||
v14 ^= v03
|
||||
v14 = (v14 << 8) | (v14 >> 24)
|
||||
v09 += v14
|
||||
v04 ^= v09
|
||||
v04 = (v04 << 7) | (v04 >> 25)
|
||||
}
|
||||
|
||||
binary.LittleEndian.PutUint32(out[0:], v00)
|
||||
binary.LittleEndian.PutUint32(out[4:], v01)
|
||||
binary.LittleEndian.PutUint32(out[8:], v02)
|
||||
binary.LittleEndian.PutUint32(out[12:], v03)
|
||||
binary.LittleEndian.PutUint32(out[16:], v12)
|
||||
binary.LittleEndian.PutUint32(out[20:], v13)
|
||||
binary.LittleEndian.PutUint32(out[24:], v14)
|
||||
binary.LittleEndian.PutUint32(out[28:], v15)
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!386 gccgo appengine nacl
|
||||
|
||||
package chacha
|
||||
|
||||
import "encoding/binary"
|
||||
|
||||
func init() {
|
||||
useSSE2 = false
|
||||
useSSSE3 = false
|
||||
useAVX = false
|
||||
useAVX2 = false
|
||||
}
|
||||
|
||||
func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
|
||||
binary.LittleEndian.PutUint32(state[0:], sigma[0])
|
||||
binary.LittleEndian.PutUint32(state[4:], sigma[1])
|
||||
binary.LittleEndian.PutUint32(state[8:], sigma[2])
|
||||
binary.LittleEndian.PutUint32(state[12:], sigma[3])
|
||||
copy(state[16:], key[:])
|
||||
copy(state[48:], nonce[:])
|
||||
}
|
||||
|
||||
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
|
||||
return xorKeyStreamGeneric(dst, src, block, state, rounds)
|
||||
}
|
||||
|
||||
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
|
||||
hChaCha20Generic(out, nonce, key)
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA ·sigma<>+0x00(SB)/4, $0x61707865
|
||||
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
|
||||
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
|
||||
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
|
||||
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16 // The 4 ChaCha initialization constants
|
||||
|
||||
// SSE2/SSE3/AVX constants
|
||||
|
||||
DATA ·one<>+0x00(SB)/8, $1
|
||||
DATA ·one<>+0x08(SB)/8, $0
|
||||
GLOBL ·one<>(SB), (NOPTR+RODATA), $16 // The constant 1 as 128 bit value
|
||||
|
||||
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 16 bit left rotate constant
|
||||
|
||||
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 8 bit left rotate constant
|
||||
|
||||
// AVX2 constants
|
||||
|
||||
DATA ·one_AVX2<>+0x00(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·one_AVX2<>+0x10(SB)/8, $1
|
||||
DATA ·one_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32 // The constant 1 as 256 bit value
|
||||
|
||||
DATA ·two_AVX2<>+0x00(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x08(SB)/8, $0
|
||||
DATA ·two_AVX2<>+0x10(SB)/8, $2
|
||||
DATA ·two_AVX2<>+0x18(SB)/8, $0
|
||||
GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32
|
||||
|
||||
DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
|
||||
DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302
|
||||
DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
|
||||
GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 16 bit left rotate constant
|
||||
|
||||
DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
|
||||
DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003
|
||||
DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
|
||||
GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 8 bit left rotate constant
|
|
@ -0,0 +1,163 @@
|
|||
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
|
||||
|
||||
// ROTL_SSE rotates all 4 32 bit values of the XMM register v
|
||||
// left by n bits using SSE2 instructions (0 <= n <= 32).
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define ROTL_SSE(n, t, v) \
|
||||
MOVO v, t; \
|
||||
PSLLL $n, t; \
|
||||
PSRLL $(32-n), v; \
|
||||
PXOR t, v
|
||||
|
||||
// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
|
||||
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
|
||||
// The AVX/AVX2 register t is used as a temp. register.
|
||||
#define ROTL_AVX(n, t, v) \
|
||||
VPSLLD $n, v, t; \
|
||||
VPSRLD $(32-n), v, v; \
|
||||
VPXOR v, t, v
|
||||
|
||||
// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
|
||||
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
|
||||
// rotations. The XMM register t is used as a temp. register.
|
||||
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE(16, t, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(12, t, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
ROTL_SSE(8, t, v3); \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(7, t, v1)
|
||||
|
||||
// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
|
||||
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
|
||||
// rotations. The XMM register t is used as a temp. register.
|
||||
//
|
||||
// r16 holds the PSHUFB constant for a 16 bit left rotate.
|
||||
// r8 holds the PSHUFB constant for a 8 bit left rotate.
|
||||
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r16, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(12, t, v1); \
|
||||
PADDL v1, v0; \
|
||||
PXOR v0, v3; \
|
||||
PSHUFB r8, v3; \
|
||||
PADDL v3, v2; \
|
||||
PXOR v2, v1; \
|
||||
ROTL_SSE(7, t, v1)
|
||||
|
||||
// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
|
||||
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
|
||||
// rotations. The AVX/AVX2 register t is used as a temp. register.
|
||||
//
|
||||
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
|
||||
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
|
||||
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB r16, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL_AVX(12, t, v1); \
|
||||
VPADDD v0, v1, v0; \
|
||||
VPXOR v3, v0, v3; \
|
||||
VPSHUFB r8, v3, v3; \
|
||||
VPADDD v2, v3, v2; \
|
||||
VPXOR v1, v2, v1; \
|
||||
ROTL_AVX(7, t, v1)
|
||||
|
||||
// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
|
||||
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
|
||||
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
|
||||
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
|
||||
PSHUFL $0x39, v1, v1; \
|
||||
PSHUFL $0x4E, v2, v2; \
|
||||
PSHUFL $0x93, v3, v3
|
||||
|
||||
// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
|
||||
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
|
||||
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
|
||||
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
|
||||
VPSHUFD $0x39, v1, v1; \
|
||||
VPSHUFD $0x4E, v2, v2; \
|
||||
VPSHUFD $0x93, v3, v3
|
||||
|
||||
// XOR_SSE extracts 4x16 byte vectors from src at
|
||||
// off, xors all vectors with the corresponding XMM
|
||||
// register (v0 - v3) and writes the result to dst
|
||||
// at off.
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
|
||||
MOVOU 0+off(src), t; \
|
||||
PXOR v0, t; \
|
||||
MOVOU t, 0+off(dst); \
|
||||
MOVOU 16+off(src), t; \
|
||||
PXOR v1, t; \
|
||||
MOVOU t, 16+off(dst); \
|
||||
MOVOU 32+off(src), t; \
|
||||
PXOR v2, t; \
|
||||
MOVOU t, 32+off(dst); \
|
||||
MOVOU 48+off(src), t; \
|
||||
PXOR v3, t; \
|
||||
MOVOU t, 48+off(dst)
|
||||
|
||||
// XOR_AVX extracts 4x16 byte vectors from src at
|
||||
// off, xors all vectors with the corresponding AVX
|
||||
// register (v0 - v3) and writes the result to dst
|
||||
// at off.
|
||||
// The XMM register t is used as a temp. register.
|
||||
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
|
||||
VPXOR 0+off(src), v0, t; \
|
||||
VMOVDQU t, 0+off(dst); \
|
||||
VPXOR 16+off(src), v1, t; \
|
||||
VMOVDQU t, 16+off(dst); \
|
||||
VPXOR 32+off(src), v2, t; \
|
||||
VMOVDQU t, 32+off(dst); \
|
||||
VPXOR 48+off(src), v3, t; \
|
||||
VMOVDQU t, 48+off(dst)
|
||||
|
||||
#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
VMOVDQU (64+off)(src), t0; \
|
||||
VPERM2I128 $49, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (64+off)(dst); \
|
||||
VMOVDQU (96+off)(src), t0; \
|
||||
VPERM2I128 $49, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (96+off)(dst)
|
||||
|
||||
#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
|
||||
VMOVDQU (0+off)(src), t0; \
|
||||
VPERM2I128 $32, v1, v0, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (0+off)(dst); \
|
||||
VMOVDQU (32+off)(src), t0; \
|
||||
VPERM2I128 $32, v3, v2, t1; \
|
||||
VPXOR t0, t1, t0; \
|
||||
VMOVDQU t0, (32+off)(dst); \
|
||||
|
||||
#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
|
||||
VPERM2I128 $49, v1, v0, t0; \
|
||||
VMOVDQU t0, 0(dst); \
|
||||
VPERM2I128 $49, v3, v2, t0; \
|
||||
VMOVDQU t0, 32(dst)
|
|
@ -15,6 +15,8 @@ blitter.com/go/kyber
|
|||
blitter.com/go/mtwist
|
||||
# blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae
|
||||
blitter.com/go/newhope
|
||||
# github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da
|
||||
github.com/aead/chacha20/chacha
|
||||
# github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f
|
||||
github.com/jameskeane/bcrypt
|
||||
# github.com/klauspost/cpuid v1.2.2
|
||||
|
|
2
xs/xs.go
2
xs/xs.go
|
@ -624,7 +624,7 @@ func main() {
|
|||
|
||||
flag.BoolVar(&vopt, "v", false, "show version")
|
||||
flag.BoolVar(&dbg, "d", false, "debug logging")
|
||||
flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1]")
|
||||
flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1 | C_CHACHA20_12]")
|
||||
flag.StringVar(&hmacAlg, "m", "H_SHA256", "session `HMAC` [H_SHA256 | H_SHA512]")
|
||||
flag.StringVar(&kexAlg, "k", "KEX_HERRADURA512", "KEx `alg` [KEX_HERRADURA{256/512/1024/2048} | KEX_KYBER{512/768/1024} | KEX_NEWHOPE | KEX_NEWHOPE_SIMPLE]")
|
||||
flag.StringVar(&kcpMode, "K", "unused", "KCP `alg`, one of [KCP_NONE | KCP_AES | KCP_BLOWFISH | KCP_CAST5 | KCP_SM4 | KCP_SALSA20 | KCP_SIMPLEXOR | KCP_TEA | KCP_3DES | KCP_TWOFISH | KCP_XTEA] to use KCP (github.com/xtaci/kcp-go) reliable UDP instead of TCP")
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"log"
|
||||
|
||||
"blitter.com/go/cryptmt"
|
||||
"github.com/aead/chacha20/chacha"
|
||||
"golang.org/x/crypto/blowfish"
|
||||
"golang.org/x/crypto/twofish"
|
||||
|
||||
|
@ -104,6 +105,18 @@ func (hc Conn) getStream(keymat []byte) (rc cipher.Stream, mc hash.Hash, err err
|
|||
case CAlgCryptMT1:
|
||||
rc = cryptmt.New(nil, nil, keymat)
|
||||
log.Printf("[cipher CRYPTMT1 (%d)]\n", copts)
|
||||
case CAlgChaCha20_12:
|
||||
keymat = expandKeyMat(keymat, chacha.KeySize)
|
||||
key = keymat[0:chacha.KeySize]
|
||||
ivlen = chacha.INonceSize
|
||||
iv = keymat[chacha.KeySize : chacha.KeySize+ivlen]
|
||||
rc, err = chacha.NewCipher(iv, key, 20)
|
||||
if err != nil {
|
||||
log.Printf("[ChaCha20 config error]\n")
|
||||
fmt.Printf("[ChaCha20 config error]\n")
|
||||
}
|
||||
// TODO: SetCounter() to something derived from key or nonce or extra keymat?
|
||||
log.Printf("[cipher CHACHA20_12 (%d)]\n", copts)
|
||||
default:
|
||||
log.Printf("[invalid cipher (%d)]\n", copts)
|
||||
fmt.Printf("DOOFUS SET A VALID CIPHER ALG (%d)\n", copts)
|
||||
|
|
|
@ -99,6 +99,7 @@ const (
|
|||
CAlgTwofish128 // golang.org/x/crypto/twofish
|
||||
CAlgBlowfish64 // golang.org/x/crypto/blowfish
|
||||
CAlgCryptMT1 //cryptmt using mtwist64
|
||||
CAlgChaCha20_12
|
||||
CAlgNoneDisallowed
|
||||
)
|
||||
|
||||
|
|
10
xsnet/net.go
10
xsnet/net.go
|
@ -145,6 +145,8 @@ func (c *CSCipherAlg) String() string {
|
|||
return "C_BLOWFISH_64"
|
||||
case CAlgCryptMT1:
|
||||
return "C_CRYPTMT1"
|
||||
case CAlgChaCha20_12:
|
||||
return "C_CHACHA20_12"
|
||||
default:
|
||||
return "C_ERR_UNK"
|
||||
}
|
||||
|
@ -280,6 +282,8 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) {
|
|||
hc.kex = KEX_HERRADURA512
|
||||
log.Printf("[KEx alg %d ?? defaults to %d]\n", kexAlg, hc.kex)
|
||||
}
|
||||
|
||||
//hc.logCipherText = true // !!! DEBUGGING ONLY !!! NEVER DEPLOY this uncommented !!!
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -298,7 +302,7 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) {
|
|||
//
|
||||
// Session (symmetric) crypto
|
||||
//
|
||||
// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1
|
||||
// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1 C_CHACHA20_12
|
||||
//
|
||||
// Session HMACs
|
||||
//
|
||||
|
@ -322,6 +326,10 @@ func (hc *Conn) applyConnExtensions(extensions ...string) {
|
|||
log.Println("[extension arg = C_CRYPTMT1]")
|
||||
hc.cipheropts &= (0xFFFFFF00)
|
||||
hc.cipheropts |= CAlgCryptMT1
|
||||
case "C_CHACHA20_12":
|
||||
log.Println("[extension arg = C_CHACHA20_12]")
|
||||
hc.cipheropts &= (0xFFFFFF00)
|
||||
hc.cipheropts |= CAlgChaCha20_12
|
||||
case "H_SHA256":
|
||||
log.Println("[extension arg = H_SHA256]")
|
||||
hc.cipheropts &= (0xFFFF00FF)
|
||||
|
|
Loading…
Reference in New Issue