mirror of https://gogs.blitter.com/RLabs/xs
1073 lines
24 KiB
ArmAsm
1073 lines
24 KiB
ArmAsm
|
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
|
||
|
// Use of this source code is governed by a license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
// +build amd64,!gccgo,!appengine,!nacl
|
||
|
|
||
|
#include "const.s"
|
||
|
#include "macro.s"
|
||
|
|
||
|
// FINALIZE xors len bytes from src and block using
|
||
|
// the temp. registers t0 and t1 and writes the result
|
||
|
// to dst.
|
||
|
#define FINALIZE(dst, src, block, len, t0, t1) \
|
||
|
XORQ t0, t0; \
|
||
|
XORQ t1, t1; \
|
||
|
FINALIZE_LOOP:; \
|
||
|
MOVB 0(src), t0; \
|
||
|
MOVB 0(block), t1; \
|
||
|
XORQ t0, t1; \
|
||
|
MOVB t1, 0(dst); \
|
||
|
INCQ src; \
|
||
|
INCQ block; \
|
||
|
INCQ dst; \
|
||
|
DECQ len; \
|
||
|
JG FINALIZE_LOOP \
|
||
|
|
||
|
#define Dst DI
|
||
|
#define Nonce AX
|
||
|
#define Key BX
|
||
|
#define Rounds DX
|
||
|
|
||
|
// func initialize(state *[64]byte, key []byte, nonce *[16]byte)
|
||
|
TEXT ·initialize(SB), 4, $0-40
|
||
|
MOVQ state+0(FP), Dst
|
||
|
MOVQ key+8(FP), Key
|
||
|
MOVQ nonce+32(FP), Nonce
|
||
|
|
||
|
MOVOU ·sigma<>(SB), X0
|
||
|
MOVOU 0*16(Key), X1
|
||
|
MOVOU 1*16(Key), X2
|
||
|
MOVOU 0*16(Nonce), X3
|
||
|
|
||
|
MOVOU X0, 0*16(Dst)
|
||
|
MOVOU X1, 1*16(Dst)
|
||
|
MOVOU X2, 2*16(Dst)
|
||
|
MOVOU X3, 3*16(Dst)
|
||
|
RET
|
||
|
|
||
|
// func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||
|
TEXT ·hChaCha20AVX(SB), 4, $0-24
|
||
|
MOVQ out+0(FP), Dst
|
||
|
MOVQ nonce+8(FP), Nonce
|
||
|
MOVQ key+16(FP), Key
|
||
|
|
||
|
VMOVDQU ·sigma<>(SB), X0
|
||
|
VMOVDQU 0*16(Key), X1
|
||
|
VMOVDQU 1*16(Key), X2
|
||
|
VMOVDQU 0*16(Nonce), X3
|
||
|
VMOVDQU ·rol16_AVX2<>(SB), X5
|
||
|
VMOVDQU ·rol8_AVX2<>(SB), X6
|
||
|
MOVQ $20, Rounds
|
||
|
|
||
|
CHACHA_LOOP:
|
||
|
CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6)
|
||
|
CHACHA_SHUFFLE_AVX(X1, X2, X3)
|
||
|
CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, X5, X6)
|
||
|
CHACHA_SHUFFLE_AVX(X3, X2, X1)
|
||
|
SUBQ $2, Rounds
|
||
|
JNZ CHACHA_LOOP
|
||
|
|
||
|
VMOVDQU X0, 0*16(Dst)
|
||
|
VMOVDQU X3, 1*16(Dst)
|
||
|
VZEROUPPER
|
||
|
RET
|
||
|
|
||
|
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||
|
TEXT ·hChaCha20SSE2(SB), 4, $0-24
|
||
|
MOVQ out+0(FP), Dst
|
||
|
MOVQ nonce+8(FP), Nonce
|
||
|
MOVQ key+16(FP), Key
|
||
|
|
||
|
MOVOU ·sigma<>(SB), X0
|
||
|
MOVOU 0*16(Key), X1
|
||
|
MOVOU 1*16(Key), X2
|
||
|
MOVOU 0*16(Nonce), X3
|
||
|
MOVQ $20, Rounds
|
||
|
|
||
|
CHACHA_LOOP:
|
||
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||
|
SUBQ $2, Rounds
|
||
|
JNZ CHACHA_LOOP
|
||
|
|
||
|
MOVOU X0, 0*16(Dst)
|
||
|
MOVOU X3, 1*16(Dst)
|
||
|
RET
|
||
|
|
||
|
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
|
||
|
TEXT ·hChaCha20SSSE3(SB), 4, $0-24
|
||
|
MOVQ out+0(FP), Dst
|
||
|
MOVQ nonce+8(FP), Nonce
|
||
|
MOVQ key+16(FP), Key
|
||
|
|
||
|
MOVOU ·sigma<>(SB), X0
|
||
|
MOVOU 0*16(Key), X1
|
||
|
MOVOU 1*16(Key), X2
|
||
|
MOVOU 0*16(Nonce), X3
|
||
|
MOVOU ·rol16<>(SB), X5
|
||
|
MOVOU ·rol8<>(SB), X6
|
||
|
MOVQ $20, Rounds
|
||
|
|
||
|
chacha_loop:
|
||
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
|
||
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||
|
SUBQ $2, Rounds
|
||
|
JNZ chacha_loop
|
||
|
|
||
|
MOVOU X0, 0*16(Dst)
|
||
|
MOVOU X3, 1*16(Dst)
|
||
|
RET
|
||
|
|
||
|
#undef Dst
|
||
|
#undef Nonce
|
||
|
#undef Key
|
||
|
#undef Rounds
|
||
|
|
||
|
#define Dst DI
|
||
|
#define Src SI
|
||
|
#define Len R12
|
||
|
#define Rounds DX
|
||
|
#define Buffer BX
|
||
|
#define State AX
|
||
|
#define Stack SP
|
||
|
#define SavedSP R8
|
||
|
#define Tmp0 R9
|
||
|
#define Tmp1 R10
|
||
|
#define Tmp2 R11
|
||
|
|
||
|
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
|
||
|
TEXT ·xorKeyStreamSSE2(SB), 4, $112-80
|
||
|
MOVQ dst_base+0(FP), Dst
|
||
|
MOVQ src_base+24(FP), Src
|
||
|
MOVQ block+48(FP), Buffer
|
||
|
MOVQ state+56(FP), State
|
||
|
MOVQ rounds+64(FP), Rounds
|
||
|
MOVQ src_len+32(FP), Len
|
||
|
|
||
|
MOVOU 0*16(State), X0
|
||
|
MOVOU 1*16(State), X1
|
||
|
MOVOU 2*16(State), X2
|
||
|
MOVOU 3*16(State), X3
|
||
|
|
||
|
MOVQ Stack, SavedSP
|
||
|
ADDQ $16, Stack
|
||
|
ANDQ $-16, Stack
|
||
|
|
||
|
TESTQ Len, Len
|
||
|
JZ DONE
|
||
|
|
||
|
MOVOU ·one<>(SB), X4
|
||
|
MOVO X0, 0*16(Stack)
|
||
|
MOVO X1, 1*16(Stack)
|
||
|
MOVO X2, 2*16(Stack)
|
||
|
MOVO X3, 3*16(Stack)
|
||
|
MOVO X4, 4*16(Stack)
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192
|
||
|
JLE GENERATE_KEYSTREAM_192
|
||
|
|
||
|
GENERATE_KEYSTREAM_256:
|
||
|
MOVO X0, X12
|
||
|
MOVO X1, X13
|
||
|
MOVO X2, X14
|
||
|
MOVO X3, X15
|
||
|
PADDQ 4*16(Stack), X15
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X15, X11
|
||
|
PADDQ 4*16(Stack), X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X11, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
MOVO X3, 3*16(Stack) // Save X3
|
||
|
|
||
|
CHACHA_LOOP_256:
|
||
|
MOVO X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4)
|
||
|
MOVO 5*16(Stack), X4
|
||
|
MOVO X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||
|
MOVO 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||
|
CHACHA_SHUFFLE_SSE(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
MOVO X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
|
||
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X4)
|
||
|
MOVO 5*16(Stack), X4
|
||
|
MOVO X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||
|
MOVO 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||
|
CHACHA_SHUFFLE_SSE(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_256
|
||
|
|
||
|
PADDL 0*16(Stack), X0
|
||
|
PADDL 1*16(Stack), X1
|
||
|
PADDL 2*16(Stack), X2
|
||
|
PADDL 3*16(Stack), X3
|
||
|
MOVO X4, 5*16(Stack) // Save X4
|
||
|
XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4)
|
||
|
MOVO 5*16(Stack), X4 // Restore X4
|
||
|
|
||
|
MOVO 0*16(Stack), X0
|
||
|
MOVO 1*16(Stack), X1
|
||
|
MOVO 2*16(Stack), X2
|
||
|
MOVO 3*16(Stack), X3
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
PADDL X0, X12
|
||
|
PADDL X1, X13
|
||
|
PADDL X2, X14
|
||
|
PADDL X3, X15
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0)
|
||
|
XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0)
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $192, Dst
|
||
|
ADDQ $192, Src
|
||
|
SUBQ $192, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
|
||
|
JG GENERATE_KEYSTREAM_256
|
||
|
|
||
|
GENERATE_KEYSTREAM_192:
|
||
|
MOVO X0, X12
|
||
|
MOVO X1, X13
|
||
|
MOVO X2, X14
|
||
|
MOVO X3, X15
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X3, X11
|
||
|
PADDQ 4*16(Stack), X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X11, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
CHACHA_LOOP_192:
|
||
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||
|
CHACHA_SHUFFLE_SSE(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSE2(X12, X13, X14, X15, X0)
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X0)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
|
||
|
CHACHA_SHUFFLE_SSE(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_192
|
||
|
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
PADDL X0, X12
|
||
|
PADDL X1, X13
|
||
|
PADDL X2, X14
|
||
|
PADDL X3, X15
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0)
|
||
|
XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0)
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $128, Dst
|
||
|
ADDQ $128, Src
|
||
|
SUBQ $128, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
|
||
|
GENERATE_KEYSTREAM_128:
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X3, X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X3, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
CHACHA_LOOP_128:
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSE2(X8, X9, X10, X11, X12)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X12)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_128
|
||
|
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
|
||
|
|
||
|
GENERATE_KEYSTREAM_64:
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X3, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
CHACHA_LOOP_64:
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X8)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_64
|
||
|
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Src
|
||
|
ADDQ $64, Dst
|
||
|
SUBQ $64, Len
|
||
|
JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
|
||
|
|
||
|
BUFFER_KEYSTREAM:
|
||
|
MOVOU X4, 0*16(Buffer)
|
||
|
MOVOU X5, 1*16(Buffer)
|
||
|
MOVOU X6, 2*16(Buffer)
|
||
|
MOVOU X7, 3*16(Buffer)
|
||
|
MOVQ Len, Tmp0
|
||
|
FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
|
||
|
|
||
|
DONE:
|
||
|
MOVQ SavedSP, Stack // Restore stack pointer
|
||
|
MOVOU X3, 3*16(State)
|
||
|
MOVQ Len, ret+72(FP)
|
||
|
RET
|
||
|
|
||
|
// func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
|
||
|
TEXT ·xorKeyStreamSSSE3(SB), 4, $144-80
|
||
|
MOVQ dst_base+0(FP), Dst
|
||
|
MOVQ src_base+24(FP), Src
|
||
|
MOVQ block+48(FP), Buffer
|
||
|
MOVQ state+56(FP), State
|
||
|
MOVQ rounds+64(FP), Rounds
|
||
|
MOVQ src_len+32(FP), Len
|
||
|
|
||
|
MOVOU 0*16(State), X0
|
||
|
MOVOU 1*16(State), X1
|
||
|
MOVOU 2*16(State), X2
|
||
|
MOVOU 3*16(State), X3
|
||
|
|
||
|
MOVQ Stack, SavedSP
|
||
|
ADDQ $16, Stack
|
||
|
ANDQ $-16, Stack
|
||
|
|
||
|
TESTQ Len, Len
|
||
|
JZ DONE
|
||
|
|
||
|
MOVOU ·one<>(SB), X4
|
||
|
MOVOU ·rol16<>(SB), X5
|
||
|
MOVOU ·rol8<>(SB), X6
|
||
|
MOVO X0, 0*16(Stack)
|
||
|
MOVO X1, 1*16(Stack)
|
||
|
MOVO X2, 2*16(Stack)
|
||
|
MOVO X3, 3*16(Stack)
|
||
|
MOVO X4, 4*16(Stack)
|
||
|
MOVO X5, 6*16(Stack)
|
||
|
MOVO X6, 7*16(Stack)
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192
|
||
|
JLE GENERATE_KEYSTREAM_192
|
||
|
|
||
|
GENERATE_KEYSTREAM_256:
|
||
|
MOVO X0, X12
|
||
|
MOVO X1, X13
|
||
|
MOVO X2, X14
|
||
|
MOVO X3, X15
|
||
|
PADDQ 4*16(Stack), X15
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X15, X11
|
||
|
PADDQ 4*16(Stack), X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X11, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
MOVO X3, 3*16(Stack) // Save X3
|
||
|
|
||
|
CHACHA_LOOP_256:
|
||
|
MOVO X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
|
||
|
MOVO 5*16(Stack), X4
|
||
|
MOVO X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
|
||
|
MOVO 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_SSE(X1, X2, X3)
|
||
|
CHACHA_SHUFFLE_SSE(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
MOVO X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
|
||
|
MOVO 5*16(Stack), X4
|
||
|
MOVO X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
|
||
|
MOVO 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_SSE(X3, X2, X1)
|
||
|
CHACHA_SHUFFLE_SSE(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_256
|
||
|
|
||
|
PADDL 0*16(Stack), X0
|
||
|
PADDL 1*16(Stack), X1
|
||
|
PADDL 2*16(Stack), X2
|
||
|
PADDL 3*16(Stack), X3
|
||
|
MOVO X4, 5*16(Stack) // Save X4
|
||
|
XOR_SSE(Dst, Src, 0, X0, X1, X2, X3, X4)
|
||
|
MOVO 5*16(Stack), X4 // Restore X4
|
||
|
|
||
|
MOVO 0*16(Stack), X0
|
||
|
MOVO 1*16(Stack), X1
|
||
|
MOVO 2*16(Stack), X2
|
||
|
MOVO 3*16(Stack), X3
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
PADDL X0, X12
|
||
|
PADDL X1, X13
|
||
|
PADDL X2, X14
|
||
|
PADDL X3, X15
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 64, X12, X13, X14, X15, X0)
|
||
|
XOR_SSE(Dst, Src, 128, X8, X9, X10, X11, X0)
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $192, Dst
|
||
|
ADDQ $192, Src
|
||
|
SUBQ $192, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
|
||
|
JG GENERATE_KEYSTREAM_256
|
||
|
|
||
|
GENERATE_KEYSTREAM_192:
|
||
|
MOVO X0, X12
|
||
|
MOVO X1, X13
|
||
|
MOVO X2, X14
|
||
|
MOVO X3, X15
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X3, X11
|
||
|
PADDQ 4*16(Stack), X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X11, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
MOVO 6*16(Stack), X1 // Load 16 bit rotate-left constant
|
||
|
MOVO 7*16(Stack), X2 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_192:
|
||
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2)
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2)
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
|
||
|
CHACHA_SHUFFLE_SSE(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSSE3(X12, X13, X14, X15, X0, X1, X2)
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X0, X1, X2)
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X0, X1, X2)
|
||
|
CHACHA_SHUFFLE_SSE(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_192
|
||
|
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
MOVO 1*16(Stack), X1 // Restore X1
|
||
|
MOVO 2*16(Stack), X2 // Restore X2
|
||
|
PADDL X0, X12
|
||
|
PADDL X1, X13
|
||
|
PADDL X2, X14
|
||
|
PADDL X3, X15
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X12, X13, X14, X15, X0)
|
||
|
XOR_SSE(Dst, Src, 64, X8, X9, X10, X11, X0)
|
||
|
MOVO 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $128, Dst
|
||
|
ADDQ $128, Src
|
||
|
SUBQ $128, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
|
||
|
GENERATE_KEYSTREAM_128:
|
||
|
MOVO X0, X8
|
||
|
MOVO X1, X9
|
||
|
MOVO X2, X10
|
||
|
MOVO X3, X11
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X3, X7
|
||
|
PADDQ 4*16(Stack), X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
MOVO 6*16(Stack), X13 // Load 16 bit rotate-left constant
|
||
|
MOVO 7*16(Stack), X14 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_128:
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
||
|
CHACHA_SHUFFLE_SSE(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSSE3(X8, X9, X10, X11, X12, X13, X14)
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X12, X13, X14)
|
||
|
CHACHA_SHUFFLE_SSE(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_128
|
||
|
|
||
|
PADDL X0, X8
|
||
|
PADDL X1, X9
|
||
|
PADDL X2, X10
|
||
|
PADDL X3, X11
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X8, X9, X10, X11, X12)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
|
||
|
|
||
|
GENERATE_KEYSTREAM_64:
|
||
|
MOVO X0, X4
|
||
|
MOVO X1, X5
|
||
|
MOVO X2, X6
|
||
|
MOVO X3, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
MOVO 6*16(Stack), X9 // Load 16 bit rotate-left constant
|
||
|
MOVO 7*16(Stack), X10 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_64:
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10)
|
||
|
CHACHA_SHUFFLE_SSE(X5, X6, X7)
|
||
|
CHACHA_QROUND_SSSE3(X4, X5, X6, X7, X8, X9, X10)
|
||
|
CHACHA_SHUFFLE_SSE(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_64
|
||
|
|
||
|
PADDL X0, X4
|
||
|
PADDL X1, X5
|
||
|
PADDL X2, X6
|
||
|
PADDL X3, X7
|
||
|
PADDQ 4*16(Stack), X3
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Src
|
||
|
ADDQ $64, Dst
|
||
|
SUBQ $64, Len
|
||
|
JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
|
||
|
|
||
|
BUFFER_KEYSTREAM:
|
||
|
MOVOU X4, 0*16(Buffer)
|
||
|
MOVOU X5, 1*16(Buffer)
|
||
|
MOVOU X6, 2*16(Buffer)
|
||
|
MOVOU X7, 3*16(Buffer)
|
||
|
MOVQ Len, Tmp0
|
||
|
FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
|
||
|
|
||
|
DONE:
|
||
|
MOVQ SavedSP, Stack // Restore stack pointer
|
||
|
MOVOU X3, 3*16(State)
|
||
|
MOVQ Len, ret+72(FP)
|
||
|
RET
|
||
|
|
||
|
// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
|
||
|
TEXT ·xorKeyStreamAVX(SB), 4, $144-80
|
||
|
MOVQ dst_base+0(FP), Dst
|
||
|
MOVQ src_base+24(FP), Src
|
||
|
MOVQ block+48(FP), Buffer
|
||
|
MOVQ state+56(FP), State
|
||
|
MOVQ rounds+64(FP), Rounds
|
||
|
MOVQ src_len+32(FP), Len
|
||
|
|
||
|
VMOVDQU 0*16(State), X0
|
||
|
VMOVDQU 1*16(State), X1
|
||
|
VMOVDQU 2*16(State), X2
|
||
|
VMOVDQU 3*16(State), X3
|
||
|
|
||
|
MOVQ Stack, SavedSP
|
||
|
ADDQ $16, Stack
|
||
|
ANDQ $-16, Stack
|
||
|
|
||
|
TESTQ Len, Len
|
||
|
JZ DONE
|
||
|
|
||
|
VMOVDQU ·one<>(SB), X4
|
||
|
VMOVDQU ·rol16<>(SB), X5
|
||
|
VMOVDQU ·rol8<>(SB), X6
|
||
|
VMOVDQA X0, 0*16(Stack)
|
||
|
VMOVDQA X1, 1*16(Stack)
|
||
|
VMOVDQA X2, 2*16(Stack)
|
||
|
VMOVDQA X3, 3*16(Stack)
|
||
|
VMOVDQA X4, 4*16(Stack)
|
||
|
VMOVDQA X5, 6*16(Stack)
|
||
|
VMOVDQA X6, 7*16(Stack)
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192
|
||
|
JLE GENERATE_KEYSTREAM_192
|
||
|
|
||
|
GENERATE_KEYSTREAM_256:
|
||
|
VMOVDQA X0, X12
|
||
|
VMOVDQA X1, X13
|
||
|
VMOVDQA X2, X14
|
||
|
VMOVDQA X3, X15
|
||
|
VPADDQ 4*16(Stack), X15, X15
|
||
|
VMOVDQA X0, X8
|
||
|
VMOVDQA X1, X9
|
||
|
VMOVDQA X2, X10
|
||
|
VMOVDQA X15, X11
|
||
|
VPADDQ 4*16(Stack), X11, X11
|
||
|
VMOVDQA X0, X4
|
||
|
VMOVDQA X1, X5
|
||
|
VMOVDQA X2, X6
|
||
|
VMOVDQA X11, X7
|
||
|
VPADDQ 4*16(Stack), X7, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
VMOVDQA X3, 3*16(Stack) // Save X3
|
||
|
|
||
|
CHACHA_LOOP_256:
|
||
|
VMOVDQA X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
|
||
|
VMOVDQA 5*16(Stack), X4
|
||
|
VMOVDQA X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
|
||
|
VMOVDQA 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_AVX(X1, X2, X3)
|
||
|
CHACHA_SHUFFLE_AVX(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_AVX(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||
|
VMOVDQA X4, 5*16(Stack)
|
||
|
CHACHA_QROUND_AVX(X0, X1, X2, X3, X4, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_AVX(X12, X13, X14, X15, X4, 6*16(Stack), 7*16(Stack))
|
||
|
VMOVDQA 5*16(Stack), X4
|
||
|
VMOVDQA X0, 5*16(Stack)
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, 6*16(Stack), 7*16(Stack))
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, 6*16(Stack), 7*16(Stack))
|
||
|
VMOVDQA 5*16(Stack), X0
|
||
|
CHACHA_SHUFFLE_AVX(X3, X2, X1)
|
||
|
CHACHA_SHUFFLE_AVX(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_AVX(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_256
|
||
|
|
||
|
VPADDD 0*16(Stack), X0, X0
|
||
|
VPADDD 1*16(Stack), X1, X1
|
||
|
VPADDD 2*16(Stack), X2, X2
|
||
|
VPADDD 3*16(Stack), X3, X3
|
||
|
VMOVDQA X4, 5*16(Stack) // Save X4
|
||
|
XOR_AVX(Dst, Src, 0, X0, X1, X2, X3, X4)
|
||
|
VMOVDQA 5*16(Stack), X4 // Restore X4
|
||
|
|
||
|
VMOVDQA 0*16(Stack), X0
|
||
|
VMOVDQA 1*16(Stack), X1
|
||
|
VMOVDQA 2*16(Stack), X2
|
||
|
VMOVDQA 3*16(Stack), X3
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
|
||
|
VPADDD X0, X12, X12
|
||
|
VPADDD X1, X13, X13
|
||
|
VPADDD X2, X14, X14
|
||
|
VPADDD X3, X15, X15
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
VPADDD X0, X8, X8
|
||
|
VPADDD X1, X9, X9
|
||
|
VPADDD X2, X10, X10
|
||
|
VPADDD X3, X11, X11
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
VPADDD X0, X4, X4
|
||
|
VPADDD X1, X5, X5
|
||
|
VPADDD X2, X6, X6
|
||
|
VPADDD X3, X7, X7
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
|
||
|
XOR_AVX(Dst, Src, 64, X12, X13, X14, X15, X0)
|
||
|
XOR_AVX(Dst, Src, 128, X8, X9, X10, X11, X0)
|
||
|
VMOVDQA 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $192, Dst
|
||
|
ADDQ $192, Src
|
||
|
SUBQ $192, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
CMPQ Len, $128 // If 64 < Len <= 128 -> gen. only 128 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_128
|
||
|
CMPQ Len, $192 // If Len > 192 -> repeat, otherwise Len > 128 && Len <= 192 -> gen. 192 byte keystream
|
||
|
JG GENERATE_KEYSTREAM_256
|
||
|
|
||
|
GENERATE_KEYSTREAM_192:
|
||
|
VMOVDQA X0, X12
|
||
|
VMOVDQA X1, X13
|
||
|
VMOVDQA X2, X14
|
||
|
VMOVDQA X3, X15
|
||
|
VMOVDQA X0, X8
|
||
|
VMOVDQA X1, X9
|
||
|
VMOVDQA X2, X10
|
||
|
VMOVDQA X3, X11
|
||
|
VPADDQ 4*16(Stack), X11, X11
|
||
|
VMOVDQA X0, X4
|
||
|
VMOVDQA X1, X5
|
||
|
VMOVDQA X2, X6
|
||
|
VMOVDQA X11, X7
|
||
|
VPADDQ 4*16(Stack), X7, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
VMOVDQA 6*16(Stack), X1 // Load 16 bit rotate-left constant
|
||
|
VMOVDQA 7*16(Stack), X2 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_192:
|
||
|
CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2)
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2)
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2)
|
||
|
CHACHA_SHUFFLE_AVX(X13, X14, X15)
|
||
|
CHACHA_SHUFFLE_AVX(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||
|
CHACHA_QROUND_AVX(X12, X13, X14, X15, X0, X1, X2)
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X0, X1, X2)
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X0, X1, X2)
|
||
|
CHACHA_SHUFFLE_AVX(X15, X14, X13)
|
||
|
CHACHA_SHUFFLE_AVX(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_192
|
||
|
|
||
|
VMOVDQA 0*16(Stack), X0 // Restore X0
|
||
|
VMOVDQA 1*16(Stack), X1 // Restore X1
|
||
|
VMOVDQA 2*16(Stack), X2 // Restore X2
|
||
|
VPADDD X0, X12, X12
|
||
|
VPADDD X1, X13, X13
|
||
|
VPADDD X2, X14, X14
|
||
|
VPADDD X3, X15, X15
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
VPADDD X0, X8, X8
|
||
|
VPADDD X1, X9, X9
|
||
|
VPADDD X2, X10, X10
|
||
|
VPADDD X3, X11, X11
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
VPADDD X0, X4, X4
|
||
|
VPADDD X1, X5, X5
|
||
|
VPADDD X2, X6, X6
|
||
|
VPADDD X3, X7, X7
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X12, X13, X14, X15, X0)
|
||
|
XOR_AVX(Dst, Src, 64, X8, X9, X10, X11, X0)
|
||
|
VMOVDQA 0*16(Stack), X0 // Restore X0
|
||
|
ADDQ $128, Dst
|
||
|
ADDQ $128, Src
|
||
|
SUBQ $128, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE
|
||
|
CMPQ Len, $64 // If Len <= 64 -> gen. only 64 byte keystream.
|
||
|
JLE GENERATE_KEYSTREAM_64
|
||
|
|
||
|
GENERATE_KEYSTREAM_128:
|
||
|
VMOVDQA X0, X8
|
||
|
VMOVDQA X1, X9
|
||
|
VMOVDQA X2, X10
|
||
|
VMOVDQA X3, X11
|
||
|
VMOVDQA X0, X4
|
||
|
VMOVDQA X1, X5
|
||
|
VMOVDQA X2, X6
|
||
|
VMOVDQA X3, X7
|
||
|
VPADDQ 4*16(Stack), X7, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
VMOVDQA 6*16(Stack), X13 // Load 16 bit rotate-left constant
|
||
|
VMOVDQA 7*16(Stack), X14 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_128:
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14)
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14)
|
||
|
CHACHA_SHUFFLE_AVX(X9, X10, X11)
|
||
|
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||
|
CHACHA_QROUND_AVX(X8, X9, X10, X11, X12, X13, X14)
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X12, X13, X14)
|
||
|
CHACHA_SHUFFLE_AVX(X11, X10, X9)
|
||
|
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_128
|
||
|
|
||
|
VPADDD X0, X8, X8
|
||
|
VPADDD X1, X9, X9
|
||
|
VPADDD X2, X10, X10
|
||
|
VPADDD X3, X11, X11
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
VPADDD X0, X4, X4
|
||
|
VPADDD X1, X5, X5
|
||
|
VPADDD X2, X6, X6
|
||
|
VPADDD X3, X7, X7
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X8, X9, X10, X11, X12)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Dst
|
||
|
ADDQ $64, Src
|
||
|
SUBQ $64, Len
|
||
|
JZ DONE // If Len == 0 -> DONE, otherwise Len <= 64 -> gen 64 byte keystream
|
||
|
|
||
|
GENERATE_KEYSTREAM_64:
|
||
|
VMOVDQA X0, X4
|
||
|
VMOVDQA X1, X5
|
||
|
VMOVDQA X2, X6
|
||
|
VMOVDQA X3, X7
|
||
|
MOVQ Rounds, Tmp0
|
||
|
|
||
|
VMOVDQA 6*16(Stack), X9 // Load 16 bit rotate-left constant
|
||
|
VMOVDQA 7*16(Stack), X10 // Load 8 bit rotate-left constant
|
||
|
|
||
|
CHACHA_LOOP_64:
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10)
|
||
|
CHACHA_SHUFFLE_AVX(X5, X6, X7)
|
||
|
CHACHA_QROUND_AVX(X4, X5, X6, X7, X8, X9, X10)
|
||
|
CHACHA_SHUFFLE_AVX(X7, X6, X5)
|
||
|
SUBQ $2, Tmp0
|
||
|
JNZ CHACHA_LOOP_64
|
||
|
|
||
|
VPADDD X0, X4, X4
|
||
|
VPADDD X1, X5, X5
|
||
|
VPADDD X2, X6, X6
|
||
|
VPADDD X3, X7, X7
|
||
|
VPADDQ 4*16(Stack), X3, X3
|
||
|
|
||
|
CMPQ Len, $64
|
||
|
JL BUFFER_KEYSTREAM
|
||
|
|
||
|
XOR_AVX(Dst, Src, 0, X4, X5, X6, X7, X8)
|
||
|
ADDQ $64, Src
|
||
|
ADDQ $64, Dst
|
||
|
SUBQ $64, Len
|
||
|
JMP DONE // jump directly to DONE - there is no keystream to buffer, Len == 0 always true.
|
||
|
|
||
|
BUFFER_KEYSTREAM:
|
||
|
VMOVDQU X4, 0*16(Buffer)
|
||
|
VMOVDQU X5, 1*16(Buffer)
|
||
|
VMOVDQU X6, 2*16(Buffer)
|
||
|
VMOVDQU X7, 3*16(Buffer)
|
||
|
MOVQ Len, Tmp0
|
||
|
FINALIZE(Dst, Src, Buffer, Tmp0, Tmp1, Tmp2)
|
||
|
|
||
|
DONE:
|
||
|
MOVQ SavedSP, Stack // Restore stack pointer
|
||
|
VMOVDQU X3, 3*16(State)
|
||
|
VZEROUPPER
|
||
|
MOVQ Len, ret+72(FP)
|
||
|
RET
|
||
|
|
||
|
#undef Dst
|
||
|
#undef Src
|
||
|
#undef Len
|
||
|
#undef Rounds
|
||
|
#undef Buffer
|
||
|
#undef State
|
||
|
#undef Stack
|
||
|
#undef SavedSP
|
||
|
#undef Tmp0
|
||
|
#undef Tmp1
|
||
|
#undef Tmp2
|