xs/vendor/github.com/aead/chacha20/chacha/chacha_386.s

164 lines
3.5 KiB
ArmAsm

// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl
#include "const.s"
#include "macro.s"
// FINALIZE xors len bytes from src and block using
// the temp. registers t0 and t1 and writes the result
// to dst.
#define FINALIZE(dst, src, block, len, t0, t1) \
XORL t0, t0; \
XORL t1, t1; \
FINALIZE_LOOP:; \
MOVB 0(src), t0; \
MOVB 0(block), t1; \
XORL t0, t1; \
MOVB t1, 0(dst); \
INCL src; \
INCL block; \
INCL dst; \
DECL len; \
JG FINALIZE_LOOP \
#define Dst DI
#define Nonce AX
#define Key BX
#define Rounds DX
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSE2(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
chacha_loop:
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
MOVOU ·rol16<>(SB), X5
MOVOU ·rol8<>(SB), X6
chacha_loop:
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
#undef Dst
#undef Nonce
#undef Key
#undef Rounds
#define State AX
#define Dst DI
#define Src SI
#define Len DX
#define Tmp0 BX
#define Tmp1 BP
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
MOVL dst_base+0(FP), Dst
MOVL src_base+12(FP), Src
MOVL state+28(FP), State
MOVL src_len+16(FP), Len
MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
MOVOU 0*16(State), X0
MOVOU 1*16(State), X1
MOVOU 2*16(State), X2
MOVOU 3*16(State), X3
TESTL Len, Len
JZ DONE
GENERATE_KEYSTREAM:
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
MOVL rounds+32(FP), Tmp0
CHACHA_LOOP:
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X5, X6, X7)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X7, X6, X5)
SUBL $2, Tmp0
JA CHACHA_LOOP
MOVOU 0*16(State), X0 // Restore X0 from state
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
MOVOU ·one<>(SB), X0
PADDQ X0, X3
CMPL Len, $64
JL BUFFER_KEYSTREAM
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
MOVOU 0*16(State), X0 // Restore X0 from state
ADDL $64, Src
ADDL $64, Dst
SUBL $64, Len
JZ DONE
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
BUFFER_KEYSTREAM:
MOVL block+24(FP), State
MOVOU X4, 0(State)
MOVOU X5, 16(State)
MOVOU X6, 32(State)
MOVOU X7, 48(State)
MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64
FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
DONE:
MOVL state+28(FP), State
MOVOU X3, 3*16(State)
RET
#undef State
#undef Dst
#undef Src
#undef Len
#undef Tmp0
#undef Tmp1