xs/vendor/blitter.com/go/chacha20/internal/hardware/impl_amd64.s

1683 lines
38 KiB
ArmAsm
Raw Normal View History

// Copryright (C) 2019 Yawning Angel
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
// +build !noasm
#include "textflag.h"
DATA ·chacha_constants<>+0x00(SB)/4, $0x61707865
DATA ·chacha_constants<>+0x04(SB)/4, $0x3320646E
DATA ·chacha_constants<>+0x08(SB)/4, $0x79622D32
DATA ·chacha_constants<>+0x0c(SB)/4, $0x6B206574
DATA ·chacha_constants<>+0x10(SB)/8, $0x0504070601000302
DATA ·chacha_constants<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
DATA ·chacha_constants<>+0x20(SB)/8, $0x0605040702010003
DATA ·chacha_constants<>+0x28(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·chacha_constants<>(SB), (NOPTR+RODATA), $48
// func blocksAVX2(s *[api.StateSize]uint32, in, out []byte)
TEXT ·blocksAVX2(SB), NOSPLIT, $576-56
// This is Andrew Moon's AVX2 ChaCha implementation taken from
// supercop-20171218, with some minor changes, primarily calling
// convention and assembly dialect related.
// Align the stack on a 64 byte boundary.
MOVQ SP, BP
ADDQ $64, BP
ANDQ $-64, BP
// Go calling convention -> SYSV AMD64 (and a fixup).
MOVQ s+0(FP), DI // &s -> DI
ADDQ $16, DI // Skip the ChaCha constants in the chachaState.
MOVQ in+8(FP), SI // &in[0] -> SI
MOVQ out+32(FP), DX // &out[0] -> DX
MOVQ in_len+16(FP), CX // len(in) -> CX
// Begin the main body of `chacha_blocks_avx2`.
//
// Mostly a direct translation except:
// * The number of rounds is always 20.
// * %rbp is used instead of %rsp.
LEAQ ·chacha_constants<>(SB), AX
VMOVDQU 0(AX), X8
VMOVDQU 16(AX), X6
VMOVDQU 32(AX), X7
VMOVDQU 0(DI), X9
VMOVDQU 16(DI), X10
VMOVDQU 32(DI), X11
// MOVQ 48(DI), AX
MOVQ $1, R9
VMOVDQA X8, 0(BP)
VMOVDQA X9, 16(BP)
VMOVDQA X10, 32(BP)
VMOVDQA X11, 48(BP)
// MOVQ AX, 64(BP)
VMOVDQA X6, 448(BP)
VMOVDQA X6, 464(BP)
VMOVDQA X7, 480(BP)
VMOVDQA X7, 496(BP)
CMPQ CX, $512
JAE chacha_blocks_avx2_atleast512
CMPQ CX, $256
JAE chacha_blocks_avx2_atleast256
JMP chacha_blocks_avx2_below256
chacha_blocks_avx2_atleast512:
MOVQ 48(BP), AX
LEAQ 1(AX), R8
LEAQ 2(AX), R9
LEAQ 3(AX), R10
LEAQ 4(AX), BX
LEAQ 5(AX), R11
LEAQ 6(AX), R12
LEAQ 7(AX), R13
LEAQ 8(AX), R14
MOVL AX, 128(BP)
MOVL R8, 4+128(BP)
MOVL R9, 8+128(BP)
MOVL R10, 12+128(BP)
MOVL BX, 16+128(BP)
MOVL R11, 20+128(BP)
MOVL R12, 24+128(BP)
MOVL R13, 28+128(BP)
SHRQ $32, AX
SHRQ $32, R8
SHRQ $32, R9
SHRQ $32, R10
SHRQ $32, BX
SHRQ $32, R11
SHRQ $32, R12
SHRQ $32, R13
MOVL AX, 160(BP)
MOVL R8, 4+160(BP)
MOVL R9, 8+160(BP)
MOVL R10, 12+160(BP)
MOVL BX, 16+160(BP)
MOVL R11, 20+160(BP)
MOVL R12, 24+160(BP)
MOVL R13, 28+160(BP)
MOVQ R14, 48(BP)
// MOVQ 64(BP), AX
MOVQ $20, AX
VPBROADCASTD 0(BP), Y0
VPBROADCASTD 4+0(BP), Y1
VPBROADCASTD 8+0(BP), Y2
VPBROADCASTD 12+0(BP), Y3
VPBROADCASTD 16(BP), Y4
VPBROADCASTD 4+16(BP), Y5
VPBROADCASTD 8+16(BP), Y6
VPBROADCASTD 12+16(BP), Y7
VPBROADCASTD 32(BP), Y8
VPBROADCASTD 4+32(BP), Y9
VPBROADCASTD 8+32(BP), Y10
VPBROADCASTD 12+32(BP), Y11
VPBROADCASTD 8+48(BP), Y14
VPBROADCASTD 12+48(BP), Y15
VMOVDQA 128(BP), Y12
VMOVDQA 160(BP), Y13
chacha_blocks_avx2_mainloop1:
VPADDD Y0, Y4, Y0
VPADDD Y1, Y5, Y1
VPXOR Y12, Y0, Y12
VPXOR Y13, Y1, Y13
VPADDD Y2, Y6, Y2
VPADDD Y3, Y7, Y3
VPXOR Y14, Y2, Y14
VPXOR Y15, Y3, Y15
VPSHUFB 448(BP), Y12, Y12
VPSHUFB 448(BP), Y13, Y13
VPADDD Y8, Y12, Y8
VPADDD Y9, Y13, Y9
VPSHUFB 448(BP), Y14, Y14
VPSHUFB 448(BP), Y15, Y15
VPADDD Y10, Y14, Y10
VPADDD Y11, Y15, Y11
VMOVDQA Y12, 96(BP)
VPXOR Y4, Y8, Y4
VPXOR Y5, Y9, Y5
VPSLLD $ 12, Y4, Y12
VPSRLD $20, Y4, Y4
VPXOR Y4, Y12, Y4
VPSLLD $ 12, Y5, Y12
VPSRLD $20, Y5, Y5
VPXOR Y5, Y12, Y5
VPXOR Y6, Y10, Y6
VPXOR Y7, Y11, Y7
VPSLLD $ 12, Y6, Y12
VPSRLD $20, Y6, Y6
VPXOR Y6, Y12, Y6
VPSLLD $ 12, Y7, Y12
VPSRLD $20, Y7, Y7
VPXOR Y7, Y12, Y7
VPADDD Y0, Y4, Y0
VPADDD Y1, Y5, Y1
VPXOR 96(BP), Y0, Y12
VPXOR Y13, Y1, Y13
VPADDD Y2, Y6, Y2
VPADDD Y3, Y7, Y3
VPXOR Y14, Y2, Y14
VPXOR Y15, Y3, Y15
VPSHUFB 480(BP), Y12, Y12
VPSHUFB 480(BP), Y13, Y13
VPADDD Y8, Y12, Y8
VPADDD Y9, Y13, Y9
VPSHUFB 480(BP), Y14, Y14
VPSHUFB 480(BP), Y15, Y15
VPADDD Y10, Y14, Y10
VPADDD Y11, Y15, Y11
VMOVDQA Y12, 96(BP)
VPXOR Y4, Y8, Y4
VPXOR Y5, Y9, Y5
VPSLLD $ 7, Y4, Y12
VPSRLD $25, Y4, Y4
VPXOR Y4, Y12, Y4
VPSLLD $ 7, Y5, Y12
VPSRLD $25, Y5, Y5
VPXOR Y5, Y12, Y5
VPXOR Y6, Y10, Y6
VPXOR Y7, Y11, Y7
VPSLLD $ 7, Y6, Y12
VPSRLD $25, Y6, Y6
VPXOR Y6, Y12, Y6
VPSLLD $ 7, Y7, Y12
VPSRLD $25, Y7, Y7
VPXOR Y7, Y12, Y7
VPADDD Y0, Y5, Y0
VPADDD Y1, Y6, Y1
VPXOR Y15, Y0, Y15
VPXOR 96(BP), Y1, Y12
VPADDD Y2, Y7, Y2
VPADDD Y3, Y4, Y3
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPSHUFB 448(BP), Y15, Y15
VPSHUFB 448(BP), Y12, Y12
VPADDD Y10, Y15, Y10
VPADDD Y11, Y12, Y11
VPSHUFB 448(BP), Y13, Y13
VPSHUFB 448(BP), Y14, Y14
VPADDD Y8, Y13, Y8
VPADDD Y9, Y14, Y9
VMOVDQA Y15, 96(BP)
VPXOR Y5, Y10, Y5
VPXOR Y6, Y11, Y6
VPSLLD $ 12, Y5, Y15
VPSRLD $20, Y5, Y5
VPXOR Y5, Y15, Y5
VPSLLD $ 12, Y6, Y15
VPSRLD $20, Y6, Y6
VPXOR Y6, Y15, Y6
VPXOR Y7, Y8, Y7
VPXOR Y4, Y9, Y4
VPSLLD $ 12, Y7, Y15
VPSRLD $20, Y7, Y7
VPXOR Y7, Y15, Y7
VPSLLD $ 12, Y4, Y15
VPSRLD $20, Y4, Y4
VPXOR Y4, Y15, Y4
VPADDD Y0, Y5, Y0
VPADDD Y1, Y6, Y1
VPXOR 96(BP), Y0, Y15
VPXOR Y12, Y1, Y12
VPADDD Y2, Y7, Y2
VPADDD Y3, Y4, Y3
VPXOR Y13, Y2, Y13
VPXOR Y14, Y3, Y14
VPSHUFB 480(BP), Y15, Y15
VPSHUFB 480(BP), Y12, Y12
VPADDD Y10, Y15, Y10
VPADDD Y11, Y12, Y11
VPSHUFB 480(BP), Y13, Y13
VPSHUFB 480(BP), Y14, Y14
VPADDD Y8, Y13, Y8
VPADDD Y9, Y14, Y9
VMOVDQA Y15, 96(BP)
VPXOR Y5, Y10, Y5
VPXOR Y6, Y11, Y6
VPSLLD $ 7, Y5, Y15
VPSRLD $25, Y5, Y5
VPXOR Y5, Y15, Y5
VPSLLD $ 7, Y6, Y15
VPSRLD $25, Y6, Y6
VPXOR Y6, Y15, Y6
VPXOR Y7, Y8, Y7
VPXOR Y4, Y9, Y4
VPSLLD $ 7, Y7, Y15
VPSRLD $25, Y7, Y7
VPXOR Y7, Y15, Y7
VPSLLD $ 7, Y4, Y15
VPSRLD $25, Y4, Y4
VPXOR Y4, Y15, Y4
VMOVDQA 96(BP), Y15
SUBQ $2, AX
JNZ chacha_blocks_avx2_mainloop1
VMOVDQA Y8, 192(BP)
VMOVDQA Y9, 224(BP)
VMOVDQA Y10, 256(BP)
VMOVDQA Y11, 288(BP)
VMOVDQA Y12, 320(BP)
VMOVDQA Y13, 352(BP)
VMOVDQA Y14, 384(BP)
VMOVDQA Y15, 416(BP)
VPBROADCASTD 0(BP), Y8
VPBROADCASTD 4+0(BP), Y9
VPBROADCASTD 8+0(BP), Y10
VPBROADCASTD 12+0(BP), Y11
VPBROADCASTD 16(BP), Y12
VPBROADCASTD 4+16(BP), Y13
VPBROADCASTD 8+16(BP), Y14
VPBROADCASTD 12+16(BP), Y15
VPADDD Y8, Y0, Y0
VPADDD Y9, Y1, Y1
VPADDD Y10, Y2, Y2
VPADDD Y11, Y3, Y3
VPADDD Y12, Y4, Y4
VPADDD Y13, Y5, Y5
VPADDD Y14, Y6, Y6
VPADDD Y15, Y7, Y7
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKLDQ Y3, Y2, Y9
VPUNPCKHDQ Y1, Y0, Y12
VPUNPCKHDQ Y3, Y2, Y13
VPUNPCKLDQ Y5, Y4, Y10
VPUNPCKLDQ Y7, Y6, Y11
VPUNPCKHDQ Y5, Y4, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y9, Y8, Y0
VPUNPCKLQDQ Y11, Y10, Y1
VPUNPCKHQDQ Y9, Y8, Y2
VPUNPCKHQDQ Y11, Y10, Y3
VPUNPCKLQDQ Y13, Y12, Y4
VPUNPCKLQDQ Y15, Y14, Y5
VPUNPCKHQDQ Y13, Y12, Y6
VPUNPCKHQDQ Y15, Y14, Y7
VPERM2I128 $0x20, Y1, Y0, Y8
VPERM2I128 $0x20, Y3, Y2, Y9
VPERM2I128 $0x31, Y1, Y0, Y12
VPERM2I128 $0x31, Y3, Y2, Y13
VPERM2I128 $0x20, Y5, Y4, Y10
VPERM2I128 $0x20, Y7, Y6, Y11
VPERM2I128 $0x31, Y5, Y4, Y14
VPERM2I128 $0x31, Y7, Y6, Y15
ANDQ SI, SI
JZ chacha_blocks_avx2_noinput1
VPXOR 0(SI), Y8, Y8
VPXOR 64(SI), Y9, Y9
VPXOR 128(SI), Y10, Y10
VPXOR 192(SI), Y11, Y11
VPXOR 256(SI), Y12, Y12
VPXOR 320(SI), Y13, Y13
VPXOR 384(SI), Y14, Y14
VPXOR 448(SI), Y15, Y15
VMOVDQU Y8, 0(DX)
VMOVDQU Y9, 64(DX)
VMOVDQU Y10, 128(DX)
VMOVDQU Y11, 192(DX)
VMOVDQU Y12, 256(DX)
VMOVDQU Y13, 320(DX)
VMOVDQU Y14, 384(DX)
VMOVDQU Y15, 448(DX)
VMOVDQA 192(BP), Y0
VMOVDQA 224(BP), Y1
VMOVDQA 256(BP), Y2
VMOVDQA 288(BP), Y3
VMOVDQA 320(BP), Y4
VMOVDQA 352(BP), Y5
VMOVDQA 384(BP), Y6
VMOVDQA 416(BP), Y7
VPBROADCASTD 32(BP), Y8
VPBROADCASTD 4+32(BP), Y9
VPBROADCASTD 8+32(BP), Y10
VPBROADCASTD 12+32(BP), Y11
VMOVDQA 128(BP), Y12
VMOVDQA 160(BP), Y13
VPBROADCASTD 8+48(BP), Y14
VPBROADCASTD 12+48(BP), Y15
VPADDD Y8, Y0, Y0
VPADDD Y9, Y1, Y1
VPADDD Y10, Y2, Y2
VPADDD Y11, Y3, Y3
VPADDD Y12, Y4, Y4
VPADDD Y13, Y5, Y5
VPADDD Y14, Y6, Y6
VPADDD Y15, Y7, Y7
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKLDQ Y3, Y2, Y9
VPUNPCKHDQ Y1, Y0, Y12
VPUNPCKHDQ Y3, Y2, Y13
VPUNPCKLDQ Y5, Y4, Y10
VPUNPCKLDQ Y7, Y6, Y11
VPUNPCKHDQ Y5, Y4, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y9, Y8, Y0
VPUNPCKLQDQ Y11, Y10, Y1
VPUNPCKHQDQ Y9, Y8, Y2
VPUNPCKHQDQ Y11, Y10, Y3
VPUNPCKLQDQ Y13, Y12, Y4
VPUNPCKLQDQ Y15, Y14, Y5
VPUNPCKHQDQ Y13, Y12, Y6
VPUNPCKHQDQ Y15, Y14, Y7
VPERM2I128 $0x20, Y1, Y0, Y8
VPERM2I128 $0x20, Y3, Y2, Y9
VPERM2I128 $0x31, Y1, Y0, Y12
VPERM2I128 $0x31, Y3, Y2, Y13
VPERM2I128 $0x20, Y5, Y4, Y10
VPERM2I128 $0x20, Y7, Y6, Y11
VPERM2I128 $0x31, Y5, Y4, Y14
VPERM2I128 $0x31, Y7, Y6, Y15
VPXOR 32(SI), Y8, Y8
VPXOR 96(SI), Y9, Y9
VPXOR 160(SI), Y10, Y10
VPXOR 224(SI), Y11, Y11
VPXOR 288(SI), Y12, Y12
VPXOR 352(SI), Y13, Y13
VPXOR 416(SI), Y14, Y14
VPXOR 480(SI), Y15, Y15
VMOVDQU Y8, 32(DX)
VMOVDQU Y9, 96(DX)
VMOVDQU Y10, 160(DX)
VMOVDQU Y11, 224(DX)
VMOVDQU Y12, 288(DX)
VMOVDQU Y13, 352(DX)
VMOVDQU Y14, 416(DX)
VMOVDQU Y15, 480(DX)
ADDQ $512, SI
JMP chacha_blocks_avx2_mainloop1_cont
chacha_blocks_avx2_noinput1:
VMOVDQU Y8, 0(DX)
VMOVDQU Y9, 64(DX)
VMOVDQU Y10, 128(DX)
VMOVDQU Y11, 192(DX)
VMOVDQU Y12, 256(DX)
VMOVDQU Y13, 320(DX)
VMOVDQU Y14, 384(DX)
VMOVDQU Y15, 448(DX)
VMOVDQA 192(BP), Y0
VMOVDQA 224(BP), Y1
VMOVDQA 256(BP), Y2
VMOVDQA 288(BP), Y3
VMOVDQA 320(BP), Y4
VMOVDQA 352(BP), Y5
VMOVDQA 384(BP), Y6
VMOVDQA 416(BP), Y7
VPBROADCASTD 32(BP), Y8
VPBROADCASTD 4+32(BP), Y9
VPBROADCASTD 8+32(BP), Y10
VPBROADCASTD 12+32(BP), Y11
VMOVDQA 128(BP), Y12
VMOVDQA 160(BP), Y13
VPBROADCASTD 8+48(BP), Y14
VPBROADCASTD 12+48(BP), Y15
VPADDD Y8, Y0, Y0
VPADDD Y9, Y1, Y1
VPADDD Y10, Y2, Y2
VPADDD Y11, Y3, Y3
VPADDD Y12, Y4, Y4
VPADDD Y13, Y5, Y5
VPADDD Y14, Y6, Y6
VPADDD Y15, Y7, Y7
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKLDQ Y3, Y2, Y9
VPUNPCKHDQ Y1, Y0, Y12
VPUNPCKHDQ Y3, Y2, Y13
VPUNPCKLDQ Y5, Y4, Y10
VPUNPCKLDQ Y7, Y6, Y11
VPUNPCKHDQ Y5, Y4, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y9, Y8, Y0
VPUNPCKLQDQ Y11, Y10, Y1
VPUNPCKHQDQ Y9, Y8, Y2
VPUNPCKHQDQ Y11, Y10, Y3
VPUNPCKLQDQ Y13, Y12, Y4
VPUNPCKLQDQ Y15, Y14, Y5
VPUNPCKHQDQ Y13, Y12, Y6
VPUNPCKHQDQ Y15, Y14, Y7
VPERM2I128 $0x20, Y1, Y0, Y8
VPERM2I128 $0x20, Y3, Y2, Y9
VPERM2I128 $0x31, Y1, Y0, Y12
VPERM2I128 $0x31, Y3, Y2, Y13
VPERM2I128 $0x20, Y5, Y4, Y10
VPERM2I128 $0x20, Y7, Y6, Y11
VPERM2I128 $0x31, Y5, Y4, Y14
VPERM2I128 $0x31, Y7, Y6, Y15
VMOVDQU Y8, 32(DX)
VMOVDQU Y9, 96(DX)
VMOVDQU Y10, 160(DX)
VMOVDQU Y11, 224(DX)
VMOVDQU Y12, 288(DX)
VMOVDQU Y13, 352(DX)
VMOVDQU Y14, 416(DX)
VMOVDQU Y15, 480(DX)
chacha_blocks_avx2_mainloop1_cont:
ADDQ $512, DX
SUBQ $512, CX
CMPQ CX, $512
JAE chacha_blocks_avx2_atleast512
CMPQ CX, $256
JB chacha_blocks_avx2_below256_fixup
chacha_blocks_avx2_atleast256:
MOVQ 48(BP), AX
LEAQ 1(AX), R8
LEAQ 2(AX), R9
LEAQ 3(AX), R10
LEAQ 4(AX), BX
MOVL AX, 128(BP)
MOVL R8, 4+128(BP)
MOVL R9, 8+128(BP)
MOVL R10, 12+128(BP)
SHRQ $32, AX
SHRQ $32, R8
SHRQ $32, R9
SHRQ $32, R10
MOVL AX, 160(BP)
MOVL R8, 4+160(BP)
MOVL R9, 8+160(BP)
MOVL R10, 12+160(BP)
MOVQ BX, 48(BP)
// MOVQ 64(BP), AX
MOVQ $20, AX
VPBROADCASTD 0(BP), X0
VPBROADCASTD 4+0(BP), X1
VPBROADCASTD 8+0(BP), X2
VPBROADCASTD 12+0(BP), X3
VPBROADCASTD 16(BP), X4
VPBROADCASTD 4+16(BP), X5
VPBROADCASTD 8+16(BP), X6
VPBROADCASTD 12+16(BP), X7
VPBROADCASTD 32(BP), X8
VPBROADCASTD 4+32(BP), X9
VPBROADCASTD 8+32(BP), X10
VPBROADCASTD 12+32(BP), X11
VMOVDQA 128(BP), X12
VMOVDQA 160(BP), X13
VPBROADCASTD 8+48(BP), X14
VPBROADCASTD 12+48(BP), X15
chacha_blocks_avx2_mainloop2:
VPADDD X0, X4, X0
VPADDD X1, X5, X1
VPXOR X12, X0, X12
VPXOR X13, X1, X13
VPADDD X2, X6, X2
VPADDD X3, X7, X3
VPXOR X14, X2, X14
VPXOR X15, X3, X15
VPSHUFB 448(BP), X12, X12
VPSHUFB 448(BP), X13, X13
VPADDD X8, X12, X8
VPADDD X9, X13, X9
VPSHUFB 448(BP), X14, X14
VPSHUFB 448(BP), X15, X15
VPADDD X10, X14, X10
VPADDD X11, X15, X11
VMOVDQA X12, 96(BP)
VPXOR X4, X8, X4
VPXOR X5, X9, X5
VPSLLD $ 12, X4, X12
VPSRLD $20, X4, X4
VPXOR X4, X12, X4
VPSLLD $ 12, X5, X12
VPSRLD $20, X5, X5
VPXOR X5, X12, X5
VPXOR X6, X10, X6
VPXOR X7, X11, X7
VPSLLD $ 12, X6, X12
VPSRLD $20, X6, X6
VPXOR X6, X12, X6
VPSLLD $ 12, X7, X12
VPSRLD $20, X7, X7
VPXOR X7, X12, X7
VPADDD X0, X4, X0
VPADDD X1, X5, X1
VPXOR 96(BP), X0, X12
VPXOR X13, X1, X13
VPADDD X2, X6, X2
VPADDD X3, X7, X3
VPXOR X14, X2, X14
VPXOR X15, X3, X15
VPSHUFB 480(BP), X12, X12
VPSHUFB 480(BP), X13, X13
VPADDD X8, X12, X8
VPADDD X9, X13, X9
VPSHUFB 480(BP), X14, X14
VPSHUFB 480(BP), X15, X15
VPADDD X10, X14, X10
VPADDD X11, X15, X11
VMOVDQA X12, 96(BP)
VPXOR X4, X8, X4
VPXOR X5, X9, X5
VPSLLD $ 7, X4, X12
VPSRLD $25, X4, X4
VPXOR X4, X12, X4
VPSLLD $ 7, X5, X12
VPSRLD $25, X5, X5
VPXOR X5, X12, X5
VPXOR X6, X10, X6
VPXOR X7, X11, X7
VPSLLD $ 7, X6, X12
VPSRLD $25, X6, X6
VPXOR X6, X12, X6
VPSLLD $ 7, X7, X12
VPSRLD $25, X7, X7
VPXOR X7, X12, X7
VPADDD X0, X5, X0
VPADDD X1, X6, X1
VPXOR X15, X0, X15
VPXOR 96(BP), X1, X12
VPADDD X2, X7, X2
VPADDD X3, X4, X3
VPXOR X13, X2, X13
VPXOR X14, X3, X14
VPSHUFB 448(BP), X15, X15
VPSHUFB 448(BP), X12, X12
VPADDD X10, X15, X10
VPADDD X11, X12, X11
VPSHUFB 448(BP), X13, X13
VPSHUFB 448(BP), X14, X14
VPADDD X8, X13, X8
VPADDD X9, X14, X9
VMOVDQA X15, 96(BP)
VPXOR X5, X10, X5
VPXOR X6, X11, X6
VPSLLD $ 12, X5, X15
VPSRLD $20, X5, X5
VPXOR X5, X15, X5
VPSLLD $ 12, X6, X15
VPSRLD $20, X6, X6
VPXOR X6, X15, X6
VPXOR X7, X8, X7
VPXOR X4, X9, X4
VPSLLD $ 12, X7, X15
VPSRLD $20, X7, X7
VPXOR X7, X15, X7
VPSLLD $ 12, X4, X15
VPSRLD $20, X4, X4
VPXOR X4, X15, X4
VPADDD X0, X5, X0
VPADDD X1, X6, X1
VPXOR 96(BP), X0, X15
VPXOR X12, X1, X12
VPADDD X2, X7, X2
VPADDD X3, X4, X3
VPXOR X13, X2, X13
VPXOR X14, X3, X14
VPSHUFB 480(BP), X15, X15
VPSHUFB 480(BP), X12, X12
VPADDD X10, X15, X10
VPADDD X11, X12, X11
VPSHUFB 480(BP), X13, X13
VPSHUFB 480(BP), X14, X14
VPADDD X8, X13, X8
VPADDD X9, X14, X9
VMOVDQA X15, 96(BP)
VPXOR X5, X10, X5
VPXOR X6, X11, X6
VPSLLD $ 7, X5, X15
VPSRLD $25, X5, X5
VPXOR X5, X15, X5
VPSLLD $ 7, X6, X15
VPSRLD $25, X6, X6
VPXOR X6, X15, X6
VPXOR X7, X8, X7
VPXOR X4, X9, X4
VPSLLD $ 7, X7, X15
VPSRLD $25, X7, X7
VPXOR X7, X15, X7
VPSLLD $ 7, X4, X15
VPSRLD $25, X4, X4
VPXOR X4, X15, X4
VMOVDQA 96(BP), X15
SUBQ $2, AX
JNZ chacha_blocks_avx2_mainloop2
VMOVDQA X8, 192(BP)
VMOVDQA X9, 208(BP)
VMOVDQA X10, 224(BP)
VMOVDQA X11, 240(BP)
VMOVDQA X12, 256(BP)
VMOVDQA X13, 272(BP)
VMOVDQA X14, 288(BP)
VMOVDQA X15, 304(BP)
VPBROADCASTD 0(BP), X8
VPBROADCASTD 4+0(BP), X9
VPBROADCASTD 8+0(BP), X10
VPBROADCASTD 12+0(BP), X11
VPBROADCASTD 16(BP), X12
VPBROADCASTD 4+16(BP), X13
VPBROADCASTD 8+16(BP), X14
VPBROADCASTD 12+16(BP), X15
VPADDD X8, X0, X0
VPADDD X9, X1, X1
VPADDD X10, X2, X2
VPADDD X11, X3, X3
VPADDD X12, X4, X4
VPADDD X13, X5, X5
VPADDD X14, X6, X6
VPADDD X15, X7, X7
VPUNPCKLDQ X1, X0, X8
VPUNPCKLDQ X3, X2, X9
VPUNPCKHDQ X1, X0, X12
VPUNPCKHDQ X3, X2, X13
VPUNPCKLDQ X5, X4, X10
VPUNPCKLDQ X7, X6, X11
VPUNPCKHDQ X5, X4, X14
VPUNPCKHDQ X7, X6, X15
VPUNPCKLQDQ X9, X8, X0
VPUNPCKLQDQ X11, X10, X1
VPUNPCKHQDQ X9, X8, X2
VPUNPCKHQDQ X11, X10, X3
VPUNPCKLQDQ X13, X12, X4
VPUNPCKLQDQ X15, X14, X5
VPUNPCKHQDQ X13, X12, X6
VPUNPCKHQDQ X15, X14, X7
ANDQ SI, SI
JZ chacha_blocks_avx2_noinput2
VPXOR 0(SI), X0, X0
VPXOR 16(SI), X1, X1
VPXOR 64(SI), X2, X2
VPXOR 80(SI), X3, X3
VPXOR 128(SI), X4, X4
VPXOR 144(SI), X5, X5
VPXOR 192(SI), X6, X6
VPXOR 208(SI), X7, X7
VMOVDQU X0, 0(DX)
VMOVDQU X1, 16(DX)
VMOVDQU X2, 64(DX)
VMOVDQU X3, 80(DX)
VMOVDQU X4, 128(DX)
VMOVDQU X5, 144(DX)
VMOVDQU X6, 192(DX)
VMOVDQU X7, 208(DX)
VMOVDQA 192(BP), X0
VMOVDQA 208(BP), X1
VMOVDQA 224(BP), X2
VMOVDQA 240(BP), X3
VMOVDQA 256(BP), X4
VMOVDQA 272(BP), X5
VMOVDQA 288(BP), X6
VMOVDQA 304(BP), X7
VPBROADCASTD 32(BP), X8
VPBROADCASTD 4+32(BP), X9
VPBROADCASTD 8+32(BP), X10
VPBROADCASTD 12+32(BP), X11
VMOVDQA 128(BP), X12
VMOVDQA 160(BP), X13
VPBROADCASTD 8+48(BP), X14
VPBROADCASTD 12+48(BP), X15
VPADDD X8, X0, X0
VPADDD X9, X1, X1
VPADDD X10, X2, X2
VPADDD X11, X3, X3
VPADDD X12, X4, X4
VPADDD X13, X5, X5
VPADDD X14, X6, X6
VPADDD X15, X7, X7
VPUNPCKLDQ X1, X0, X8
VPUNPCKLDQ X3, X2, X9
VPUNPCKHDQ X1, X0, X12
VPUNPCKHDQ X3, X2, X13
VPUNPCKLDQ X5, X4, X10
VPUNPCKLDQ X7, X6, X11
VPUNPCKHDQ X5, X4, X14
VPUNPCKHDQ X7, X6, X15
VPUNPCKLQDQ X9, X8, X0
VPUNPCKLQDQ X11, X10, X1
VPUNPCKHQDQ X9, X8, X2
VPUNPCKHQDQ X11, X10, X3
VPUNPCKLQDQ X13, X12, X4
VPUNPCKLQDQ X15, X14, X5
VPUNPCKHQDQ X13, X12, X6
VPUNPCKHQDQ X15, X14, X7
VPXOR 32(SI), X0, X0
VPXOR 48(SI), X1, X1
VPXOR 96(SI), X2, X2
VPXOR 112(SI), X3, X3
VPXOR 160(SI), X4, X4
VPXOR 176(SI), X5, X5
VPXOR 224(SI), X6, X6
VPXOR 240(SI), X7, X7
VMOVDQU X0, 32(DX)
VMOVDQU X1, 48(DX)
VMOVDQU X2, 96(DX)
VMOVDQU X3, 112(DX)
VMOVDQU X4, 160(DX)
VMOVDQU X5, 176(DX)
VMOVDQU X6, 224(DX)
VMOVDQU X7, 240(DX)
ADDQ $256, SI
JMP chacha_blocks_avx2_mainloop2_cont
chacha_blocks_avx2_noinput2:
VMOVDQU X0, 0(DX)
VMOVDQU X1, 16(DX)
VMOVDQU X2, 64(DX)
VMOVDQU X3, 80(DX)
VMOVDQU X4, 128(DX)
VMOVDQU X5, 144(DX)
VMOVDQU X6, 192(DX)
VMOVDQU X7, 208(DX)
VMOVDQA 192(BP), X0
VMOVDQA 208(BP), X1
VMOVDQA 224(BP), X2
VMOVDQA 240(BP), X3
VMOVDQA 256(BP), X4
VMOVDQA 272(BP), X5
VMOVDQA 288(BP), X6
VMOVDQA 304(BP), X7
VPBROADCASTD 32(BP), X8
VPBROADCASTD 4+32(BP), X9
VPBROADCASTD 8+32(BP), X10
VPBROADCASTD 12+32(BP), X11
VMOVDQA 128(BP), X12
VMOVDQA 160(BP), X13
VPBROADCASTD 8+48(BP), X14
VPBROADCASTD 12+48(BP), X15
VPADDD X8, X0, X0
VPADDD X9, X1, X1
VPADDD X10, X2, X2
VPADDD X11, X3, X3
VPADDD X12, X4, X4
VPADDD X13, X5, X5
VPADDD X14, X6, X6
VPADDD X15, X7, X7
VPUNPCKLDQ X1, X0, X8
VPUNPCKLDQ X3, X2, X9
VPUNPCKHDQ X1, X0, X12
VPUNPCKHDQ X3, X2, X13
VPUNPCKLDQ X5, X4, X10
VPUNPCKLDQ X7, X6, X11
VPUNPCKHDQ X5, X4, X14
VPUNPCKHDQ X7, X6, X15
VPUNPCKLQDQ X9, X8, X0
VPUNPCKLQDQ X11, X10, X1
VPUNPCKHQDQ X9, X8, X2
VPUNPCKHQDQ X11, X10, X3
VPUNPCKLQDQ X13, X12, X4
VPUNPCKLQDQ X15, X14, X5
VPUNPCKHQDQ X13, X12, X6
VPUNPCKHQDQ X15, X14, X7
VMOVDQU X0, 32(DX)
VMOVDQU X1, 48(DX)
VMOVDQU X2, 96(DX)
VMOVDQU X3, 112(DX)
VMOVDQU X4, 160(DX)
VMOVDQU X5, 176(DX)
VMOVDQU X6, 224(DX)
VMOVDQU X7, 240(DX)
chacha_blocks_avx2_mainloop2_cont:
ADDQ $256, DX
SUBQ $256, CX
CMPQ CX, $256
JAE chacha_blocks_avx2_atleast256
chacha_blocks_avx2_below256_fixup:
VMOVDQA 448(BP), X6
VMOVDQA 480(BP), X7
VMOVDQA 0(BP), X8
VMOVDQA 16(BP), X9
VMOVDQA 32(BP), X10
VMOVDQA 48(BP), X11
MOVQ $1, R9
chacha_blocks_avx2_below256:
VMOVQ R9, X5
ANDQ CX, CX
JZ chacha_blocks_avx2_done
CMPQ CX, $64
JAE chacha_blocks_avx2_above63
MOVQ DX, R9
ANDQ SI, SI
JZ chacha_blocks_avx2_noinput3
MOVQ CX, R10
MOVQ BP, DX
ADDQ R10, SI
ADDQ R10, DX
NEGQ R10
chacha_blocks_avx2_copyinput:
MOVB (SI)(R10*1), AX
MOVB AX, (DX)(R10*1)
INCQ R10
JNZ chacha_blocks_avx2_copyinput
MOVQ BP, SI
chacha_blocks_avx2_noinput3:
MOVQ BP, DX
chacha_blocks_avx2_above63:
VMOVDQA X8, X0
VMOVDQA X9, X1
VMOVDQA X10, X2
VMOVDQA X11, X3
// MOVQ 64(BP), AX
MOVQ $20, AX
chacha_blocks_avx2_mainloop3:
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X6, X3, X3
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSLLD $12, X1, X4
VPSRLD $20, X1, X1
VPXOR X1, X4, X1
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X7, X3, X3
VPSHUFD $0x93, X0, X0
VPADDD X2, X3, X2
VPSHUFD $0x4e, X3, X3
VPXOR X1, X2, X1
VPSHUFD $0x39, X2, X2
VPSLLD $7, X1, X4
VPSRLD $25, X1, X1
VPXOR X1, X4, X1
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X6, X3, X3
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSLLD $12, X1, X4
VPSRLD $20, X1, X1
VPXOR X1, X4, X1
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X7, X3, X3
VPSHUFD $0x39, X0, X0
VPADDD X2, X3, X2
VPSHUFD $0x4e, X3, X3
VPXOR X1, X2, X1
VPSHUFD $0x93, X2, X2
VPSLLD $7, X1, X4
VPSRLD $25, X1, X1
VPXOR X1, X4, X1
SUBQ $2, AX
JNZ chacha_blocks_avx2_mainloop3
VPADDD X0, X8, X0
VPADDD X1, X9, X1
VPADDD X2, X10, X2
VPADDD X3, X11, X3
ANDQ SI, SI
JZ chacha_blocks_avx2_noinput4
VPXOR 0(SI), X0, X0
VPXOR 16(SI), X1, X1
VPXOR 32(SI), X2, X2
VPXOR 48(SI), X3, X3
ADDQ $64, SI
chacha_blocks_avx2_noinput4:
VMOVDQU X0, 0(DX)
VMOVDQU X1, 16(DX)
VMOVDQU X2, 32(DX)
VMOVDQU X3, 48(DX)
VPADDQ X11, X5, X11
CMPQ CX, $64
JBE chacha_blocks_avx2_mainloop3_finishup
ADDQ $64, DX
SUBQ $64, CX
JMP chacha_blocks_avx2_below256
chacha_blocks_avx2_mainloop3_finishup:
CMPQ CX, $64
JE chacha_blocks_avx2_done
ADDQ CX, R9
ADDQ CX, DX
NEGQ CX
chacha_blocks_avx2_copyoutput:
MOVB (DX)(CX*1), AX
MOVB AX, (R9)(CX*1)
INCQ CX
JNZ chacha_blocks_avx2_copyoutput
chacha_blocks_avx2_done:
VMOVDQU X11, 32(DI)
VZEROUPPER
RET
// func hChaChaAVX2(key, nonce []byte, dst *byte)
TEXT ·hChaChaAVX2(SB), NOSPLIT|NOFRAME, $0-56
MOVQ key+0(FP), DI
MOVQ nonce+24(FP), SI
MOVQ dst+48(FP), DX
MOVL $20, CX
LEAQ ·chacha_constants<>(SB), AX
VMOVDQA 0(AX), X0
VMOVDQA 16(AX), X6
VMOVDQA 32(AX), X5
VMOVDQU 0(DI), X1
VMOVDQU 16(DI), X2
VMOVDQU 0(SI), X3
hhacha_mainloop_avx2:
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X6, X3, X3
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSLLD $12, X1, X4
VPSRLD $20, X1, X1
VPXOR X1, X4, X1
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X5, X3, X3
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSLLD $7, X1, X4
VPSRLD $25, X1, X1
VPSHUFD $0x93, X0, X0
VPXOR X1, X4, X1
VPSHUFD $0x4e, X3, X3
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X6, X3, X3
VPSHUFD $0x39, X2, X2
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSLLD $12, X1, X4
VPSRLD $20, X1, X1
VPXOR X1, X4, X1
VPADDD X0, X1, X0
VPXOR X3, X0, X3
VPSHUFB X5, X3, X3
VPADDD X2, X3, X2
VPXOR X1, X2, X1
VPSHUFD $0x39, X0, X0
VPSLLD $7, X1, X4
VPSHUFD $0x4e, X3, X3
VPSRLD $25, X1, X1
VPSHUFD $0x93, X2, X2
VPXOR X1, X4, X1
SUBL $2, CX
JNE hhacha_mainloop_avx2
VMOVDQU X0, (DX)
VMOVDQU X3, 16(DX)
VZEROUPPER
RET
// func blocksSSSE3(s *[api.StateSize]uint32, in, out []byte)
TEXT ·blocksSSSE3(SB), NOSPLIT, $576-56
// This is Andrew Moon's SSSE3 ChaCha implementation taken from
// supercop-20190110, with some minor changes, primarily calling
// convention and assembly dialect related.
// Align the stack on a 64 byte boundary.
MOVQ SP, BP
ADDQ $64, BP
ANDQ $-64, BP
// Go calling convention -> SYSV AMD64 (and a fixup).
MOVQ s+0(FP), DI // &s -> DI
ADDQ $16, DI // Skip the ChaCha constants in the chachaState.
MOVQ in+8(FP), SI // &in[0] -> SI
MOVQ out+32(FP), DX // &out[0] -> DX
MOVQ in_len+16(FP), CX // len(in) -> CX
// Begin the main body of `chacha_blocks_ssse3`.
//
// Mostly a direct translation except:
// * The number of rounds is always 20.
// * %rbp is used instead of BP.
LEAQ ·chacha_constants<>(SB), AX
MOVO 0(AX), X8
MOVO 16(AX), X6
MOVO 32(AX), X7
MOVOU 0(DI), X9
MOVOU 16(DI), X10
MOVOU 32(DI), X11
// MOVQ 48(DI), AX
MOVQ $1, R9
MOVO X8, 0(BP)
MOVO X9, 16(BP)
MOVO X10, 32(BP)
MOVO X11, 48(BP)
MOVO X6, 80(BP)
MOVO X7, 96(BP)
// MOVQ AX, 64(BP)
CMPQ CX, $256
JB chacha_blocks_ssse3_below256
PSHUFD $0x00, X8, X0
PSHUFD $0x55, X8, X1
PSHUFD $0xaa, X8, X2
PSHUFD $0xff, X8, X3
MOVO X0, 128(BP)
MOVO X1, 144(BP)
MOVO X2, 160(BP)
MOVO X3, 176(BP)
PSHUFD $0x00, X9, X0
PSHUFD $0x55, X9, X1
PSHUFD $0xaa, X9, X2
PSHUFD $0xff, X9, X3
MOVO X0, 192(BP)
MOVO X1, 208(BP)
MOVO X2, 224(BP)
MOVO X3, 240(BP)
PSHUFD $0x00, X10, X0
PSHUFD $0x55, X10, X1
PSHUFD $0xaa, X10, X2
PSHUFD $0xff, X10, X3
MOVO X0, 256(BP)
MOVO X1, 272(BP)
MOVO X2, 288(BP)
MOVO X3, 304(BP)
PSHUFD $0xaa, X11, X0
PSHUFD $0xff, X11, X1
MOVO X0, 352(BP)
MOVO X1, 368(BP)
JMP chacha_blocks_ssse3_atleast256
// .p2align 6,,63
// # align to 4 mod 64
// nop;nop;nop;nop;
chacha_blocks_ssse3_atleast256:
MOVQ 48(BP), AX
LEAQ 1(AX), R8
LEAQ 2(AX), R9
LEAQ 3(AX), R10
LEAQ 4(AX), BX
MOVL AX, 320(BP)
MOVL R8, 4+320(BP)
MOVL R9, 8+320(BP)
MOVL R10, 12+320(BP)
SHRQ $32, AX
SHRQ $32, R8
SHRQ $32, R9
SHRQ $32, R10
MOVL AX, 336(BP)
MOVL R8, 4+336(BP)
MOVL R9, 8+336(BP)
MOVL R10, 12+336(BP)
MOVQ BX, 48(BP)
// MOVQ 64(BP), AX
MOVQ $20, AX
MOVO 128(BP), X0
MOVO 144(BP), X1
MOVO 160(BP), X2
MOVO 176(BP), X3
MOVO 192(BP), X4
MOVO 208(BP), X5
MOVO 224(BP), X6
MOVO 240(BP), X7
MOVO 256(BP), X8
MOVO 272(BP), X9
MOVO 288(BP), X10
MOVO 304(BP), X11
MOVO 320(BP), X12
MOVO 336(BP), X13
MOVO 352(BP), X14
MOVO 368(BP), X15
chacha_blocks_ssse3_mainloop1:
PADDD X4, X0
PADDD X5, X1
PXOR X0, X12
PXOR X1, X13
PADDD X6, X2
PADDD X7, X3
PXOR X2, X14
PXOR X3, X15
PSHUFB 80(BP), X12
PSHUFB 80(BP), X13
PADDD X12, X8
PADDD X13, X9
PSHUFB 80(BP), X14
PSHUFB 80(BP), X15
PADDD X14, X10
PADDD X15, X11
MOVO X12, 112(BP)
PXOR X8, X4
PXOR X9, X5
MOVO X4, X12
PSLLL $ 12, X4
PSRLL $20, X12
PXOR X12, X4
MOVO X5, X12
PSLLL $ 12, X5
PSRLL $20, X12
PXOR X12, X5
PXOR X10, X6
PXOR X11, X7
MOVO X6, X12
PSLLL $ 12, X6
PSRLL $20, X12
PXOR X12, X6
MOVO X7, X12
PSLLL $ 12, X7
PSRLL $20, X12
PXOR X12, X7
MOVO 112(BP), X12
PADDD X4, X0
PADDD X5, X1
PXOR X0, X12
PXOR X1, X13
PADDD X6, X2
PADDD X7, X3
PXOR X2, X14
PXOR X3, X15
PSHUFB 96(BP), X12
PSHUFB 96(BP), X13
PADDD X12, X8
PADDD X13, X9
PSHUFB 96(BP), X14
PSHUFB 96(BP), X15
PADDD X14, X10
PADDD X15, X11
MOVO X12, 112(BP)
PXOR X8, X4
PXOR X9, X5
MOVO X4, X12
PSLLL $ 7, X4
PSRLL $25, X12
PXOR X12, X4
MOVO X5, X12
PSLLL $ 7, X5
PSRLL $25, X12
PXOR X12, X5
PXOR X10, X6
PXOR X11, X7
MOVO X6, X12
PSLLL $ 7, X6
PSRLL $25, X12
PXOR X12, X6
MOVO X7, X12
PSLLL $ 7, X7
PSRLL $25, X12
PXOR X12, X7
MOVO 112(BP), X12
PADDD X5, X0
PADDD X6, X1
PXOR X0, X15
PXOR X1, X12
PADDD X7, X2
PADDD X4, X3
PXOR X2, X13
PXOR X3, X14
PSHUFB 80(BP), X15
PSHUFB 80(BP), X12
PADDD X15, X10
PADDD X12, X11
PSHUFB 80(BP), X13
PSHUFB 80(BP), X14
PADDD X13, X8
PADDD X14, X9
MOVO X15, 112(BP)
PXOR X10, X5
PXOR X11, X6
MOVO X5, X15
PSLLL $ 12, X5
PSRLL $20, X15
PXOR X15, X5
MOVO X6, X15
PSLLL $ 12, X6
PSRLL $20, X15
PXOR X15, X6
PXOR X8, X7
PXOR X9, X4
MOVO X7, X15
PSLLL $ 12, X7
PSRLL $20, X15
PXOR X15, X7
MOVO X4, X15
PSLLL $ 12, X4
PSRLL $20, X15
PXOR X15, X4
MOVO 112(BP), X15
PADDD X5, X0
PADDD X6, X1
PXOR X0, X15
PXOR X1, X12
PADDD X7, X2
PADDD X4, X3
PXOR X2, X13
PXOR X3, X14
PSHUFB 96(BP), X15
PSHUFB 96(BP), X12
PADDD X15, X10
PADDD X12, X11
PSHUFB 96(BP), X13
PSHUFB 96(BP), X14
PADDD X13, X8
PADDD X14, X9
MOVO X15, 112(BP)
PXOR X10, X5
PXOR X11, X6
MOVO X5, X15
PSLLL $ 7, X5
PSRLL $25, X15
PXOR X15, X5
MOVO X6, X15
PSLLL $ 7, X6
PSRLL $25, X15
PXOR X15, X6
PXOR X8, X7
PXOR X9, X4
MOVO X7, X15
PSLLL $ 7, X7
PSRLL $25, X15
PXOR X15, X7
MOVO X4, X15
PSLLL $ 7, X4
PSRLL $25, X15
PXOR X15, X4
SUBQ $2, AX
MOVO 112(BP), X15
JNZ chacha_blocks_ssse3_mainloop1
PADDD 128(BP), X0
PADDD 144(BP), X1
PADDD 160(BP), X2
PADDD 176(BP), X3
PADDD 192(BP), X4
PADDD 208(BP), X5
PADDD 224(BP), X6
PADDD 240(BP), X7
PADDD 256(BP), X8
PADDD 272(BP), X9
PADDD 288(BP), X10
PADDD 304(BP), X11
PADDD 320(BP), X12
PADDD 336(BP), X13
PADDD 352(BP), X14
PADDD 368(BP), X15
MOVO X8, 384(BP)
MOVO X9, 400(BP)
MOVO X10, 416(BP)
MOVO X11, 432(BP)
MOVO X12, 448(BP)
MOVO X13, 464(BP)
MOVO X14, 480(BP)
MOVO X15, 496(BP)
MOVO X0, X8
MOVO X2, X9
MOVO X4, X10
MOVO X6, X11
PUNPCKHLQ X1, X0
PUNPCKHLQ X3, X2
PUNPCKHLQ X5, X4
PUNPCKHLQ X7, X6
PUNPCKLLQ X1, X8
PUNPCKLLQ X3, X9
PUNPCKLLQ X5, X10
PUNPCKLLQ X7, X11
MOVO X0, X1
MOVO X4, X3
MOVO X8, X5
MOVO X10, X7
PUNPCKHQDQ X2, X0
PUNPCKHQDQ X6, X4
PUNPCKHQDQ X9, X8
PUNPCKHQDQ X11, X10
PUNPCKLQDQ X2, X1
PUNPCKLQDQ X6, X3
PUNPCKLQDQ X9, X5
PUNPCKLQDQ X11, X7
ANDQ SI, SI
JZ chacha_blocks_ssse3_noinput1
MOVOU 0(SI), X2
MOVOU 16(SI), X6
MOVOU 64(SI), X9
MOVOU 80(SI), X11
MOVOU 128(SI), X12
MOVOU 144(SI), X13
MOVOU 192(SI), X14
MOVOU 208(SI), X15
PXOR X2, X5
PXOR X6, X7
PXOR X9, X8
PXOR X11, X10
PXOR X12, X1
PXOR X13, X3
PXOR X14, X0
PXOR X15, X4
MOVOU X5, 0(DX)
MOVOU X7, 16(DX)
MOVOU X8, 64(DX)
MOVOU X10, 80(DX)
MOVOU X1, 128(DX)
MOVOU X3, 144(DX)
MOVOU X0, 192(DX)
MOVOU X4, 208(DX)
MOVO 384(BP), X0
MOVO 400(BP), X1
MOVO 416(BP), X2
MOVO 432(BP), X3
MOVO 448(BP), X4
MOVO 464(BP), X5
MOVO 480(BP), X6
MOVO 496(BP), X7
MOVO X0, X8
MOVO X2, X9
MOVO X4, X10
MOVO X6, X11
PUNPCKLLQ X1, X8
PUNPCKLLQ X3, X9
PUNPCKHLQ X1, X0
PUNPCKHLQ X3, X2
PUNPCKLLQ X5, X10
PUNPCKLLQ X7, X11
PUNPCKHLQ X5, X4
PUNPCKHLQ X7, X6
MOVO X8, X1
MOVO X0, X3
MOVO X10, X5
MOVO X4, X7
PUNPCKLQDQ X9, X1
PUNPCKLQDQ X11, X5
PUNPCKHQDQ X9, X8
PUNPCKHQDQ X11, X10
PUNPCKLQDQ X2, X3
PUNPCKLQDQ X6, X7
PUNPCKHQDQ X2, X0
PUNPCKHQDQ X6, X4
MOVOU 32(SI), X2
MOVOU 48(SI), X6
MOVOU 96(SI), X9
MOVOU 112(SI), X11
MOVOU 160(SI), X12
MOVOU 176(SI), X13
MOVOU 224(SI), X14
MOVOU 240(SI), X15
PXOR X2, X1
PXOR X6, X5
PXOR X9, X8
PXOR X11, X10
PXOR X12, X3
PXOR X13, X7
PXOR X14, X0
PXOR X15, X4
MOVOU X1, 32(DX)
MOVOU X5, 48(DX)
MOVOU X8, 96(DX)
MOVOU X10, 112(DX)
MOVOU X3, 160(DX)
MOVOU X7, 176(DX)
MOVOU X0, 224(DX)
MOVOU X4, 240(DX)
ADDQ $256, SI
JMP chacha_blocks_ssse3_mainloop_cont
chacha_blocks_ssse3_noinput1:
MOVOU X5, 0(DX)
MOVOU X7, 16(DX)
MOVOU X8, 64(DX)
MOVOU X10, 80(DX)
MOVOU X1, 128(DX)
MOVOU X3, 144(DX)
MOVOU X0, 192(DX)
MOVOU X4, 208(DX)
MOVO 384(BP), X0
MOVO 400(BP), X1
MOVO 416(BP), X2
MOVO 432(BP), X3
MOVO 448(BP), X4
MOVO 464(BP), X5
MOVO 480(BP), X6
MOVO 496(BP), X7
MOVO X0, X8
MOVO X2, X9
MOVO X4, X10
MOVO X6, X11
PUNPCKLLQ X1, X8
PUNPCKLLQ X3, X9
PUNPCKHLQ X1, X0
PUNPCKHLQ X3, X2
PUNPCKLLQ X5, X10
PUNPCKLLQ X7, X11
PUNPCKHLQ X5, X4
PUNPCKHLQ X7, X6
MOVO X8, X1
MOVO X0, X3
MOVO X10, X5
MOVO X4, X7
PUNPCKLQDQ X9, X1
PUNPCKLQDQ X11, X5
PUNPCKHQDQ X9, X8
PUNPCKHQDQ X11, X10
PUNPCKLQDQ X2, X3
PUNPCKLQDQ X6, X7
PUNPCKHQDQ X2, X0
PUNPCKHQDQ X6, X4
MOVOU X1, 32(DX)
MOVOU X5, 48(DX)
MOVOU X8, 96(DX)
MOVOU X10, 112(DX)
MOVOU X3, 160(DX)
MOVOU X7, 176(DX)
MOVOU X0, 224(DX)
MOVOU X4, 240(DX)
chacha_blocks_ssse3_mainloop_cont:
ADDQ $256, DX
SUBQ $256, CX
CMPQ CX, $256
JAE chacha_blocks_ssse3_atleast256
MOVO 80(BP), X6
MOVO 96(BP), X7
MOVO 0(BP), X8
MOVO 16(BP), X9
MOVO 32(BP), X10
MOVO 48(BP), X11
MOVQ $1, R9
chacha_blocks_ssse3_below256:
MOVQ R9, X5
ANDQ CX, CX
JZ chacha_blocks_ssse3_done
CMPQ CX, $64
JAE chacha_blocks_ssse3_above63
MOVQ DX, R9
ANDQ SI, SI
JZ chacha_blocks_ssse3_noinput2
MOVQ CX, R10
MOVQ BP, DX
ADDQ R10, SI
ADDQ R10, DX
NEGQ R10
chacha_blocks_ssse3_copyinput:
MOVB (SI)(R10*1), AX
MOVB AX, (DX)(R10*1)
INCQ R10
JNZ chacha_blocks_ssse3_copyinput
MOVQ BP, SI
chacha_blocks_ssse3_noinput2:
MOVQ BP, DX
chacha_blocks_ssse3_above63:
MOVO X8, X0
MOVO X9, X1
MOVO X10, X2
MOVO X11, X3
// MOVQ 64(BP), AX
MOVQ $20, AX
chacha_blocks_ssse3_mainloop2:
PADDD X1, X0
PXOR X0, X3
PSHUFB X6, X3
PADDD X3, X2
PXOR X2, X1
MOVO X1, X4
PSLLL $12, X4
PSRLL $20, X1
PXOR X4, X1
PADDD X1, X0
PXOR X0, X3
PSHUFB X7, X3
PSHUFD $0x93, X0, X0
PADDD X3, X2
PSHUFD $0x4e, X3, X3
PXOR X2, X1
PSHUFD $0x39, X2, X2
MOVO X1, X4
PSLLL $7, X4
PSRLL $25, X1
PXOR X4, X1
PADDD X1, X0
PXOR X0, X3
PSHUFB X6, X3
PADDD X3, X2
PXOR X2, X1
MOVO X1, X4
PSLLL $12, X4
PSRLL $20, X1
PXOR X4, X1
PADDD X1, X0
PXOR X0, X3
PSHUFB X7, X3
PSHUFD $0x39, X0, X0
PADDD X3, X2
PSHUFD $0x4e, X3, X3
PXOR X2, X1
PSHUFD $0x93, X2, X2
MOVO X1, X4
PSLLL $7, X4
PSRLL $25, X1
PXOR X4, X1
SUBQ $2, AX
JNZ chacha_blocks_ssse3_mainloop2
PADDD X8, X0
PADDD X9, X1
PADDD X10, X2
PADDD X11, X3
ANDQ SI, SI
JZ chacha_blocks_ssse3_noinput3
MOVOU 0(SI), X12
MOVOU 16(SI), X13
MOVOU 32(SI), X14
MOVOU 48(SI), X15
PXOR X12, X0
PXOR X13, X1
PXOR X14, X2
PXOR X15, X3
ADDQ $64, SI
chacha_blocks_ssse3_noinput3:
MOVOU X0, 0(DX)
MOVOU X1, 16(DX)
MOVOU X2, 32(DX)
MOVOU X3, 48(DX)
PADDQ X5, X11
CMPQ CX, $64
JBE chacha_blocks_ssse3_mainloop2_finishup
ADDQ $64, DX
SUBQ $64, CX
JMP chacha_blocks_ssse3_below256
chacha_blocks_ssse3_mainloop2_finishup:
CMPQ CX, $64
JE chacha_blocks_ssse3_done
ADDQ CX, R9
ADDQ CX, DX
NEGQ CX
chacha_blocks_ssse3_copyoutput:
MOVB (DX)(CX*1), AX
MOVB AX, (R9)(CX*1)
INCQ CX
JNZ chacha_blocks_ssse3_copyoutput
chacha_blocks_ssse3_done:
MOVOU X11, 32(DI)
RET
// func hChaChaSSSE3(key, nonce []byte, dst *byte)
TEXT ·hChaChaSSSE3(SB), NOSPLIT|NOFRAME, $0-56
MOVQ key+0(FP), DI
MOVQ nonce+24(FP), SI
MOVQ dst+48(FP), DX
MOVL $20, CX
LEAQ ·chacha_constants<>(SB), AX
MOVO 0(AX), X0
MOVO 16(AX), X5
MOVO 32(AX), X6
MOVOU 0(DI), X1
MOVOU 16(DI), X2
MOVOU 0(SI), X3
hchacha_ssse3_mainloop:
PADDD X1, X0
PXOR X0, X3
PSHUFB X5, X3
PADDD X3, X2
PXOR X2, X1
MOVO X1, X4
PSLLL $12, X1
PSRLL $20, X4
PXOR X4, X1
PADDD X1, X0
PXOR X0, X3
PSHUFB X6, X3
PSHUFD $0X93, X0, X0
PADDD X3, X2
PSHUFD $0X4E, X3, X3
PXOR X2, X1
PSHUFD $0X39, X2, X2
MOVO X1, X4
PSLLL $7, X1
PSRLL $25, X4
PXOR X4, X1
SUBQ $2, CX
PADDD X1, X0
PXOR X0, X3
PSHUFB X5, X3
PADDD X3, X2
PXOR X2, X1
MOVO X1, X4
PSLLL $12, X1
PSRLL $20, X4
PXOR X4, X1
PADDD X1, X0
PXOR X0, X3
PSHUFB X6, X3
PSHUFD $0X39, X0, X0
PADDD X3, X2
PSHUFD $0X4E, X3, X3
PXOR X2, X1
PSHUFD $0X93, X2, X2
MOVO X1, X4
PSLLL $7, X1
PSRLL $25, X4
PXOR X4, X1
JA hchacha_ssse3_mainloop
MOVOU X0, 0(DX)
MOVOU X3, 16(DX)
RET