// Copryright (C) 2019 Yawning Angel // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as // published by the Free Software Foundation, either version 3 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . // +build !noasm #include "textflag.h" DATA ·chacha_constants<>+0x00(SB)/4, $0x61707865 DATA ·chacha_constants<>+0x04(SB)/4, $0x3320646E DATA ·chacha_constants<>+0x08(SB)/4, $0x79622D32 DATA ·chacha_constants<>+0x0c(SB)/4, $0x6B206574 DATA ·chacha_constants<>+0x10(SB)/8, $0x0504070601000302 DATA ·chacha_constants<>+0x18(SB)/8, $0x0D0C0F0E09080B0A DATA ·chacha_constants<>+0x20(SB)/8, $0x0605040702010003 DATA ·chacha_constants<>+0x28(SB)/8, $0x0E0D0C0F0A09080B GLOBL ·chacha_constants<>(SB), (NOPTR+RODATA), $48 // func blocksAVX2(s *[api.StateSize]uint32, in, out []byte) TEXT ·blocksAVX2(SB), NOSPLIT, $576-56 // This is Andrew Moon's AVX2 ChaCha implementation taken from // supercop-20171218, with some minor changes, primarily calling // convention and assembly dialect related. // Align the stack on a 64 byte boundary. MOVQ SP, BP ADDQ $64, BP ANDQ $-64, BP // Go calling convention -> SYSV AMD64 (and a fixup). MOVQ s+0(FP), DI // &s -> DI ADDQ $16, DI // Skip the ChaCha constants in the chachaState. MOVQ in+8(FP), SI // &in[0] -> SI MOVQ out+32(FP), DX // &out[0] -> DX MOVQ in_len+16(FP), CX // len(in) -> CX // Begin the main body of `chacha_blocks_avx2`. // // Mostly a direct translation except: // * The number of rounds is always 20. // * %rbp is used instead of %rsp. LEAQ ·chacha_constants<>(SB), AX VMOVDQU 0(AX), X8 VMOVDQU 16(AX), X6 VMOVDQU 32(AX), X7 VMOVDQU 0(DI), X9 VMOVDQU 16(DI), X10 VMOVDQU 32(DI), X11 // MOVQ 48(DI), AX MOVQ $1, R9 VMOVDQA X8, 0(BP) VMOVDQA X9, 16(BP) VMOVDQA X10, 32(BP) VMOVDQA X11, 48(BP) // MOVQ AX, 64(BP) VMOVDQA X6, 448(BP) VMOVDQA X6, 464(BP) VMOVDQA X7, 480(BP) VMOVDQA X7, 496(BP) CMPQ CX, $512 JAE chacha_blocks_avx2_atleast512 CMPQ CX, $256 JAE chacha_blocks_avx2_atleast256 JMP chacha_blocks_avx2_below256 chacha_blocks_avx2_atleast512: MOVQ 48(BP), AX LEAQ 1(AX), R8 LEAQ 2(AX), R9 LEAQ 3(AX), R10 LEAQ 4(AX), BX LEAQ 5(AX), R11 LEAQ 6(AX), R12 LEAQ 7(AX), R13 LEAQ 8(AX), R14 MOVL AX, 128(BP) MOVL R8, 4+128(BP) MOVL R9, 8+128(BP) MOVL R10, 12+128(BP) MOVL BX, 16+128(BP) MOVL R11, 20+128(BP) MOVL R12, 24+128(BP) MOVL R13, 28+128(BP) SHRQ $32, AX SHRQ $32, R8 SHRQ $32, R9 SHRQ $32, R10 SHRQ $32, BX SHRQ $32, R11 SHRQ $32, R12 SHRQ $32, R13 MOVL AX, 160(BP) MOVL R8, 4+160(BP) MOVL R9, 8+160(BP) MOVL R10, 12+160(BP) MOVL BX, 16+160(BP) MOVL R11, 20+160(BP) MOVL R12, 24+160(BP) MOVL R13, 28+160(BP) MOVQ R14, 48(BP) // MOVQ 64(BP), AX MOVQ $20, AX VPBROADCASTD 0(BP), Y0 VPBROADCASTD 4+0(BP), Y1 VPBROADCASTD 8+0(BP), Y2 VPBROADCASTD 12+0(BP), Y3 VPBROADCASTD 16(BP), Y4 VPBROADCASTD 4+16(BP), Y5 VPBROADCASTD 8+16(BP), Y6 VPBROADCASTD 12+16(BP), Y7 VPBROADCASTD 32(BP), Y8 VPBROADCASTD 4+32(BP), Y9 VPBROADCASTD 8+32(BP), Y10 VPBROADCASTD 12+32(BP), Y11 VPBROADCASTD 8+48(BP), Y14 VPBROADCASTD 12+48(BP), Y15 VMOVDQA 128(BP), Y12 VMOVDQA 160(BP), Y13 chacha_blocks_avx2_mainloop1: VPADDD Y0, Y4, Y0 VPADDD Y1, Y5, Y1 VPXOR Y12, Y0, Y12 VPXOR Y13, Y1, Y13 VPADDD Y2, Y6, Y2 VPADDD Y3, Y7, Y3 VPXOR Y14, Y2, Y14 VPXOR Y15, Y3, Y15 VPSHUFB 448(BP), Y12, Y12 VPSHUFB 448(BP), Y13, Y13 VPADDD Y8, Y12, Y8 VPADDD Y9, Y13, Y9 VPSHUFB 448(BP), Y14, Y14 VPSHUFB 448(BP), Y15, Y15 VPADDD Y10, Y14, Y10 VPADDD Y11, Y15, Y11 VMOVDQA Y12, 96(BP) VPXOR Y4, Y8, Y4 VPXOR Y5, Y9, Y5 VPSLLD $ 12, Y4, Y12 VPSRLD $20, Y4, Y4 VPXOR Y4, Y12, Y4 VPSLLD $ 12, Y5, Y12 VPSRLD $20, Y5, Y5 VPXOR Y5, Y12, Y5 VPXOR Y6, Y10, Y6 VPXOR Y7, Y11, Y7 VPSLLD $ 12, Y6, Y12 VPSRLD $20, Y6, Y6 VPXOR Y6, Y12, Y6 VPSLLD $ 12, Y7, Y12 VPSRLD $20, Y7, Y7 VPXOR Y7, Y12, Y7 VPADDD Y0, Y4, Y0 VPADDD Y1, Y5, Y1 VPXOR 96(BP), Y0, Y12 VPXOR Y13, Y1, Y13 VPADDD Y2, Y6, Y2 VPADDD Y3, Y7, Y3 VPXOR Y14, Y2, Y14 VPXOR Y15, Y3, Y15 VPSHUFB 480(BP), Y12, Y12 VPSHUFB 480(BP), Y13, Y13 VPADDD Y8, Y12, Y8 VPADDD Y9, Y13, Y9 VPSHUFB 480(BP), Y14, Y14 VPSHUFB 480(BP), Y15, Y15 VPADDD Y10, Y14, Y10 VPADDD Y11, Y15, Y11 VMOVDQA Y12, 96(BP) VPXOR Y4, Y8, Y4 VPXOR Y5, Y9, Y5 VPSLLD $ 7, Y4, Y12 VPSRLD $25, Y4, Y4 VPXOR Y4, Y12, Y4 VPSLLD $ 7, Y5, Y12 VPSRLD $25, Y5, Y5 VPXOR Y5, Y12, Y5 VPXOR Y6, Y10, Y6 VPXOR Y7, Y11, Y7 VPSLLD $ 7, Y6, Y12 VPSRLD $25, Y6, Y6 VPXOR Y6, Y12, Y6 VPSLLD $ 7, Y7, Y12 VPSRLD $25, Y7, Y7 VPXOR Y7, Y12, Y7 VPADDD Y0, Y5, Y0 VPADDD Y1, Y6, Y1 VPXOR Y15, Y0, Y15 VPXOR 96(BP), Y1, Y12 VPADDD Y2, Y7, Y2 VPADDD Y3, Y4, Y3 VPXOR Y13, Y2, Y13 VPXOR Y14, Y3, Y14 VPSHUFB 448(BP), Y15, Y15 VPSHUFB 448(BP), Y12, Y12 VPADDD Y10, Y15, Y10 VPADDD Y11, Y12, Y11 VPSHUFB 448(BP), Y13, Y13 VPSHUFB 448(BP), Y14, Y14 VPADDD Y8, Y13, Y8 VPADDD Y9, Y14, Y9 VMOVDQA Y15, 96(BP) VPXOR Y5, Y10, Y5 VPXOR Y6, Y11, Y6 VPSLLD $ 12, Y5, Y15 VPSRLD $20, Y5, Y5 VPXOR Y5, Y15, Y5 VPSLLD $ 12, Y6, Y15 VPSRLD $20, Y6, Y6 VPXOR Y6, Y15, Y6 VPXOR Y7, Y8, Y7 VPXOR Y4, Y9, Y4 VPSLLD $ 12, Y7, Y15 VPSRLD $20, Y7, Y7 VPXOR Y7, Y15, Y7 VPSLLD $ 12, Y4, Y15 VPSRLD $20, Y4, Y4 VPXOR Y4, Y15, Y4 VPADDD Y0, Y5, Y0 VPADDD Y1, Y6, Y1 VPXOR 96(BP), Y0, Y15 VPXOR Y12, Y1, Y12 VPADDD Y2, Y7, Y2 VPADDD Y3, Y4, Y3 VPXOR Y13, Y2, Y13 VPXOR Y14, Y3, Y14 VPSHUFB 480(BP), Y15, Y15 VPSHUFB 480(BP), Y12, Y12 VPADDD Y10, Y15, Y10 VPADDD Y11, Y12, Y11 VPSHUFB 480(BP), Y13, Y13 VPSHUFB 480(BP), Y14, Y14 VPADDD Y8, Y13, Y8 VPADDD Y9, Y14, Y9 VMOVDQA Y15, 96(BP) VPXOR Y5, Y10, Y5 VPXOR Y6, Y11, Y6 VPSLLD $ 7, Y5, Y15 VPSRLD $25, Y5, Y5 VPXOR Y5, Y15, Y5 VPSLLD $ 7, Y6, Y15 VPSRLD $25, Y6, Y6 VPXOR Y6, Y15, Y6 VPXOR Y7, Y8, Y7 VPXOR Y4, Y9, Y4 VPSLLD $ 7, Y7, Y15 VPSRLD $25, Y7, Y7 VPXOR Y7, Y15, Y7 VPSLLD $ 7, Y4, Y15 VPSRLD $25, Y4, Y4 VPXOR Y4, Y15, Y4 VMOVDQA 96(BP), Y15 SUBQ $2, AX JNZ chacha_blocks_avx2_mainloop1 VMOVDQA Y8, 192(BP) VMOVDQA Y9, 224(BP) VMOVDQA Y10, 256(BP) VMOVDQA Y11, 288(BP) VMOVDQA Y12, 320(BP) VMOVDQA Y13, 352(BP) VMOVDQA Y14, 384(BP) VMOVDQA Y15, 416(BP) VPBROADCASTD 0(BP), Y8 VPBROADCASTD 4+0(BP), Y9 VPBROADCASTD 8+0(BP), Y10 VPBROADCASTD 12+0(BP), Y11 VPBROADCASTD 16(BP), Y12 VPBROADCASTD 4+16(BP), Y13 VPBROADCASTD 8+16(BP), Y14 VPBROADCASTD 12+16(BP), Y15 VPADDD Y8, Y0, Y0 VPADDD Y9, Y1, Y1 VPADDD Y10, Y2, Y2 VPADDD Y11, Y3, Y3 VPADDD Y12, Y4, Y4 VPADDD Y13, Y5, Y5 VPADDD Y14, Y6, Y6 VPADDD Y15, Y7, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKLDQ Y3, Y2, Y9 VPUNPCKHDQ Y1, Y0, Y12 VPUNPCKHDQ Y3, Y2, Y13 VPUNPCKLDQ Y5, Y4, Y10 VPUNPCKLDQ Y7, Y6, Y11 VPUNPCKHDQ Y5, Y4, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y9, Y8, Y0 VPUNPCKLQDQ Y11, Y10, Y1 VPUNPCKHQDQ Y9, Y8, Y2 VPUNPCKHQDQ Y11, Y10, Y3 VPUNPCKLQDQ Y13, Y12, Y4 VPUNPCKLQDQ Y15, Y14, Y5 VPUNPCKHQDQ Y13, Y12, Y6 VPUNPCKHQDQ Y15, Y14, Y7 VPERM2I128 $0x20, Y1, Y0, Y8 VPERM2I128 $0x20, Y3, Y2, Y9 VPERM2I128 $0x31, Y1, Y0, Y12 VPERM2I128 $0x31, Y3, Y2, Y13 VPERM2I128 $0x20, Y5, Y4, Y10 VPERM2I128 $0x20, Y7, Y6, Y11 VPERM2I128 $0x31, Y5, Y4, Y14 VPERM2I128 $0x31, Y7, Y6, Y15 ANDQ SI, SI JZ chacha_blocks_avx2_noinput1 VPXOR 0(SI), Y8, Y8 VPXOR 64(SI), Y9, Y9 VPXOR 128(SI), Y10, Y10 VPXOR 192(SI), Y11, Y11 VPXOR 256(SI), Y12, Y12 VPXOR 320(SI), Y13, Y13 VPXOR 384(SI), Y14, Y14 VPXOR 448(SI), Y15, Y15 VMOVDQU Y8, 0(DX) VMOVDQU Y9, 64(DX) VMOVDQU Y10, 128(DX) VMOVDQU Y11, 192(DX) VMOVDQU Y12, 256(DX) VMOVDQU Y13, 320(DX) VMOVDQU Y14, 384(DX) VMOVDQU Y15, 448(DX) VMOVDQA 192(BP), Y0 VMOVDQA 224(BP), Y1 VMOVDQA 256(BP), Y2 VMOVDQA 288(BP), Y3 VMOVDQA 320(BP), Y4 VMOVDQA 352(BP), Y5 VMOVDQA 384(BP), Y6 VMOVDQA 416(BP), Y7 VPBROADCASTD 32(BP), Y8 VPBROADCASTD 4+32(BP), Y9 VPBROADCASTD 8+32(BP), Y10 VPBROADCASTD 12+32(BP), Y11 VMOVDQA 128(BP), Y12 VMOVDQA 160(BP), Y13 VPBROADCASTD 8+48(BP), Y14 VPBROADCASTD 12+48(BP), Y15 VPADDD Y8, Y0, Y0 VPADDD Y9, Y1, Y1 VPADDD Y10, Y2, Y2 VPADDD Y11, Y3, Y3 VPADDD Y12, Y4, Y4 VPADDD Y13, Y5, Y5 VPADDD Y14, Y6, Y6 VPADDD Y15, Y7, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKLDQ Y3, Y2, Y9 VPUNPCKHDQ Y1, Y0, Y12 VPUNPCKHDQ Y3, Y2, Y13 VPUNPCKLDQ Y5, Y4, Y10 VPUNPCKLDQ Y7, Y6, Y11 VPUNPCKHDQ Y5, Y4, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y9, Y8, Y0 VPUNPCKLQDQ Y11, Y10, Y1 VPUNPCKHQDQ Y9, Y8, Y2 VPUNPCKHQDQ Y11, Y10, Y3 VPUNPCKLQDQ Y13, Y12, Y4 VPUNPCKLQDQ Y15, Y14, Y5 VPUNPCKHQDQ Y13, Y12, Y6 VPUNPCKHQDQ Y15, Y14, Y7 VPERM2I128 $0x20, Y1, Y0, Y8 VPERM2I128 $0x20, Y3, Y2, Y9 VPERM2I128 $0x31, Y1, Y0, Y12 VPERM2I128 $0x31, Y3, Y2, Y13 VPERM2I128 $0x20, Y5, Y4, Y10 VPERM2I128 $0x20, Y7, Y6, Y11 VPERM2I128 $0x31, Y5, Y4, Y14 VPERM2I128 $0x31, Y7, Y6, Y15 VPXOR 32(SI), Y8, Y8 VPXOR 96(SI), Y9, Y9 VPXOR 160(SI), Y10, Y10 VPXOR 224(SI), Y11, Y11 VPXOR 288(SI), Y12, Y12 VPXOR 352(SI), Y13, Y13 VPXOR 416(SI), Y14, Y14 VPXOR 480(SI), Y15, Y15 VMOVDQU Y8, 32(DX) VMOVDQU Y9, 96(DX) VMOVDQU Y10, 160(DX) VMOVDQU Y11, 224(DX) VMOVDQU Y12, 288(DX) VMOVDQU Y13, 352(DX) VMOVDQU Y14, 416(DX) VMOVDQU Y15, 480(DX) ADDQ $512, SI JMP chacha_blocks_avx2_mainloop1_cont chacha_blocks_avx2_noinput1: VMOVDQU Y8, 0(DX) VMOVDQU Y9, 64(DX) VMOVDQU Y10, 128(DX) VMOVDQU Y11, 192(DX) VMOVDQU Y12, 256(DX) VMOVDQU Y13, 320(DX) VMOVDQU Y14, 384(DX) VMOVDQU Y15, 448(DX) VMOVDQA 192(BP), Y0 VMOVDQA 224(BP), Y1 VMOVDQA 256(BP), Y2 VMOVDQA 288(BP), Y3 VMOVDQA 320(BP), Y4 VMOVDQA 352(BP), Y5 VMOVDQA 384(BP), Y6 VMOVDQA 416(BP), Y7 VPBROADCASTD 32(BP), Y8 VPBROADCASTD 4+32(BP), Y9 VPBROADCASTD 8+32(BP), Y10 VPBROADCASTD 12+32(BP), Y11 VMOVDQA 128(BP), Y12 VMOVDQA 160(BP), Y13 VPBROADCASTD 8+48(BP), Y14 VPBROADCASTD 12+48(BP), Y15 VPADDD Y8, Y0, Y0 VPADDD Y9, Y1, Y1 VPADDD Y10, Y2, Y2 VPADDD Y11, Y3, Y3 VPADDD Y12, Y4, Y4 VPADDD Y13, Y5, Y5 VPADDD Y14, Y6, Y6 VPADDD Y15, Y7, Y7 VPUNPCKLDQ Y1, Y0, Y8 VPUNPCKLDQ Y3, Y2, Y9 VPUNPCKHDQ Y1, Y0, Y12 VPUNPCKHDQ Y3, Y2, Y13 VPUNPCKLDQ Y5, Y4, Y10 VPUNPCKLDQ Y7, Y6, Y11 VPUNPCKHDQ Y5, Y4, Y14 VPUNPCKHDQ Y7, Y6, Y15 VPUNPCKLQDQ Y9, Y8, Y0 VPUNPCKLQDQ Y11, Y10, Y1 VPUNPCKHQDQ Y9, Y8, Y2 VPUNPCKHQDQ Y11, Y10, Y3 VPUNPCKLQDQ Y13, Y12, Y4 VPUNPCKLQDQ Y15, Y14, Y5 VPUNPCKHQDQ Y13, Y12, Y6 VPUNPCKHQDQ Y15, Y14, Y7 VPERM2I128 $0x20, Y1, Y0, Y8 VPERM2I128 $0x20, Y3, Y2, Y9 VPERM2I128 $0x31, Y1, Y0, Y12 VPERM2I128 $0x31, Y3, Y2, Y13 VPERM2I128 $0x20, Y5, Y4, Y10 VPERM2I128 $0x20, Y7, Y6, Y11 VPERM2I128 $0x31, Y5, Y4, Y14 VPERM2I128 $0x31, Y7, Y6, Y15 VMOVDQU Y8, 32(DX) VMOVDQU Y9, 96(DX) VMOVDQU Y10, 160(DX) VMOVDQU Y11, 224(DX) VMOVDQU Y12, 288(DX) VMOVDQU Y13, 352(DX) VMOVDQU Y14, 416(DX) VMOVDQU Y15, 480(DX) chacha_blocks_avx2_mainloop1_cont: ADDQ $512, DX SUBQ $512, CX CMPQ CX, $512 JAE chacha_blocks_avx2_atleast512 CMPQ CX, $256 JB chacha_blocks_avx2_below256_fixup chacha_blocks_avx2_atleast256: MOVQ 48(BP), AX LEAQ 1(AX), R8 LEAQ 2(AX), R9 LEAQ 3(AX), R10 LEAQ 4(AX), BX MOVL AX, 128(BP) MOVL R8, 4+128(BP) MOVL R9, 8+128(BP) MOVL R10, 12+128(BP) SHRQ $32, AX SHRQ $32, R8 SHRQ $32, R9 SHRQ $32, R10 MOVL AX, 160(BP) MOVL R8, 4+160(BP) MOVL R9, 8+160(BP) MOVL R10, 12+160(BP) MOVQ BX, 48(BP) // MOVQ 64(BP), AX MOVQ $20, AX VPBROADCASTD 0(BP), X0 VPBROADCASTD 4+0(BP), X1 VPBROADCASTD 8+0(BP), X2 VPBROADCASTD 12+0(BP), X3 VPBROADCASTD 16(BP), X4 VPBROADCASTD 4+16(BP), X5 VPBROADCASTD 8+16(BP), X6 VPBROADCASTD 12+16(BP), X7 VPBROADCASTD 32(BP), X8 VPBROADCASTD 4+32(BP), X9 VPBROADCASTD 8+32(BP), X10 VPBROADCASTD 12+32(BP), X11 VMOVDQA 128(BP), X12 VMOVDQA 160(BP), X13 VPBROADCASTD 8+48(BP), X14 VPBROADCASTD 12+48(BP), X15 chacha_blocks_avx2_mainloop2: VPADDD X0, X4, X0 VPADDD X1, X5, X1 VPXOR X12, X0, X12 VPXOR X13, X1, X13 VPADDD X2, X6, X2 VPADDD X3, X7, X3 VPXOR X14, X2, X14 VPXOR X15, X3, X15 VPSHUFB 448(BP), X12, X12 VPSHUFB 448(BP), X13, X13 VPADDD X8, X12, X8 VPADDD X9, X13, X9 VPSHUFB 448(BP), X14, X14 VPSHUFB 448(BP), X15, X15 VPADDD X10, X14, X10 VPADDD X11, X15, X11 VMOVDQA X12, 96(BP) VPXOR X4, X8, X4 VPXOR X5, X9, X5 VPSLLD $ 12, X4, X12 VPSRLD $20, X4, X4 VPXOR X4, X12, X4 VPSLLD $ 12, X5, X12 VPSRLD $20, X5, X5 VPXOR X5, X12, X5 VPXOR X6, X10, X6 VPXOR X7, X11, X7 VPSLLD $ 12, X6, X12 VPSRLD $20, X6, X6 VPXOR X6, X12, X6 VPSLLD $ 12, X7, X12 VPSRLD $20, X7, X7 VPXOR X7, X12, X7 VPADDD X0, X4, X0 VPADDD X1, X5, X1 VPXOR 96(BP), X0, X12 VPXOR X13, X1, X13 VPADDD X2, X6, X2 VPADDD X3, X7, X3 VPXOR X14, X2, X14 VPXOR X15, X3, X15 VPSHUFB 480(BP), X12, X12 VPSHUFB 480(BP), X13, X13 VPADDD X8, X12, X8 VPADDD X9, X13, X9 VPSHUFB 480(BP), X14, X14 VPSHUFB 480(BP), X15, X15 VPADDD X10, X14, X10 VPADDD X11, X15, X11 VMOVDQA X12, 96(BP) VPXOR X4, X8, X4 VPXOR X5, X9, X5 VPSLLD $ 7, X4, X12 VPSRLD $25, X4, X4 VPXOR X4, X12, X4 VPSLLD $ 7, X5, X12 VPSRLD $25, X5, X5 VPXOR X5, X12, X5 VPXOR X6, X10, X6 VPXOR X7, X11, X7 VPSLLD $ 7, X6, X12 VPSRLD $25, X6, X6 VPXOR X6, X12, X6 VPSLLD $ 7, X7, X12 VPSRLD $25, X7, X7 VPXOR X7, X12, X7 VPADDD X0, X5, X0 VPADDD X1, X6, X1 VPXOR X15, X0, X15 VPXOR 96(BP), X1, X12 VPADDD X2, X7, X2 VPADDD X3, X4, X3 VPXOR X13, X2, X13 VPXOR X14, X3, X14 VPSHUFB 448(BP), X15, X15 VPSHUFB 448(BP), X12, X12 VPADDD X10, X15, X10 VPADDD X11, X12, X11 VPSHUFB 448(BP), X13, X13 VPSHUFB 448(BP), X14, X14 VPADDD X8, X13, X8 VPADDD X9, X14, X9 VMOVDQA X15, 96(BP) VPXOR X5, X10, X5 VPXOR X6, X11, X6 VPSLLD $ 12, X5, X15 VPSRLD $20, X5, X5 VPXOR X5, X15, X5 VPSLLD $ 12, X6, X15 VPSRLD $20, X6, X6 VPXOR X6, X15, X6 VPXOR X7, X8, X7 VPXOR X4, X9, X4 VPSLLD $ 12, X7, X15 VPSRLD $20, X7, X7 VPXOR X7, X15, X7 VPSLLD $ 12, X4, X15 VPSRLD $20, X4, X4 VPXOR X4, X15, X4 VPADDD X0, X5, X0 VPADDD X1, X6, X1 VPXOR 96(BP), X0, X15 VPXOR X12, X1, X12 VPADDD X2, X7, X2 VPADDD X3, X4, X3 VPXOR X13, X2, X13 VPXOR X14, X3, X14 VPSHUFB 480(BP), X15, X15 VPSHUFB 480(BP), X12, X12 VPADDD X10, X15, X10 VPADDD X11, X12, X11 VPSHUFB 480(BP), X13, X13 VPSHUFB 480(BP), X14, X14 VPADDD X8, X13, X8 VPADDD X9, X14, X9 VMOVDQA X15, 96(BP) VPXOR X5, X10, X5 VPXOR X6, X11, X6 VPSLLD $ 7, X5, X15 VPSRLD $25, X5, X5 VPXOR X5, X15, X5 VPSLLD $ 7, X6, X15 VPSRLD $25, X6, X6 VPXOR X6, X15, X6 VPXOR X7, X8, X7 VPXOR X4, X9, X4 VPSLLD $ 7, X7, X15 VPSRLD $25, X7, X7 VPXOR X7, X15, X7 VPSLLD $ 7, X4, X15 VPSRLD $25, X4, X4 VPXOR X4, X15, X4 VMOVDQA 96(BP), X15 SUBQ $2, AX JNZ chacha_blocks_avx2_mainloop2 VMOVDQA X8, 192(BP) VMOVDQA X9, 208(BP) VMOVDQA X10, 224(BP) VMOVDQA X11, 240(BP) VMOVDQA X12, 256(BP) VMOVDQA X13, 272(BP) VMOVDQA X14, 288(BP) VMOVDQA X15, 304(BP) VPBROADCASTD 0(BP), X8 VPBROADCASTD 4+0(BP), X9 VPBROADCASTD 8+0(BP), X10 VPBROADCASTD 12+0(BP), X11 VPBROADCASTD 16(BP), X12 VPBROADCASTD 4+16(BP), X13 VPBROADCASTD 8+16(BP), X14 VPBROADCASTD 12+16(BP), X15 VPADDD X8, X0, X0 VPADDD X9, X1, X1 VPADDD X10, X2, X2 VPADDD X11, X3, X3 VPADDD X12, X4, X4 VPADDD X13, X5, X5 VPADDD X14, X6, X6 VPADDD X15, X7, X7 VPUNPCKLDQ X1, X0, X8 VPUNPCKLDQ X3, X2, X9 VPUNPCKHDQ X1, X0, X12 VPUNPCKHDQ X3, X2, X13 VPUNPCKLDQ X5, X4, X10 VPUNPCKLDQ X7, X6, X11 VPUNPCKHDQ X5, X4, X14 VPUNPCKHDQ X7, X6, X15 VPUNPCKLQDQ X9, X8, X0 VPUNPCKLQDQ X11, X10, X1 VPUNPCKHQDQ X9, X8, X2 VPUNPCKHQDQ X11, X10, X3 VPUNPCKLQDQ X13, X12, X4 VPUNPCKLQDQ X15, X14, X5 VPUNPCKHQDQ X13, X12, X6 VPUNPCKHQDQ X15, X14, X7 ANDQ SI, SI JZ chacha_blocks_avx2_noinput2 VPXOR 0(SI), X0, X0 VPXOR 16(SI), X1, X1 VPXOR 64(SI), X2, X2 VPXOR 80(SI), X3, X3 VPXOR 128(SI), X4, X4 VPXOR 144(SI), X5, X5 VPXOR 192(SI), X6, X6 VPXOR 208(SI), X7, X7 VMOVDQU X0, 0(DX) VMOVDQU X1, 16(DX) VMOVDQU X2, 64(DX) VMOVDQU X3, 80(DX) VMOVDQU X4, 128(DX) VMOVDQU X5, 144(DX) VMOVDQU X6, 192(DX) VMOVDQU X7, 208(DX) VMOVDQA 192(BP), X0 VMOVDQA 208(BP), X1 VMOVDQA 224(BP), X2 VMOVDQA 240(BP), X3 VMOVDQA 256(BP), X4 VMOVDQA 272(BP), X5 VMOVDQA 288(BP), X6 VMOVDQA 304(BP), X7 VPBROADCASTD 32(BP), X8 VPBROADCASTD 4+32(BP), X9 VPBROADCASTD 8+32(BP), X10 VPBROADCASTD 12+32(BP), X11 VMOVDQA 128(BP), X12 VMOVDQA 160(BP), X13 VPBROADCASTD 8+48(BP), X14 VPBROADCASTD 12+48(BP), X15 VPADDD X8, X0, X0 VPADDD X9, X1, X1 VPADDD X10, X2, X2 VPADDD X11, X3, X3 VPADDD X12, X4, X4 VPADDD X13, X5, X5 VPADDD X14, X6, X6 VPADDD X15, X7, X7 VPUNPCKLDQ X1, X0, X8 VPUNPCKLDQ X3, X2, X9 VPUNPCKHDQ X1, X0, X12 VPUNPCKHDQ X3, X2, X13 VPUNPCKLDQ X5, X4, X10 VPUNPCKLDQ X7, X6, X11 VPUNPCKHDQ X5, X4, X14 VPUNPCKHDQ X7, X6, X15 VPUNPCKLQDQ X9, X8, X0 VPUNPCKLQDQ X11, X10, X1 VPUNPCKHQDQ X9, X8, X2 VPUNPCKHQDQ X11, X10, X3 VPUNPCKLQDQ X13, X12, X4 VPUNPCKLQDQ X15, X14, X5 VPUNPCKHQDQ X13, X12, X6 VPUNPCKHQDQ X15, X14, X7 VPXOR 32(SI), X0, X0 VPXOR 48(SI), X1, X1 VPXOR 96(SI), X2, X2 VPXOR 112(SI), X3, X3 VPXOR 160(SI), X4, X4 VPXOR 176(SI), X5, X5 VPXOR 224(SI), X6, X6 VPXOR 240(SI), X7, X7 VMOVDQU X0, 32(DX) VMOVDQU X1, 48(DX) VMOVDQU X2, 96(DX) VMOVDQU X3, 112(DX) VMOVDQU X4, 160(DX) VMOVDQU X5, 176(DX) VMOVDQU X6, 224(DX) VMOVDQU X7, 240(DX) ADDQ $256, SI JMP chacha_blocks_avx2_mainloop2_cont chacha_blocks_avx2_noinput2: VMOVDQU X0, 0(DX) VMOVDQU X1, 16(DX) VMOVDQU X2, 64(DX) VMOVDQU X3, 80(DX) VMOVDQU X4, 128(DX) VMOVDQU X5, 144(DX) VMOVDQU X6, 192(DX) VMOVDQU X7, 208(DX) VMOVDQA 192(BP), X0 VMOVDQA 208(BP), X1 VMOVDQA 224(BP), X2 VMOVDQA 240(BP), X3 VMOVDQA 256(BP), X4 VMOVDQA 272(BP), X5 VMOVDQA 288(BP), X6 VMOVDQA 304(BP), X7 VPBROADCASTD 32(BP), X8 VPBROADCASTD 4+32(BP), X9 VPBROADCASTD 8+32(BP), X10 VPBROADCASTD 12+32(BP), X11 VMOVDQA 128(BP), X12 VMOVDQA 160(BP), X13 VPBROADCASTD 8+48(BP), X14 VPBROADCASTD 12+48(BP), X15 VPADDD X8, X0, X0 VPADDD X9, X1, X1 VPADDD X10, X2, X2 VPADDD X11, X3, X3 VPADDD X12, X4, X4 VPADDD X13, X5, X5 VPADDD X14, X6, X6 VPADDD X15, X7, X7 VPUNPCKLDQ X1, X0, X8 VPUNPCKLDQ X3, X2, X9 VPUNPCKHDQ X1, X0, X12 VPUNPCKHDQ X3, X2, X13 VPUNPCKLDQ X5, X4, X10 VPUNPCKLDQ X7, X6, X11 VPUNPCKHDQ X5, X4, X14 VPUNPCKHDQ X7, X6, X15 VPUNPCKLQDQ X9, X8, X0 VPUNPCKLQDQ X11, X10, X1 VPUNPCKHQDQ X9, X8, X2 VPUNPCKHQDQ X11, X10, X3 VPUNPCKLQDQ X13, X12, X4 VPUNPCKLQDQ X15, X14, X5 VPUNPCKHQDQ X13, X12, X6 VPUNPCKHQDQ X15, X14, X7 VMOVDQU X0, 32(DX) VMOVDQU X1, 48(DX) VMOVDQU X2, 96(DX) VMOVDQU X3, 112(DX) VMOVDQU X4, 160(DX) VMOVDQU X5, 176(DX) VMOVDQU X6, 224(DX) VMOVDQU X7, 240(DX) chacha_blocks_avx2_mainloop2_cont: ADDQ $256, DX SUBQ $256, CX CMPQ CX, $256 JAE chacha_blocks_avx2_atleast256 chacha_blocks_avx2_below256_fixup: VMOVDQA 448(BP), X6 VMOVDQA 480(BP), X7 VMOVDQA 0(BP), X8 VMOVDQA 16(BP), X9 VMOVDQA 32(BP), X10 VMOVDQA 48(BP), X11 MOVQ $1, R9 chacha_blocks_avx2_below256: VMOVQ R9, X5 ANDQ CX, CX JZ chacha_blocks_avx2_done CMPQ CX, $64 JAE chacha_blocks_avx2_above63 MOVQ DX, R9 ANDQ SI, SI JZ chacha_blocks_avx2_noinput3 MOVQ CX, R10 MOVQ BP, DX ADDQ R10, SI ADDQ R10, DX NEGQ R10 chacha_blocks_avx2_copyinput: MOVB (SI)(R10*1), AX MOVB AX, (DX)(R10*1) INCQ R10 JNZ chacha_blocks_avx2_copyinput MOVQ BP, SI chacha_blocks_avx2_noinput3: MOVQ BP, DX chacha_blocks_avx2_above63: VMOVDQA X8, X0 VMOVDQA X9, X1 VMOVDQA X10, X2 VMOVDQA X11, X3 // MOVQ 64(BP), AX MOVQ $20, AX chacha_blocks_avx2_mainloop3: VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X6, X3, X3 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSLLD $12, X1, X4 VPSRLD $20, X1, X1 VPXOR X1, X4, X1 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X7, X3, X3 VPSHUFD $0x93, X0, X0 VPADDD X2, X3, X2 VPSHUFD $0x4e, X3, X3 VPXOR X1, X2, X1 VPSHUFD $0x39, X2, X2 VPSLLD $7, X1, X4 VPSRLD $25, X1, X1 VPXOR X1, X4, X1 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X6, X3, X3 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSLLD $12, X1, X4 VPSRLD $20, X1, X1 VPXOR X1, X4, X1 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X7, X3, X3 VPSHUFD $0x39, X0, X0 VPADDD X2, X3, X2 VPSHUFD $0x4e, X3, X3 VPXOR X1, X2, X1 VPSHUFD $0x93, X2, X2 VPSLLD $7, X1, X4 VPSRLD $25, X1, X1 VPXOR X1, X4, X1 SUBQ $2, AX JNZ chacha_blocks_avx2_mainloop3 VPADDD X0, X8, X0 VPADDD X1, X9, X1 VPADDD X2, X10, X2 VPADDD X3, X11, X3 ANDQ SI, SI JZ chacha_blocks_avx2_noinput4 VPXOR 0(SI), X0, X0 VPXOR 16(SI), X1, X1 VPXOR 32(SI), X2, X2 VPXOR 48(SI), X3, X3 ADDQ $64, SI chacha_blocks_avx2_noinput4: VMOVDQU X0, 0(DX) VMOVDQU X1, 16(DX) VMOVDQU X2, 32(DX) VMOVDQU X3, 48(DX) VPADDQ X11, X5, X11 CMPQ CX, $64 JBE chacha_blocks_avx2_mainloop3_finishup ADDQ $64, DX SUBQ $64, CX JMP chacha_blocks_avx2_below256 chacha_blocks_avx2_mainloop3_finishup: CMPQ CX, $64 JE chacha_blocks_avx2_done ADDQ CX, R9 ADDQ CX, DX NEGQ CX chacha_blocks_avx2_copyoutput: MOVB (DX)(CX*1), AX MOVB AX, (R9)(CX*1) INCQ CX JNZ chacha_blocks_avx2_copyoutput chacha_blocks_avx2_done: VMOVDQU X11, 32(DI) VZEROUPPER RET // func hChaChaAVX2(key, nonce []byte, dst *byte) TEXT ·hChaChaAVX2(SB), NOSPLIT|NOFRAME, $0-56 MOVQ key+0(FP), DI MOVQ nonce+24(FP), SI MOVQ dst+48(FP), DX MOVL $20, CX LEAQ ·chacha_constants<>(SB), AX VMOVDQA 0(AX), X0 VMOVDQA 16(AX), X6 VMOVDQA 32(AX), X5 VMOVDQU 0(DI), X1 VMOVDQU 16(DI), X2 VMOVDQU 0(SI), X3 hhacha_mainloop_avx2: VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X6, X3, X3 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSLLD $12, X1, X4 VPSRLD $20, X1, X1 VPXOR X1, X4, X1 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X5, X3, X3 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSLLD $7, X1, X4 VPSRLD $25, X1, X1 VPSHUFD $0x93, X0, X0 VPXOR X1, X4, X1 VPSHUFD $0x4e, X3, X3 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X6, X3, X3 VPSHUFD $0x39, X2, X2 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSLLD $12, X1, X4 VPSRLD $20, X1, X1 VPXOR X1, X4, X1 VPADDD X0, X1, X0 VPXOR X3, X0, X3 VPSHUFB X5, X3, X3 VPADDD X2, X3, X2 VPXOR X1, X2, X1 VPSHUFD $0x39, X0, X0 VPSLLD $7, X1, X4 VPSHUFD $0x4e, X3, X3 VPSRLD $25, X1, X1 VPSHUFD $0x93, X2, X2 VPXOR X1, X4, X1 SUBL $2, CX JNE hhacha_mainloop_avx2 VMOVDQU X0, (DX) VMOVDQU X3, 16(DX) VZEROUPPER RET // func blocksSSSE3(s *[api.StateSize]uint32, in, out []byte) TEXT ·blocksSSSE3(SB), NOSPLIT, $576-56 // This is Andrew Moon's SSSE3 ChaCha implementation taken from // supercop-20190110, with some minor changes, primarily calling // convention and assembly dialect related. // Align the stack on a 64 byte boundary. MOVQ SP, BP ADDQ $64, BP ANDQ $-64, BP // Go calling convention -> SYSV AMD64 (and a fixup). MOVQ s+0(FP), DI // &s -> DI ADDQ $16, DI // Skip the ChaCha constants in the chachaState. MOVQ in+8(FP), SI // &in[0] -> SI MOVQ out+32(FP), DX // &out[0] -> DX MOVQ in_len+16(FP), CX // len(in) -> CX // Begin the main body of `chacha_blocks_ssse3`. // // Mostly a direct translation except: // * The number of rounds is always 20. // * %rbp is used instead of BP. LEAQ ·chacha_constants<>(SB), AX MOVO 0(AX), X8 MOVO 16(AX), X6 MOVO 32(AX), X7 MOVOU 0(DI), X9 MOVOU 16(DI), X10 MOVOU 32(DI), X11 // MOVQ 48(DI), AX MOVQ $1, R9 MOVO X8, 0(BP) MOVO X9, 16(BP) MOVO X10, 32(BP) MOVO X11, 48(BP) MOVO X6, 80(BP) MOVO X7, 96(BP) // MOVQ AX, 64(BP) CMPQ CX, $256 JB chacha_blocks_ssse3_below256 PSHUFD $0x00, X8, X0 PSHUFD $0x55, X8, X1 PSHUFD $0xaa, X8, X2 PSHUFD $0xff, X8, X3 MOVO X0, 128(BP) MOVO X1, 144(BP) MOVO X2, 160(BP) MOVO X3, 176(BP) PSHUFD $0x00, X9, X0 PSHUFD $0x55, X9, X1 PSHUFD $0xaa, X9, X2 PSHUFD $0xff, X9, X3 MOVO X0, 192(BP) MOVO X1, 208(BP) MOVO X2, 224(BP) MOVO X3, 240(BP) PSHUFD $0x00, X10, X0 PSHUFD $0x55, X10, X1 PSHUFD $0xaa, X10, X2 PSHUFD $0xff, X10, X3 MOVO X0, 256(BP) MOVO X1, 272(BP) MOVO X2, 288(BP) MOVO X3, 304(BP) PSHUFD $0xaa, X11, X0 PSHUFD $0xff, X11, X1 MOVO X0, 352(BP) MOVO X1, 368(BP) JMP chacha_blocks_ssse3_atleast256 // .p2align 6,,63 // # align to 4 mod 64 // nop;nop;nop;nop; chacha_blocks_ssse3_atleast256: MOVQ 48(BP), AX LEAQ 1(AX), R8 LEAQ 2(AX), R9 LEAQ 3(AX), R10 LEAQ 4(AX), BX MOVL AX, 320(BP) MOVL R8, 4+320(BP) MOVL R9, 8+320(BP) MOVL R10, 12+320(BP) SHRQ $32, AX SHRQ $32, R8 SHRQ $32, R9 SHRQ $32, R10 MOVL AX, 336(BP) MOVL R8, 4+336(BP) MOVL R9, 8+336(BP) MOVL R10, 12+336(BP) MOVQ BX, 48(BP) // MOVQ 64(BP), AX MOVQ $20, AX MOVO 128(BP), X0 MOVO 144(BP), X1 MOVO 160(BP), X2 MOVO 176(BP), X3 MOVO 192(BP), X4 MOVO 208(BP), X5 MOVO 224(BP), X6 MOVO 240(BP), X7 MOVO 256(BP), X8 MOVO 272(BP), X9 MOVO 288(BP), X10 MOVO 304(BP), X11 MOVO 320(BP), X12 MOVO 336(BP), X13 MOVO 352(BP), X14 MOVO 368(BP), X15 chacha_blocks_ssse3_mainloop1: PADDD X4, X0 PADDD X5, X1 PXOR X0, X12 PXOR X1, X13 PADDD X6, X2 PADDD X7, X3 PXOR X2, X14 PXOR X3, X15 PSHUFB 80(BP), X12 PSHUFB 80(BP), X13 PADDD X12, X8 PADDD X13, X9 PSHUFB 80(BP), X14 PSHUFB 80(BP), X15 PADDD X14, X10 PADDD X15, X11 MOVO X12, 112(BP) PXOR X8, X4 PXOR X9, X5 MOVO X4, X12 PSLLL $ 12, X4 PSRLL $20, X12 PXOR X12, X4 MOVO X5, X12 PSLLL $ 12, X5 PSRLL $20, X12 PXOR X12, X5 PXOR X10, X6 PXOR X11, X7 MOVO X6, X12 PSLLL $ 12, X6 PSRLL $20, X12 PXOR X12, X6 MOVO X7, X12 PSLLL $ 12, X7 PSRLL $20, X12 PXOR X12, X7 MOVO 112(BP), X12 PADDD X4, X0 PADDD X5, X1 PXOR X0, X12 PXOR X1, X13 PADDD X6, X2 PADDD X7, X3 PXOR X2, X14 PXOR X3, X15 PSHUFB 96(BP), X12 PSHUFB 96(BP), X13 PADDD X12, X8 PADDD X13, X9 PSHUFB 96(BP), X14 PSHUFB 96(BP), X15 PADDD X14, X10 PADDD X15, X11 MOVO X12, 112(BP) PXOR X8, X4 PXOR X9, X5 MOVO X4, X12 PSLLL $ 7, X4 PSRLL $25, X12 PXOR X12, X4 MOVO X5, X12 PSLLL $ 7, X5 PSRLL $25, X12 PXOR X12, X5 PXOR X10, X6 PXOR X11, X7 MOVO X6, X12 PSLLL $ 7, X6 PSRLL $25, X12 PXOR X12, X6 MOVO X7, X12 PSLLL $ 7, X7 PSRLL $25, X12 PXOR X12, X7 MOVO 112(BP), X12 PADDD X5, X0 PADDD X6, X1 PXOR X0, X15 PXOR X1, X12 PADDD X7, X2 PADDD X4, X3 PXOR X2, X13 PXOR X3, X14 PSHUFB 80(BP), X15 PSHUFB 80(BP), X12 PADDD X15, X10 PADDD X12, X11 PSHUFB 80(BP), X13 PSHUFB 80(BP), X14 PADDD X13, X8 PADDD X14, X9 MOVO X15, 112(BP) PXOR X10, X5 PXOR X11, X6 MOVO X5, X15 PSLLL $ 12, X5 PSRLL $20, X15 PXOR X15, X5 MOVO X6, X15 PSLLL $ 12, X6 PSRLL $20, X15 PXOR X15, X6 PXOR X8, X7 PXOR X9, X4 MOVO X7, X15 PSLLL $ 12, X7 PSRLL $20, X15 PXOR X15, X7 MOVO X4, X15 PSLLL $ 12, X4 PSRLL $20, X15 PXOR X15, X4 MOVO 112(BP), X15 PADDD X5, X0 PADDD X6, X1 PXOR X0, X15 PXOR X1, X12 PADDD X7, X2 PADDD X4, X3 PXOR X2, X13 PXOR X3, X14 PSHUFB 96(BP), X15 PSHUFB 96(BP), X12 PADDD X15, X10 PADDD X12, X11 PSHUFB 96(BP), X13 PSHUFB 96(BP), X14 PADDD X13, X8 PADDD X14, X9 MOVO X15, 112(BP) PXOR X10, X5 PXOR X11, X6 MOVO X5, X15 PSLLL $ 7, X5 PSRLL $25, X15 PXOR X15, X5 MOVO X6, X15 PSLLL $ 7, X6 PSRLL $25, X15 PXOR X15, X6 PXOR X8, X7 PXOR X9, X4 MOVO X7, X15 PSLLL $ 7, X7 PSRLL $25, X15 PXOR X15, X7 MOVO X4, X15 PSLLL $ 7, X4 PSRLL $25, X15 PXOR X15, X4 SUBQ $2, AX MOVO 112(BP), X15 JNZ chacha_blocks_ssse3_mainloop1 PADDD 128(BP), X0 PADDD 144(BP), X1 PADDD 160(BP), X2 PADDD 176(BP), X3 PADDD 192(BP), X4 PADDD 208(BP), X5 PADDD 224(BP), X6 PADDD 240(BP), X7 PADDD 256(BP), X8 PADDD 272(BP), X9 PADDD 288(BP), X10 PADDD 304(BP), X11 PADDD 320(BP), X12 PADDD 336(BP), X13 PADDD 352(BP), X14 PADDD 368(BP), X15 MOVO X8, 384(BP) MOVO X9, 400(BP) MOVO X10, 416(BP) MOVO X11, 432(BP) MOVO X12, 448(BP) MOVO X13, 464(BP) MOVO X14, 480(BP) MOVO X15, 496(BP) MOVO X0, X8 MOVO X2, X9 MOVO X4, X10 MOVO X6, X11 PUNPCKHLQ X1, X0 PUNPCKHLQ X3, X2 PUNPCKHLQ X5, X4 PUNPCKHLQ X7, X6 PUNPCKLLQ X1, X8 PUNPCKLLQ X3, X9 PUNPCKLLQ X5, X10 PUNPCKLLQ X7, X11 MOVO X0, X1 MOVO X4, X3 MOVO X8, X5 MOVO X10, X7 PUNPCKHQDQ X2, X0 PUNPCKHQDQ X6, X4 PUNPCKHQDQ X9, X8 PUNPCKHQDQ X11, X10 PUNPCKLQDQ X2, X1 PUNPCKLQDQ X6, X3 PUNPCKLQDQ X9, X5 PUNPCKLQDQ X11, X7 ANDQ SI, SI JZ chacha_blocks_ssse3_noinput1 MOVOU 0(SI), X2 MOVOU 16(SI), X6 MOVOU 64(SI), X9 MOVOU 80(SI), X11 MOVOU 128(SI), X12 MOVOU 144(SI), X13 MOVOU 192(SI), X14 MOVOU 208(SI), X15 PXOR X2, X5 PXOR X6, X7 PXOR X9, X8 PXOR X11, X10 PXOR X12, X1 PXOR X13, X3 PXOR X14, X0 PXOR X15, X4 MOVOU X5, 0(DX) MOVOU X7, 16(DX) MOVOU X8, 64(DX) MOVOU X10, 80(DX) MOVOU X1, 128(DX) MOVOU X3, 144(DX) MOVOU X0, 192(DX) MOVOU X4, 208(DX) MOVO 384(BP), X0 MOVO 400(BP), X1 MOVO 416(BP), X2 MOVO 432(BP), X3 MOVO 448(BP), X4 MOVO 464(BP), X5 MOVO 480(BP), X6 MOVO 496(BP), X7 MOVO X0, X8 MOVO X2, X9 MOVO X4, X10 MOVO X6, X11 PUNPCKLLQ X1, X8 PUNPCKLLQ X3, X9 PUNPCKHLQ X1, X0 PUNPCKHLQ X3, X2 PUNPCKLLQ X5, X10 PUNPCKLLQ X7, X11 PUNPCKHLQ X5, X4 PUNPCKHLQ X7, X6 MOVO X8, X1 MOVO X0, X3 MOVO X10, X5 MOVO X4, X7 PUNPCKLQDQ X9, X1 PUNPCKLQDQ X11, X5 PUNPCKHQDQ X9, X8 PUNPCKHQDQ X11, X10 PUNPCKLQDQ X2, X3 PUNPCKLQDQ X6, X7 PUNPCKHQDQ X2, X0 PUNPCKHQDQ X6, X4 MOVOU 32(SI), X2 MOVOU 48(SI), X6 MOVOU 96(SI), X9 MOVOU 112(SI), X11 MOVOU 160(SI), X12 MOVOU 176(SI), X13 MOVOU 224(SI), X14 MOVOU 240(SI), X15 PXOR X2, X1 PXOR X6, X5 PXOR X9, X8 PXOR X11, X10 PXOR X12, X3 PXOR X13, X7 PXOR X14, X0 PXOR X15, X4 MOVOU X1, 32(DX) MOVOU X5, 48(DX) MOVOU X8, 96(DX) MOVOU X10, 112(DX) MOVOU X3, 160(DX) MOVOU X7, 176(DX) MOVOU X0, 224(DX) MOVOU X4, 240(DX) ADDQ $256, SI JMP chacha_blocks_ssse3_mainloop_cont chacha_blocks_ssse3_noinput1: MOVOU X5, 0(DX) MOVOU X7, 16(DX) MOVOU X8, 64(DX) MOVOU X10, 80(DX) MOVOU X1, 128(DX) MOVOU X3, 144(DX) MOVOU X0, 192(DX) MOVOU X4, 208(DX) MOVO 384(BP), X0 MOVO 400(BP), X1 MOVO 416(BP), X2 MOVO 432(BP), X3 MOVO 448(BP), X4 MOVO 464(BP), X5 MOVO 480(BP), X6 MOVO 496(BP), X7 MOVO X0, X8 MOVO X2, X9 MOVO X4, X10 MOVO X6, X11 PUNPCKLLQ X1, X8 PUNPCKLLQ X3, X9 PUNPCKHLQ X1, X0 PUNPCKHLQ X3, X2 PUNPCKLLQ X5, X10 PUNPCKLLQ X7, X11 PUNPCKHLQ X5, X4 PUNPCKHLQ X7, X6 MOVO X8, X1 MOVO X0, X3 MOVO X10, X5 MOVO X4, X7 PUNPCKLQDQ X9, X1 PUNPCKLQDQ X11, X5 PUNPCKHQDQ X9, X8 PUNPCKHQDQ X11, X10 PUNPCKLQDQ X2, X3 PUNPCKLQDQ X6, X7 PUNPCKHQDQ X2, X0 PUNPCKHQDQ X6, X4 MOVOU X1, 32(DX) MOVOU X5, 48(DX) MOVOU X8, 96(DX) MOVOU X10, 112(DX) MOVOU X3, 160(DX) MOVOU X7, 176(DX) MOVOU X0, 224(DX) MOVOU X4, 240(DX) chacha_blocks_ssse3_mainloop_cont: ADDQ $256, DX SUBQ $256, CX CMPQ CX, $256 JAE chacha_blocks_ssse3_atleast256 MOVO 80(BP), X6 MOVO 96(BP), X7 MOVO 0(BP), X8 MOVO 16(BP), X9 MOVO 32(BP), X10 MOVO 48(BP), X11 MOVQ $1, R9 chacha_blocks_ssse3_below256: MOVQ R9, X5 ANDQ CX, CX JZ chacha_blocks_ssse3_done CMPQ CX, $64 JAE chacha_blocks_ssse3_above63 MOVQ DX, R9 ANDQ SI, SI JZ chacha_blocks_ssse3_noinput2 MOVQ CX, R10 MOVQ BP, DX ADDQ R10, SI ADDQ R10, DX NEGQ R10 chacha_blocks_ssse3_copyinput: MOVB (SI)(R10*1), AX MOVB AX, (DX)(R10*1) INCQ R10 JNZ chacha_blocks_ssse3_copyinput MOVQ BP, SI chacha_blocks_ssse3_noinput2: MOVQ BP, DX chacha_blocks_ssse3_above63: MOVO X8, X0 MOVO X9, X1 MOVO X10, X2 MOVO X11, X3 // MOVQ 64(BP), AX MOVQ $20, AX chacha_blocks_ssse3_mainloop2: PADDD X1, X0 PXOR X0, X3 PSHUFB X6, X3 PADDD X3, X2 PXOR X2, X1 MOVO X1, X4 PSLLL $12, X4 PSRLL $20, X1 PXOR X4, X1 PADDD X1, X0 PXOR X0, X3 PSHUFB X7, X3 PSHUFD $0x93, X0, X0 PADDD X3, X2 PSHUFD $0x4e, X3, X3 PXOR X2, X1 PSHUFD $0x39, X2, X2 MOVO X1, X4 PSLLL $7, X4 PSRLL $25, X1 PXOR X4, X1 PADDD X1, X0 PXOR X0, X3 PSHUFB X6, X3 PADDD X3, X2 PXOR X2, X1 MOVO X1, X4 PSLLL $12, X4 PSRLL $20, X1 PXOR X4, X1 PADDD X1, X0 PXOR X0, X3 PSHUFB X7, X3 PSHUFD $0x39, X0, X0 PADDD X3, X2 PSHUFD $0x4e, X3, X3 PXOR X2, X1 PSHUFD $0x93, X2, X2 MOVO X1, X4 PSLLL $7, X4 PSRLL $25, X1 PXOR X4, X1 SUBQ $2, AX JNZ chacha_blocks_ssse3_mainloop2 PADDD X8, X0 PADDD X9, X1 PADDD X10, X2 PADDD X11, X3 ANDQ SI, SI JZ chacha_blocks_ssse3_noinput3 MOVOU 0(SI), X12 MOVOU 16(SI), X13 MOVOU 32(SI), X14 MOVOU 48(SI), X15 PXOR X12, X0 PXOR X13, X1 PXOR X14, X2 PXOR X15, X3 ADDQ $64, SI chacha_blocks_ssse3_noinput3: MOVOU X0, 0(DX) MOVOU X1, 16(DX) MOVOU X2, 32(DX) MOVOU X3, 48(DX) PADDQ X5, X11 CMPQ CX, $64 JBE chacha_blocks_ssse3_mainloop2_finishup ADDQ $64, DX SUBQ $64, CX JMP chacha_blocks_ssse3_below256 chacha_blocks_ssse3_mainloop2_finishup: CMPQ CX, $64 JE chacha_blocks_ssse3_done ADDQ CX, R9 ADDQ CX, DX NEGQ CX chacha_blocks_ssse3_copyoutput: MOVB (DX)(CX*1), AX MOVB AX, (R9)(CX*1) INCQ CX JNZ chacha_blocks_ssse3_copyoutput chacha_blocks_ssse3_done: MOVOU X11, 32(DI) RET // func hChaChaSSSE3(key, nonce []byte, dst *byte) TEXT ·hChaChaSSSE3(SB), NOSPLIT|NOFRAME, $0-56 MOVQ key+0(FP), DI MOVQ nonce+24(FP), SI MOVQ dst+48(FP), DX MOVL $20, CX LEAQ ·chacha_constants<>(SB), AX MOVO 0(AX), X0 MOVO 16(AX), X5 MOVO 32(AX), X6 MOVOU 0(DI), X1 MOVOU 16(DI), X2 MOVOU 0(SI), X3 hchacha_ssse3_mainloop: PADDD X1, X0 PXOR X0, X3 PSHUFB X5, X3 PADDD X3, X2 PXOR X2, X1 MOVO X1, X4 PSLLL $12, X1 PSRLL $20, X4 PXOR X4, X1 PADDD X1, X0 PXOR X0, X3 PSHUFB X6, X3 PSHUFD $0X93, X0, X0 PADDD X3, X2 PSHUFD $0X4E, X3, X3 PXOR X2, X1 PSHUFD $0X39, X2, X2 MOVO X1, X4 PSLLL $7, X1 PSRLL $25, X4 PXOR X4, X1 SUBQ $2, CX PADDD X1, X0 PXOR X0, X3 PSHUFB X5, X3 PADDD X3, X2 PXOR X2, X1 MOVO X1, X4 PSLLL $12, X1 PSRLL $20, X4 PXOR X4, X1 PADDD X1, X0 PXOR X0, X3 PSHUFB X6, X3 PSHUFD $0X39, X0, X0 PADDD X3, X2 PSHUFD $0X4E, X3, X3 PXOR X2, X1 PSHUFD $0X93, X2, X2 MOVO X1, X4 PSLLL $7, X1 PSRLL $25, X4 PXOR X4, X1 JA hchacha_ssse3_mainloop MOVOU X0, 0(DX) MOVOU X3, 16(DX) RET