#!/usr/bin/env python3 # # To the extent possible under law, Yawning Angel has waived all copyright # and related or neighboring rights to chacha20, using the Creative # Commons "CC0" public domain dedication. See LICENSE or # for full details. # # cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics. # The least terrible/retarded option is to use a Python code generator, so # that's what I did. # # Code based on Ted Krovetz's vec128 C implementation, with corrections # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and # output pointers. # # Dependencies: https://github.com/Maratyszcza/PeachPy # # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py # from peachpy import * from peachpy.x86_64 import * x = Argument(ptr(uint32_t)) inp = Argument(ptr(const_uint8_t)) outp = Argument(ptr(uint8_t)) nrBlocks = Argument(ptr(size_t)) # # SSE2 helper functions. A temporary register is explicitly passed in because # the main fast loop uses every single register (and even spills) so manual # control is needed. # # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like # in the C code, but the C code has the luxury of an optimizer reordering # everything, while this does not. # def ROTW16_sse2(tmp, d): MOVDQA(tmp, d) PSLLD(tmp, 16) PSRLD(d, 16) PXOR(d, tmp) def ROTW12_sse2(tmp, b): MOVDQA(tmp, b) PSLLD(tmp, 12) PSRLD(b, 20) PXOR(b, tmp) def ROTW8_sse2(tmp, d): MOVDQA(tmp, d) PSLLD(tmp, 8) PSRLD(d, 24) PXOR(d, tmp) def ROTW7_sse2(tmp, b): MOVDQA(tmp, b) PSLLD(tmp, 7) PSRLD(b, 25) PXOR(b, tmp) def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3): MOVDQU(tmp, [inp+d]) PXOR(tmp, v0) MOVDQU([outp+d], tmp) MOVDQU(tmp, [inp+d+16]) PXOR(tmp, v1) MOVDQU([outp+d+16], tmp) MOVDQU(tmp, [inp+d+32]) PXOR(tmp, v2) MOVDQU([outp+d+32], tmp) MOVDQU(tmp, [inp+d+48]) PXOR(tmp, v3) MOVDQU([outp+d+48], tmp) # SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will # process 4/2/1 blocks at a time. with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)): reg_x = GeneralPurposeRegister64() reg_inp = GeneralPurposeRegister64() reg_outp = GeneralPurposeRegister64() reg_blocks = GeneralPurposeRegister64() reg_sp_save = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_x, x) LOAD.ARGUMENT(reg_inp, inp) LOAD.ARGUMENT(reg_outp, outp) LOAD.ARGUMENT(reg_blocks, nrBlocks) # Align the stack to a 32 byte boundary. MOV(reg_sp_save, registers.rsp) AND(registers.rsp, 0xffffffffffffffe0) SUB(registers.rsp, 0x20) # Build the counter increment vector on the stack, and allocate the scratch # space xmm_v0 = XMMRegister() PXOR(xmm_v0, xmm_v0) SUB(registers.rsp, 16+16) MOVDQA([registers.rsp], xmm_v0) reg_tmp = GeneralPurposeRegister32() MOV(reg_tmp, 0x00000001) MOV([registers.rsp], reg_tmp) mem_one = [registers.rsp] # (Stack) Counter increment vector mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space. mem_s0 = [reg_x] # (Memory) Cipher state [0..3] mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7] mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11] mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15] # xmm_v0 allocated above... xmm_v1 = XMMRegister() xmm_v2 = XMMRegister() xmm_v3 = XMMRegister() xmm_v4 = XMMRegister() xmm_v5 = XMMRegister() xmm_v6 = XMMRegister() xmm_v7 = XMMRegister() xmm_v8 = XMMRegister() xmm_v9 = XMMRegister() xmm_v10 = XMMRegister() xmm_v11 = XMMRegister() xmm_v12 = XMMRegister() xmm_v13 = XMMRegister() xmm_v14 = XMMRegister() xmm_v15 = XMMRegister() xmm_tmp = xmm_v12 # # 4 blocks at a time. # reg_rounds = GeneralPurposeRegister64() vector_loop4 = Loop() SUB(reg_blocks, 4) JB(vector_loop4.end) with vector_loop4: MOVDQU(xmm_v0, mem_s0) MOVDQU(xmm_v1, mem_s1) MOVDQU(xmm_v2, mem_s2) MOVDQU(xmm_v3, mem_s3) MOVDQA(xmm_v4, xmm_v0) MOVDQA(xmm_v5, xmm_v1) MOVDQA(xmm_v6, xmm_v2) MOVDQA(xmm_v7, xmm_v3) PADDQ(xmm_v7, mem_one) MOVDQA(xmm_v8, xmm_v0) MOVDQA(xmm_v9, xmm_v1) MOVDQA(xmm_v10, xmm_v2) MOVDQA(xmm_v11, xmm_v7) PADDQ(xmm_v11, mem_one) MOVDQA(xmm_v12, xmm_v0) MOVDQA(xmm_v13, xmm_v1) MOVDQA(xmm_v14, xmm_v2) MOVDQA(xmm_v15, xmm_v11) PADDQ(xmm_v15, mem_one) MOV(reg_rounds, 20) rounds_loop4 = Loop() with rounds_loop4: # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PADDD(xmm_v8, xmm_v9) PADDD(xmm_v12, xmm_v13) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) PXOR(xmm_v11, xmm_v8) PXOR(xmm_v15, xmm_v12) MOVDQA(mem_tmp0, xmm_tmp) # Save ROTW16_sse2(xmm_tmp, xmm_v3) ROTW16_sse2(xmm_tmp, xmm_v7) ROTW16_sse2(xmm_tmp, xmm_v11) ROTW16_sse2(xmm_tmp, xmm_v15) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PADDD(xmm_v10, xmm_v11) PADDD(xmm_v14, xmm_v15) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) PXOR(xmm_v9, xmm_v10) PXOR(xmm_v13, xmm_v14) ROTW12_sse2(xmm_tmp, xmm_v1) ROTW12_sse2(xmm_tmp, xmm_v5) ROTW12_sse2(xmm_tmp, xmm_v9) ROTW12_sse2(xmm_tmp, xmm_v13) # a += b; d ^= a; d = ROTW8(d); MOVDQA(xmm_tmp, mem_tmp0) # Restore PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PADDD(xmm_v8, xmm_v9) PADDD(xmm_v12, xmm_v13) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) PXOR(xmm_v11, xmm_v8) PXOR(xmm_v15, xmm_v12) MOVDQA(mem_tmp0, xmm_tmp) # Save ROTW8_sse2(xmm_tmp, xmm_v3) ROTW8_sse2(xmm_tmp, xmm_v7) ROTW8_sse2(xmm_tmp, xmm_v11) ROTW8_sse2(xmm_tmp, xmm_v15) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PADDD(xmm_v10, xmm_v11) PADDD(xmm_v14, xmm_v15) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) PXOR(xmm_v9, xmm_v10) PXOR(xmm_v13, xmm_v14) ROTW7_sse2(xmm_tmp, xmm_v1) ROTW7_sse2(xmm_tmp, xmm_v5) ROTW7_sse2(xmm_tmp, xmm_v9) ROTW7_sse2(xmm_tmp, xmm_v13) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x39) PSHUFD(xmm_v5, xmm_v5, 0x39) PSHUFD(xmm_v9, xmm_v9, 0x39) PSHUFD(xmm_v13, xmm_v13, 0x39) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v6, xmm_v6, 0x4e) PSHUFD(xmm_v10, xmm_v10, 0x4e) PSHUFD(xmm_v14, xmm_v14, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x93) PSHUFD(xmm_v7, xmm_v7, 0x93) PSHUFD(xmm_v11, xmm_v11, 0x93) PSHUFD(xmm_v15, xmm_v15, 0x93) MOVDQA(xmm_tmp, mem_tmp0) # Restore # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PADDD(xmm_v8, xmm_v9) PADDD(xmm_v12, xmm_v13) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) PXOR(xmm_v11, xmm_v8) PXOR(xmm_v15, xmm_v12) MOVDQA(mem_tmp0, xmm_tmp) # Save ROTW16_sse2(xmm_tmp, xmm_v3) ROTW16_sse2(xmm_tmp, xmm_v7) ROTW16_sse2(xmm_tmp, xmm_v11) ROTW16_sse2(xmm_tmp, xmm_v15) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PADDD(xmm_v10, xmm_v11) PADDD(xmm_v14, xmm_v15) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) PXOR(xmm_v9, xmm_v10) PXOR(xmm_v13, xmm_v14) ROTW12_sse2(xmm_tmp, xmm_v1) ROTW12_sse2(xmm_tmp, xmm_v5) ROTW12_sse2(xmm_tmp, xmm_v9) ROTW12_sse2(xmm_tmp, xmm_v13) # a += b; d ^= a; d = ROTW8(d); MOVDQA(xmm_tmp, mem_tmp0) # Restore PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PADDD(xmm_v8, xmm_v9) PADDD(xmm_v12, xmm_v13) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) PXOR(xmm_v11, xmm_v8) PXOR(xmm_v15, xmm_v12) MOVDQA(mem_tmp0, xmm_tmp) # Save ROTW8_sse2(xmm_tmp, xmm_v3) ROTW8_sse2(xmm_tmp, xmm_v7) ROTW8_sse2(xmm_tmp, xmm_v11) ROTW8_sse2(xmm_tmp, xmm_v15) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PADDD(xmm_v10, xmm_v11) PADDD(xmm_v14, xmm_v15) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) PXOR(xmm_v9, xmm_v10) PXOR(xmm_v13, xmm_v14) ROTW7_sse2(xmm_tmp, xmm_v1) ROTW7_sse2(xmm_tmp, xmm_v5) ROTW7_sse2(xmm_tmp, xmm_v9) ROTW7_sse2(xmm_tmp, xmm_v13) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x93) PSHUFD(xmm_v5, xmm_v5, 0x93) PSHUFD(xmm_v9, xmm_v9, 0x93) PSHUFD(xmm_v13, xmm_v13, 0x93) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v6, xmm_v6, 0x4e) PSHUFD(xmm_v10, xmm_v10, 0x4e) PSHUFD(xmm_v14, xmm_v14, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x39) PSHUFD(xmm_v7, xmm_v7, 0x39) PSHUFD(xmm_v11, xmm_v11, 0x39) PSHUFD(xmm_v15, xmm_v15, 0x39) MOVDQA(xmm_tmp, mem_tmp0) # Restore SUB(reg_rounds, 2) JNZ(rounds_loop4.begin) MOVDQA(mem_tmp0, xmm_tmp) PADDD(xmm_v0, mem_s0) PADDD(xmm_v1, mem_s1) PADDD(xmm_v2, mem_s2) PADDD(xmm_v3, mem_s3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) MOVDQU(xmm_v3, mem_s3) PADDQ(xmm_v3, mem_one) PADDD(xmm_v4, mem_s0) PADDD(xmm_v5, mem_s1) PADDD(xmm_v6, mem_s2) PADDD(xmm_v7, xmm_v3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) PADDQ(xmm_v3, mem_one) PADDD(xmm_v8, mem_s0) PADDD(xmm_v9, mem_s1) PADDD(xmm_v10, mem_s2) PADDD(xmm_v11, xmm_v3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11) PADDQ(xmm_v3, mem_one) MOVDQA(xmm_tmp, mem_tmp0) PADDD(xmm_v12, mem_s0) PADDD(xmm_v13, mem_s1) PADDD(xmm_v14, mem_s2) PADDD(xmm_v15, xmm_v3) WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15) PADDQ(xmm_v3, mem_one) MOVDQU(mem_s3, xmm_v3) ADD(reg_inp, 4 * 64) ADD(reg_outp, 4 * 64) SUB(reg_blocks, 4) JAE(vector_loop4.begin) ADD(reg_blocks, 4) out = Label() JZ(out) # Past this point, we no longer need to use every single register to hold # the in progress state. xmm_s0 = xmm_v8 xmm_s1 = xmm_v9 xmm_s2 = xmm_v10 xmm_s3 = xmm_v11 xmm_one = xmm_v13 MOVDQU(xmm_s0, mem_s0) MOVDQU(xmm_s1, mem_s1) MOVDQU(xmm_s2, mem_s2) MOVDQU(xmm_s3, mem_s3) MOVDQA(xmm_one, mem_one) # # 2 blocks at a time. # process_1_block = Label() SUB(reg_blocks, 2) JB(process_1_block) # < 2 blocks remaining. MOVDQA(xmm_v0, xmm_s0) MOVDQA(xmm_v1, xmm_s1) MOVDQA(xmm_v2, xmm_s2) MOVDQA(xmm_v3, xmm_s3) MOVDQA(xmm_v4, xmm_v0) MOVDQA(xmm_v5, xmm_v1) MOVDQA(xmm_v6, xmm_v2) MOVDQA(xmm_v7, xmm_v3) PADDQ(xmm_v7, xmm_one) MOV(reg_rounds, 20) rounds_loop2 = Loop() with rounds_loop2: # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) ROTW16_sse2(xmm_tmp, xmm_v3) ROTW16_sse2(xmm_tmp, xmm_v7) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) ROTW12_sse2(xmm_tmp, xmm_v1) ROTW12_sse2(xmm_tmp, xmm_v5) # a += b; d ^= a; d = ROTW8(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) ROTW8_sse2(xmm_tmp, xmm_v3) ROTW8_sse2(xmm_tmp, xmm_v7) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) ROTW7_sse2(xmm_tmp, xmm_v1) ROTW7_sse2(xmm_tmp, xmm_v5) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x39) PSHUFD(xmm_v5, xmm_v5, 0x39) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v6, xmm_v6, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x93) PSHUFD(xmm_v7, xmm_v7, 0x93) # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) ROTW16_sse2(xmm_tmp, xmm_v3) ROTW16_sse2(xmm_tmp, xmm_v7) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) ROTW12_sse2(xmm_tmp, xmm_v1) ROTW12_sse2(xmm_tmp, xmm_v5) # a += b; d ^= a; d = ROTW8(d); PADDD(xmm_v0, xmm_v1) PADDD(xmm_v4, xmm_v5) PXOR(xmm_v3, xmm_v0) PXOR(xmm_v7, xmm_v4) ROTW8_sse2(xmm_tmp, xmm_v3) ROTW8_sse2(xmm_tmp, xmm_v7) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PADDD(xmm_v6, xmm_v7) PXOR(xmm_v1, xmm_v2) PXOR(xmm_v5, xmm_v6) ROTW7_sse2(xmm_tmp, xmm_v1) ROTW7_sse2(xmm_tmp, xmm_v5) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x93) PSHUFD(xmm_v5, xmm_v5, 0x93) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v6, xmm_v6, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x39) PSHUFD(xmm_v7, xmm_v7, 0x39) SUB(reg_rounds, 2) JNZ(rounds_loop2.begin) PADDD(xmm_v0, xmm_s0) PADDD(xmm_v1, xmm_s1) PADDD(xmm_v2, xmm_s2) PADDD(xmm_v3, xmm_s3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) PADDQ(xmm_s3, xmm_one) PADDD(xmm_v4, xmm_s0) PADDD(xmm_v5, xmm_s1) PADDD(xmm_v6, xmm_s2) PADDD(xmm_v7, xmm_s3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) PADDQ(xmm_s3, xmm_one) ADD(reg_inp, 2 * 64) ADD(reg_outp, 2 * 64) SUB(reg_blocks, 2) LABEL(process_1_block) ADD(reg_blocks, 2) out_serial = Label() JZ(out_serial) # # 1 block at a time. Only executed once, because if there was > 1, # the parallel code would have processed it already. # MOVDQA(xmm_v0, xmm_s0) MOVDQA(xmm_v1, xmm_s1) MOVDQA(xmm_v2, xmm_s2) MOVDQA(xmm_v3, xmm_s3) MOV(reg_rounds, 20) rounds_loop1 = Loop() with rounds_loop1: # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PXOR(xmm_v3, xmm_v0) ROTW16_sse2(xmm_tmp, xmm_v3) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PXOR(xmm_v1, xmm_v2) ROTW12_sse2(xmm_tmp, xmm_v1) # a += b; d ^= a; d = ROTW8(d); PADDD(xmm_v0, xmm_v1) PXOR(xmm_v3, xmm_v0) ROTW8_sse2(xmm_tmp, xmm_v3) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PXOR(xmm_v1, xmm_v2) ROTW7_sse2(xmm_tmp, xmm_v1) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x39) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x93) # a += b; d ^= a; d = ROTW16(d); PADDD(xmm_v0, xmm_v1) PXOR(xmm_v3, xmm_v0) ROTW16_sse2(xmm_tmp, xmm_v3) # c += d; b ^= c; b = ROTW12(b); PADDD(xmm_v2, xmm_v3) PXOR(xmm_v1, xmm_v2) ROTW12_sse2(xmm_tmp, xmm_v1) # a += b; d ^= a; d = ROTW8(d); PADDD(xmm_v0, xmm_v1) PXOR(xmm_v3, xmm_v0) ROTW8_sse2(xmm_tmp, xmm_v3) # c += d; b ^= c; b = ROTW7(b) PADDD(xmm_v2, xmm_v3) PXOR(xmm_v1, xmm_v2) ROTW7_sse2(xmm_tmp, xmm_v1) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); PSHUFD(xmm_v1, xmm_v1, 0x93) PSHUFD(xmm_v2, xmm_v2, 0x4e) PSHUFD(xmm_v3, xmm_v3, 0x39) SUB(reg_rounds, 2) JNZ(rounds_loop1.begin) PADDD(xmm_v0, xmm_s0) PADDD(xmm_v1, xmm_s1) PADDD(xmm_v2, xmm_s2) PADDD(xmm_v3, xmm_s3) WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) PADDQ(xmm_s3, xmm_one) LABEL(out_serial) # Write back the updated counter. Stoping at 2^70 bytes is the user's # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks # because the counter is incremented in memory while looping.) MOVDQU(mem_s3, xmm_s3) LABEL(out) # Paranoia, cleanse the scratch space. PXOR(xmm_v0, xmm_v0) MOVDQA(mem_tmp0, xmm_v0) # Remove our stack allocation. MOV(registers.rsp, reg_sp_save) RETURN() # # AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit, # and more helpers are used to increase readability for destructive operations. # # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB. # def ADD_avx2(dst, src): VPADDD(dst, dst, src) def XOR_avx2(dst, src): VPXOR(dst, dst, src) def ROTW16_avx2(tmp, d): VPSLLD(tmp, d, 16) VPSRLD(d, d, 16) XOR_avx2(d, tmp) def ROTW12_avx2(tmp, b): VPSLLD(tmp, b, 12) VPSRLD(b, b, 20) XOR_avx2(b, tmp) def ROTW8_avx2(tmp, d): VPSLLD(tmp, d, 8) VPSRLD(d, d, 24) XOR_avx2(d, tmp) def ROTW7_avx2(tmp, b): VPSLLD(tmp, b, 7) VPSRLD(b, b, 25) XOR_avx2(b, tmp) def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3): # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); VPERM2I128(tmp, v0, v1, 0x20) VPXOR(tmp, tmp, [inp+d]) VMOVDQU([outp+d], tmp) # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); VPERM2I128(tmp, v2, v3, 0x20) VPXOR(tmp, tmp, [inp+d+32]) VMOVDQU([outp+d+32], tmp) # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); VPERM2I128(tmp, v0, v1, 0x31) VPXOR(tmp, tmp, [inp+d+64]) VMOVDQU([outp+d+64], tmp) # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); VPERM2I128(tmp, v2, v3, 0x31) VPXOR(tmp, tmp, [inp+d+96]) VMOVDQU([outp+d+96], tmp) # AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process # 8/4/2 blocks at a time. with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell): reg_x = GeneralPurposeRegister64() reg_inp = GeneralPurposeRegister64() reg_outp = GeneralPurposeRegister64() reg_blocks = GeneralPurposeRegister64() reg_sp_save = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_x, x) LOAD.ARGUMENT(reg_inp, inp) LOAD.ARGUMENT(reg_outp, outp) LOAD.ARGUMENT(reg_blocks, nrBlocks) # Align the stack to a 32 byte boundary. MOV(reg_sp_save, registers.rsp) AND(registers.rsp, 0xffffffffffffffe0) SUB(registers.rsp, 0x20) x_s0 = [reg_x] # (Memory) Cipher state [0..3] x_s1 = [reg_x+16] # (Memory) Cipher state [4..7] x_s2 = [reg_x+32] # (Memory) Cipher state [8..11] x_s3 = [reg_x+48] # (Memory) Cipher state [12..15] ymm_v0 = YMMRegister() ymm_v1 = YMMRegister() ymm_v2 = YMMRegister() ymm_v3 = YMMRegister() ymm_v4 = YMMRegister() ymm_v5 = YMMRegister() ymm_v6 = YMMRegister() ymm_v7 = YMMRegister() ymm_v8 = YMMRegister() ymm_v9 = YMMRegister() ymm_v10 = YMMRegister() ymm_v11 = YMMRegister() ymm_v12 = YMMRegister() ymm_v13 = YMMRegister() ymm_v14 = YMMRegister() ymm_v15 = YMMRegister() ymm_tmp0 = ymm_v12 # Allocate the neccecary stack space for the counter vector and two ymm # registers that we will spill. SUB(registers.rsp, 96) mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space. mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x) mem_inc = [registers.rsp] # (Stack) Counter increment vector. # Increment the counter for one side of the state vector. VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0) VMOVDQU(mem_inc, ymm_tmp0) reg_tmp = GeneralPurposeRegister32() MOV(reg_tmp, 0x00000001) MOV([registers.rsp+16], reg_tmp) VBROADCASTI128(ymm_v3, x_s3) VPADDQ(ymm_v3, ymm_v3, [registers.rsp]) VMOVDQA(mem_s3, ymm_v3) # As we process 2xN blocks at a time, so the counter increment for both # sides of the state vector is 2. MOV(reg_tmp, 0x00000002) MOV([registers.rsp], reg_tmp) MOV([registers.rsp+16], reg_tmp) out_write_even = Label() out_write_odd = Label() # # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's # a decent gain despite all the pain... # reg_rounds = GeneralPurposeRegister64() vector_loop8 = Loop() SUB(reg_blocks, 8) JB(vector_loop8.end) with vector_loop8: VBROADCASTI128(ymm_v0, x_s0) VBROADCASTI128(ymm_v1, x_s1) VBROADCASTI128(ymm_v2, x_s2) VMOVDQA(ymm_v3, mem_s3) VMOVDQA(ymm_v4, ymm_v0) VMOVDQA(ymm_v5, ymm_v1) VMOVDQA(ymm_v6, ymm_v2) VPADDQ(ymm_v7, ymm_v3, mem_inc) VMOVDQA(ymm_v8, ymm_v0) VMOVDQA(ymm_v9, ymm_v1) VMOVDQA(ymm_v10, ymm_v2) VPADDQ(ymm_v11, ymm_v7, mem_inc) VMOVDQA(ymm_v12, ymm_v0) VMOVDQA(ymm_v13, ymm_v1) VMOVDQA(ymm_v14, ymm_v2) VPADDQ(ymm_v15, ymm_v11, mem_inc) MOV(reg_rounds, 20) rounds_loop8 = Loop() with rounds_loop8: # a += b; d ^= a; d = ROTW16(d); ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) ADD_avx2(ymm_v8, ymm_v9) ADD_avx2(ymm_v12, ymm_v13) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) XOR_avx2(ymm_v11, ymm_v8) XOR_avx2(ymm_v15, ymm_v12) VMOVDQA(mem_tmp0, ymm_tmp0) # Save ROTW16_avx2(ymm_tmp0, ymm_v3) ROTW16_avx2(ymm_tmp0, ymm_v7) ROTW16_avx2(ymm_tmp0, ymm_v11) ROTW16_avx2(ymm_tmp0, ymm_v15) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) ADD_avx2(ymm_v10, ymm_v11) ADD_avx2(ymm_v14, ymm_v15) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) XOR_avx2(ymm_v9, ymm_v10) XOR_avx2(ymm_v13, ymm_v14) ROTW12_avx2(ymm_tmp0, ymm_v1) ROTW12_avx2(ymm_tmp0, ymm_v5) ROTW12_avx2(ymm_tmp0, ymm_v9) ROTW12_avx2(ymm_tmp0, ymm_v13) # a += b; d ^= a; d = ROTW8(d); VMOVDQA(ymm_tmp0, mem_tmp0) # Restore ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) ADD_avx2(ymm_v8, ymm_v9) ADD_avx2(ymm_v12, ymm_v13) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) XOR_avx2(ymm_v11, ymm_v8) XOR_avx2(ymm_v15, ymm_v12) VMOVDQA(mem_tmp0, ymm_tmp0) # Save ROTW8_avx2(ymm_tmp0, ymm_v3) ROTW8_avx2(ymm_tmp0, ymm_v7) ROTW8_avx2(ymm_tmp0, ymm_v11) ROTW8_avx2(ymm_tmp0, ymm_v15) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) ADD_avx2(ymm_v10, ymm_v11) ADD_avx2(ymm_v14, ymm_v15) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) XOR_avx2(ymm_v9, ymm_v10) XOR_avx2(ymm_v13, ymm_v14) ROTW7_avx2(ymm_tmp0, ymm_v1) ROTW7_avx2(ymm_tmp0, ymm_v5) ROTW7_avx2(ymm_tmp0, ymm_v9) ROTW7_avx2(ymm_tmp0, ymm_v13) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x39) VPSHUFD(ymm_v5, ymm_v5, 0x39) VPSHUFD(ymm_v9, ymm_v9, 0x39) VPSHUFD(ymm_v13, ymm_v13, 0x39) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v6, ymm_v6, 0x4e) VPSHUFD(ymm_v10, ymm_v10, 0x4e) VPSHUFD(ymm_v14, ymm_v14, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x93) VPSHUFD(ymm_v7, ymm_v7, 0x93) VPSHUFD(ymm_v11, ymm_v11, 0x93) VPSHUFD(ymm_v15, ymm_v15, 0x93) # a += b; d ^= a; d = ROTW16(d); VMOVDQA(ymm_tmp0, mem_tmp0) # Restore ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) ADD_avx2(ymm_v8, ymm_v9) ADD_avx2(ymm_v12, ymm_v13) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) XOR_avx2(ymm_v11, ymm_v8) XOR_avx2(ymm_v15, ymm_v12) VMOVDQA(mem_tmp0, ymm_tmp0) # Save ROTW16_avx2(ymm_tmp0, ymm_v3) ROTW16_avx2(ymm_tmp0, ymm_v7) ROTW16_avx2(ymm_tmp0, ymm_v11) ROTW16_avx2(ymm_tmp0, ymm_v15) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) ADD_avx2(ymm_v10, ymm_v11) ADD_avx2(ymm_v14, ymm_v15) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) XOR_avx2(ymm_v9, ymm_v10) XOR_avx2(ymm_v13, ymm_v14) ROTW12_avx2(ymm_tmp0, ymm_v1) ROTW12_avx2(ymm_tmp0, ymm_v5) ROTW12_avx2(ymm_tmp0, ymm_v9) ROTW12_avx2(ymm_tmp0, ymm_v13) # a += b; d ^= a; d = ROTW8(d); VMOVDQA(ymm_tmp0, mem_tmp0) # Restore ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) ADD_avx2(ymm_v8, ymm_v9) ADD_avx2(ymm_v12, ymm_v13) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) XOR_avx2(ymm_v11, ymm_v8) XOR_avx2(ymm_v15, ymm_v12) VMOVDQA(mem_tmp0, ymm_tmp0) # Save ROTW8_avx2(ymm_tmp0, ymm_v3) ROTW8_avx2(ymm_tmp0, ymm_v7) ROTW8_avx2(ymm_tmp0, ymm_v11) ROTW8_avx2(ymm_tmp0, ymm_v15) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) ADD_avx2(ymm_v10, ymm_v11) ADD_avx2(ymm_v14, ymm_v15) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) XOR_avx2(ymm_v9, ymm_v10) XOR_avx2(ymm_v13, ymm_v14) ROTW7_avx2(ymm_tmp0, ymm_v1) ROTW7_avx2(ymm_tmp0, ymm_v5) ROTW7_avx2(ymm_tmp0, ymm_v9) ROTW7_avx2(ymm_tmp0, ymm_v13) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x93) VPSHUFD(ymm_v5, ymm_v5, 0x93) VPSHUFD(ymm_v9, ymm_v9, 0x93) VPSHUFD(ymm_v13, ymm_v13, 0x93) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v6, ymm_v6, 0x4e) VPSHUFD(ymm_v10, ymm_v10, 0x4e) VPSHUFD(ymm_v14, ymm_v14, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x39) VPSHUFD(ymm_v7, ymm_v7, 0x39) VPSHUFD(ymm_v11, ymm_v11, 0x39) VPSHUFD(ymm_v15, ymm_v15, 0x39) VMOVDQA(ymm_tmp0, mem_tmp0) # Restore SUB(reg_rounds, 2) JNZ(rounds_loop8.begin) # ymm_v12 is in mem_tmp0 and is current.... # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA.... VBROADCASTI128(ymm_tmp0, x_s0) ADD_avx2(ymm_v0, ymm_tmp0) ADD_avx2(ymm_v4, ymm_tmp0) ADD_avx2(ymm_v8, ymm_tmp0) ADD_avx2(ymm_tmp0, mem_tmp0) VMOVDQA(mem_tmp0, ymm_tmp0) VBROADCASTI128(ymm_tmp0, x_s1) ADD_avx2(ymm_v1, ymm_tmp0) ADD_avx2(ymm_v5, ymm_tmp0) ADD_avx2(ymm_v9, ymm_tmp0) ADD_avx2(ymm_v13, ymm_tmp0) VBROADCASTI128(ymm_tmp0, x_s2) ADD_avx2(ymm_v2, ymm_tmp0) ADD_avx2(ymm_v6, ymm_tmp0) ADD_avx2(ymm_v10, ymm_tmp0) ADD_avx2(ymm_v14, ymm_tmp0) ADD_avx2(ymm_v3, mem_s3) WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) VMOVDQA(ymm_v3, mem_s3) ADD_avx2(ymm_v3, mem_inc) ADD_avx2(ymm_v7, ymm_v3) WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) ADD_avx2(ymm_v3, mem_inc) ADD_avx2(ymm_v11, ymm_v3) WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11) ADD_avx2(ymm_v3, mem_inc) VMOVDQA(ymm_v12, mem_tmp0) ADD_avx2(ymm_v15, ymm_v3) WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15) ADD_avx2(ymm_v3, mem_inc) VMOVDQA(mem_s3, ymm_v3) ADD(reg_inp, 8 * 64) ADD(reg_outp, 8 * 64) SUB(reg_blocks, 8) JAE(vector_loop8.begin) # ymm_v3 contains a current copy of mem_s3 either from when it was built, # or because the loop updates it. Copy this before we mess with the block # counter in case we need to write it back and return. ymm_s3 = ymm_v11 VMOVDQA(ymm_s3, ymm_v3) ADD(reg_blocks, 8) JZ(out_write_even) # We now actually can do everything in registers. ymm_s0 = ymm_v8 VBROADCASTI128(ymm_s0, x_s0) ymm_s1 = ymm_v9 VBROADCASTI128(ymm_s1, x_s1) ymm_s2 = ymm_v10 VBROADCASTI128(ymm_s2, x_s2) ymm_inc = ymm_v14 VMOVDQA(ymm_inc, mem_inc) # # 4 blocks at a time. # process_2_blocks = Label() SUB(reg_blocks, 4) JB(process_2_blocks) # < 4 blocks remaining. VMOVDQA(ymm_v0, ymm_s0) VMOVDQA(ymm_v1, ymm_s1) VMOVDQA(ymm_v2, ymm_s2) VMOVDQA(ymm_v3, ymm_s3) VMOVDQA(ymm_v4, ymm_v0) VMOVDQA(ymm_v5, ymm_v1) VMOVDQA(ymm_v6, ymm_v2) VPADDQ(ymm_v7, ymm_v3, ymm_inc) MOV(reg_rounds, 20) rounds_loop4 = Loop() with rounds_loop4: # a += b; d ^= a; d = ROTW16(d); ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) ROTW16_avx2(ymm_tmp0, ymm_v3) ROTW16_avx2(ymm_tmp0, ymm_v7) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) ROTW12_avx2(ymm_tmp0, ymm_v1) ROTW12_avx2(ymm_tmp0, ymm_v5) # a += b; d ^= a; d = ROTW8(d); ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) ROTW8_avx2(ymm_tmp0, ymm_v3) ROTW8_avx2(ymm_tmp0, ymm_v7) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) ROTW7_avx2(ymm_tmp0, ymm_v1) ROTW7_avx2(ymm_tmp0, ymm_v5) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x39) VPSHUFD(ymm_v5, ymm_v5, 0x39) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v6, ymm_v6, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x93) VPSHUFD(ymm_v7, ymm_v7, 0x93) # a += b; d ^= a; d = ROTW16(d); ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) ROTW16_avx2(ymm_tmp0, ymm_v3) ROTW16_avx2(ymm_tmp0, ymm_v7) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) ROTW12_avx2(ymm_tmp0, ymm_v1) ROTW12_avx2(ymm_tmp0, ymm_v5) # a += b; d ^= a; d = ROTW8(d); ADD_avx2(ymm_v0, ymm_v1) ADD_avx2(ymm_v4, ymm_v5) XOR_avx2(ymm_v3, ymm_v0) XOR_avx2(ymm_v7, ymm_v4) ROTW8_avx2(ymm_tmp0, ymm_v3) ROTW8_avx2(ymm_tmp0, ymm_v7) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) ADD_avx2(ymm_v6, ymm_v7) XOR_avx2(ymm_v1, ymm_v2) XOR_avx2(ymm_v5, ymm_v6) ROTW7_avx2(ymm_tmp0, ymm_v1) ROTW7_avx2(ymm_tmp0, ymm_v5) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x93) VPSHUFD(ymm_v5, ymm_v5, 0x93) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v6, ymm_v6, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x39) VPSHUFD(ymm_v7, ymm_v7, 0x39) SUB(reg_rounds, 2) JNZ(rounds_loop4.begin) ADD_avx2(ymm_v0, ymm_s0) ADD_avx2(ymm_v1, ymm_s1) ADD_avx2(ymm_v2, ymm_s2) ADD_avx2(ymm_v3, ymm_s3) WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) ADD_avx2(ymm_s3, ymm_inc) ADD_avx2(ymm_v4, ymm_s0) ADD_avx2(ymm_v5, ymm_s1) ADD_avx2(ymm_v6, ymm_s2) ADD_avx2(ymm_v7, ymm_s3) WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) ADD_avx2(ymm_s3, ymm_inc) ADD(reg_inp, 4 * 64) ADD(reg_outp, 4 * 64) SUB(reg_blocks, 4) LABEL(process_2_blocks) ADD(reg_blocks, 4) JZ(out_write_even) # 0 blocks left. # # 2/1 blocks at a time. The two codepaths are unified because # with AVX2 we do 2 blocks at a time anyway, and this only gets called # if 3/2/1 blocks are remaining, so the extra branches don't hurt that # much. # vector_loop2 = Loop() with vector_loop2: VMOVDQA(ymm_v0, ymm_s0) VMOVDQA(ymm_v1, ymm_s1) VMOVDQA(ymm_v2, ymm_s2) VMOVDQA(ymm_v3, ymm_s3) MOV(reg_rounds, 20) rounds_loop2 = Loop() with rounds_loop2: # a += b; d ^= a; d = ROTW16(d); ADD_avx2(ymm_v0, ymm_v1) XOR_avx2(ymm_v3, ymm_v0) ROTW16_avx2(ymm_tmp0, ymm_v3) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) XOR_avx2(ymm_v1, ymm_v2) ROTW12_avx2(ymm_tmp0, ymm_v1) # a += b; d ^= a; d = ROTW8(d); ADD_avx2(ymm_v0, ymm_v1) XOR_avx2(ymm_v3, ymm_v0) ROTW8_avx2(ymm_tmp0, ymm_v3) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) XOR_avx2(ymm_v1, ymm_v2) ROTW7_avx2(ymm_tmp0, ymm_v1) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x39) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x93) # a += b; d ^= a; d = ROTW16(d); ADD_avx2(ymm_v0, ymm_v1) XOR_avx2(ymm_v3, ymm_v0) ROTW16_avx2(ymm_tmp0, ymm_v3) # c += d; b ^= c; b = ROTW12(b); ADD_avx2(ymm_v2, ymm_v3) XOR_avx2(ymm_v1, ymm_v2) ROTW12_avx2(ymm_tmp0, ymm_v1) # a += b; d ^= a; d = ROTW8(d); ADD_avx2(ymm_v0, ymm_v1) XOR_avx2(ymm_v3, ymm_v0) ROTW8_avx2(ymm_tmp0, ymm_v3) # c += d; b ^= c; b = ROTW7(b) ADD_avx2(ymm_v2, ymm_v3) XOR_avx2(ymm_v1, ymm_v2) ROTW7_avx2(ymm_tmp0, ymm_v1) # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); VPSHUFD(ymm_v1, ymm_v1, 0x93) VPSHUFD(ymm_v2, ymm_v2, 0x4e) VPSHUFD(ymm_v3, ymm_v3, 0x39) SUB(reg_rounds, 2) JNZ(rounds_loop2.begin) ADD_avx2(ymm_v0, ymm_s0) ADD_avx2(ymm_v1, ymm_s1) ADD_avx2(ymm_v2, ymm_s2) ADD_avx2(ymm_v3, ymm_s3) # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20) VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp]) VMOVDQU([reg_outp], ymm_tmp0) # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20) VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32]) VMOVDQU([reg_outp+32], ymm_tmp0) SUB(reg_blocks, 1) JZ(out_write_odd) ADD_avx2(ymm_s3, ymm_inc) # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31) VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64]) VMOVDQU([reg_outp+64], ymm_tmp0) # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31) VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96]) VMOVDQU([reg_outp+96], ymm_tmp0) SUB(reg_blocks, 1) JZ(out_write_even) ADD(reg_inp, 2 * 64) ADD(reg_outp, 2 * 64) JMP(vector_loop2.begin) LABEL(out_write_odd) VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks. LABEL(out_write_even) VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3 # Paranoia, cleanse the scratch space. VPXOR(ymm_v0, ymm_v0, ymm_v0) VMOVDQA(mem_tmp0, ymm_v0) VMOVDQA(mem_s3, ymm_v0) # Clear all YMM (and XMM) registers. VZEROALL() # Remove our stack allocation. MOV(registers.rsp, reg_sp_save) RETURN() # # CPUID # cpuidParams = Argument(ptr(uint32_t)) with Function("cpuidAmd64", (cpuidParams,)): reg_params = registers.r15 LOAD.ARGUMENT(reg_params, cpuidParams) MOV(registers.eax, [reg_params]) MOV(registers.ecx, [reg_params+8]) CPUID() MOV([reg_params], registers.eax) MOV([reg_params+4], registers.ebx) MOV([reg_params+8], registers.ecx) MOV([reg_params+12], registers.edx) RETURN() # # XGETBV (ECX = 0) # xcrVec = Argument(ptr(uint32_t)) with Function("xgetbv0Amd64", (xcrVec,)): reg_vec = GeneralPurposeRegister64() LOAD.ARGUMENT(reg_vec, xcrVec) XOR(registers.ecx, registers.ecx) XGETBV() MOV([reg_vec], registers.eax) MOV([reg_vec+4], registers.edx) RETURN()