// +build !noasm,go1.10 // hwaccel_amd64.s - AMD64 optimized routines. // // To the extent possible under law, Yawning Angel has waived all copyright // and related or neighboring rights to the software, using the Creative // Commons "CC0" public domain dedication. See LICENSE or // for full details. #include "textflag.h" // func cpuidAmd64(cpuidParams *uint32) TEXT ·cpuidAmd64(SB), NOSPLIT, $0-8 MOVQ cpuidParams+0(FP), R15 MOVL 0(R15), AX MOVL 8(R15), CX CPUID MOVL AX, 0(R15) MOVL BX, 4(R15) MOVL CX, 8(R15) MOVL DX, 12(R15) RET // func xgetbv0Amd64(xcrVec *uint32) TEXT ·xgetbv0Amd64(SB), NOSPLIT, $0-8 MOVQ xcrVec+0(FP), BX XORL CX, CX XGETBV MOVL AX, 0(BX) MOVL DX, 4(BX) RET // Routines taken from the `avx2` implementation, converted to Go's assembly // dialect. I do this in lieu of cutting myself to see if I still can feel // pain. // // The conversion is mostly direct except: // * Instead of aligned loads, unaligned loads are used, as there is no // meaningful difference on modern Intel systems, and it's not immediately // obvious to me how Go will align global data. // * The polyvec_pointwise_acc family of routines take vectors of pointers // due to the different internal memory layout of a polyvec. // * The constants are renamed slightly. // Note: // * These must be kept in sync with the values in params.go. // Currently assumes Q = 7681, Q_INV = 57857. // * Caution, Little endian so things will look different from avx2/consts.c. DATA ·vpshufb_idx<>+0x00(SB)/8, $0x0504070601000302 DATA ·vpshufb_idx<>+0x08(SB)/8, $0x0d0c0f0e09080b0a DATA ·vpshufb_idx<>+0x10(SB)/8, $0x0504070601000302 DATA ·vpshufb_idx<>+0x18(SB)/8, $0x0d0c0f0e09080b0a GLOBL ·vpshufb_idx<>(SB), (NOPTR+RODATA), $32 DATA ·low_mask<>+0x00(SB)/8, $0x1fff1fff1fff1fff DATA ·low_mask<>+0x08(SB)/8, $0x1fff1fff1fff1fff DATA ·low_mask<>+0x10(SB)/8, $0x1fff1fff1fff1fff DATA ·low_mask<>+0x18(SB)/8, $0x1fff1fff1fff1fff GLOBL ·low_mask<>(SB), (NOPTR+RODATA), $32 DATA ·lowdword<>+0x00(SB)/8, $0x0000ffff0000ffff DATA ·lowdword<>+0x08(SB)/8, $0x0000ffff0000ffff DATA ·lowdword<>+0x10(SB)/8, $0x0000ffff0000ffff DATA ·lowdword<>+0x18(SB)/8, $0x0000ffff0000ffff GLOBL ·lowdword<>(SB), (NOPTR+RODATA), $32 DATA ·q_x16<>+0x00(SB)/8, $0x1e011e011e011e01 DATA ·q_x16<>+0x08(SB)/8, $0x1e011e011e011e01 DATA ·q_x16<>+0x10(SB)/8, $0x1e011e011e011e01 DATA ·q_x16<>+0x18(SB)/8, $0x1e011e011e011e01 GLOBL ·q_x16<>(SB), (NOPTR+RODATA), $32 DATA ·q2_x16<>+0x00(SB)/8, $0x3c023c023c023c02 DATA ·q2_x16<>+0x08(SB)/8, $0x3c023c023c023c02 DATA ·q2_x16<>+0x10(SB)/8, $0x3c023c023c023c02 DATA ·q2_x16<>+0x18(SB)/8, $0x3c023c023c023c02 GLOBL ·q2_x16<>(SB), (NOPTR+RODATA), $32 DATA ·qinv_x16<>+0x00(SB)/8, $0xe201e201e201e201 DATA ·qinv_x16<>+0x08(SB)/8, $0xe201e201e201e201 DATA ·qinv_x16<>+0x10(SB)/8, $0xe201e201e201e201 DATA ·qinv_x16<>+0x18(SB)/8, $0xe201e201e201e201 GLOBL ·qinv_x16<>(SB), (NOPTR+RODATA), $32 DATA ·f_x16<>+0x00(SB)/8, $0x0100010001000100 DATA ·f_x16<>+0x08(SB)/8, $0x0100010001000100 DATA ·f_x16<>+0x10(SB)/8, $0x0100010001000100 DATA ·f_x16<>+0x18(SB)/8, $0x0100010001000100 GLOBL ·f_x16<>(SB), (NOPTR+RODATA), $32 DATA ·v_x16<>+0x00(SB)/8, $0x4442444244424442 DATA ·v_x16<>+0x08(SB)/8, $0x4442444244424442 DATA ·v_x16<>+0x10(SB)/8, $0x4442444244424442 DATA ·v_x16<>+0x18(SB)/8, $0x4442444244424442 GLOBL ·v_x16<>(SB), (NOPTR+RODATA), $32 DATA ·montsq_x16<>+0x00(SB)/8, $0x15c115c115c115c1 DATA ·montsq_x16<>+0x08(SB)/8, $0x15c115c115c115c1 DATA ·montsq_x16<>+0x10(SB)/8, $0x15c115c115c115c1 DATA ·montsq_x16<>+0x18(SB)/8, $0x15c115c115c115c1 GLOBL ·montsq_x16<>(SB), (NOPTR+RODATA), $32 DATA ·mask11<>+0x00(SB)/8, $0x1111111111111111 DATA ·mask11<>+0x08(SB)/8, $0x1111111111111111 DATA ·mask11<>+0x10(SB)/8, $0x1111111111111111 DATA ·mask11<>+0x18(SB)/8, $0x1111111111111111 GLOBL ·mask11<>(SB), (NOPTR+RODATA), $32 DATA ·mask0f<>+0x00(SB)/8, $0x0f0f0f0f0f0f0f0f DATA ·mask0f<>+0x08(SB)/8, $0x0f0f0f0f0f0f0f0f DATA ·mask0f<>+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f DATA ·mask0f<>+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f GLOBL ·mask0f<>(SB), (NOPTR+RODATA), $32 // func nttAVX2(inout, zetas *uint16) TEXT ·nttAVX2(SB), NOSPLIT, $0-16 MOVQ inout+0(FP), DI MOVQ zetas+8(FP), SI VMOVDQU ·qinv_x16<>(SB), Y0 VMOVDQU ·q_x16<>(SB), Y1 VMOVDQU ·low_mask<>(SB), Y2 // zetas VMOVDQU (SI), Y3 // first round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 256(DI), Y8 VMOVDQU 288(DI), Y9 VMOVDQU 320(DI), Y10 VMOVDQU 352(DI), Y11 // level 0 // mul VPMULLW Y3, Y8, Y12 VPMULHW Y3, Y8, Y8 VPMULLW Y3, Y9, Y13 VPMULHW Y3, Y9, Y9 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y12 VPSUBW Y13, Y9, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y8 VPSUBW Y13, Y5, Y9 VPSUBW Y14, Y6, Y10 VPSUBW Y15, Y7, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y6, Y6 VPADDW Y15, Y7, Y7 // store VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) VMOVDQU Y6, 64(DI) VMOVDQU Y7, 96(DI) VMOVDQU Y8, 256(DI) VMOVDQU Y9, 288(DI) VMOVDQU Y10, 320(DI) VMOVDQU Y11, 352(DI) ADDQ $128, DI // second round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 256(DI), Y8 VMOVDQU 288(DI), Y9 VMOVDQU 320(DI), Y10 VMOVDQU 352(DI), Y11 // level 0 // mul VPMULLW Y3, Y8, Y12 VPMULHW Y3, Y8, Y8 VPMULLW Y3, Y9, Y13 VPMULHW Y3, Y9, Y9 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y12 VPSUBW Y13, Y9, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y8 VPSUBW Y13, Y5, Y9 VPSUBW Y14, Y6, Y10 VPSUBW Y15, Y7, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y6, Y6 VPADDW Y15, Y7, Y7 // store VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) VMOVDQU Y6, 64(DI) VMOVDQU Y7, 96(DI) VMOVDQU Y8, 256(DI) VMOVDQU Y9, 288(DI) VMOVDQU Y10, 320(DI) VMOVDQU Y11, 352(DI) SUBQ $128, DI // first round // zetas VMOVDQU 32(SI), Y3 // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 128(DI), Y8 VMOVDQU 160(DI), Y9 VMOVDQU 192(DI), Y10 VMOVDQU 224(DI), Y11 // level 1 // mul VPMULLW Y3, Y8, Y12 VPMULHW Y3, Y8, Y8 VPMULLW Y3, Y9, Y13 VPMULHW Y3, Y9, Y9 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y12 VPSUBW Y13, Y9, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y8 VPSUBW Y13, Y5, Y9 VPSUBW Y14, Y6, Y10 VPSUBW Y15, Y7, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y6, Y6 VPADDW Y15, Y7, Y7 // level 2 // zetas VMOVDQU 96(SI), Y15 VMOVDQU 128(SI), Y3 // mul VPMULLW Y15, Y6, Y12 VPMULHW Y15, Y6, Y6 VPMULLW Y15, Y7, Y13 VPMULHW Y15, Y7, Y7 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y6, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y6 VPSUBW Y13, Y5, Y7 VPSUBW Y14, Y8, Y10 VPSUBW Y15, Y9, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y8, Y8 VPADDW Y15, Y9, Y9 // level 3 // zetas VMOVDQU 224(SI), Y13 VMOVDQU 256(SI), Y14 VMOVDQU 288(SI), Y15 VMOVDQU 320(SI), Y3 // mul VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 VPSUBW Y15, Y11, Y15 // reduce 2 VPSRAW $13, Y4, Y5 VPSRAW $13, Y6, Y7 VPSRAW $13, Y8, Y9 VPSRAW $13, Y10, Y11 VPAND Y2, Y4, Y4 VPAND Y2, Y6, Y6 VPAND Y2, Y8, Y8 VPAND Y2, Y10, Y10 VPSUBW Y5, Y4, Y4 VPSUBW Y7, Y6, Y6 VPSUBW Y9, Y8, Y8 VPSUBW Y11, Y10, Y10 VPSLLW $9, Y5, Y5 VPSLLW $9, Y7, Y7 VPSLLW $9, Y9, Y9 VPSLLW $9, Y11, Y11 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 VPADDW Y11, Y10, Y10 // update VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPSUBW Y15, Y10, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 VPADDW Y15, Y10, Y10 // level 4 // zetas VMOVDQU 480(SI), Y12 VMOVDQU 512(SI), Y13 VMOVDQU 544(SI), Y14 VMOVDQU 576(SI), Y15 // shuffle VPERM2I128 $0x02, Y4, Y5, Y3 VPERM2I128 $0x13, Y4, Y5, Y4 VPERM2I128 $0x02, Y6, Y7, Y5 VPERM2I128 $0x13, Y6, Y7, Y6 VPERM2I128 $0x02, Y8, Y9, Y7 VPERM2I128 $0x13, Y8, Y9, Y8 VPERM2I128 $0x02, Y10, Y11, Y9 VPERM2I128 $0x13, Y10, Y11, Y10 // mul VPMULLW Y12, Y4, Y11 VPMULHW Y12, Y4, Y4 VPMULLW Y13, Y6, Y12 VPMULHW Y13, Y6, Y6 VPMULLW Y14, Y8, Y13 VPMULHW Y14, Y8, Y8 VPMULLW Y15, Y10, Y14 VPMULHW Y15, Y10, Y10 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y4, Y11 VPSUBW Y12, Y6, Y12 VPSUBW Y13, Y8, Y13 VPSUBW Y14, Y10, Y14 // update VPSUBW Y11, Y3, Y4 VPSUBW Y12, Y5, Y6 VPSUBW Y13, Y7, Y8 VPSUBW Y14, Y9, Y10 VPADDW Y11, Y3, Y3 VPADDW Y12, Y5, Y5 VPADDW Y13, Y7, Y7 VPADDW Y14, Y9, Y9 // level 5 // zetas VMOVDQU 736(SI), Y12 VMOVDQU 768(SI), Y13 VMOVDQU 800(SI), Y14 VMOVDQU 832(SI), Y15 // shuffle VSHUFPD $0x00, Y4, Y3, Y11 VSHUFPD $0x0F, Y4, Y3, Y3 VSHUFPD $0x00, Y6, Y5, Y4 VSHUFPD $0x0F, Y6, Y5, Y5 VSHUFPD $0x00, Y8, Y7, Y6 VSHUFPD $0x0F, Y8, Y7, Y7 VSHUFPD $0x00, Y10, Y9, Y8 VSHUFPD $0x0F, Y10, Y9, Y9 // mul VPMULLW Y12, Y3, Y10 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y10, Y10 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y10, Y10 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y10, Y3, Y10 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // update VPSUBW Y10, Y11, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y10, Y11, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // level 6 // shuffle VPSHUFD $0xB1, Y10, Y12 VPSHUFD $0xB1, Y3, Y13 VPSHUFD $0xB1, Y4, Y14 VPSHUFD $0xB1, Y5, Y15 VPBLENDD $0x55, Y10, Y13, Y10 VPBLENDD $0xAA, Y3, Y12, Y3 VPBLENDD $0x55, Y4, Y15, Y4 VPBLENDD $0xAA, Y5, Y14, Y5 VPSHUFD $0xB1, Y6, Y12 VPSHUFD $0xB1, Y7, Y13 VPSHUFD $0xB1, Y8, Y14 VPSHUFD $0xB1, Y9, Y15 VPBLENDD $0x55, Y6, Y13, Y6 VPBLENDD $0xAA, Y7, Y12, Y7 VPBLENDD $0x55, Y8, Y15, Y8 VPBLENDD $0xAA, Y9, Y14, Y9 // zetas VMOVDQU 992(SI), Y12 VMOVDQU 1024(SI), Y13 VMOVDQU 1056(SI), Y14 VMOVDQU 1088(SI), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // reduce 2 VPSRAW $13, Y10, Y3 VPSRAW $13, Y4, Y5 VPSRAW $13, Y6, Y7 VPSRAW $13, Y8, Y9 VPAND Y2, Y10, Y10 VPAND Y2, Y4, Y4 VPAND Y2, Y6, Y6 VPAND Y2, Y8, Y8 VPSUBW Y3, Y10, Y10 VPSUBW Y5, Y4, Y4 VPSUBW Y7, Y6, Y6 VPSUBW Y9, Y8, Y8 VPSLLW $9, Y3, Y3 VPSLLW $9, Y5, Y5 VPSLLW $9, Y7, Y7 VPSLLW $9, Y9, Y9 VPADDW Y3, Y10, Y10 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 // update VPSUBW Y11, Y10, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y11, Y10, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // level 7 // shuffle VMOVDQU ·vpshufb_idx<>(SB), Y15 VPSHUFB Y15, Y10, Y11 VPSHUFB Y15, Y3, Y12 VPSHUFB Y15, Y4, Y13 VPSHUFB Y15, Y5, Y14 VPBLENDW $0x55, Y10, Y12, Y10 VPBLENDW $0xAA, Y3, Y11, Y3 VPBLENDW $0x55, Y4, Y14, Y4 VPBLENDW $0xAA, Y5, Y13, Y5 VPSHUFB Y15, Y6, Y11 VPSHUFB Y15, Y7, Y12 VPSHUFB Y15, Y8, Y13 VPSHUFB Y15, Y9, Y14 VPBLENDW $0x55, Y6, Y12, Y6 VPBLENDW $0xAA, Y7, Y11, Y7 VPBLENDW $0x55, Y8, Y14, Y8 VPBLENDW $0xAA, Y9, Y13, Y9 // zetas VMOVDQU 1248(SI), Y12 VMOVDQU 1280(SI), Y13 VMOVDQU 1312(SI), Y14 VMOVDQU 1344(SI), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // reduce 3 VMOVDQU ·q2_x16<>(SB), Y15 VPSRAW $15, Y10, Y3 VPSRAW $15, Y4, Y5 VPSRAW $15, Y6, Y7 VPSRAW $15, Y8, Y9 VPAND Y15, Y3, Y3 VPAND Y15, Y5, Y5 VPAND Y15, Y7, Y7 VPAND Y15, Y9, Y9 VPADDW Y1, Y10, Y10 VPADDW Y1, Y4, Y4 VPADDW Y1, Y6, Y6 VPADDW Y1, Y8, Y8 VPADDW Y3, Y10, Y10 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 // update VPSUBW Y11, Y10, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y11, Y10, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // reorder VPUNPCKLWD Y3, Y10, Y12 VPUNPCKHWD Y3, Y10, Y13 VPUNPCKLWD Y5, Y4, Y14 VPUNPCKHWD Y5, Y4, Y15 VPUNPCKLWD Y7, Y6, Y3 VPUNPCKHWD Y7, Y6, Y4 VPUNPCKLWD Y9, Y8, Y5 VPUNPCKHWD Y9, Y8, Y6 VPERM2I128 $0x20, Y13, Y12, Y11 VPERM2I128 $0x31, Y13, Y12, Y12 VPERM2I128 $0x20, Y15, Y14, Y13 VPERM2I128 $0x31, Y15, Y14, Y14 VPERM2I128 $0x20, Y4, Y3, Y15 VPERM2I128 $0x31, Y4, Y3, Y3 VPERM2I128 $0x20, Y6, Y5, Y4 VPERM2I128 $0x31, Y6, Y5, Y5 // store VMOVDQU Y11, (DI) VMOVDQU Y12, 32(DI) VMOVDQU Y13, 64(DI) VMOVDQU Y14, 96(DI) VMOVDQU Y15, 128(DI) VMOVDQU Y3, 160(DI) VMOVDQU Y4, 192(DI) VMOVDQU Y5, 224(DI) ADDQ $256, DI // second round // zetas VMOVDQU 64(SI), Y3 // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 128(DI), Y8 VMOVDQU 160(DI), Y9 VMOVDQU 192(DI), Y10 VMOVDQU 224(DI), Y11 // level 1 // mul VPMULLW Y3, Y8, Y12 VPMULHW Y3, Y8, Y8 VPMULLW Y3, Y9, Y13 VPMULHW Y3, Y9, Y9 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y12 VPSUBW Y13, Y9, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y8 VPSUBW Y13, Y5, Y9 VPSUBW Y14, Y6, Y10 VPSUBW Y15, Y7, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y6, Y6 VPADDW Y15, Y7, Y7 // level 2 // zetas VMOVDQU 160(SI), Y15 VMOVDQU 192(SI), Y3 // mul VPMULLW Y15, Y6, Y12 VPMULHW Y15, Y6, Y6 VPMULLW Y15, Y7, Y13 VPMULHW Y15, Y7, Y7 VPMULLW Y3, Y10, Y14 VPMULHW Y3, Y10, Y10 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y6, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y10, Y14 VPSUBW Y15, Y11, Y15 // update VPSUBW Y12, Y4, Y6 VPSUBW Y13, Y5, Y7 VPSUBW Y14, Y8, Y10 VPSUBW Y15, Y9, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y5, Y5 VPADDW Y14, Y8, Y8 VPADDW Y15, Y9, Y9 // level 3 // zetas VMOVDQU 352(SI), Y13 VMOVDQU 384(SI), Y14 VMOVDQU 416(SI), Y15 VMOVDQU 448(SI), Y3 // mul VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 VPSUBW Y15, Y11, Y15 // reduce 2 VPSRAW $13, Y4, Y5 VPSRAW $13, Y6, Y7 VPSRAW $13, Y8, Y9 VPSRAW $13, Y10, Y11 VPAND Y2, Y4, Y4 VPAND Y2, Y6, Y6 VPAND Y2, Y8, Y8 VPAND Y2, Y10, Y10 VPSUBW Y5, Y4, Y4 VPSUBW Y7, Y6, Y6 VPSUBW Y9, Y8, Y8 VPSUBW Y11, Y10, Y10 VPSLLW $9, Y5, Y5 VPSLLW $9, Y7, Y7 VPSLLW $9, Y9, Y9 VPSLLW $9, Y11, Y11 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 VPADDW Y11, Y10, Y10 // update VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPSUBW Y15, Y10, Y11 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 VPADDW Y15, Y10, Y10 // level 4 // zetas VMOVDQU 608(SI), Y12 VMOVDQU 640(SI), Y13 VMOVDQU 672(SI), Y14 VMOVDQU 704(SI), Y15 // shuffle VPERM2I128 $0x02, Y4, Y5, Y3 VPERM2I128 $0x13, Y4, Y5, Y4 VPERM2I128 $0x02, Y6, Y7, Y5 VPERM2I128 $0x13, Y6, Y7, Y6 VPERM2I128 $0x02, Y8, Y9, Y7 VPERM2I128 $0x13, Y8, Y9, Y8 VPERM2I128 $0x02, Y10, Y11, Y9 VPERM2I128 $0x13, Y10, Y11, Y10 // mul VPMULLW Y12, Y4, Y11 VPMULHW Y12, Y4, Y4 VPMULLW Y13, Y6, Y12 VPMULHW Y13, Y6, Y6 VPMULLW Y14, Y8, Y13 VPMULHW Y14, Y8, Y8 VPMULLW Y15, Y10, Y14 VPMULHW Y15, Y10, Y10 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y4, Y11 VPSUBW Y12, Y6, Y12 VPSUBW Y13, Y8, Y13 VPSUBW Y14, Y10, Y14 // update VPSUBW Y11, Y3, Y4 VPSUBW Y12, Y5, Y6 VPSUBW Y13, Y7, Y8 VPSUBW Y14, Y9, Y10 VPADDW Y11, Y3, Y3 VPADDW Y12, Y5, Y5 VPADDW Y13, Y7, Y7 VPADDW Y14, Y9, Y9 // level 5 // zetas VMOVDQU 864(SI), Y12 VMOVDQU 896(SI), Y13 VMOVDQU 928(SI), Y14 VMOVDQU 960(SI), Y15 // shuffle VSHUFPD $0x00, Y4, Y3, Y11 VSHUFPD $0x0F, Y4, Y3, Y3 VSHUFPD $0x00, Y6, Y5, Y4 VSHUFPD $0x0F, Y6, Y5, Y5 VSHUFPD $0x00, Y8, Y7, Y6 VSHUFPD $0x0F, Y8, Y7, Y7 VSHUFPD $0x00, Y10, Y9, Y8 VSHUFPD $0x0F, Y10, Y9, Y9 // mul VPMULLW Y12, Y3, Y10 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y10, Y10 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y10, Y10 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y10, Y3, Y10 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // update VPSUBW Y10, Y11, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y10, Y11, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // level 6 // shuffle VPSHUFD $0xB1, Y10, Y12 VPSHUFD $0xB1, Y3, Y13 VPSHUFD $0xB1, Y4, Y14 VPSHUFD $0xB1, Y5, Y15 VPBLENDD $0x55, Y10, Y13, Y10 VPBLENDD $0xAA, Y3, Y12, Y3 VPBLENDD $0x55, Y4, Y15, Y4 VPBLENDD $0xAA, Y5, Y14, Y5 VPSHUFD $0xB1, Y6, Y12 VPSHUFD $0xB1, Y7, Y13 VPSHUFD $0xB1, Y8, Y14 VPSHUFD $0xB1, Y9, Y15 VPBLENDD $0x55, Y6, Y13, Y6 VPBLENDD $0xAA, Y7, Y12, Y7 VPBLENDD $0x55, Y8, Y15, Y8 VPBLENDD $0xAA, Y9, Y14, Y9 // zetas VMOVDQU 1120(SI), Y12 VMOVDQU 1152(SI), Y13 VMOVDQU 1184(SI), Y14 VMOVDQU 1216(SI), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // reduce 2 VPSRAW $13, Y10, Y3 VPSRAW $13, Y4, Y5 VPSRAW $13, Y6, Y7 VPSRAW $13, Y8, Y9 VPAND Y2, Y10, Y10 VPAND Y2, Y4, Y4 VPAND Y2, Y6, Y6 VPAND Y2, Y8, Y8 VPSUBW Y3, Y10, Y10 VPSUBW Y5, Y4, Y4 VPSUBW Y7, Y6, Y6 VPSUBW Y9, Y8, Y8 VPSLLW $9, Y3, Y3 VPSLLW $9, Y5, Y5 VPSLLW $9, Y7, Y7 VPSLLW $9, Y9, Y9 VPADDW Y3, Y10, Y10 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 // update VPSUBW Y11, Y10, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y11, Y10, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // level 7 // shuffle VMOVDQU ·vpshufb_idx<>(SB), Y15 VPSHUFB Y15, Y10, Y11 VPSHUFB Y15, Y3, Y12 VPSHUFB Y15, Y4, Y13 VPSHUFB Y15, Y5, Y14 VPBLENDW $0x55, Y10, Y12, Y10 VPBLENDW $0xAA, Y3, Y11, Y3 VPBLENDW $0x55, Y4, Y14, Y4 VPBLENDW $0xAA, Y5, Y13, Y5 VPSHUFB Y15, Y6, Y11 VPSHUFB Y15, Y7, Y12 VPSHUFB Y15, Y8, Y13 VPSHUFB Y15, Y9, Y14 VPBLENDW $0x55, Y6, Y12, Y6 VPBLENDW $0xAA, Y7, Y11, Y7 VPBLENDW $0x55, Y8, Y14, Y8 VPBLENDW $0xAA, Y9, Y13, Y9 // zetas VMOVDQU 1376(SI), Y12 VMOVDQU 1408(SI), Y13 VMOVDQU 1440(SI), Y14 VMOVDQU 1472(SI), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y11 VPSUBW Y12, Y5, Y12 VPSUBW Y13, Y7, Y13 VPSUBW Y14, Y9, Y14 // reduce 3 VMOVDQU ·q2_x16<>(SB), Y15 VPSRAW $15, Y10, Y3 VPSRAW $15, Y4, Y5 VPSRAW $15, Y6, Y7 VPSRAW $15, Y8, Y9 VPAND Y15, Y3, Y3 VPAND Y15, Y5, Y5 VPAND Y15, Y7, Y7 VPAND Y15, Y9, Y9 VPADDW Y1, Y10, Y10 VPADDW Y1, Y4, Y4 VPADDW Y1, Y6, Y6 VPADDW Y1, Y8, Y8 VPADDW Y3, Y10, Y10 VPADDW Y5, Y4, Y4 VPADDW Y7, Y6, Y6 VPADDW Y9, Y8, Y8 // update VPSUBW Y11, Y10, Y3 VPSUBW Y12, Y4, Y5 VPSUBW Y13, Y6, Y7 VPSUBW Y14, Y8, Y9 VPADDW Y11, Y10, Y10 VPADDW Y12, Y4, Y4 VPADDW Y13, Y6, Y6 VPADDW Y14, Y8, Y8 // reorder VPUNPCKLWD Y3, Y10, Y12 VPUNPCKHWD Y3, Y10, Y13 VPUNPCKLWD Y5, Y4, Y14 VPUNPCKHWD Y5, Y4, Y15 VPUNPCKLWD Y7, Y6, Y3 VPUNPCKHWD Y7, Y6, Y4 VPUNPCKLWD Y9, Y8, Y5 VPUNPCKHWD Y9, Y8, Y6 VPERM2I128 $0x20, Y13, Y12, Y11 VPERM2I128 $0x31, Y13, Y12, Y12 VPERM2I128 $0x20, Y15, Y14, Y13 VPERM2I128 $0x31, Y15, Y14, Y14 VPERM2I128 $0x20, Y4, Y3, Y15 VPERM2I128 $0x31, Y4, Y3, Y3 VPERM2I128 $0x20, Y6, Y5, Y4 VPERM2I128 $0x31, Y6, Y5, Y5 // store VMOVDQU Y11, (DI) VMOVDQU Y12, 32(DI) VMOVDQU Y13, 64(DI) VMOVDQU Y14, 96(DI) VMOVDQU Y15, 128(DI) VMOVDQU Y3, 160(DI) VMOVDQU Y4, 192(DI) VMOVDQU Y5, 224(DI) VZEROUPPER RET // Go 1.10's VPERMQ support expects the imm8 to be a `int8`, instead of a // `uint8`. While this is fixed in master, use the signed representation // for now till it's reasonable to expect versions with the fix to be widely // available. // // See: https://github.com/golang/go/issues/24378 #define invntt_VPERMQ_IDX $-40 // $0xd8 // func invnttAVX2(inout, omegas *uint16) TEXT ·invnttAVX2(SB), NOSPLIT, $0-16 MOVQ inout+0(FP), DI MOVQ omegas+8(FP), SI VMOVDQU ·qinv_x16<>(SB), Y0 VMOVDQU ·q_x16<>(SB), Y1 VMOVDQU ·v_x16<>(SB), Y2 MOVQ SI, R8 // first round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 128(DI), Y8 VMOVDQU 160(DI), Y9 VMOVDQU 192(DI), Y10 VMOVDQU 224(DI), Y11 // reorder VMOVDQU ·lowdword<>(SB), Y3 VPAND Y3, Y4, Y12 VPAND Y3, Y5, Y13 VPAND Y3, Y6, Y14 VPAND Y3, Y7, Y15 VPSRLD $16, Y4, Y4 VPSRLD $16, Y5, Y5 VPSRLD $16, Y6, Y6 VPSRLD $16, Y7, Y7 VPACKUSDW Y5, Y4, Y5 VPACKUSDW Y13, Y12, Y4 VPACKUSDW Y7, Y6, Y7 VPACKUSDW Y15, Y14, Y6 VPERMQ invntt_VPERMQ_IDX, Y4, Y4 VPERMQ invntt_VPERMQ_IDX, Y5, Y5 VPERMQ invntt_VPERMQ_IDX, Y6, Y6 VPERMQ invntt_VPERMQ_IDX, Y7, Y7 VPAND Y3, Y8, Y12 VPAND Y3, Y9, Y13 VPAND Y3, Y10, Y14 VPAND Y3, Y11, Y15 VPSRLD $16, Y8, Y8 VPSRLD $16, Y9, Y9 VPSRLD $16, Y10, Y10 VPSRLD $16, Y11, Y11 VPACKUSDW Y9, Y8, Y9 VPACKUSDW Y13, Y12, Y8 VPACKUSDW Y11, Y10, Y11 VPACKUSDW Y15, Y14, Y10 VPERMQ invntt_VPERMQ_IDX, Y8, Y8 VPERMQ invntt_VPERMQ_IDX, Y9, Y9 VPERMQ invntt_VPERMQ_IDX, Y10, Y10 VPERMQ invntt_VPERMQ_IDX, Y11, Y11 // level 0 // update VPSUBW Y5, Y4, Y12 VPSUBW Y7, Y6, Y13 VPSUBW Y9, Y8, Y14 VPSUBW Y11, Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 // zetas VMOVDQU (R8), Y7 VMOVDQU 32(R8), Y9 VMOVDQU 64(R8), Y11 VMOVDQU 96(R8), Y3 // mul VPMULLW Y7, Y12, Y5 VPMULHW Y7, Y12, Y12 VPMULLW Y9, Y13, Y7 VPMULHW Y9, Y13, Y13 VPMULLW Y11, Y14, Y9 VPMULHW Y11, Y14, Y14 VPMULLW Y3, Y15, Y11 VPMULHW Y3, Y15, Y15 // reduce VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y9, Y9 VPMULLW Y0, Y11, Y11 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y9, Y9 VPMULHW Y1, Y11, Y11 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // level 1 // shuffle VMOVDQU ·vpshufb_idx<>(SB), Y3 VPSHUFB Y3, Y4, Y12 VPSHUFB Y3, Y5, Y13 VPSHUFB Y3, Y6, Y14 VPSHUFB Y3, Y7, Y15 VPBLENDW $0x55, Y4, Y13, Y4 VPBLENDW $0xAA, Y5, Y12, Y5 VPBLENDW $0x55, Y6, Y15, Y6 VPBLENDW $0xAA, Y7, Y14, Y7 VPSHUFB Y3, Y8, Y12 VPSHUFB Y3, Y9, Y13 VPSHUFB Y3, Y10, Y14 VPSHUFB Y3, Y11, Y15 VPBLENDW $0x55, Y8, Y13, Y8 VPBLENDW $0xAA, Y9, Y12, Y9 VPBLENDW $0x55, Y10, Y15, Y10 VPBLENDW $0xAA, Y11, Y14, Y11 // update VPSUBW Y5, Y4, Y12 VPSUBW Y7, Y6, Y13 VPSUBW Y9, Y8, Y14 VPSUBW Y11, Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 // zetas VMOVDQU 256(R8), Y7 VMOVDQU 288(R8), Y9 VMOVDQU 320(R8), Y11 VMOVDQU 352(R8), Y3 // mul VPMULLW Y7, Y12, Y5 VPMULHW Y7, Y12, Y12 VPMULLW Y9, Y13, Y7 VPMULHW Y9, Y13, Y13 VPMULLW Y11, Y14, Y9 VPMULHW Y11, Y14, Y14 VPMULLW Y3, Y15, Y11 VPMULHW Y3, Y15, Y15 // reduce VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y9, Y9 VPMULLW Y0, Y11, Y11 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y9, Y9 VPMULHW Y1, Y11, Y11 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // reduce 2 VPMULHW Y2, Y4, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y8, Y14 VPMULHW Y2, Y10, Y15 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPSRAW $11, Y14, Y14 VPSRAW $11, Y15, Y15 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPMULLW Y1, Y14, Y14 VPMULLW Y1, Y15, Y15 VPSUBW Y12, Y4, Y4 VPSUBW Y13, Y6, Y6 VPSUBW Y14, Y8, Y8 VPSUBW Y15, Y10, Y10 // level 2 // shuffle VPSHUFD $0xB1, Y4, Y12 VPSHUFD $0xB1, Y5, Y13 VPSHUFD $0xB1, Y6, Y14 VPSHUFD $0xB1, Y7, Y15 VPBLENDD $0x55, Y4, Y13, Y4 VPBLENDD $0xAA, Y5, Y12, Y5 VPBLENDD $0x55, Y6, Y15, Y6 VPBLENDD $0xAA, Y7, Y14, Y7 VPSHUFD $0xB1, Y8, Y12 VPSHUFD $0xB1, Y9, Y13 VPSHUFD $0xB1, Y10, Y14 VPSHUFD $0xB1, Y11, Y15 VPBLENDD $0x55, Y8, Y13, Y8 VPBLENDD $0xAA, Y9, Y12, Y9 VPBLENDD $0x55, Y10, Y15, Y10 VPBLENDD $0xAA, Y11, Y14, Y11 // update VPSUBW Y5, Y4, Y12 VPSUBW Y7, Y6, Y13 VPSUBW Y9, Y8, Y14 VPSUBW Y11, Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 // zetas VMOVDQU 512(R8), Y7 VMOVDQU 544(R8), Y9 VMOVDQU 576(R8), Y11 VMOVDQU 608(R8), Y3 // mul VPMULLW Y7, Y12, Y5 VPMULHW Y7, Y12, Y12 VPMULLW Y9, Y13, Y7 VPMULHW Y9, Y13, Y13 VPMULLW Y11, Y14, Y9 VPMULHW Y11, Y14, Y14 VPMULLW Y3, Y15, Y11 VPMULHW Y3, Y15, Y15 // reduce VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y9, Y9 VPMULLW Y0, Y11, Y11 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y9, Y9 VPMULHW Y1, Y11, Y11 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // level 3 // shuffle VSHUFPD $0x00, Y5, Y4, Y3 VSHUFPD $0x0F, Y5, Y4, Y4 VSHUFPD $0x00, Y7, Y6, Y5 VSHUFPD $0x0F, Y7, Y6, Y6 VSHUFPD $0x00, Y9, Y8, Y7 VSHUFPD $0x0F, Y9, Y8, Y8 VSHUFPD $0x00, Y11, Y10, Y9 VSHUFPD $0x0F, Y11, Y10, Y10 // update VPSUBW Y4, Y3, Y12 VPSUBW Y6, Y5, Y13 VPSUBW Y8, Y7, Y14 VPSUBW Y10, Y9, Y15 VPADDW Y3, Y4, Y3 VPADDW Y5, Y6, Y5 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 // zetas VMOVDQU 768(R8), Y6 VMOVDQU 800(R8), Y8 VMOVDQU 832(R8), Y10 VMOVDQU 864(R8), Y11 // mul VPMULLW Y6, Y12, Y4 VPMULHW Y6, Y12, Y12 VPMULLW Y8, Y13, Y6 VPMULHW Y8, Y13, Y13 VPMULLW Y10, Y14, Y8 VPMULHW Y10, Y14, Y14 VPMULLW Y11, Y15, Y10 VPMULHW Y11, Y15, Y15 // reduce VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y8, Y8 VPMULLW Y0, Y10, Y10 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y8, Y8 VPMULHW Y1, Y10, Y10 VPSUBW Y4, Y12, Y4 VPSUBW Y6, Y13, Y6 VPSUBW Y8, Y14, Y8 VPSUBW Y10, Y15, Y10 // reduce 2 VPMULHW Y2, Y3, Y12 VPMULHW Y2, Y5, Y13 VPMULHW Y2, Y7, Y14 VPMULHW Y2, Y9, Y15 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPSRAW $11, Y14, Y14 VPSRAW $11, Y15, Y15 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPMULLW Y1, Y14, Y14 VPMULLW Y1, Y15, Y15 VPSUBW Y12, Y3, Y3 VPSUBW Y13, Y5, Y5 VPSUBW Y14, Y7, Y7 VPSUBW Y15, Y9, Y9 // level 4 // shuffle VPERM2I128 $0x02, Y3, Y4, Y11 VPERM2I128 $0x13, Y3, Y4, Y3 VPERM2I128 $0x02, Y5, Y6, Y4 VPERM2I128 $0x13, Y5, Y6, Y5 VPERM2I128 $0x02, Y7, Y8, Y6 VPERM2I128 $0x13, Y7, Y8, Y7 VPERM2I128 $0x02, Y9, Y10, Y8 VPERM2I128 $0x13, Y9, Y10, Y9 // update VMOVDQA Y11, Y12 VMOVDQA Y4, Y13 VMOVDQA Y6, Y14 VMOVDQA Y8, Y15 VPADDW Y11, Y3, Y10 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPSUBW Y3, Y12, Y3 VPSUBW Y5, Y13, Y5 VPSUBW Y7, Y14, Y7 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1024(R8), Y12 VMOVDQU 1056(R8), Y13 VMOVDQU 1088(R8), Y14 VMOVDQU 1120(R8), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y3 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y7, Y7 VPSUBW Y14, Y9, Y9 // level 5 // update VMOVDQA Y10, Y12 VMOVDQA Y3, Y13 VMOVDQA Y6, Y14 VMOVDQA Y7, Y15 VPADDW Y10, Y4, Y10 VPADDW Y3, Y5, Y3 VPADDW Y6, Y8, Y6 VPADDW Y7, Y9, Y7 VPSUBW Y4, Y12, Y4 VPSUBW Y5, Y13, Y5 VPSUBW Y8, Y14, Y8 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1280(SI), Y14 VMOVDQU 1312(SI), Y15 // mul VPMULLW Y14, Y4, Y11 VPMULLW Y14, Y5, Y12 VPMULLW Y15, Y8, Y13 VPMULHW Y14, Y4, Y4 VPMULHW Y14, Y5, Y5 VPMULHW Y15, Y8, Y8 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y4, Y4 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y8, Y8 VPSUBW Y14, Y9, Y9 // reduce 2 VPMULHW Y2, Y10, Y12 VPMULHW Y2, Y6, Y13 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPSUBW Y12, Y10, Y10 VPSUBW Y13, Y6, Y6 // level 6 // update VMOVDQA Y10, Y12 VMOVDQA Y3, Y13 VMOVDQA Y4, Y14 VMOVDQA Y5, Y15 VPADDW Y10, Y6, Y10 VPADDW Y3, Y7, Y3 VPADDW Y4, Y8, Y4 VPADDW Y5, Y9, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y7, Y13, Y7 VPSUBW Y8, Y14, Y8 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1408(SI), Y15 // mul VPMULLW Y15, Y6, Y11 VPMULLW Y15, Y7, Y12 VPMULLW Y15, Y8, Y13 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y6, Y6 VPMULHW Y15, Y7, Y7 VPMULHW Y15, Y8, Y8 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y6, Y6 VPSUBW Y12, Y7, Y7 VPSUBW Y13, Y8, Y8 VPSUBW Y14, Y9, Y9 // reduce 2 VPMULHW Y2, Y3, Y12 VPSRAW $11, Y12, Y12 VPMULLW Y1, Y12, Y12 VPSUBW Y12, Y3, Y3 // store VMOVDQU Y10, (DI) VMOVDQU Y3, 32(DI) VMOVDQU Y4, 64(DI) VMOVDQU Y5, 96(DI) VMOVDQU Y6, 128(DI) VMOVDQU Y7, 160(DI) VMOVDQU Y8, 192(DI) VMOVDQU Y9, 224(DI) ADDQ $256, DI ADDQ $128, R8 // second round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 128(DI), Y8 VMOVDQU 160(DI), Y9 VMOVDQU 192(DI), Y10 VMOVDQU 224(DI), Y11 // reorder VMOVDQU ·lowdword<>(SB), Y3 VPAND Y3, Y4, Y12 VPAND Y3, Y5, Y13 VPAND Y3, Y6, Y14 VPAND Y3, Y7, Y15 VPSRLD $16, Y4, Y4 VPSRLD $16, Y5, Y5 VPSRLD $16, Y6, Y6 VPSRLD $16, Y7, Y7 VPACKUSDW Y5, Y4, Y5 VPACKUSDW Y13, Y12, Y4 VPACKUSDW Y7, Y6, Y7 VPACKUSDW Y15, Y14, Y6 VPERMQ invntt_VPERMQ_IDX, Y4, Y4 VPERMQ invntt_VPERMQ_IDX, Y5, Y5 VPERMQ invntt_VPERMQ_IDX, Y6, Y6 VPERMQ invntt_VPERMQ_IDX, Y7, Y7 VPAND Y3, Y8, Y12 VPAND Y3, Y9, Y13 VPAND Y3, Y10, Y14 VPAND Y3, Y11, Y15 VPSRLD $16, Y8, Y8 VPSRLD $16, Y9, Y9 VPSRLD $16, Y10, Y10 VPSRLD $16, Y11, Y11 VPACKUSDW Y9, Y8, Y9 VPACKUSDW Y13, Y12, Y8 VPACKUSDW Y11, Y10, Y11 VPACKUSDW Y15, Y14, Y10 VPERMQ invntt_VPERMQ_IDX, Y8, Y8 VPERMQ invntt_VPERMQ_IDX, Y9, Y9 VPERMQ invntt_VPERMQ_IDX, Y10, Y10 VPERMQ invntt_VPERMQ_IDX, Y11, Y11 // level 0 // update VMOVDQA Y4, Y12 VMOVDQA Y6, Y13 VMOVDQA Y8, Y14 VMOVDQA Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // zetas VMOVDQU (R8), Y13 VMOVDQU 32(R8), Y14 VMOVDQU 64(R8), Y15 VMOVDQU 96(R8), Y3 // mul VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y7, Y7 VPSUBW Y14, Y9, Y9 VPSUBW Y15, Y11, Y11 // level 1 // shuffle VMOVDQU ·vpshufb_idx<>(SB), Y3 VPSHUFB Y3, Y4, Y12 VPSHUFB Y3, Y5, Y13 VPSHUFB Y3, Y6, Y14 VPSHUFB Y3, Y7, Y15 VPBLENDW $0x55, Y4, Y13, Y4 VPBLENDW $0xAA, Y5, Y12, Y5 VPBLENDW $0x55, Y6, Y15, Y6 VPBLENDW $0xAA, Y7, Y14, Y7 VPSHUFB Y3, Y8, Y12 VPSHUFB Y3, Y9, Y13 VPSHUFB Y3, Y10, Y14 VPSHUFB Y3, Y11, Y15 VPBLENDW $0x55, Y8, Y13, Y8 VPBLENDW $0xAA, Y9, Y12, Y9 VPBLENDW $0x55, Y10, Y15, Y10 VPBLENDW $0xAA, Y11, Y14, Y11 // update VMOVDQA Y4, Y12 VMOVDQA Y6, Y13 VMOVDQA Y8, Y14 VMOVDQA Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // zetas VMOVDQU 256(R8), Y13 VMOVDQU 288(R8), Y14 VMOVDQU 320(R8), Y15 VMOVDQU 352(R8), Y3 // mul VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y7, Y7 VPSUBW Y14, Y9, Y9 VPSUBW Y15, Y11, Y11 // reduce 2 VPMULHW Y2, Y4, Y12 VPMULHW Y2, Y6, Y13 VPMULHW Y2, Y8, Y14 VPMULHW Y2, Y10, Y15 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPSRAW $11, Y14, Y14 VPSRAW $11, Y15, Y15 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPMULLW Y1, Y14, Y14 VPMULLW Y1, Y15, Y15 VPSUBW Y12, Y4, Y4 VPSUBW Y13, Y6, Y6 VPSUBW Y14, Y8, Y8 VPSUBW Y15, Y10, Y10 // level 2 // shuffle VPSHUFD $0xB1, Y4, Y12 VPSHUFD $0xB1, Y5, Y13 VPSHUFD $0xB1, Y6, Y14 VPSHUFD $0xB1, Y7, Y15 VPBLENDD $0x55, Y4, Y13, Y4 VPBLENDD $0xAA, Y5, Y12, Y5 VPBLENDD $0x55, Y6, Y15, Y6 VPBLENDD $0xAA, Y7, Y14, Y7 VPSHUFD $0xB1, Y8, Y12 VPSHUFD $0xB1, Y9, Y13 VPSHUFD $0xB1, Y10, Y14 VPSHUFD $0xB1, Y11, Y15 VPBLENDD $0x55, Y8, Y13, Y8 VPBLENDD $0xAA, Y9, Y12, Y9 VPBLENDD $0x55, Y10, Y15, Y10 VPBLENDD $0xAA, Y11, Y14, Y11 // update VMOVDQA Y4, Y12 VMOVDQA Y6, Y13 VMOVDQA Y8, Y14 VMOVDQA Y10, Y15 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPADDW Y10, Y11, Y10 VPSUBW Y5, Y12, Y5 VPSUBW Y7, Y13, Y7 VPSUBW Y9, Y14, Y9 VPSUBW Y11, Y15, Y11 // zetas VMOVDQU 512(R8), Y13 VMOVDQU 544(R8), Y14 VMOVDQU 576(R8), Y15 VMOVDQU 608(R8), Y3 // mul VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y7, Y7 VPSUBW Y14, Y9, Y9 VPSUBW Y15, Y11, Y11 // level 3 // shuffle VSHUFPD $0x00, Y5, Y4, Y3 VSHUFPD $0x0F, Y5, Y4, Y4 VSHUFPD $0x00, Y7, Y6, Y5 VSHUFPD $0x0F, Y7, Y6, Y6 VSHUFPD $0x00, Y9, Y8, Y7 VSHUFPD $0x0F, Y9, Y8, Y8 VSHUFPD $0x00, Y11, Y10, Y9 VSHUFPD $0x0F, Y11, Y10, Y10 // update VMOVDQA Y3, Y12 VMOVDQA Y5, Y13 VMOVDQA Y7, Y14 VMOVDQA Y9, Y15 VPADDW Y3, Y4, Y3 VPADDW Y5, Y6, Y5 VPADDW Y7, Y8, Y7 VPADDW Y9, Y10, Y9 VPSUBW Y4, Y12, Y4 VPSUBW Y6, Y13, Y6 VPSUBW Y8, Y14, Y8 VPSUBW Y10, Y15, Y10 // zetas VMOVDQU 768(R8), Y12 VMOVDQU 800(R8), Y13 VMOVDQU 832(R8), Y14 VMOVDQU 864(R8), Y15 // mul VPMULLW Y12, Y4, Y11 VPMULHW Y12, Y4, Y4 VPMULLW Y13, Y6, Y12 VPMULHW Y13, Y6, Y6 VPMULLW Y14, Y8, Y13 VPMULHW Y14, Y8, Y8 VPMULLW Y15, Y10, Y14 VPMULHW Y15, Y10, Y10 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y4, Y4 VPSUBW Y12, Y6, Y6 VPSUBW Y13, Y8, Y8 VPSUBW Y14, Y10, Y10 // reduce 2 VPMULHW Y2, Y3, Y12 VPMULHW Y2, Y5, Y13 VPMULHW Y2, Y7, Y14 VPMULHW Y2, Y9, Y15 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPSRAW $11, Y14, Y14 VPSRAW $11, Y15, Y15 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPMULLW Y1, Y14, Y14 VPMULLW Y1, Y15, Y15 VPSUBW Y12, Y3, Y3 VPSUBW Y13, Y5, Y5 VPSUBW Y14, Y7, Y7 VPSUBW Y15, Y9, Y9 // level 4 // shuffle VPERM2I128 $0x02, Y3, Y4, Y11 VPERM2I128 $0x13, Y3, Y4, Y3 VPERM2I128 $0x02, Y5, Y6, Y4 VPERM2I128 $0x13, Y5, Y6, Y5 VPERM2I128 $0x02, Y7, Y8, Y6 VPERM2I128 $0x13, Y7, Y8, Y7 VPERM2I128 $0x02, Y9, Y10, Y8 VPERM2I128 $0x13, Y9, Y10, Y9 // update VMOVDQA Y11, Y12 VMOVDQA Y4, Y13 VMOVDQA Y6, Y14 VMOVDQA Y8, Y15 VPADDW Y11, Y3, Y10 VPADDW Y4, Y5, Y4 VPADDW Y6, Y7, Y6 VPADDW Y8, Y9, Y8 VPSUBW Y3, Y12, Y3 VPSUBW Y5, Y13, Y5 VPSUBW Y7, Y14, Y7 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1024(R8), Y12 VMOVDQU 1056(R8), Y13 VMOVDQU 1088(R8), Y14 VMOVDQU 1120(R8), Y15 // mul VPMULLW Y12, Y3, Y11 VPMULHW Y12, Y3, Y3 VPMULLW Y13, Y5, Y12 VPMULHW Y13, Y5, Y5 VPMULLW Y14, Y7, Y13 VPMULHW Y14, Y7, Y7 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y3, Y3 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y7, Y7 VPSUBW Y14, Y9, Y9 // level 5 // update VMOVDQA Y10, Y12 VMOVDQA Y3, Y13 VMOVDQA Y6, Y14 VMOVDQA Y7, Y15 VPADDW Y10, Y4, Y10 VPADDW Y3, Y5, Y3 VPADDW Y6, Y8, Y6 VPADDW Y7, Y9, Y7 VPSUBW Y4, Y12, Y4 VPSUBW Y5, Y13, Y5 VPSUBW Y8, Y14, Y8 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1344(SI), Y14 VMOVDQU 1376(SI), Y15 // mul VPMULLW Y14, Y4, Y11 VPMULLW Y14, Y5, Y12 VPMULLW Y15, Y8, Y13 VPMULHW Y14, Y4, Y4 VPMULHW Y14, Y5, Y5 VPMULHW Y15, Y8, Y8 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y4, Y4 VPSUBW Y12, Y5, Y5 VPSUBW Y13, Y8, Y8 VPSUBW Y14, Y9, Y9 // reduce 2 VPMULHW Y2, Y10, Y12 VPMULHW Y2, Y6, Y13 VPSRAW $11, Y12, Y12 VPSRAW $11, Y13, Y13 VPMULLW Y1, Y12, Y12 VPMULLW Y1, Y13, Y13 VPSUBW Y12, Y10, Y10 VPSUBW Y13, Y6, Y6 // level 6 // update VMOVDQA Y10, Y12 VMOVDQA Y3, Y13 VMOVDQA Y4, Y14 VMOVDQA Y5, Y15 VPADDW Y10, Y6, Y10 VPADDW Y3, Y7, Y3 VPADDW Y4, Y8, Y4 VPADDW Y5, Y9, Y5 VPSUBW Y6, Y12, Y6 VPSUBW Y7, Y13, Y7 VPSUBW Y8, Y14, Y8 VPSUBW Y9, Y15, Y9 // zetas VMOVDQU 1440(SI), Y15 // mul VPMULLW Y15, Y6, Y11 VPMULLW Y15, Y7, Y12 VPMULLW Y15, Y8, Y13 VPMULLW Y15, Y9, Y14 VPMULHW Y15, Y6, Y6 VPMULHW Y15, Y7, Y7 VPMULHW Y15, Y8, Y8 VPMULHW Y15, Y9, Y9 // reduce VPMULLW Y0, Y11, Y11 VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULHW Y1, Y11, Y11 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPSUBW Y11, Y6, Y6 VPSUBW Y12, Y7, Y7 VPSUBW Y13, Y8, Y8 VPSUBW Y14, Y9, Y9 // reduce 2 VPMULHW Y2, Y3, Y12 VPSRAW $11, Y12, Y12 VPMULLW Y1, Y12, Y12 VPSUBW Y12, Y3, Y3 // store VMOVDQU Y10, (DI) VMOVDQU Y3, 32(DI) VMOVDQU Y4, 64(DI) VMOVDQU Y5, 96(DI) VMOVDQU Y6, 128(DI) VMOVDQU Y7, 160(DI) VMOVDQU Y8, 192(DI) VMOVDQU Y9, 224(DI) SUBQ $256, DI // f VMOVDQU ·f_x16<>(SB), Y2 // first round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 256(DI), Y8 VMOVDQU 288(DI), Y9 VMOVDQU 320(DI), Y10 VMOVDQU 352(DI), Y11 // level 7 // update VMOVDQA Y4, Y12 VMOVDQA Y5, Y13 VMOVDQA Y6, Y14 VMOVDQA Y7, Y15 VPADDW Y4, Y8, Y4 VPADDW Y5, Y9, Y5 VPADDW Y6, Y10, Y6 VPADDW Y7, Y11, Y7 VPSUBW Y8, Y12, Y8 VPSUBW Y9, Y13, Y9 VPSUBW Y10, Y14, Y10 VPSUBW Y11, Y15, Y11 // zeta VMOVDQU 1472(SI), Y3 // mul VPMULLW Y3, Y8, Y12 VPMULLW Y3, Y9, Y13 VPMULLW Y3, Y10, Y14 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y8, Y8 VPMULHW Y3, Y9, Y9 VPMULHW Y3, Y10, Y10 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y8 VPSUBW Y13, Y9, Y9 VPSUBW Y14, Y10, Y10 VPSUBW Y15, Y11, Y11 VPADDW Y1, Y8, Y8 VPADDW Y1, Y9, Y9 VPADDW Y1, Y10, Y10 VPADDW Y1, Y11, Y11 // mul VPMULLW Y2, Y4, Y12 VPMULLW Y2, Y5, Y13 VPMULLW Y2, Y6, Y14 VPMULLW Y2, Y7, Y15 VPMULHW Y2, Y4, Y4 VPMULHW Y2, Y5, Y5 VPMULHW Y2, Y6, Y6 VPMULHW Y2, Y7, Y7 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y4, Y4 VPSUBW Y13, Y5, Y5 VPSUBW Y14, Y6, Y6 VPSUBW Y15, Y7, Y7 VPADDW Y1, Y4, Y4 VPADDW Y1, Y5, Y5 VPADDW Y1, Y6, Y6 VPADDW Y1, Y7, Y7 // store VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) VMOVDQU Y6, 64(DI) VMOVDQU Y7, 96(DI) VMOVDQU Y8, 256(DI) VMOVDQU Y9, 288(DI) VMOVDQU Y10, 320(DI) VMOVDQU Y11, 352(DI) ADDQ $128, DI // second round // load VMOVDQU (DI), Y4 VMOVDQU 32(DI), Y5 VMOVDQU 64(DI), Y6 VMOVDQU 96(DI), Y7 VMOVDQU 256(DI), Y8 VMOVDQU 288(DI), Y9 VMOVDQU 320(DI), Y10 VMOVDQU 352(DI), Y11 // zeta VMOVDQU 1472(SI), Y3 // level 7 // update VMOVDQA Y4, Y12 VMOVDQA Y5, Y13 VMOVDQA Y6, Y14 VMOVDQA Y7, Y15 VPADDW Y4, Y8, Y4 VPADDW Y5, Y9, Y5 VPADDW Y6, Y10, Y6 VPADDW Y7, Y11, Y7 VPSUBW Y8, Y12, Y8 VPSUBW Y9, Y13, Y9 VPSUBW Y10, Y14, Y10 VPSUBW Y11, Y15, Y11 // mul VPMULLW Y3, Y8, Y12 VPMULLW Y3, Y9, Y13 VPMULLW Y3, Y10, Y14 VPMULLW Y3, Y11, Y15 VPMULHW Y3, Y8, Y8 VPMULHW Y3, Y9, Y9 VPMULHW Y3, Y10, Y10 VPMULHW Y3, Y11, Y11 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y8, Y8 VPSUBW Y13, Y9, Y9 VPSUBW Y14, Y10, Y10 VPSUBW Y15, Y11, Y11 VPADDW Y1, Y8, Y8 VPADDW Y1, Y9, Y9 VPADDW Y1, Y10, Y10 VPADDW Y1, Y11, Y11 // mul VPMULLW Y2, Y4, Y12 VPMULLW Y2, Y5, Y13 VPMULLW Y2, Y6, Y14 VPMULLW Y2, Y7, Y15 VPMULHW Y2, Y4, Y4 VPMULHW Y2, Y5, Y5 VPMULHW Y2, Y6, Y6 VPMULHW Y2, Y7, Y7 // reduce VPMULLW Y0, Y12, Y12 VPMULLW Y0, Y13, Y13 VPMULLW Y0, Y14, Y14 VPMULLW Y0, Y15, Y15 VPMULHW Y1, Y12, Y12 VPMULHW Y1, Y13, Y13 VPMULHW Y1, Y14, Y14 VPMULHW Y1, Y15, Y15 VPSUBW Y12, Y4, Y4 VPSUBW Y13, Y5, Y5 VPSUBW Y14, Y6, Y6 VPSUBW Y15, Y7, Y7 VPADDW Y1, Y4, Y4 VPADDW Y1, Y5, Y5 VPADDW Y1, Y6, Y6 VPADDW Y1, Y7, Y7 // store VMOVDQU Y4, (DI) VMOVDQU Y5, 32(DI) VMOVDQU Y6, 64(DI) VMOVDQU Y7, 96(DI) VMOVDQU Y8, 256(DI) VMOVDQU Y9, 288(DI) VMOVDQU Y10, 320(DI) VMOVDQU Y11, 352(DI) VZEROUPPER RET // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16) TEXT ·pointwiseAccK2AVX2(SB), NOSPLIT, $0-24 MOVQ dst+0(FP), DI MOVQ a+8(FP), SI MOVQ b+16(FP), DX VMOVDQU ·qinv_x16<>(SB), Y0 VMOVDQU ·q_x16<>(SB), Y1 VMOVDQU ·montsq_x16<>(SB), Y2 XORQ AX, AX XORQ BX, BX MOVQ 8(SI), R8 // a[1] MOVQ (SI), SI // a[0] MOVQ 8(DX), R11 // b[1] MOVQ (DX), DX // b[0] looptop2: // load a VMOVDQU (SI)(BX*1), Y4 VMOVDQU 32(SI)(BX*1), Y5 VMOVDQU 64(SI)(BX*1), Y6 VMOVDQU (R8)(BX*1), Y7 VMOVDQU 32(R8)(BX*1), Y8 VMOVDQU 64(R8)(BX*1), Y9 // mul montsq VPMULLW Y2, Y4, Y3 VPMULHW Y2, Y4, Y10 VPMULLW Y2, Y5, Y4 VPMULHW Y2, Y5, Y11 VPMULLW Y2, Y6, Y5 VPMULHW Y2, Y6, Y12 VPMULLW Y2, Y7, Y6 VPMULHW Y2, Y7, Y13 VPMULLW Y2, Y8, Y7 VPMULHW Y2, Y8, Y14 VPMULLW Y2, Y9, Y8 VPMULHW Y2, Y9, Y15 // reduce VPMULLW Y0, Y3, Y3 VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y8, Y8 VPMULHW Y1, Y3, Y3 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y8, Y8 VPSUBW Y3, Y10, Y3 VPSUBW Y4, Y11, Y4 VPSUBW Y5, Y12, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y7, Y14, Y7 VPSUBW Y8, Y15, Y8 // load b VMOVDQU (DX)(BX*1), Y9 VMOVDQU 32(DX)(BX*1), Y10 VMOVDQU 64(DX)(BX*1), Y11 VMOVDQU (R11)(BX*1), Y12 VMOVDQU 32(R11)(BX*1), Y13 VMOVDQU 64(R11)(BX*1), Y14 // mul VPMULLW Y3, Y9, Y15 VPMULHW Y3, Y9, Y9 VPMULLW Y4, Y10, Y3 VPMULHW Y4, Y10, Y10 VPMULLW Y5, Y11, Y4 VPMULHW Y5, Y11, Y11 VPMULLW Y6, Y12, Y5 VPMULHW Y6, Y12, Y12 VPMULLW Y7, Y13, Y6 VPMULHW Y7, Y13, Y13 VPMULLW Y8, Y14, Y7 VPMULHW Y8, Y14, Y14 // reduce VPMULLW Y0, Y15, Y15 VPMULLW Y0, Y3, Y3 VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULHW Y1, Y15, Y15 VPMULHW Y1, Y3, Y3 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPSUBW Y15, Y9, Y15 VPSUBW Y3, Y10, Y3 VPSUBW Y4, Y11, Y4 VPSUBW Y5, Y12, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y7, Y14, Y7 // add VPADDW Y15, Y5, Y5 VPADDW Y3, Y6, Y6 VPADDW Y4, Y7, Y7 // reduce 2 VMOVDQU ·v_x16<>(SB), Y3 VPMULHW Y3, Y5, Y8 VPMULHW Y3, Y6, Y9 VPMULHW Y3, Y7, Y10 VPSRAW $11, Y8, Y8 VPSRAW $11, Y9, Y9 VPSRAW $11, Y10, Y10 VPMULLW Y1, Y8, Y8 VPMULLW Y1, Y9, Y9 VPMULLW Y1, Y10, Y10 VPSUBW Y8, Y5, Y5 VPSUBW Y9, Y6, Y6 VPSUBW Y10, Y7, Y7 // store VMOVDQU Y5, (DI)(BX*1) VMOVDQU Y6, 32(DI)(BX*1) VMOVDQU Y7, 64(DI)(BX*1) ADDQ $1, AX ADDQ $96, BX CMPQ AX, $5 JB looptop2 // load VMOVDQU (SI)(BX*1), Y4 VMOVDQU (R8)(BX*1), Y7 VMOVDQU (DX)(BX*1), Y9 VMOVDQU (R11)(BX*1), Y12 // mul montsq VPMULLW Y2, Y4, Y3 VPMULHW Y2, Y4, Y10 VPMULLW Y2, Y7, Y6 VPMULHW Y2, Y7, Y13 // reduce VPMULLW Y0, Y3, Y3 VPMULLW Y0, Y6, Y6 VPMULHW Y1, Y3, Y3 VPMULHW Y1, Y6, Y6 VPSUBW Y3, Y10, Y3 VPSUBW Y6, Y13, Y6 // mul VPMULLW Y3, Y9, Y15 VPMULHW Y3, Y9, Y9 VPMULLW Y6, Y12, Y5 VPMULHW Y6, Y12, Y12 // reduce VPMULLW Y0, Y15, Y15 VPMULLW Y0, Y5, Y5 VPMULHW Y1, Y15, Y15 VPMULHW Y1, Y5, Y5 VPSUBW Y15, Y9, Y15 VPSUBW Y5, Y12, Y5 // add VPADDW Y15, Y5, Y5 // reduce 2 VMOVDQU ·v_x16<>(SB), Y3 VPMULHW Y3, Y5, Y8 VPSRAW $11, Y8, Y8 VPMULLW Y1, Y8, Y8 VPSUBW Y8, Y5, Y5 // store VMOVDQU Y5, (DI)(BX*1) VZEROUPPER RET // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16) TEXT ·pointwiseAccK3AVX2(SB), NOSPLIT, $0-24 MOVQ dst+0(FP), DI MOVQ a+8(FP), SI MOVQ b+16(FP), DX VMOVDQU ·qinv_x16<>(SB), Y0 VMOVDQU ·q_x16<>(SB), Y1 VMOVDQU ·montsq_x16<>(SB), Y2 XORQ AX, AX XORQ BX, BX MOVQ (16)(SI), R9 // a[2] MOVQ 8(SI), R8 // a[1] MOVQ (SI), SI // a[0] MOVQ 16(DX), R12 // b[2] MOVQ 8(DX), R11 // b[1] MOVQ (DX), DX // b[0] looptop3: // load a VMOVDQU (SI)(BX*1), Y4 VMOVDQU 32(SI)(BX*1), Y5 VMOVDQU (R8)(BX*1), Y6 VMOVDQU 32(R8)(BX*1), Y7 VMOVDQU (R9)(BX*1), Y8 VMOVDQU 32(R9)(BX*1), Y9 // mul montsq VPMULLW Y2, Y4, Y3 VPMULHW Y2, Y4, Y10 VPMULLW Y2, Y5, Y4 VPMULHW Y2, Y5, Y11 VPMULLW Y2, Y6, Y5 VPMULHW Y2, Y6, Y12 VPMULLW Y2, Y7, Y6 VPMULHW Y2, Y7, Y13 VPMULLW Y2, Y8, Y7 VPMULHW Y2, Y8, Y14 VPMULLW Y2, Y9, Y8 VPMULHW Y2, Y9, Y15 // reduce VPMULLW Y0, Y3, Y3 VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y8, Y8 VPMULHW Y1, Y3, Y3 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y8, Y8 VPSUBW Y3, Y10, Y3 VPSUBW Y4, Y11, Y4 VPSUBW Y5, Y12, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y7, Y14, Y7 VPSUBW Y8, Y15, Y8 // load b VMOVDQU (DX)(BX*1), Y9 VMOVDQU 32(DX)(BX*1), Y10 VMOVDQU (R11)(BX*1), Y11 VMOVDQU 32(R11)(BX*1), Y12 VMOVDQU (R12)(BX*1), Y13 VMOVDQU 32(R12)(BX*1), Y14 // mul VPMULLW Y3, Y9, Y15 VPMULHW Y3, Y9, Y9 VPMULLW Y4, Y10, Y3 VPMULHW Y4, Y10, Y10 VPMULLW Y5, Y11, Y4 VPMULHW Y5, Y11, Y11 VPMULLW Y6, Y12, Y5 VPMULHW Y6, Y12, Y12 VPMULLW Y7, Y13, Y6 VPMULHW Y7, Y13, Y13 VPMULLW Y8, Y14, Y7 VPMULHW Y8, Y14, Y14 // reduce VPMULLW Y0, Y15, Y15 VPMULLW Y0, Y3, Y3 VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULHW Y1, Y15, Y15 VPMULHW Y1, Y3, Y3 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPSUBW Y15, Y9, Y15 VPSUBW Y3, Y10, Y3 VPSUBW Y4, Y11, Y4 VPSUBW Y5, Y12, Y5 VPSUBW Y6, Y13, Y6 VPSUBW Y7, Y14, Y7 // add VPADDW Y15, Y4, Y4 VPADDW Y3, Y5, Y5 VPADDW Y4, Y6, Y6 VPADDW Y5, Y7, Y7 // reduce 2 VMOVDQU ·v_x16<>(SB), Y3 VPMULHW Y3, Y6, Y8 VPMULHW Y3, Y7, Y9 VPSRAW $11, Y8, Y8 VPSRAW $11, Y9, Y9 VPMULLW Y1, Y8, Y8 VPMULLW Y1, Y9, Y9 VPSUBW Y8, Y6, Y6 VPSUBW Y9, Y7, Y7 // store VMOVDQU Y6, (DI)(BX*1) VMOVDQU Y7, 32(DI)(BX*1) ADDQ $1, AX ADDQ $64, BX CMPQ AX, $8 JB looptop3 VZEROUPPER RET // func pointwiseAccK2AVX2(dst *uint16, a, b **uint16) TEXT ·pointwiseAccK4AVX2(SB), NOSPLIT, $0-24 MOVQ dst+0(FP), DI MOVQ a+8(FP), SI MOVQ b+16(FP), DX VMOVDQU ·qinv_x16<>(SB), Y0 VMOVDQU ·q_x16<>(SB), Y1 VMOVDQU ·montsq_x16<>(SB), Y2 VMOVDQU ·v_x16<>(SB), Y3 XORQ AX, AX XORQ BX, BX MOVQ 24(SI), R10 // a[3] MOVQ 16(SI), R9 // a[2] MOVQ 8(SI), R8 // a[1] MOVQ (SI), SI // a[0] MOVQ 24(DX), R13 // b[3] MOVQ 16(DX), R12 // b[2] MOVQ 8(DX), R11 // b[1] MOVQ (DX), DX // b[0] looptop4: // load a VMOVDQU (SI)(BX*1), Y6 VMOVDQU (R8)(BX*1), Y7 VMOVDQU (R9)(BX*1), Y8 VMOVDQU (R10)(BX*1), Y9 // mul montsq VPMULLW Y2, Y6, Y5 VPMULHW Y2, Y6, Y10 VPMULLW Y2, Y7, Y6 VPMULHW Y2, Y7, Y11 VPMULLW Y2, Y8, Y7 VPMULHW Y2, Y8, Y12 VPMULLW Y2, Y9, Y8 VPMULHW Y2, Y9, Y13 // reduce VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULLW Y0, Y8, Y8 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPMULHW Y1, Y8, Y8 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y11, Y6 VPSUBW Y7, Y12, Y7 VPSUBW Y8, Y13, Y8 // load b VMOVDQU (DX)(BX*1), Y9 VMOVDQU (R11)(BX*1), Y10 VMOVDQU (R12)(BX*1), Y11 VMOVDQU (R13)(BX*1), Y12 // mul VPMULLW Y5, Y9, Y4 VPMULHW Y5, Y9, Y9 VPMULLW Y6, Y10, Y5 VPMULHW Y6, Y10, Y10 VPMULLW Y7, Y11, Y6 VPMULHW Y7, Y11, Y11 VPMULLW Y8, Y12, Y7 VPMULHW Y8, Y12, Y12 // reduce VPMULLW Y0, Y4, Y4 VPMULLW Y0, Y5, Y5 VPMULLW Y0, Y6, Y6 VPMULLW Y0, Y7, Y7 VPMULHW Y1, Y4, Y4 VPMULHW Y1, Y5, Y5 VPMULHW Y1, Y6, Y6 VPMULHW Y1, Y7, Y7 VPSUBW Y4, Y9, Y4 VPSUBW Y5, Y10, Y5 VPSUBW Y6, Y11, Y6 VPSUBW Y7, Y12, Y7 // add VPADDW Y4, Y5, Y5 VPADDW Y5, Y6, Y6 VPADDW Y6, Y7, Y7 // reduce 2 VPMULHW Y3, Y7, Y8 VPSRAW $11, Y8, Y8 VPMULLW Y1, Y8, Y8 VPSUBW Y8, Y7, Y8 // store VMOVDQU Y8, (DI)(BX*1) ADDQ $1, AX ADDQ $32, BX CMPQ AX, $16 JB looptop4 VZEROUPPER RET // func cbdEta4AVX2(dst *uint16, b *byte) TEXT ·cbdEta4AVX2(SB), NOSPLIT, $0-16 MOVQ dst+0(FP), DI MOVQ b+8(FP), SI VMOVDQU ·mask11<>(SB), Y0 VMOVDQU ·mask0f<>(SB), Y1 VMOVDQU ·q_x16<>(SB), Y2 MOVQ $256, DX looptop: VMOVUPD 0(SI), Y3 VPAND Y3, Y0, Y4 VPSRLW $1, Y3, Y3 VPAND Y3, Y0, Y5 VPADDB Y5, Y4, Y4 VPSRLW $1, Y3, Y3 VPAND Y3, Y0, Y5 VPADDB Y5, Y4, Y4 VPSRLW $1, Y3, Y3 VPAND Y3, Y0, Y3 VPADDB Y3, Y4, Y3 VPSRLW $4, Y3, Y4 VPAND Y3, Y1, Y3 VPAND Y4, Y1, Y4 VPSUBB Y4, Y3, Y3 VPMOVSXBW X3, Y4 VPADDW Y2, Y4, Y4 VMOVUPD Y4, 0(DI) VPERM2F128 $0x21, Y3, Y3, Y3 VPMOVSXBW X3, Y4 VPADDW Y2, Y4, Y4 VMOVUPD Y4, 32(DI) ADDQ $64, DI ADDQ $32, SI SUBQ $32, DX JA looptop VZEROUPPER RET