cloudflared-mirror/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.s

2355 lines
59 KiB
ArmAsm

// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
// +build amd64
#include "textflag.h"
// func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·addAVX2(SB), NOSPLIT, $0-24
MOVQ p+0(FP), AX
MOVQ a+8(FP), CX
MOVQ b+16(FP), DX
VMOVDQU (CX), Y0
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y4
VMOVDQU 96(CX), Y6
VMOVDQU 128(CX), Y8
VMOVDQU 160(CX), Y10
VMOVDQU 192(CX), Y12
VMOVDQU 224(CX), Y14
VMOVDQU (DX), Y1
VMOVDQU 32(DX), Y3
VMOVDQU 64(DX), Y5
VMOVDQU 96(DX), Y7
VMOVDQU 128(DX), Y9
VMOVDQU 160(DX), Y11
VMOVDQU 192(DX), Y13
VMOVDQU 224(DX), Y15
VPADDW Y0, Y1, Y1
VPADDW Y2, Y3, Y3
VPADDW Y4, Y5, Y5
VPADDW Y6, Y7, Y7
VPADDW Y8, Y9, Y9
VPADDW Y10, Y11, Y11
VPADDW Y12, Y13, Y13
VPADDW Y14, Y15, Y15
VMOVDQU Y1, (AX)
VMOVDQU Y3, 32(AX)
VMOVDQU Y5, 64(AX)
VMOVDQU Y7, 96(AX)
VMOVDQU Y9, 128(AX)
VMOVDQU Y11, 160(AX)
VMOVDQU Y13, 192(AX)
VMOVDQU Y15, 224(AX)
VMOVDQU 256(CX), Y0
VMOVDQU 288(CX), Y2
VMOVDQU 320(CX), Y4
VMOVDQU 352(CX), Y6
VMOVDQU 384(CX), Y8
VMOVDQU 416(CX), Y10
VMOVDQU 448(CX), Y12
VMOVDQU 480(CX), Y14
VMOVDQU 256(DX), Y1
VMOVDQU 288(DX), Y3
VMOVDQU 320(DX), Y5
VMOVDQU 352(DX), Y7
VMOVDQU 384(DX), Y9
VMOVDQU 416(DX), Y11
VMOVDQU 448(DX), Y13
VMOVDQU 480(DX), Y15
VPADDW Y0, Y1, Y1
VPADDW Y2, Y3, Y3
VPADDW Y4, Y5, Y5
VPADDW Y6, Y7, Y7
VPADDW Y8, Y9, Y9
VPADDW Y10, Y11, Y11
VPADDW Y12, Y13, Y13
VPADDW Y14, Y15, Y15
VMOVDQU Y1, 256(AX)
VMOVDQU Y3, 288(AX)
VMOVDQU Y5, 320(AX)
VMOVDQU Y7, 352(AX)
VMOVDQU Y9, 384(AX)
VMOVDQU Y11, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y15, 480(AX)
RET
// func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·subAVX2(SB), NOSPLIT, $0-24
MOVQ p+0(FP), AX
MOVQ a+8(FP), CX
MOVQ b+16(FP), DX
VMOVDQU (CX), Y0
VMOVDQU 32(CX), Y2
VMOVDQU 64(CX), Y4
VMOVDQU 96(CX), Y6
VMOVDQU 128(CX), Y8
VMOVDQU 160(CX), Y10
VMOVDQU 192(CX), Y12
VMOVDQU 224(CX), Y14
VMOVDQU (DX), Y1
VMOVDQU 32(DX), Y3
VMOVDQU 64(DX), Y5
VMOVDQU 96(DX), Y7
VMOVDQU 128(DX), Y9
VMOVDQU 160(DX), Y11
VMOVDQU 192(DX), Y13
VMOVDQU 224(DX), Y15
VPSUBW Y1, Y0, Y1
VPSUBW Y3, Y2, Y3
VPSUBW Y5, Y4, Y5
VPSUBW Y7, Y6, Y7
VPSUBW Y9, Y8, Y9
VPSUBW Y11, Y10, Y11
VPSUBW Y13, Y12, Y13
VPSUBW Y15, Y14, Y15
VMOVDQU Y1, (AX)
VMOVDQU Y3, 32(AX)
VMOVDQU Y5, 64(AX)
VMOVDQU Y7, 96(AX)
VMOVDQU Y9, 128(AX)
VMOVDQU Y11, 160(AX)
VMOVDQU Y13, 192(AX)
VMOVDQU Y15, 224(AX)
VMOVDQU 256(CX), Y0
VMOVDQU 288(CX), Y2
VMOVDQU 320(CX), Y4
VMOVDQU 352(CX), Y6
VMOVDQU 384(CX), Y8
VMOVDQU 416(CX), Y10
VMOVDQU 448(CX), Y12
VMOVDQU 480(CX), Y14
VMOVDQU 256(DX), Y1
VMOVDQU 288(DX), Y3
VMOVDQU 320(DX), Y5
VMOVDQU 352(DX), Y7
VMOVDQU 384(DX), Y9
VMOVDQU 416(DX), Y11
VMOVDQU 448(DX), Y13
VMOVDQU 480(DX), Y15
VPSUBW Y1, Y0, Y1
VPSUBW Y3, Y2, Y3
VPSUBW Y5, Y4, Y5
VPSUBW Y7, Y6, Y7
VPSUBW Y9, Y8, Y9
VPSUBW Y11, Y10, Y11
VPSUBW Y13, Y12, Y13
VPSUBW Y15, Y14, Y15
VMOVDQU Y1, 256(AX)
VMOVDQU Y3, 288(AX)
VMOVDQU Y5, 320(AX)
VMOVDQU Y7, 352(AX)
VMOVDQU Y9, 384(AX)
VMOVDQU Y11, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y15, 480(AX)
RET
// func nttAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·nttAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
LEAQ ·ZetasAVX2+0(SB), CX
MOVL $0x00000d01, DX
VMOVD DX, X0
VPBROADCASTW X0, Y15
VPBROADCASTW (CX), Y0
VPBROADCASTW 2(CX), Y1
VMOVDQU (AX), Y7
VMOVDQU 32(AX), Y8
VMOVDQU 64(AX), Y9
VMOVDQU 96(AX), Y10
VMOVDQU 256(AX), Y11
VMOVDQU 288(AX), Y12
VMOVDQU 320(AX), Y13
VMOVDQU 352(AX), Y14
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y2
VPSUBW Y3, Y12, Y3
VPSUBW Y4, Y13, Y4
VPSUBW Y5, Y14, Y5
VPSUBW Y2, Y7, Y11
VPSUBW Y3, Y8, Y12
VPSUBW Y4, Y9, Y13
VPSUBW Y5, Y10, Y14
VPADDW Y2, Y7, Y7
VPADDW Y3, Y8, Y8
VPADDW Y4, Y9, Y9
VPADDW Y5, Y10, Y10
VMOVDQU Y7, (AX)
VMOVDQU Y8, 32(AX)
VMOVDQU Y9, 64(AX)
VMOVDQU Y10, 96(AX)
VMOVDQU Y11, 256(AX)
VMOVDQU Y12, 288(AX)
VMOVDQU Y13, 320(AX)
VMOVDQU Y14, 352(AX)
VMOVDQU 128(AX), Y7
VMOVDQU 160(AX), Y8
VMOVDQU 192(AX), Y9
VMOVDQU 224(AX), Y10
VMOVDQU 384(AX), Y11
VMOVDQU 416(AX), Y12
VMOVDQU 448(AX), Y13
VMOVDQU 480(AX), Y14
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y2
VPSUBW Y3, Y12, Y3
VPSUBW Y4, Y13, Y4
VPSUBW Y5, Y14, Y5
VPSUBW Y2, Y7, Y11
VPSUBW Y3, Y8, Y12
VPSUBW Y4, Y9, Y13
VPSUBW Y5, Y10, Y14
VPADDW Y2, Y7, Y7
VPADDW Y3, Y8, Y8
VPADDW Y4, Y9, Y9
VPADDW Y5, Y10, Y10
VMOVDQU Y7, 128(AX)
VMOVDQU Y8, 160(AX)
VMOVDQU Y9, 192(AX)
VMOVDQU Y10, 224(AX)
VMOVDQU Y11, 384(AX)
VMOVDQU Y12, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y14, 480(AX)
VPBROADCASTW 4(CX), Y0
VPBROADCASTW 6(CX), Y1
VMOVDQU (AX), Y7
VMOVDQU 32(AX), Y8
VMOVDQU 64(AX), Y9
VMOVDQU 96(AX), Y10
VMOVDQU 128(AX), Y11
VMOVDQU 160(AX), Y12
VMOVDQU 192(AX), Y13
VMOVDQU 224(AX), Y14
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y2
VPSUBW Y3, Y12, Y3
VPSUBW Y4, Y13, Y4
VPSUBW Y5, Y14, Y5
VPSUBW Y2, Y7, Y11
VPSUBW Y3, Y8, Y12
VPSUBW Y4, Y9, Y13
VPSUBW Y5, Y10, Y14
VPADDW Y2, Y7, Y7
VPADDW Y3, Y8, Y8
VPADDW Y4, Y9, Y9
VPADDW Y5, Y10, Y10
VPBROADCASTW 12(CX), Y0
VPBROADCASTW 14(CX), Y1
VPBROADCASTW 16(CX), Y2
VPBROADCASTW 18(CX), Y3
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU 32(CX), Y0
VMOVDQU 64(CX), Y1
VMOVDQU 96(CX), Y2
VMOVDQU 128(CX), Y3
VPERM2I128 $0x20, Y9, Y7, Y4
VPERM2I128 $0x31, Y9, Y7, Y9
VMOVDQA Y4, Y7
VPERM2I128 $0x20, Y10, Y8, Y4
VPERM2I128 $0x31, Y10, Y8, Y10
VMOVDQA Y4, Y8
VPERM2I128 $0x20, Y13, Y11, Y4
VPERM2I128 $0x31, Y13, Y11, Y13
VMOVDQA Y4, Y11
VPERM2I128 $0x20, Y14, Y12, Y4
VPERM2I128 $0x31, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPMULLW Y8, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y12, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y8, Y1, Y8
VPMULHW Y10, Y1, Y10
VPMULHW Y12, Y3, Y12
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y8, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y8
VPSUBW Y5, Y9, Y10
VPSUBW Y6, Y11, Y12
VPSUBW Y0, Y13, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y9, Y9
VPADDW Y6, Y11, Y11
VPADDW Y0, Y13, Y13
VMOVDQU 288(CX), Y0
VMOVDQU 320(CX), Y1
VMOVDQU 352(CX), Y2
VMOVDQU 384(CX), Y3
VPUNPCKLQDQ Y8, Y7, Y4
VPUNPCKHQDQ Y8, Y7, Y8
VMOVDQA Y4, Y7
VPUNPCKLQDQ Y10, Y9, Y4
VPUNPCKHQDQ Y10, Y9, Y10
VMOVDQA Y4, Y9
VPUNPCKLQDQ Y12, Y11, Y4
VPUNPCKHQDQ Y12, Y11, Y12
VMOVDQA Y4, Y11
VPUNPCKLQDQ Y14, Y13, Y4
VPUNPCKHQDQ Y14, Y13, Y14
VMOVDQA Y4, Y13
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU 544(CX), Y0
VMOVDQU 576(CX), Y1
VMOVDQU 608(CX), Y2
VMOVDQU 640(CX), Y3
VMOVSLDUP Y9, Y4
VPBLENDD $0xaa, Y4, Y7, Y4
VPSRLQ $0x20, Y7, Y7
VPBLENDD $0xaa, Y9, Y7, Y9
VMOVDQA Y4, Y7
VMOVSLDUP Y10, Y4
VPBLENDD $0xaa, Y4, Y8, Y4
VPSRLQ $0x20, Y8, Y8
VPBLENDD $0xaa, Y10, Y8, Y10
VMOVDQA Y4, Y8
VMOVSLDUP Y13, Y4
VPBLENDD $0xaa, Y4, Y11, Y4
VPSRLQ $0x20, Y11, Y11
VPBLENDD $0xaa, Y13, Y11, Y13
VMOVDQA Y4, Y11
VMOVSLDUP Y14, Y4
VPBLENDD $0xaa, Y4, Y12, Y4
VPSRLQ $0x20, Y12, Y12
VPBLENDD $0xaa, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPMULLW Y8, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y12, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y8, Y1, Y8
VPMULHW Y10, Y1, Y10
VPMULHW Y12, Y3, Y12
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y8, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y8
VPSUBW Y5, Y9, Y10
VPSUBW Y6, Y11, Y12
VPSUBW Y0, Y13, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y9, Y9
VPADDW Y6, Y11, Y11
VPADDW Y0, Y13, Y13
VMOVDQU 800(CX), Y0
VMOVDQU 832(CX), Y1
VMOVDQU 864(CX), Y2
VMOVDQU 896(CX), Y3
VPSLLD $0x10, Y8, Y4
VPBLENDW $0xaa, Y4, Y7, Y4
VPSRLD $0x10, Y7, Y7
VPBLENDW $0xaa, Y8, Y7, Y8
VMOVDQA Y4, Y7
VPSLLD $0x10, Y10, Y4
VPBLENDW $0xaa, Y4, Y9, Y4
VPSRLD $0x10, Y9, Y9
VPBLENDW $0xaa, Y10, Y9, Y10
VMOVDQA Y4, Y9
VPSLLD $0x10, Y12, Y4
VPBLENDW $0xaa, Y4, Y11, Y4
VPSRLD $0x10, Y11, Y11
VPBLENDW $0xaa, Y12, Y11, Y12
VMOVDQA Y4, Y11
VPSLLD $0x10, Y14, Y4
VPBLENDW $0xaa, Y4, Y13, Y4
VPSRLD $0x10, Y13, Y13
VPBLENDW $0xaa, Y14, Y13, Y14
VMOVDQA Y4, Y13
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU Y7, (AX)
VMOVDQU Y8, 32(AX)
VMOVDQU Y9, 64(AX)
VMOVDQU Y10, 96(AX)
VMOVDQU Y11, 128(AX)
VMOVDQU Y12, 160(AX)
VMOVDQU Y13, 192(AX)
VMOVDQU Y14, 224(AX)
VPBROADCASTW 8(CX), Y0
VPBROADCASTW 10(CX), Y1
VMOVDQU 256(AX), Y7
VMOVDQU 288(AX), Y8
VMOVDQU 320(AX), Y9
VMOVDQU 352(AX), Y10
VMOVDQU 384(AX), Y11
VMOVDQU 416(AX), Y12
VMOVDQU 448(AX), Y13
VMOVDQU 480(AX), Y14
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y2
VPSUBW Y3, Y12, Y3
VPSUBW Y4, Y13, Y4
VPSUBW Y5, Y14, Y5
VPSUBW Y2, Y7, Y11
VPSUBW Y3, Y8, Y12
VPSUBW Y4, Y9, Y13
VPSUBW Y5, Y10, Y14
VPADDW Y2, Y7, Y7
VPADDW Y3, Y8, Y8
VPADDW Y4, Y9, Y9
VPADDW Y5, Y10, Y10
VPBROADCASTW 20(CX), Y0
VPBROADCASTW 22(CX), Y1
VPBROADCASTW 24(CX), Y2
VPBROADCASTW 26(CX), Y3
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU 160(CX), Y0
VMOVDQU 192(CX), Y1
VMOVDQU 224(CX), Y2
VMOVDQU 256(CX), Y3
VPERM2I128 $0x20, Y9, Y7, Y4
VPERM2I128 $0x31, Y9, Y7, Y9
VMOVDQA Y4, Y7
VPERM2I128 $0x20, Y10, Y8, Y4
VPERM2I128 $0x31, Y10, Y8, Y10
VMOVDQA Y4, Y8
VPERM2I128 $0x20, Y13, Y11, Y4
VPERM2I128 $0x31, Y13, Y11, Y13
VMOVDQA Y4, Y11
VPERM2I128 $0x20, Y14, Y12, Y4
VPERM2I128 $0x31, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPMULLW Y8, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y12, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y8, Y1, Y8
VPMULHW Y10, Y1, Y10
VPMULHW Y12, Y3, Y12
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y8, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y8
VPSUBW Y5, Y9, Y10
VPSUBW Y6, Y11, Y12
VPSUBW Y0, Y13, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y9, Y9
VPADDW Y6, Y11, Y11
VPADDW Y0, Y13, Y13
VMOVDQU 416(CX), Y0
VMOVDQU 448(CX), Y1
VMOVDQU 480(CX), Y2
VMOVDQU 512(CX), Y3
VPUNPCKLQDQ Y8, Y7, Y4
VPUNPCKHQDQ Y8, Y7, Y8
VMOVDQA Y4, Y7
VPUNPCKLQDQ Y10, Y9, Y4
VPUNPCKHQDQ Y10, Y9, Y10
VMOVDQA Y4, Y9
VPUNPCKLQDQ Y12, Y11, Y4
VPUNPCKHQDQ Y12, Y11, Y12
VMOVDQA Y4, Y11
VPUNPCKLQDQ Y14, Y13, Y4
VPUNPCKHQDQ Y14, Y13, Y14
VMOVDQA Y4, Y13
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU 672(CX), Y0
VMOVDQU 704(CX), Y1
VMOVDQU 736(CX), Y2
VMOVDQU 768(CX), Y3
VMOVSLDUP Y9, Y4
VPBLENDD $0xaa, Y4, Y7, Y4
VPSRLQ $0x20, Y7, Y7
VPBLENDD $0xaa, Y9, Y7, Y9
VMOVDQA Y4, Y7
VMOVSLDUP Y10, Y4
VPBLENDD $0xaa, Y4, Y8, Y4
VPSRLQ $0x20, Y8, Y8
VPBLENDD $0xaa, Y10, Y8, Y10
VMOVDQA Y4, Y8
VMOVSLDUP Y13, Y4
VPBLENDD $0xaa, Y4, Y11, Y4
VPSRLQ $0x20, Y11, Y11
VPBLENDD $0xaa, Y13, Y11, Y13
VMOVDQA Y4, Y11
VMOVSLDUP Y14, Y4
VPBLENDD $0xaa, Y4, Y12, Y4
VPSRLQ $0x20, Y12, Y12
VPBLENDD $0xaa, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPMULLW Y8, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y12, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y8, Y1, Y8
VPMULHW Y10, Y1, Y10
VPMULHW Y12, Y3, Y12
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y8, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y12, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y8
VPSUBW Y5, Y9, Y10
VPSUBW Y6, Y11, Y12
VPSUBW Y0, Y13, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y9, Y9
VPADDW Y6, Y11, Y11
VPADDW Y0, Y13, Y13
VMOVDQU 928(CX), Y0
VMOVDQU 960(CX), Y1
VMOVDQU 992(CX), Y2
VMOVDQU 1024(CX), Y3
VPSLLD $0x10, Y8, Y4
VPBLENDW $0xaa, Y4, Y7, Y4
VPSRLD $0x10, Y7, Y7
VPBLENDW $0xaa, Y8, Y7, Y8
VMOVDQA Y4, Y7
VPSLLD $0x10, Y10, Y4
VPBLENDW $0xaa, Y4, Y9, Y4
VPSRLD $0x10, Y9, Y9
VPBLENDW $0xaa, Y10, Y9, Y10
VMOVDQA Y4, Y9
VPSLLD $0x10, Y12, Y4
VPBLENDW $0xaa, Y4, Y11, Y4
VPSRLD $0x10, Y11, Y11
VPBLENDW $0xaa, Y12, Y11, Y12
VMOVDQA Y4, Y11
VPSLLD $0x10, Y14, Y4
VPBLENDW $0xaa, Y4, Y13, Y4
VPSRLD $0x10, Y13, Y13
VPBLENDW $0xaa, Y14, Y13, Y14
VMOVDQA Y4, Y13
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULLW Y13, Y2, Y6
VPMULLW Y14, Y2, Y0
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y13, Y3, Y13
VPMULHW Y14, Y3, Y14
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPMULHW Y6, Y15, Y6
VPMULHW Y0, Y15, Y0
VPSUBW Y4, Y9, Y4
VPSUBW Y5, Y10, Y5
VPSUBW Y6, Y13, Y6
VPSUBW Y0, Y14, Y0
VPSUBW Y4, Y7, Y9
VPSUBW Y5, Y8, Y10
VPSUBW Y6, Y11, Y13
VPSUBW Y0, Y12, Y14
VPADDW Y4, Y7, Y7
VPADDW Y5, Y8, Y8
VPADDW Y6, Y11, Y11
VPADDW Y0, Y12, Y12
VMOVDQU Y7, 256(AX)
VMOVDQU Y8, 288(AX)
VMOVDQU Y9, 320(AX)
VMOVDQU Y10, 352(AX)
VMOVDQU Y11, 384(AX)
VMOVDQU Y12, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y14, 480(AX)
RET
// func invNttAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
LEAQ ·ZetasAVX2+0(SB), CX
MOVL $0x00000d01, DX
VMOVD DX, X0
VPBROADCASTW X0, Y15
VMOVDQU (AX), Y7
VMOVDQU 32(AX), Y8
VMOVDQU 64(AX), Y9
VMOVDQU 96(AX), Y10
VMOVDQU 128(AX), Y11
VMOVDQU 160(AX), Y12
VMOVDQU 192(AX), Y13
VMOVDQU 224(AX), Y14
VMOVDQU 1056(CX), Y0
VMOVDQU 1088(CX), Y1
VMOVDQU 1120(CX), Y2
VMOVDQU 1152(CX), Y3
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
VMOVDQU 1312(CX), Y0
VMOVDQU 1344(CX), Y1
VMOVDQU 1376(CX), Y2
VMOVDQU 1408(CX), Y3
VPSLLD $0x10, Y8, Y4
VPBLENDW $0xaa, Y4, Y7, Y4
VPSRLD $0x10, Y7, Y7
VPBLENDW $0xaa, Y8, Y7, Y8
VMOVDQA Y4, Y7
VPSLLD $0x10, Y10, Y4
VPBLENDW $0xaa, Y4, Y9, Y4
VPSRLD $0x10, Y9, Y9
VPBLENDW $0xaa, Y10, Y9, Y10
VMOVDQA Y4, Y9
VPSLLD $0x10, Y12, Y4
VPBLENDW $0xaa, Y4, Y11, Y4
VPSRLD $0x10, Y11, Y11
VPBLENDW $0xaa, Y12, Y11, Y12
VMOVDQA Y4, Y11
VPSLLD $0x10, Y14, Y4
VPBLENDW $0xaa, Y4, Y13, Y4
VPSRLD $0x10, Y13, Y13
VPBLENDW $0xaa, Y14, Y13, Y14
VMOVDQA Y4, Y13
VPSUBW Y7, Y8, Y4
VPSUBW Y9, Y10, Y5
VPSUBW Y11, Y12, Y6
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
VPADDW Y11, Y12, Y11
VPMULLW Y4, Y0, Y8
VPMULLW Y5, Y0, Y10
VPSUBW Y13, Y14, Y0
VPMULLW Y6, Y2, Y12
VPADDW Y13, Y14, Y13
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y8, Y15, Y8
VPMULHW Y10, Y15, Y10
VPMULHW Y12, Y15, Y12
VPMULHW Y14, Y15, Y14
VPSUBW Y8, Y4, Y8
VPSUBW Y10, Y5, Y10
VPSUBW Y12, Y6, Y12
VPSUBW Y14, Y0, Y14
VMOVDQU 1568(CX), Y0
VMOVDQU 1600(CX), Y1
VMOVDQU 1632(CX), Y2
VMOVDQU 1664(CX), Y3
VMOVSLDUP Y9, Y4
VPBLENDD $0xaa, Y4, Y7, Y4
VPSRLQ $0x20, Y7, Y7
VPBLENDD $0xaa, Y9, Y7, Y9
VMOVDQA Y4, Y7
VMOVSLDUP Y10, Y4
VPBLENDD $0xaa, Y4, Y8, Y4
VPSRLQ $0x20, Y8, Y8
VPBLENDD $0xaa, Y10, Y8, Y10
VMOVDQA Y4, Y8
VMOVSLDUP Y13, Y4
VPBLENDD $0xaa, Y4, Y11, Y4
VPSRLQ $0x20, Y11, Y11
VPBLENDD $0xaa, Y13, Y11, Y13
VMOVDQA Y4, Y11
VMOVSLDUP Y14, Y4
VPBLENDD $0xaa, Y4, Y12, Y4
VPSRLQ $0x20, Y12, Y12
VPBLENDD $0xaa, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
MOVL $0x00004ebf, DX
VMOVD DX, X0
VPBROADCASTW X0, Y4
VPMULHW Y4, Y7, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y7, Y7
VPMULHW Y4, Y11, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y11, Y11
VMOVDQU 1824(CX), Y0
VMOVDQU 1856(CX), Y1
VMOVDQU 1888(CX), Y2
VMOVDQU 1920(CX), Y3
VPUNPCKLQDQ Y8, Y7, Y4
VPUNPCKHQDQ Y8, Y7, Y8
VMOVDQA Y4, Y7
VPUNPCKLQDQ Y10, Y9, Y4
VPUNPCKHQDQ Y10, Y9, Y10
VMOVDQA Y4, Y9
VPUNPCKLQDQ Y12, Y11, Y4
VPUNPCKHQDQ Y12, Y11, Y12
VMOVDQA Y4, Y11
VPUNPCKLQDQ Y14, Y13, Y4
VPUNPCKHQDQ Y14, Y13, Y14
VMOVDQA Y4, Y13
VPSUBW Y7, Y8, Y4
VPSUBW Y9, Y10, Y5
VPSUBW Y11, Y12, Y6
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
VPADDW Y11, Y12, Y11
VPMULLW Y4, Y0, Y8
VPMULLW Y5, Y0, Y10
VPSUBW Y13, Y14, Y0
VPMULLW Y6, Y2, Y12
VPADDW Y13, Y14, Y13
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y8, Y15, Y8
VPMULHW Y10, Y15, Y10
VPMULHW Y12, Y15, Y12
VPMULHW Y14, Y15, Y14
VPSUBW Y8, Y4, Y8
VPSUBW Y10, Y5, Y10
VPSUBW Y12, Y6, Y12
VPSUBW Y14, Y0, Y14
VPBROADCASTW 2080(CX), Y0
VPBROADCASTW 2082(CX), Y1
VPBROADCASTW 2084(CX), Y2
VPBROADCASTW 2086(CX), Y3
VPERM2I128 $0x20, Y9, Y7, Y4
VPERM2I128 $0x31, Y9, Y7, Y9
VMOVDQA Y4, Y7
VPERM2I128 $0x20, Y10, Y8, Y4
VPERM2I128 $0x31, Y10, Y8, Y10
VMOVDQA Y4, Y8
VPERM2I128 $0x20, Y13, Y11, Y4
VPERM2I128 $0x31, Y13, Y11, Y13
VMOVDQA Y4, Y11
VPERM2I128 $0x20, Y14, Y12, Y4
VPERM2I128 $0x31, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
MOVL $0x00004ebf, DX
VMOVD DX, X0
VPBROADCASTW X0, Y4
VPMULHW Y4, Y7, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y7, Y7
VPMULHW Y4, Y11, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y11, Y11
VPBROADCASTW 2096(CX), Y0
VPBROADCASTW 2098(CX), Y1
VPSUBW Y7, Y11, Y4
VPSUBW Y8, Y12, Y5
VPSUBW Y9, Y13, Y6
VPADDW Y7, Y11, Y7
VPADDW Y8, Y12, Y8
VPADDW Y9, Y13, Y9
VPMULLW Y4, Y0, Y11
VPMULLW Y5, Y0, Y12
VPSUBW Y10, Y14, Y2
VPMULLW Y6, Y0, Y13
VPADDW Y10, Y14, Y10
VPMULLW Y2, Y0, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y1, Y6
VPMULHW Y2, Y1, Y2
VPMULHW Y11, Y15, Y11
VPMULHW Y12, Y15, Y12
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y11, Y4, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y2, Y14
VMOVDQU Y7, (AX)
VMOVDQU Y8, 32(AX)
VMOVDQU Y9, 64(AX)
VMOVDQU Y10, 96(AX)
VMOVDQU Y11, 128(AX)
VMOVDQU Y12, 160(AX)
VMOVDQU Y13, 192(AX)
VMOVDQU Y14, 224(AX)
VMOVDQU 256(AX), Y7
VMOVDQU 288(AX), Y8
VMOVDQU 320(AX), Y9
VMOVDQU 352(AX), Y10
VMOVDQU 384(AX), Y11
VMOVDQU 416(AX), Y12
VMOVDQU 448(AX), Y13
VMOVDQU 480(AX), Y14
VMOVDQU 1184(CX), Y0
VMOVDQU 1216(CX), Y1
VMOVDQU 1248(CX), Y2
VMOVDQU 1280(CX), Y3
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
VMOVDQU 1440(CX), Y0
VMOVDQU 1472(CX), Y1
VMOVDQU 1504(CX), Y2
VMOVDQU 1536(CX), Y3
VPSLLD $0x10, Y8, Y4
VPBLENDW $0xaa, Y4, Y7, Y4
VPSRLD $0x10, Y7, Y7
VPBLENDW $0xaa, Y8, Y7, Y8
VMOVDQA Y4, Y7
VPSLLD $0x10, Y10, Y4
VPBLENDW $0xaa, Y4, Y9, Y4
VPSRLD $0x10, Y9, Y9
VPBLENDW $0xaa, Y10, Y9, Y10
VMOVDQA Y4, Y9
VPSLLD $0x10, Y12, Y4
VPBLENDW $0xaa, Y4, Y11, Y4
VPSRLD $0x10, Y11, Y11
VPBLENDW $0xaa, Y12, Y11, Y12
VMOVDQA Y4, Y11
VPSLLD $0x10, Y14, Y4
VPBLENDW $0xaa, Y4, Y13, Y4
VPSRLD $0x10, Y13, Y13
VPBLENDW $0xaa, Y14, Y13, Y14
VMOVDQA Y4, Y13
VPSUBW Y7, Y8, Y4
VPSUBW Y9, Y10, Y5
VPSUBW Y11, Y12, Y6
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
VPADDW Y11, Y12, Y11
VPMULLW Y4, Y0, Y8
VPMULLW Y5, Y0, Y10
VPSUBW Y13, Y14, Y0
VPMULLW Y6, Y2, Y12
VPADDW Y13, Y14, Y13
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y8, Y15, Y8
VPMULHW Y10, Y15, Y10
VPMULHW Y12, Y15, Y12
VPMULHW Y14, Y15, Y14
VPSUBW Y8, Y4, Y8
VPSUBW Y10, Y5, Y10
VPSUBW Y12, Y6, Y12
VPSUBW Y14, Y0, Y14
VMOVDQU 1696(CX), Y0
VMOVDQU 1728(CX), Y1
VMOVDQU 1760(CX), Y2
VMOVDQU 1792(CX), Y3
VMOVSLDUP Y9, Y4
VPBLENDD $0xaa, Y4, Y7, Y4
VPSRLQ $0x20, Y7, Y7
VPBLENDD $0xaa, Y9, Y7, Y9
VMOVDQA Y4, Y7
VMOVSLDUP Y10, Y4
VPBLENDD $0xaa, Y4, Y8, Y4
VPSRLQ $0x20, Y8, Y8
VPBLENDD $0xaa, Y10, Y8, Y10
VMOVDQA Y4, Y8
VMOVSLDUP Y13, Y4
VPBLENDD $0xaa, Y4, Y11, Y4
VPSRLQ $0x20, Y11, Y11
VPBLENDD $0xaa, Y13, Y11, Y13
VMOVDQA Y4, Y11
VMOVSLDUP Y14, Y4
VPBLENDD $0xaa, Y4, Y12, Y4
VPSRLQ $0x20, Y12, Y12
VPBLENDD $0xaa, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
MOVL $0x00004ebf, DX
VMOVD DX, X0
VPBROADCASTW X0, Y4
VPMULHW Y4, Y7, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y7, Y7
VPMULHW Y4, Y11, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y11, Y11
VMOVDQU 1952(CX), Y0
VMOVDQU 1984(CX), Y1
VMOVDQU 2016(CX), Y2
VMOVDQU 2048(CX), Y3
VPUNPCKLQDQ Y8, Y7, Y4
VPUNPCKHQDQ Y8, Y7, Y8
VMOVDQA Y4, Y7
VPUNPCKLQDQ Y10, Y9, Y4
VPUNPCKHQDQ Y10, Y9, Y10
VMOVDQA Y4, Y9
VPUNPCKLQDQ Y12, Y11, Y4
VPUNPCKHQDQ Y12, Y11, Y12
VMOVDQA Y4, Y11
VPUNPCKLQDQ Y14, Y13, Y4
VPUNPCKHQDQ Y14, Y13, Y14
VMOVDQA Y4, Y13
VPSUBW Y7, Y8, Y4
VPSUBW Y9, Y10, Y5
VPSUBW Y11, Y12, Y6
VPADDW Y7, Y8, Y7
VPADDW Y9, Y10, Y9
VPADDW Y11, Y12, Y11
VPMULLW Y4, Y0, Y8
VPMULLW Y5, Y0, Y10
VPSUBW Y13, Y14, Y0
VPMULLW Y6, Y2, Y12
VPADDW Y13, Y14, Y13
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y8, Y15, Y8
VPMULHW Y10, Y15, Y10
VPMULHW Y12, Y15, Y12
VPMULHW Y14, Y15, Y14
VPSUBW Y8, Y4, Y8
VPSUBW Y10, Y5, Y10
VPSUBW Y12, Y6, Y12
VPSUBW Y14, Y0, Y14
VPBROADCASTW 2088(CX), Y0
VPBROADCASTW 2090(CX), Y1
VPBROADCASTW 2092(CX), Y2
VPBROADCASTW 2094(CX), Y3
VPERM2I128 $0x20, Y9, Y7, Y4
VPERM2I128 $0x31, Y9, Y7, Y9
VMOVDQA Y4, Y7
VPERM2I128 $0x20, Y10, Y8, Y4
VPERM2I128 $0x31, Y10, Y8, Y10
VMOVDQA Y4, Y8
VPERM2I128 $0x20, Y13, Y11, Y4
VPERM2I128 $0x31, Y13, Y11, Y13
VMOVDQA Y4, Y11
VPERM2I128 $0x20, Y14, Y12, Y4
VPERM2I128 $0x31, Y14, Y12, Y14
VMOVDQA Y4, Y12
VPSUBW Y7, Y9, Y4
VPSUBW Y8, Y10, Y5
VPSUBW Y11, Y13, Y6
VPADDW Y7, Y9, Y7
VPADDW Y8, Y10, Y8
VPADDW Y11, Y13, Y11
VPMULLW Y4, Y0, Y9
VPMULLW Y5, Y0, Y10
VPSUBW Y12, Y14, Y0
VPMULLW Y6, Y2, Y13
VPADDW Y12, Y14, Y12
VPMULLW Y0, Y2, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y3, Y6
VPMULHW Y0, Y3, Y0
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y9, Y4, Y9
VPSUBW Y10, Y5, Y10
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y0, Y14
MOVL $0x00004ebf, DX
VMOVD DX, X0
VPBROADCASTW X0, Y4
VPMULHW Y4, Y7, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y7, Y7
VPMULHW Y4, Y11, Y5
VPSRAW $0x0a, Y5, Y5
VPMULLW Y15, Y5, Y5
VPSUBW Y5, Y11, Y11
VPBROADCASTW 2100(CX), Y0
VPBROADCASTW 2102(CX), Y1
VPSUBW Y7, Y11, Y4
VPSUBW Y8, Y12, Y5
VPSUBW Y9, Y13, Y6
VPADDW Y7, Y11, Y7
VPADDW Y8, Y12, Y8
VPADDW Y9, Y13, Y9
VPMULLW Y4, Y0, Y11
VPMULLW Y5, Y0, Y12
VPSUBW Y10, Y14, Y2
VPMULLW Y6, Y0, Y13
VPADDW Y10, Y14, Y10
VPMULLW Y2, Y0, Y14
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y6, Y1, Y6
VPMULHW Y2, Y1, Y2
VPMULHW Y11, Y15, Y11
VPMULHW Y12, Y15, Y12
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y11, Y4, Y11
VPSUBW Y12, Y5, Y12
VPSUBW Y13, Y6, Y13
VPSUBW Y14, Y2, Y14
VMOVDQU Y7, 256(AX)
VMOVDQU Y8, 288(AX)
VMOVDQU Y9, 320(AX)
VMOVDQU Y10, 352(AX)
VMOVDQU Y11, 384(AX)
VMOVDQU Y12, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y14, 480(AX)
VPBROADCASTW 2104(CX), Y0
VPBROADCASTW 2106(CX), Y1
VMOVDQU (AX), Y7
VMOVDQU 32(AX), Y8
VMOVDQU 64(AX), Y9
VMOVDQU 96(AX), Y10
VMOVDQU 256(AX), Y11
VMOVDQU 288(AX), Y12
VMOVDQU 320(AX), Y13
VMOVDQU 352(AX), Y14
VPSUBW Y7, Y11, Y2
VPSUBW Y8, Y12, Y3
VPSUBW Y9, Y13, Y4
VPADDW Y7, Y11, Y7
VPADDW Y8, Y12, Y8
VPADDW Y9, Y13, Y9
VPMULLW Y2, Y0, Y11
VPMULLW Y3, Y0, Y12
VPSUBW Y10, Y14, Y5
VPMULLW Y4, Y0, Y13
VPADDW Y10, Y14, Y10
VPMULLW Y5, Y0, Y14
VPMULHW Y2, Y1, Y2
VPMULHW Y3, Y1, Y3
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y11, Y15, Y11
VPMULHW Y12, Y15, Y12
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y11, Y2, Y11
VPSUBW Y12, Y3, Y12
VPSUBW Y13, Y4, Y13
VPSUBW Y14, Y5, Y14
MOVL $0xffffd8a1, DX
VMOVD DX, X0
VPBROADCASTW X0, Y0
MOVL $0x000005a1, DX
VMOVD DX, X1
VPBROADCASTW X1, Y1
VPMULLW Y7, Y0, Y2
VPMULLW Y8, Y0, Y3
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULHW Y7, Y1, Y7
VPMULHW Y8, Y1, Y8
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y7, Y7
VPSUBW Y3, Y8, Y8
VPSUBW Y4, Y9, Y9
VPSUBW Y5, Y10, Y10
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y11
VPSUBW Y3, Y12, Y12
VPSUBW Y4, Y13, Y13
VPSUBW Y5, Y14, Y14
VMOVDQU Y7, (AX)
VMOVDQU Y8, 32(AX)
VMOVDQU Y9, 64(AX)
VMOVDQU Y10, 96(AX)
VMOVDQU Y11, 256(AX)
VMOVDQU Y12, 288(AX)
VMOVDQU Y13, 320(AX)
VMOVDQU Y14, 352(AX)
VPBROADCASTW 2104(CX), Y0
VPBROADCASTW 2106(CX), Y1
VMOVDQU 128(AX), Y7
VMOVDQU 160(AX), Y8
VMOVDQU 192(AX), Y9
VMOVDQU 224(AX), Y10
VMOVDQU 384(AX), Y11
VMOVDQU 416(AX), Y12
VMOVDQU 448(AX), Y13
VMOVDQU 480(AX), Y14
VPSUBW Y7, Y11, Y2
VPSUBW Y8, Y12, Y3
VPSUBW Y9, Y13, Y4
VPADDW Y7, Y11, Y7
VPADDW Y8, Y12, Y8
VPADDW Y9, Y13, Y9
VPMULLW Y2, Y0, Y11
VPMULLW Y3, Y0, Y12
VPSUBW Y10, Y14, Y5
VPMULLW Y4, Y0, Y13
VPADDW Y10, Y14, Y10
VPMULLW Y5, Y0, Y14
VPMULHW Y2, Y1, Y2
VPMULHW Y3, Y1, Y3
VPMULHW Y4, Y1, Y4
VPMULHW Y5, Y1, Y5
VPMULHW Y11, Y15, Y11
VPMULHW Y12, Y15, Y12
VPMULHW Y13, Y15, Y13
VPMULHW Y14, Y15, Y14
VPSUBW Y11, Y2, Y11
VPSUBW Y12, Y3, Y12
VPSUBW Y13, Y4, Y13
VPSUBW Y14, Y5, Y14
MOVL $0xffffd8a1, CX
VMOVD CX, X0
VPBROADCASTW X0, Y0
MOVL $0x000005a1, CX
VMOVD CX, X1
VPBROADCASTW X1, Y1
VPMULLW Y7, Y0, Y2
VPMULLW Y8, Y0, Y3
VPMULLW Y9, Y0, Y4
VPMULLW Y10, Y0, Y5
VPMULHW Y7, Y1, Y7
VPMULHW Y8, Y1, Y8
VPMULHW Y9, Y1, Y9
VPMULHW Y10, Y1, Y10
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y7, Y7
VPSUBW Y3, Y8, Y8
VPSUBW Y4, Y9, Y9
VPSUBW Y5, Y10, Y10
VPMULLW Y11, Y0, Y2
VPMULLW Y12, Y0, Y3
VPMULLW Y13, Y0, Y4
VPMULLW Y14, Y0, Y5
VPMULHW Y11, Y1, Y11
VPMULHW Y12, Y1, Y12
VPMULHW Y13, Y1, Y13
VPMULHW Y14, Y1, Y14
VPMULHW Y2, Y15, Y2
VPMULHW Y3, Y15, Y3
VPMULHW Y4, Y15, Y4
VPMULHW Y5, Y15, Y5
VPSUBW Y2, Y11, Y11
VPSUBW Y3, Y12, Y12
VPSUBW Y4, Y13, Y13
VPSUBW Y5, Y14, Y14
VMOVDQU Y7, 128(AX)
VMOVDQU Y8, 160(AX)
VMOVDQU Y9, 192(AX)
VMOVDQU Y10, 224(AX)
VMOVDQU Y11, 384(AX)
VMOVDQU Y12, 416(AX)
VMOVDQU Y13, 448(AX)
VMOVDQU Y14, 480(AX)
RET
// func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
MOVQ p+0(FP), AX
MOVQ a+8(FP), CX
MOVQ b+16(FP), DX
LEAQ ·ZetasAVX2+0(SB), BX
MOVL $0xfffff301, BP
VMOVD BP, X0
VPBROADCASTW X0, Y14
MOVL $0x00000d01, BP
VMOVD BP, X0
VPBROADCASTW X0, Y15
VMOVDQU (CX), Y0
VMOVDQU 32(CX), Y1
VMOVDQU 64(CX), Y2
VMOVDQU 96(CX), Y3
VMOVDQU (DX), Y4
VMOVDQU 32(DX), Y5
VMOVDQU 64(DX), Y6
VMOVDQU 96(DX), Y7
VPMULLW Y1, Y5, Y8
VPMULLW Y0, Y4, Y9
VPMULLW Y0, Y5, Y10
VPMULLW Y1, Y4, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y1, Y5, Y12
VPMULHW Y0, Y4, Y13
VPMULHW Y0, Y5, Y0
VPMULHW Y1, Y4, Y1
VMOVDQA Y12, Y4
VMOVDQA Y13, Y5
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y4, Y4
VPSUBW Y9, Y5, Y5
VPSUBW Y10, Y0, Y0
VPSUBW Y11, Y1, Y1
VMOVDQU 800(BX), Y12
VMOVDQU 832(BX), Y13
VPMULLW Y4, Y12, Y8
VPMULHW Y4, Y13, Y4
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y4, Y4
VPADDW Y4, Y5, Y4
VPADDW Y0, Y1, Y5
VPMULLW Y3, Y7, Y8
VPMULLW Y2, Y6, Y9
VPMULLW Y2, Y7, Y10
VPMULLW Y3, Y6, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y3, Y7, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y7, Y2
VPMULHW Y3, Y6, Y3
VMOVDQA Y12, Y6
VMOVDQA Y13, Y7
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y6, Y6
VPSUBW Y9, Y7, Y7
VPSUBW Y10, Y2, Y2
VPSUBW Y11, Y3, Y3
VMOVDQU 800(BX), Y12
VMOVDQU 832(BX), Y13
VPMULLW Y6, Y12, Y8
VPMULHW Y6, Y13, Y6
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y6, Y6
VPSUBW Y6, Y7, Y6
VPADDW Y2, Y3, Y7
VMOVDQU Y4, (AX)
VMOVDQU Y5, 32(AX)
VMOVDQU Y6, 64(AX)
VMOVDQU Y7, 96(AX)
VMOVDQU 128(CX), Y0
VMOVDQU 160(CX), Y1
VMOVDQU 192(CX), Y2
VMOVDQU 224(CX), Y3
VMOVDQU 128(DX), Y4
VMOVDQU 160(DX), Y5
VMOVDQU 192(DX), Y6
VMOVDQU 224(DX), Y7
VPMULLW Y1, Y5, Y8
VPMULLW Y0, Y4, Y9
VPMULLW Y0, Y5, Y10
VPMULLW Y1, Y4, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y1, Y5, Y12
VPMULHW Y0, Y4, Y13
VPMULHW Y0, Y5, Y0
VPMULHW Y1, Y4, Y1
VMOVDQA Y12, Y4
VMOVDQA Y13, Y5
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y4, Y4
VPSUBW Y9, Y5, Y5
VPSUBW Y10, Y0, Y0
VPSUBW Y11, Y1, Y1
VMOVDQU 864(BX), Y12
VMOVDQU 896(BX), Y13
VPMULLW Y4, Y12, Y8
VPMULHW Y4, Y13, Y4
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y4, Y4
VPADDW Y4, Y5, Y4
VPADDW Y0, Y1, Y5
VPMULLW Y3, Y7, Y8
VPMULLW Y2, Y6, Y9
VPMULLW Y2, Y7, Y10
VPMULLW Y3, Y6, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y3, Y7, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y7, Y2
VPMULHW Y3, Y6, Y3
VMOVDQA Y12, Y6
VMOVDQA Y13, Y7
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y6, Y6
VPSUBW Y9, Y7, Y7
VPSUBW Y10, Y2, Y2
VPSUBW Y11, Y3, Y3
VMOVDQU 864(BX), Y12
VMOVDQU 896(BX), Y13
VPMULLW Y6, Y12, Y8
VPMULHW Y6, Y13, Y6
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y6, Y6
VPSUBW Y6, Y7, Y6
VPADDW Y2, Y3, Y7
VMOVDQU Y4, 128(AX)
VMOVDQU Y5, 160(AX)
VMOVDQU Y6, 192(AX)
VMOVDQU Y7, 224(AX)
VMOVDQU 256(CX), Y0
VMOVDQU 288(CX), Y1
VMOVDQU 320(CX), Y2
VMOVDQU 352(CX), Y3
VMOVDQU 256(DX), Y4
VMOVDQU 288(DX), Y5
VMOVDQU 320(DX), Y6
VMOVDQU 352(DX), Y7
VPMULLW Y1, Y5, Y8
VPMULLW Y0, Y4, Y9
VPMULLW Y0, Y5, Y10
VPMULLW Y1, Y4, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y1, Y5, Y12
VPMULHW Y0, Y4, Y13
VPMULHW Y0, Y5, Y0
VPMULHW Y1, Y4, Y1
VMOVDQA Y12, Y4
VMOVDQA Y13, Y5
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y4, Y4
VPSUBW Y9, Y5, Y5
VPSUBW Y10, Y0, Y0
VPSUBW Y11, Y1, Y1
VMOVDQU 928(BX), Y12
VMOVDQU 960(BX), Y13
VPMULLW Y4, Y12, Y8
VPMULHW Y4, Y13, Y4
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y4, Y4
VPADDW Y4, Y5, Y4
VPADDW Y0, Y1, Y5
VPMULLW Y3, Y7, Y8
VPMULLW Y2, Y6, Y9
VPMULLW Y2, Y7, Y10
VPMULLW Y3, Y6, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y3, Y7, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y7, Y2
VPMULHW Y3, Y6, Y3
VMOVDQA Y12, Y6
VMOVDQA Y13, Y7
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y6, Y6
VPSUBW Y9, Y7, Y7
VPSUBW Y10, Y2, Y2
VPSUBW Y11, Y3, Y3
VMOVDQU 928(BX), Y12
VMOVDQU 960(BX), Y13
VPMULLW Y6, Y12, Y8
VPMULHW Y6, Y13, Y6
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y6, Y6
VPSUBW Y6, Y7, Y6
VPADDW Y2, Y3, Y7
VMOVDQU Y4, 256(AX)
VMOVDQU Y5, 288(AX)
VMOVDQU Y6, 320(AX)
VMOVDQU Y7, 352(AX)
VMOVDQU 384(CX), Y0
VMOVDQU 416(CX), Y1
VMOVDQU 448(CX), Y2
VMOVDQU 480(CX), Y3
VMOVDQU 384(DX), Y4
VMOVDQU 416(DX), Y5
VMOVDQU 448(DX), Y6
VMOVDQU 480(DX), Y7
VPMULLW Y1, Y5, Y8
VPMULLW Y0, Y4, Y9
VPMULLW Y0, Y5, Y10
VPMULLW Y1, Y4, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y1, Y5, Y12
VPMULHW Y0, Y4, Y13
VPMULHW Y0, Y5, Y0
VPMULHW Y1, Y4, Y1
VMOVDQA Y12, Y4
VMOVDQA Y13, Y5
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y4, Y4
VPSUBW Y9, Y5, Y5
VPSUBW Y10, Y0, Y0
VPSUBW Y11, Y1, Y1
VMOVDQU 992(BX), Y12
VMOVDQU 1024(BX), Y13
VPMULLW Y4, Y12, Y8
VPMULHW Y4, Y13, Y4
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y4, Y4
VPADDW Y4, Y5, Y4
VPADDW Y0, Y1, Y5
VPMULLW Y3, Y7, Y8
VPMULLW Y2, Y6, Y9
VPMULLW Y2, Y7, Y10
VPMULLW Y3, Y6, Y11
VPMULLW Y8, Y14, Y8
VPMULLW Y9, Y14, Y9
VPMULLW Y10, Y14, Y10
VPMULLW Y11, Y14, Y11
VPMULHW Y3, Y7, Y12
VPMULHW Y2, Y6, Y13
VPMULHW Y2, Y7, Y2
VPMULHW Y3, Y6, Y3
VMOVDQA Y12, Y6
VMOVDQA Y13, Y7
VPMULHW Y8, Y15, Y8
VPMULHW Y9, Y15, Y9
VPMULHW Y10, Y15, Y10
VPMULHW Y11, Y15, Y11
VPSUBW Y8, Y6, Y6
VPSUBW Y9, Y7, Y7
VPSUBW Y10, Y2, Y2
VPSUBW Y11, Y3, Y3
VMOVDQU 992(BX), Y12
VMOVDQU 1024(BX), Y13
VPMULLW Y6, Y12, Y8
VPMULHW Y6, Y13, Y6
VPMULHW Y8, Y15, Y8
VPSUBW Y8, Y6, Y6
VPSUBW Y6, Y7, Y6
VPADDW Y2, Y3, Y7
VMOVDQU Y4, 384(AX)
VMOVDQU Y5, 416(AX)
VMOVDQU Y6, 448(AX)
VMOVDQU Y7, 480(AX)
RET
// func detangleAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
VMOVDQU (AX), Y0
VMOVDQU 32(AX), Y1
VMOVDQU 64(AX), Y2
VMOVDQU 96(AX), Y3
VMOVDQU 128(AX), Y4
VMOVDQU 160(AX), Y5
VMOVDQU 192(AX), Y6
VMOVDQU 224(AX), Y7
VPSLLD $0x10, Y1, Y8
VPBLENDW $0xaa, Y8, Y0, Y8
VPSRLD $0x10, Y0, Y0
VPBLENDW $0xaa, Y1, Y0, Y1
VMOVDQA Y8, Y0
VPSLLD $0x10, Y3, Y8
VPBLENDW $0xaa, Y8, Y2, Y8
VPSRLD $0x10, Y2, Y2
VPBLENDW $0xaa, Y3, Y2, Y3
VMOVDQA Y8, Y2
VPSLLD $0x10, Y5, Y8
VPBLENDW $0xaa, Y8, Y4, Y8
VPSRLD $0x10, Y4, Y4
VPBLENDW $0xaa, Y5, Y4, Y5
VMOVDQA Y8, Y4
VPSLLD $0x10, Y7, Y8
VPBLENDW $0xaa, Y8, Y6, Y8
VPSRLD $0x10, Y6, Y6
VPBLENDW $0xaa, Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVSLDUP Y2, Y8
VPBLENDD $0xaa, Y8, Y0, Y8
VPSRLQ $0x20, Y0, Y0
VPBLENDD $0xaa, Y2, Y0, Y2
VMOVDQA Y8, Y0
VMOVSLDUP Y3, Y8
VPBLENDD $0xaa, Y8, Y1, Y8
VPSRLQ $0x20, Y1, Y1
VPBLENDD $0xaa, Y3, Y1, Y3
VMOVDQA Y8, Y1
VMOVSLDUP Y6, Y8
VPBLENDD $0xaa, Y8, Y4, Y8
VPSRLQ $0x20, Y4, Y4
VPBLENDD $0xaa, Y6, Y4, Y6
VMOVDQA Y8, Y4
VMOVSLDUP Y7, Y8
VPBLENDD $0xaa, Y8, Y5, Y8
VPSRLQ $0x20, Y5, Y5
VPBLENDD $0xaa, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPUNPCKLQDQ Y1, Y0, Y8
VPUNPCKHQDQ Y1, Y0, Y1
VMOVDQA Y8, Y0
VPUNPCKLQDQ Y3, Y2, Y8
VPUNPCKHQDQ Y3, Y2, Y3
VMOVDQA Y8, Y2
VPUNPCKLQDQ Y5, Y4, Y8
VPUNPCKHQDQ Y5, Y4, Y5
VMOVDQA Y8, Y4
VPUNPCKLQDQ Y7, Y6, Y8
VPUNPCKHQDQ Y7, Y6, Y7
VMOVDQA Y8, Y6
VPERM2I128 $0x20, Y2, Y0, Y8
VPERM2I128 $0x31, Y2, Y0, Y2
VMOVDQA Y8, Y0
VPERM2I128 $0x20, Y3, Y1, Y8
VPERM2I128 $0x31, Y3, Y1, Y3
VMOVDQA Y8, Y1
VPERM2I128 $0x20, Y6, Y4, Y8
VPERM2I128 $0x31, Y6, Y4, Y6
VMOVDQA Y8, Y4
VPERM2I128 $0x20, Y7, Y5, Y8
VPERM2I128 $0x31, Y7, Y5, Y7
VMOVDQA Y8, Y5
VMOVDQU Y0, (AX)
VMOVDQU Y1, 32(AX)
VMOVDQU Y2, 64(AX)
VMOVDQU Y3, 96(AX)
VMOVDQU Y4, 128(AX)
VMOVDQU Y5, 160(AX)
VMOVDQU Y6, 192(AX)
VMOVDQU Y7, 224(AX)
VMOVDQU 256(AX), Y0
VMOVDQU 288(AX), Y1
VMOVDQU 320(AX), Y2
VMOVDQU 352(AX), Y3
VMOVDQU 384(AX), Y4
VMOVDQU 416(AX), Y5
VMOVDQU 448(AX), Y6
VMOVDQU 480(AX), Y7
VPSLLD $0x10, Y1, Y8
VPBLENDW $0xaa, Y8, Y0, Y8
VPSRLD $0x10, Y0, Y0
VPBLENDW $0xaa, Y1, Y0, Y1
VMOVDQA Y8, Y0
VPSLLD $0x10, Y3, Y8
VPBLENDW $0xaa, Y8, Y2, Y8
VPSRLD $0x10, Y2, Y2
VPBLENDW $0xaa, Y3, Y2, Y3
VMOVDQA Y8, Y2
VPSLLD $0x10, Y5, Y8
VPBLENDW $0xaa, Y8, Y4, Y8
VPSRLD $0x10, Y4, Y4
VPBLENDW $0xaa, Y5, Y4, Y5
VMOVDQA Y8, Y4
VPSLLD $0x10, Y7, Y8
VPBLENDW $0xaa, Y8, Y6, Y8
VPSRLD $0x10, Y6, Y6
VPBLENDW $0xaa, Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVSLDUP Y2, Y8
VPBLENDD $0xaa, Y8, Y0, Y8
VPSRLQ $0x20, Y0, Y0
VPBLENDD $0xaa, Y2, Y0, Y2
VMOVDQA Y8, Y0
VMOVSLDUP Y3, Y8
VPBLENDD $0xaa, Y8, Y1, Y8
VPSRLQ $0x20, Y1, Y1
VPBLENDD $0xaa, Y3, Y1, Y3
VMOVDQA Y8, Y1
VMOVSLDUP Y6, Y8
VPBLENDD $0xaa, Y8, Y4, Y8
VPSRLQ $0x20, Y4, Y4
VPBLENDD $0xaa, Y6, Y4, Y6
VMOVDQA Y8, Y4
VMOVSLDUP Y7, Y8
VPBLENDD $0xaa, Y8, Y5, Y8
VPSRLQ $0x20, Y5, Y5
VPBLENDD $0xaa, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPUNPCKLQDQ Y1, Y0, Y8
VPUNPCKHQDQ Y1, Y0, Y1
VMOVDQA Y8, Y0
VPUNPCKLQDQ Y3, Y2, Y8
VPUNPCKHQDQ Y3, Y2, Y3
VMOVDQA Y8, Y2
VPUNPCKLQDQ Y5, Y4, Y8
VPUNPCKHQDQ Y5, Y4, Y5
VMOVDQA Y8, Y4
VPUNPCKLQDQ Y7, Y6, Y8
VPUNPCKHQDQ Y7, Y6, Y7
VMOVDQA Y8, Y6
VPERM2I128 $0x20, Y2, Y0, Y8
VPERM2I128 $0x31, Y2, Y0, Y2
VMOVDQA Y8, Y0
VPERM2I128 $0x20, Y3, Y1, Y8
VPERM2I128 $0x31, Y3, Y1, Y3
VMOVDQA Y8, Y1
VPERM2I128 $0x20, Y6, Y4, Y8
VPERM2I128 $0x31, Y6, Y4, Y6
VMOVDQA Y8, Y4
VPERM2I128 $0x20, Y7, Y5, Y8
VPERM2I128 $0x31, Y7, Y5, Y7
VMOVDQA Y8, Y5
VMOVDQU Y0, 256(AX)
VMOVDQU Y1, 288(AX)
VMOVDQU Y2, 320(AX)
VMOVDQU Y3, 352(AX)
VMOVDQU Y4, 384(AX)
VMOVDQU Y5, 416(AX)
VMOVDQU Y6, 448(AX)
VMOVDQU Y7, 480(AX)
RET
// func tangleAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
VMOVDQU (AX), Y0
VMOVDQU 32(AX), Y1
VMOVDQU 64(AX), Y2
VMOVDQU 96(AX), Y3
VMOVDQU 128(AX), Y4
VMOVDQU 160(AX), Y5
VMOVDQU 192(AX), Y6
VMOVDQU 224(AX), Y7
VPERM2I128 $0x20, Y2, Y0, Y8
VPERM2I128 $0x31, Y2, Y0, Y2
VMOVDQA Y8, Y0
VPERM2I128 $0x20, Y3, Y1, Y8
VPERM2I128 $0x31, Y3, Y1, Y3
VMOVDQA Y8, Y1
VPERM2I128 $0x20, Y6, Y4, Y8
VPERM2I128 $0x31, Y6, Y4, Y6
VMOVDQA Y8, Y4
VPERM2I128 $0x20, Y7, Y5, Y8
VPERM2I128 $0x31, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPUNPCKLQDQ Y1, Y0, Y8
VPUNPCKHQDQ Y1, Y0, Y1
VMOVDQA Y8, Y0
VPUNPCKLQDQ Y3, Y2, Y8
VPUNPCKHQDQ Y3, Y2, Y3
VMOVDQA Y8, Y2
VPUNPCKLQDQ Y5, Y4, Y8
VPUNPCKHQDQ Y5, Y4, Y5
VMOVDQA Y8, Y4
VPUNPCKLQDQ Y7, Y6, Y8
VPUNPCKHQDQ Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVSLDUP Y2, Y8
VPBLENDD $0xaa, Y8, Y0, Y8
VPSRLQ $0x20, Y0, Y0
VPBLENDD $0xaa, Y2, Y0, Y2
VMOVDQA Y8, Y0
VMOVSLDUP Y3, Y8
VPBLENDD $0xaa, Y8, Y1, Y8
VPSRLQ $0x20, Y1, Y1
VPBLENDD $0xaa, Y3, Y1, Y3
VMOVDQA Y8, Y1
VMOVSLDUP Y6, Y8
VPBLENDD $0xaa, Y8, Y4, Y8
VPSRLQ $0x20, Y4, Y4
VPBLENDD $0xaa, Y6, Y4, Y6
VMOVDQA Y8, Y4
VMOVSLDUP Y7, Y8
VPBLENDD $0xaa, Y8, Y5, Y8
VPSRLQ $0x20, Y5, Y5
VPBLENDD $0xaa, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPSLLD $0x10, Y1, Y8
VPBLENDW $0xaa, Y8, Y0, Y8
VPSRLD $0x10, Y0, Y0
VPBLENDW $0xaa, Y1, Y0, Y1
VMOVDQA Y8, Y0
VPSLLD $0x10, Y3, Y8
VPBLENDW $0xaa, Y8, Y2, Y8
VPSRLD $0x10, Y2, Y2
VPBLENDW $0xaa, Y3, Y2, Y3
VMOVDQA Y8, Y2
VPSLLD $0x10, Y5, Y8
VPBLENDW $0xaa, Y8, Y4, Y8
VPSRLD $0x10, Y4, Y4
VPBLENDW $0xaa, Y5, Y4, Y5
VMOVDQA Y8, Y4
VPSLLD $0x10, Y7, Y8
VPBLENDW $0xaa, Y8, Y6, Y8
VPSRLD $0x10, Y6, Y6
VPBLENDW $0xaa, Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVDQU Y0, (AX)
VMOVDQU Y1, 32(AX)
VMOVDQU Y2, 64(AX)
VMOVDQU Y3, 96(AX)
VMOVDQU Y4, 128(AX)
VMOVDQU Y5, 160(AX)
VMOVDQU Y6, 192(AX)
VMOVDQU Y7, 224(AX)
VMOVDQU 256(AX), Y0
VMOVDQU 288(AX), Y1
VMOVDQU 320(AX), Y2
VMOVDQU 352(AX), Y3
VMOVDQU 384(AX), Y4
VMOVDQU 416(AX), Y5
VMOVDQU 448(AX), Y6
VMOVDQU 480(AX), Y7
VPERM2I128 $0x20, Y2, Y0, Y8
VPERM2I128 $0x31, Y2, Y0, Y2
VMOVDQA Y8, Y0
VPERM2I128 $0x20, Y3, Y1, Y8
VPERM2I128 $0x31, Y3, Y1, Y3
VMOVDQA Y8, Y1
VPERM2I128 $0x20, Y6, Y4, Y8
VPERM2I128 $0x31, Y6, Y4, Y6
VMOVDQA Y8, Y4
VPERM2I128 $0x20, Y7, Y5, Y8
VPERM2I128 $0x31, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPUNPCKLQDQ Y1, Y0, Y8
VPUNPCKHQDQ Y1, Y0, Y1
VMOVDQA Y8, Y0
VPUNPCKLQDQ Y3, Y2, Y8
VPUNPCKHQDQ Y3, Y2, Y3
VMOVDQA Y8, Y2
VPUNPCKLQDQ Y5, Y4, Y8
VPUNPCKHQDQ Y5, Y4, Y5
VMOVDQA Y8, Y4
VPUNPCKLQDQ Y7, Y6, Y8
VPUNPCKHQDQ Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVSLDUP Y2, Y8
VPBLENDD $0xaa, Y8, Y0, Y8
VPSRLQ $0x20, Y0, Y0
VPBLENDD $0xaa, Y2, Y0, Y2
VMOVDQA Y8, Y0
VMOVSLDUP Y3, Y8
VPBLENDD $0xaa, Y8, Y1, Y8
VPSRLQ $0x20, Y1, Y1
VPBLENDD $0xaa, Y3, Y1, Y3
VMOVDQA Y8, Y1
VMOVSLDUP Y6, Y8
VPBLENDD $0xaa, Y8, Y4, Y8
VPSRLQ $0x20, Y4, Y4
VPBLENDD $0xaa, Y6, Y4, Y6
VMOVDQA Y8, Y4
VMOVSLDUP Y7, Y8
VPBLENDD $0xaa, Y8, Y5, Y8
VPSRLQ $0x20, Y5, Y5
VPBLENDD $0xaa, Y7, Y5, Y7
VMOVDQA Y8, Y5
VPSLLD $0x10, Y1, Y8
VPBLENDW $0xaa, Y8, Y0, Y8
VPSRLD $0x10, Y0, Y0
VPBLENDW $0xaa, Y1, Y0, Y1
VMOVDQA Y8, Y0
VPSLLD $0x10, Y3, Y8
VPBLENDW $0xaa, Y8, Y2, Y8
VPSRLD $0x10, Y2, Y2
VPBLENDW $0xaa, Y3, Y2, Y3
VMOVDQA Y8, Y2
VPSLLD $0x10, Y5, Y8
VPBLENDW $0xaa, Y8, Y4, Y8
VPSRLD $0x10, Y4, Y4
VPBLENDW $0xaa, Y5, Y4, Y5
VMOVDQA Y8, Y4
VPSLLD $0x10, Y7, Y8
VPBLENDW $0xaa, Y8, Y6, Y8
VPSRLD $0x10, Y6, Y6
VPBLENDW $0xaa, Y7, Y6, Y7
VMOVDQA Y8, Y6
VMOVDQU Y0, 256(AX)
VMOVDQU Y1, 288(AX)
VMOVDQU Y2, 320(AX)
VMOVDQU Y3, 352(AX)
VMOVDQU Y4, 384(AX)
VMOVDQU Y5, 416(AX)
VMOVDQU Y6, 448(AX)
VMOVDQU Y7, 480(AX)
RET
// func barrettReduceAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
MOVL $0x00000d01, CX
VMOVD CX, X0
VPBROADCASTW X0, Y9
MOVL $0x00004ebf, CX
VMOVD CX, X0
VPBROADCASTW X0, Y8
VMOVDQU (AX), Y0
VMOVDQU 32(AX), Y1
VMOVDQU 64(AX), Y2
VMOVDQU 96(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VMOVDQU Y0, (AX)
VMOVDQU Y1, 32(AX)
VMOVDQU Y2, 64(AX)
VMOVDQU Y3, 96(AX)
VMOVDQU 128(AX), Y0
VMOVDQU 160(AX), Y1
VMOVDQU 192(AX), Y2
VMOVDQU 224(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VMOVDQU Y0, 128(AX)
VMOVDQU Y1, 160(AX)
VMOVDQU Y2, 192(AX)
VMOVDQU Y3, 224(AX)
VMOVDQU 256(AX), Y0
VMOVDQU 288(AX), Y1
VMOVDQU 320(AX), Y2
VMOVDQU 352(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VMOVDQU Y0, 256(AX)
VMOVDQU Y1, 288(AX)
VMOVDQU Y2, 320(AX)
VMOVDQU Y3, 352(AX)
VMOVDQU 384(AX), Y0
VMOVDQU 416(AX), Y1
VMOVDQU 448(AX), Y2
VMOVDQU 480(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VMOVDQU Y0, 384(AX)
VMOVDQU Y1, 416(AX)
VMOVDQU Y2, 448(AX)
VMOVDQU Y3, 480(AX)
RET
// func normalizeAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
MOVQ p+0(FP), AX
MOVL $0x00000d01, CX
VMOVD CX, X0
VPBROADCASTW X0, Y9
MOVL $0x00004ebf, CX
VMOVD CX, X0
VPBROADCASTW X0, Y8
VMOVDQU (AX), Y0
VMOVDQU 32(AX), Y1
VMOVDQU 64(AX), Y2
VMOVDQU 96(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VPSUBW Y9, Y0, Y0
VPSUBW Y9, Y1, Y1
VPSUBW Y9, Y2, Y2
VPSUBW Y9, Y3, Y3
VPSRAW $0x0f, Y0, Y4
VPSRAW $0x0f, Y1, Y5
VPSRAW $0x0f, Y2, Y6
VPSRAW $0x0f, Y3, Y7
VPAND Y4, Y9, Y4
VPAND Y5, Y9, Y5
VPAND Y6, Y9, Y6
VPAND Y7, Y9, Y7
VPADDW Y0, Y4, Y0
VPADDW Y1, Y5, Y1
VPADDW Y2, Y6, Y2
VPADDW Y3, Y7, Y3
VMOVDQU Y0, (AX)
VMOVDQU Y1, 32(AX)
VMOVDQU Y2, 64(AX)
VMOVDQU Y3, 96(AX)
VMOVDQU 128(AX), Y0
VMOVDQU 160(AX), Y1
VMOVDQU 192(AX), Y2
VMOVDQU 224(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VPSUBW Y9, Y0, Y0
VPSUBW Y9, Y1, Y1
VPSUBW Y9, Y2, Y2
VPSUBW Y9, Y3, Y3
VPSRAW $0x0f, Y0, Y4
VPSRAW $0x0f, Y1, Y5
VPSRAW $0x0f, Y2, Y6
VPSRAW $0x0f, Y3, Y7
VPAND Y4, Y9, Y4
VPAND Y5, Y9, Y5
VPAND Y6, Y9, Y6
VPAND Y7, Y9, Y7
VPADDW Y0, Y4, Y0
VPADDW Y1, Y5, Y1
VPADDW Y2, Y6, Y2
VPADDW Y3, Y7, Y3
VMOVDQU Y0, 128(AX)
VMOVDQU Y1, 160(AX)
VMOVDQU Y2, 192(AX)
VMOVDQU Y3, 224(AX)
VMOVDQU 256(AX), Y0
VMOVDQU 288(AX), Y1
VMOVDQU 320(AX), Y2
VMOVDQU 352(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VPSUBW Y9, Y0, Y0
VPSUBW Y9, Y1, Y1
VPSUBW Y9, Y2, Y2
VPSUBW Y9, Y3, Y3
VPSRAW $0x0f, Y0, Y4
VPSRAW $0x0f, Y1, Y5
VPSRAW $0x0f, Y2, Y6
VPSRAW $0x0f, Y3, Y7
VPAND Y4, Y9, Y4
VPAND Y5, Y9, Y5
VPAND Y6, Y9, Y6
VPAND Y7, Y9, Y7
VPADDW Y0, Y4, Y0
VPADDW Y1, Y5, Y1
VPADDW Y2, Y6, Y2
VPADDW Y3, Y7, Y3
VMOVDQU Y0, 256(AX)
VMOVDQU Y1, 288(AX)
VMOVDQU Y2, 320(AX)
VMOVDQU Y3, 352(AX)
VMOVDQU 384(AX), Y0
VMOVDQU 416(AX), Y1
VMOVDQU 448(AX), Y2
VMOVDQU 480(AX), Y3
VPMULHW Y8, Y0, Y4
VPMULHW Y8, Y1, Y5
VPMULHW Y8, Y2, Y6
VPMULHW Y8, Y3, Y7
VPSRAW $0x0a, Y4, Y4
VPSRAW $0x0a, Y5, Y5
VPSRAW $0x0a, Y6, Y6
VPSRAW $0x0a, Y7, Y7
VPMULLW Y9, Y4, Y4
VPMULLW Y9, Y5, Y5
VPMULLW Y9, Y6, Y6
VPMULLW Y9, Y7, Y7
VPSUBW Y4, Y0, Y0
VPSUBW Y5, Y1, Y1
VPSUBW Y6, Y2, Y2
VPSUBW Y7, Y3, Y3
VPSUBW Y9, Y0, Y0
VPSUBW Y9, Y1, Y1
VPSUBW Y9, Y2, Y2
VPSUBW Y9, Y3, Y3
VPSRAW $0x0f, Y0, Y4
VPSRAW $0x0f, Y1, Y5
VPSRAW $0x0f, Y2, Y6
VPSRAW $0x0f, Y3, Y7
VPAND Y4, Y9, Y4
VPAND Y5, Y9, Y5
VPAND Y6, Y9, Y6
VPAND Y7, Y9, Y7
VPADDW Y0, Y4, Y0
VPADDW Y1, Y5, Y1
VPADDW Y2, Y6, Y2
VPADDW Y3, Y7, Y3
VMOVDQU Y0, 384(AX)
VMOVDQU Y1, 416(AX)
VMOVDQU Y2, 448(AX)
VMOVDQU Y3, 480(AX)
RET