2355 lines
59 KiB
ArmAsm
2355 lines
59 KiB
ArmAsm
|
// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
|
||
|
|
||
|
// +build amd64
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
// func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·addAVX2(SB), NOSPLIT, $0-24
|
||
|
MOVQ p+0(FP), AX
|
||
|
MOVQ a+8(FP), CX
|
||
|
MOVQ b+16(FP), DX
|
||
|
VMOVDQU (CX), Y0
|
||
|
VMOVDQU 32(CX), Y2
|
||
|
VMOVDQU 64(CX), Y4
|
||
|
VMOVDQU 96(CX), Y6
|
||
|
VMOVDQU 128(CX), Y8
|
||
|
VMOVDQU 160(CX), Y10
|
||
|
VMOVDQU 192(CX), Y12
|
||
|
VMOVDQU 224(CX), Y14
|
||
|
VMOVDQU (DX), Y1
|
||
|
VMOVDQU 32(DX), Y3
|
||
|
VMOVDQU 64(DX), Y5
|
||
|
VMOVDQU 96(DX), Y7
|
||
|
VMOVDQU 128(DX), Y9
|
||
|
VMOVDQU 160(DX), Y11
|
||
|
VMOVDQU 192(DX), Y13
|
||
|
VMOVDQU 224(DX), Y15
|
||
|
VPADDW Y0, Y1, Y1
|
||
|
VPADDW Y2, Y3, Y3
|
||
|
VPADDW Y4, Y5, Y5
|
||
|
VPADDW Y6, Y7, Y7
|
||
|
VPADDW Y8, Y9, Y9
|
||
|
VPADDW Y10, Y11, Y11
|
||
|
VPADDW Y12, Y13, Y13
|
||
|
VPADDW Y14, Y15, Y15
|
||
|
VMOVDQU Y1, (AX)
|
||
|
VMOVDQU Y3, 32(AX)
|
||
|
VMOVDQU Y5, 64(AX)
|
||
|
VMOVDQU Y7, 96(AX)
|
||
|
VMOVDQU Y9, 128(AX)
|
||
|
VMOVDQU Y11, 160(AX)
|
||
|
VMOVDQU Y13, 192(AX)
|
||
|
VMOVDQU Y15, 224(AX)
|
||
|
VMOVDQU 256(CX), Y0
|
||
|
VMOVDQU 288(CX), Y2
|
||
|
VMOVDQU 320(CX), Y4
|
||
|
VMOVDQU 352(CX), Y6
|
||
|
VMOVDQU 384(CX), Y8
|
||
|
VMOVDQU 416(CX), Y10
|
||
|
VMOVDQU 448(CX), Y12
|
||
|
VMOVDQU 480(CX), Y14
|
||
|
VMOVDQU 256(DX), Y1
|
||
|
VMOVDQU 288(DX), Y3
|
||
|
VMOVDQU 320(DX), Y5
|
||
|
VMOVDQU 352(DX), Y7
|
||
|
VMOVDQU 384(DX), Y9
|
||
|
VMOVDQU 416(DX), Y11
|
||
|
VMOVDQU 448(DX), Y13
|
||
|
VMOVDQU 480(DX), Y15
|
||
|
VPADDW Y0, Y1, Y1
|
||
|
VPADDW Y2, Y3, Y3
|
||
|
VPADDW Y4, Y5, Y5
|
||
|
VPADDW Y6, Y7, Y7
|
||
|
VPADDW Y8, Y9, Y9
|
||
|
VPADDW Y10, Y11, Y11
|
||
|
VPADDW Y12, Y13, Y13
|
||
|
VPADDW Y14, Y15, Y15
|
||
|
VMOVDQU Y1, 256(AX)
|
||
|
VMOVDQU Y3, 288(AX)
|
||
|
VMOVDQU Y5, 320(AX)
|
||
|
VMOVDQU Y7, 352(AX)
|
||
|
VMOVDQU Y9, 384(AX)
|
||
|
VMOVDQU Y11, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y15, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·subAVX2(SB), NOSPLIT, $0-24
|
||
|
MOVQ p+0(FP), AX
|
||
|
MOVQ a+8(FP), CX
|
||
|
MOVQ b+16(FP), DX
|
||
|
VMOVDQU (CX), Y0
|
||
|
VMOVDQU 32(CX), Y2
|
||
|
VMOVDQU 64(CX), Y4
|
||
|
VMOVDQU 96(CX), Y6
|
||
|
VMOVDQU 128(CX), Y8
|
||
|
VMOVDQU 160(CX), Y10
|
||
|
VMOVDQU 192(CX), Y12
|
||
|
VMOVDQU 224(CX), Y14
|
||
|
VMOVDQU (DX), Y1
|
||
|
VMOVDQU 32(DX), Y3
|
||
|
VMOVDQU 64(DX), Y5
|
||
|
VMOVDQU 96(DX), Y7
|
||
|
VMOVDQU 128(DX), Y9
|
||
|
VMOVDQU 160(DX), Y11
|
||
|
VMOVDQU 192(DX), Y13
|
||
|
VMOVDQU 224(DX), Y15
|
||
|
VPSUBW Y1, Y0, Y1
|
||
|
VPSUBW Y3, Y2, Y3
|
||
|
VPSUBW Y5, Y4, Y5
|
||
|
VPSUBW Y7, Y6, Y7
|
||
|
VPSUBW Y9, Y8, Y9
|
||
|
VPSUBW Y11, Y10, Y11
|
||
|
VPSUBW Y13, Y12, Y13
|
||
|
VPSUBW Y15, Y14, Y15
|
||
|
VMOVDQU Y1, (AX)
|
||
|
VMOVDQU Y3, 32(AX)
|
||
|
VMOVDQU Y5, 64(AX)
|
||
|
VMOVDQU Y7, 96(AX)
|
||
|
VMOVDQU Y9, 128(AX)
|
||
|
VMOVDQU Y11, 160(AX)
|
||
|
VMOVDQU Y13, 192(AX)
|
||
|
VMOVDQU Y15, 224(AX)
|
||
|
VMOVDQU 256(CX), Y0
|
||
|
VMOVDQU 288(CX), Y2
|
||
|
VMOVDQU 320(CX), Y4
|
||
|
VMOVDQU 352(CX), Y6
|
||
|
VMOVDQU 384(CX), Y8
|
||
|
VMOVDQU 416(CX), Y10
|
||
|
VMOVDQU 448(CX), Y12
|
||
|
VMOVDQU 480(CX), Y14
|
||
|
VMOVDQU 256(DX), Y1
|
||
|
VMOVDQU 288(DX), Y3
|
||
|
VMOVDQU 320(DX), Y5
|
||
|
VMOVDQU 352(DX), Y7
|
||
|
VMOVDQU 384(DX), Y9
|
||
|
VMOVDQU 416(DX), Y11
|
||
|
VMOVDQU 448(DX), Y13
|
||
|
VMOVDQU 480(DX), Y15
|
||
|
VPSUBW Y1, Y0, Y1
|
||
|
VPSUBW Y3, Y2, Y3
|
||
|
VPSUBW Y5, Y4, Y5
|
||
|
VPSUBW Y7, Y6, Y7
|
||
|
VPSUBW Y9, Y8, Y9
|
||
|
VPSUBW Y11, Y10, Y11
|
||
|
VPSUBW Y13, Y12, Y13
|
||
|
VPSUBW Y15, Y14, Y15
|
||
|
VMOVDQU Y1, 256(AX)
|
||
|
VMOVDQU Y3, 288(AX)
|
||
|
VMOVDQU Y5, 320(AX)
|
||
|
VMOVDQU Y7, 352(AX)
|
||
|
VMOVDQU Y9, 384(AX)
|
||
|
VMOVDQU Y11, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y15, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func nttAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·nttAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
LEAQ ·ZetasAVX2+0(SB), CX
|
||
|
MOVL $0x00000d01, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y15
|
||
|
VPBROADCASTW (CX), Y0
|
||
|
VPBROADCASTW 2(CX), Y1
|
||
|
VMOVDQU (AX), Y7
|
||
|
VMOVDQU 32(AX), Y8
|
||
|
VMOVDQU 64(AX), Y9
|
||
|
VMOVDQU 96(AX), Y10
|
||
|
VMOVDQU 256(AX), Y11
|
||
|
VMOVDQU 288(AX), Y12
|
||
|
VMOVDQU 320(AX), Y13
|
||
|
VMOVDQU 352(AX), Y14
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y2
|
||
|
VPSUBW Y3, Y12, Y3
|
||
|
VPSUBW Y4, Y13, Y4
|
||
|
VPSUBW Y5, Y14, Y5
|
||
|
VPSUBW Y2, Y7, Y11
|
||
|
VPSUBW Y3, Y8, Y12
|
||
|
VPSUBW Y4, Y9, Y13
|
||
|
VPSUBW Y5, Y10, Y14
|
||
|
VPADDW Y2, Y7, Y7
|
||
|
VPADDW Y3, Y8, Y8
|
||
|
VPADDW Y4, Y9, Y9
|
||
|
VPADDW Y5, Y10, Y10
|
||
|
VMOVDQU Y7, (AX)
|
||
|
VMOVDQU Y8, 32(AX)
|
||
|
VMOVDQU Y9, 64(AX)
|
||
|
VMOVDQU Y10, 96(AX)
|
||
|
VMOVDQU Y11, 256(AX)
|
||
|
VMOVDQU Y12, 288(AX)
|
||
|
VMOVDQU Y13, 320(AX)
|
||
|
VMOVDQU Y14, 352(AX)
|
||
|
VMOVDQU 128(AX), Y7
|
||
|
VMOVDQU 160(AX), Y8
|
||
|
VMOVDQU 192(AX), Y9
|
||
|
VMOVDQU 224(AX), Y10
|
||
|
VMOVDQU 384(AX), Y11
|
||
|
VMOVDQU 416(AX), Y12
|
||
|
VMOVDQU 448(AX), Y13
|
||
|
VMOVDQU 480(AX), Y14
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y2
|
||
|
VPSUBW Y3, Y12, Y3
|
||
|
VPSUBW Y4, Y13, Y4
|
||
|
VPSUBW Y5, Y14, Y5
|
||
|
VPSUBW Y2, Y7, Y11
|
||
|
VPSUBW Y3, Y8, Y12
|
||
|
VPSUBW Y4, Y9, Y13
|
||
|
VPSUBW Y5, Y10, Y14
|
||
|
VPADDW Y2, Y7, Y7
|
||
|
VPADDW Y3, Y8, Y8
|
||
|
VPADDW Y4, Y9, Y9
|
||
|
VPADDW Y5, Y10, Y10
|
||
|
VMOVDQU Y7, 128(AX)
|
||
|
VMOVDQU Y8, 160(AX)
|
||
|
VMOVDQU Y9, 192(AX)
|
||
|
VMOVDQU Y10, 224(AX)
|
||
|
VMOVDQU Y11, 384(AX)
|
||
|
VMOVDQU Y12, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y14, 480(AX)
|
||
|
VPBROADCASTW 4(CX), Y0
|
||
|
VPBROADCASTW 6(CX), Y1
|
||
|
VMOVDQU (AX), Y7
|
||
|
VMOVDQU 32(AX), Y8
|
||
|
VMOVDQU 64(AX), Y9
|
||
|
VMOVDQU 96(AX), Y10
|
||
|
VMOVDQU 128(AX), Y11
|
||
|
VMOVDQU 160(AX), Y12
|
||
|
VMOVDQU 192(AX), Y13
|
||
|
VMOVDQU 224(AX), Y14
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y2
|
||
|
VPSUBW Y3, Y12, Y3
|
||
|
VPSUBW Y4, Y13, Y4
|
||
|
VPSUBW Y5, Y14, Y5
|
||
|
VPSUBW Y2, Y7, Y11
|
||
|
VPSUBW Y3, Y8, Y12
|
||
|
VPSUBW Y4, Y9, Y13
|
||
|
VPSUBW Y5, Y10, Y14
|
||
|
VPADDW Y2, Y7, Y7
|
||
|
VPADDW Y3, Y8, Y8
|
||
|
VPADDW Y4, Y9, Y9
|
||
|
VPADDW Y5, Y10, Y10
|
||
|
VPBROADCASTW 12(CX), Y0
|
||
|
VPBROADCASTW 14(CX), Y1
|
||
|
VPBROADCASTW 16(CX), Y2
|
||
|
VPBROADCASTW 18(CX), Y3
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU 32(CX), Y0
|
||
|
VMOVDQU 64(CX), Y1
|
||
|
VMOVDQU 96(CX), Y2
|
||
|
VMOVDQU 128(CX), Y3
|
||
|
VPERM2I128 $0x20, Y9, Y7, Y4
|
||
|
VPERM2I128 $0x31, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPERM2I128 $0x20, Y10, Y8, Y4
|
||
|
VPERM2I128 $0x31, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VPERM2I128 $0x20, Y13, Y11, Y4
|
||
|
VPERM2I128 $0x31, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPERM2I128 $0x20, Y14, Y12, Y4
|
||
|
VPERM2I128 $0x31, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPMULLW Y8, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y12, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y12, Y3, Y12
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y8, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y12, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y8
|
||
|
VPSUBW Y5, Y9, Y10
|
||
|
VPSUBW Y6, Y11, Y12
|
||
|
VPSUBW Y0, Y13, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y9, Y9
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y13, Y13
|
||
|
VMOVDQU 288(CX), Y0
|
||
|
VMOVDQU 320(CX), Y1
|
||
|
VMOVDQU 352(CX), Y2
|
||
|
VMOVDQU 384(CX), Y3
|
||
|
VPUNPCKLQDQ Y8, Y7, Y4
|
||
|
VPUNPCKHQDQ Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPUNPCKLQDQ Y10, Y9, Y4
|
||
|
VPUNPCKHQDQ Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPUNPCKLQDQ Y12, Y11, Y4
|
||
|
VPUNPCKHQDQ Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPUNPCKLQDQ Y14, Y13, Y4
|
||
|
VPUNPCKHQDQ Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU 544(CX), Y0
|
||
|
VMOVDQU 576(CX), Y1
|
||
|
VMOVDQU 608(CX), Y2
|
||
|
VMOVDQU 640(CX), Y3
|
||
|
VMOVSLDUP Y9, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y7, Y4
|
||
|
VPSRLQ $0x20, Y7, Y7
|
||
|
VPBLENDD $0xaa, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VMOVSLDUP Y10, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y8, Y4
|
||
|
VPSRLQ $0x20, Y8, Y8
|
||
|
VPBLENDD $0xaa, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VMOVSLDUP Y13, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y11, Y4
|
||
|
VPSRLQ $0x20, Y11, Y11
|
||
|
VPBLENDD $0xaa, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VMOVSLDUP Y14, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y12, Y4
|
||
|
VPSRLQ $0x20, Y12, Y12
|
||
|
VPBLENDD $0xaa, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPMULLW Y8, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y12, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y12, Y3, Y12
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y8, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y12, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y8
|
||
|
VPSUBW Y5, Y9, Y10
|
||
|
VPSUBW Y6, Y11, Y12
|
||
|
VPSUBW Y0, Y13, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y9, Y9
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y13, Y13
|
||
|
VMOVDQU 800(CX), Y0
|
||
|
VMOVDQU 832(CX), Y1
|
||
|
VMOVDQU 864(CX), Y2
|
||
|
VMOVDQU 896(CX), Y3
|
||
|
VPSLLD $0x10, Y8, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y7, Y4
|
||
|
VPSRLD $0x10, Y7, Y7
|
||
|
VPBLENDW $0xaa, Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPSLLD $0x10, Y10, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y9, Y4
|
||
|
VPSRLD $0x10, Y9, Y9
|
||
|
VPBLENDW $0xaa, Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPSLLD $0x10, Y12, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y11, Y4
|
||
|
VPSRLD $0x10, Y11, Y11
|
||
|
VPBLENDW $0xaa, Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPSLLD $0x10, Y14, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y13, Y4
|
||
|
VPSRLD $0x10, Y13, Y13
|
||
|
VPBLENDW $0xaa, Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU Y7, (AX)
|
||
|
VMOVDQU Y8, 32(AX)
|
||
|
VMOVDQU Y9, 64(AX)
|
||
|
VMOVDQU Y10, 96(AX)
|
||
|
VMOVDQU Y11, 128(AX)
|
||
|
VMOVDQU Y12, 160(AX)
|
||
|
VMOVDQU Y13, 192(AX)
|
||
|
VMOVDQU Y14, 224(AX)
|
||
|
VPBROADCASTW 8(CX), Y0
|
||
|
VPBROADCASTW 10(CX), Y1
|
||
|
VMOVDQU 256(AX), Y7
|
||
|
VMOVDQU 288(AX), Y8
|
||
|
VMOVDQU 320(AX), Y9
|
||
|
VMOVDQU 352(AX), Y10
|
||
|
VMOVDQU 384(AX), Y11
|
||
|
VMOVDQU 416(AX), Y12
|
||
|
VMOVDQU 448(AX), Y13
|
||
|
VMOVDQU 480(AX), Y14
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y2
|
||
|
VPSUBW Y3, Y12, Y3
|
||
|
VPSUBW Y4, Y13, Y4
|
||
|
VPSUBW Y5, Y14, Y5
|
||
|
VPSUBW Y2, Y7, Y11
|
||
|
VPSUBW Y3, Y8, Y12
|
||
|
VPSUBW Y4, Y9, Y13
|
||
|
VPSUBW Y5, Y10, Y14
|
||
|
VPADDW Y2, Y7, Y7
|
||
|
VPADDW Y3, Y8, Y8
|
||
|
VPADDW Y4, Y9, Y9
|
||
|
VPADDW Y5, Y10, Y10
|
||
|
VPBROADCASTW 20(CX), Y0
|
||
|
VPBROADCASTW 22(CX), Y1
|
||
|
VPBROADCASTW 24(CX), Y2
|
||
|
VPBROADCASTW 26(CX), Y3
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU 160(CX), Y0
|
||
|
VMOVDQU 192(CX), Y1
|
||
|
VMOVDQU 224(CX), Y2
|
||
|
VMOVDQU 256(CX), Y3
|
||
|
VPERM2I128 $0x20, Y9, Y7, Y4
|
||
|
VPERM2I128 $0x31, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPERM2I128 $0x20, Y10, Y8, Y4
|
||
|
VPERM2I128 $0x31, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VPERM2I128 $0x20, Y13, Y11, Y4
|
||
|
VPERM2I128 $0x31, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPERM2I128 $0x20, Y14, Y12, Y4
|
||
|
VPERM2I128 $0x31, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPMULLW Y8, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y12, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y12, Y3, Y12
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y8, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y12, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y8
|
||
|
VPSUBW Y5, Y9, Y10
|
||
|
VPSUBW Y6, Y11, Y12
|
||
|
VPSUBW Y0, Y13, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y9, Y9
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y13, Y13
|
||
|
VMOVDQU 416(CX), Y0
|
||
|
VMOVDQU 448(CX), Y1
|
||
|
VMOVDQU 480(CX), Y2
|
||
|
VMOVDQU 512(CX), Y3
|
||
|
VPUNPCKLQDQ Y8, Y7, Y4
|
||
|
VPUNPCKHQDQ Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPUNPCKLQDQ Y10, Y9, Y4
|
||
|
VPUNPCKHQDQ Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPUNPCKLQDQ Y12, Y11, Y4
|
||
|
VPUNPCKHQDQ Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPUNPCKLQDQ Y14, Y13, Y4
|
||
|
VPUNPCKHQDQ Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU 672(CX), Y0
|
||
|
VMOVDQU 704(CX), Y1
|
||
|
VMOVDQU 736(CX), Y2
|
||
|
VMOVDQU 768(CX), Y3
|
||
|
VMOVSLDUP Y9, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y7, Y4
|
||
|
VPSRLQ $0x20, Y7, Y7
|
||
|
VPBLENDD $0xaa, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VMOVSLDUP Y10, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y8, Y4
|
||
|
VPSRLQ $0x20, Y8, Y8
|
||
|
VPBLENDD $0xaa, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VMOVSLDUP Y13, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y11, Y4
|
||
|
VPSRLQ $0x20, Y11, Y11
|
||
|
VPBLENDD $0xaa, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VMOVSLDUP Y14, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y12, Y4
|
||
|
VPSRLQ $0x20, Y12, Y12
|
||
|
VPBLENDD $0xaa, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPMULLW Y8, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y12, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y12, Y3, Y12
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y8, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y12, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y8
|
||
|
VPSUBW Y5, Y9, Y10
|
||
|
VPSUBW Y6, Y11, Y12
|
||
|
VPSUBW Y0, Y13, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y9, Y9
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y13, Y13
|
||
|
VMOVDQU 928(CX), Y0
|
||
|
VMOVDQU 960(CX), Y1
|
||
|
VMOVDQU 992(CX), Y2
|
||
|
VMOVDQU 1024(CX), Y3
|
||
|
VPSLLD $0x10, Y8, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y7, Y4
|
||
|
VPSRLD $0x10, Y7, Y7
|
||
|
VPBLENDW $0xaa, Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPSLLD $0x10, Y10, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y9, Y4
|
||
|
VPSRLD $0x10, Y9, Y9
|
||
|
VPBLENDW $0xaa, Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPSLLD $0x10, Y12, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y11, Y4
|
||
|
VPSRLD $0x10, Y11, Y11
|
||
|
VPBLENDW $0xaa, Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPSLLD $0x10, Y14, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y13, Y4
|
||
|
VPSRLD $0x10, Y13, Y13
|
||
|
VPBLENDW $0xaa, Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULLW Y13, Y2, Y6
|
||
|
VPMULLW Y14, Y2, Y0
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y13, Y3, Y13
|
||
|
VPMULHW Y14, Y3, Y14
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPMULHW Y6, Y15, Y6
|
||
|
VPMULHW Y0, Y15, Y0
|
||
|
VPSUBW Y4, Y9, Y4
|
||
|
VPSUBW Y5, Y10, Y5
|
||
|
VPSUBW Y6, Y13, Y6
|
||
|
VPSUBW Y0, Y14, Y0
|
||
|
VPSUBW Y4, Y7, Y9
|
||
|
VPSUBW Y5, Y8, Y10
|
||
|
VPSUBW Y6, Y11, Y13
|
||
|
VPSUBW Y0, Y12, Y14
|
||
|
VPADDW Y4, Y7, Y7
|
||
|
VPADDW Y5, Y8, Y8
|
||
|
VPADDW Y6, Y11, Y11
|
||
|
VPADDW Y0, Y12, Y12
|
||
|
VMOVDQU Y7, 256(AX)
|
||
|
VMOVDQU Y8, 288(AX)
|
||
|
VMOVDQU Y9, 320(AX)
|
||
|
VMOVDQU Y10, 352(AX)
|
||
|
VMOVDQU Y11, 384(AX)
|
||
|
VMOVDQU Y12, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y14, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func invNttAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
LEAQ ·ZetasAVX2+0(SB), CX
|
||
|
MOVL $0x00000d01, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y15
|
||
|
VMOVDQU (AX), Y7
|
||
|
VMOVDQU 32(AX), Y8
|
||
|
VMOVDQU 64(AX), Y9
|
||
|
VMOVDQU 96(AX), Y10
|
||
|
VMOVDQU 128(AX), Y11
|
||
|
VMOVDQU 160(AX), Y12
|
||
|
VMOVDQU 192(AX), Y13
|
||
|
VMOVDQU 224(AX), Y14
|
||
|
VMOVDQU 1056(CX), Y0
|
||
|
VMOVDQU 1088(CX), Y1
|
||
|
VMOVDQU 1120(CX), Y2
|
||
|
VMOVDQU 1152(CX), Y3
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VMOVDQU 1312(CX), Y0
|
||
|
VMOVDQU 1344(CX), Y1
|
||
|
VMOVDQU 1376(CX), Y2
|
||
|
VMOVDQU 1408(CX), Y3
|
||
|
VPSLLD $0x10, Y8, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y7, Y4
|
||
|
VPSRLD $0x10, Y7, Y7
|
||
|
VPBLENDW $0xaa, Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPSLLD $0x10, Y10, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y9, Y4
|
||
|
VPSRLD $0x10, Y9, Y9
|
||
|
VPBLENDW $0xaa, Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPSLLD $0x10, Y12, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y11, Y4
|
||
|
VPSRLD $0x10, Y11, Y11
|
||
|
VPBLENDW $0xaa, Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPSLLD $0x10, Y14, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y13, Y4
|
||
|
VPSRLD $0x10, Y13, Y13
|
||
|
VPBLENDW $0xaa, Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPSUBW Y7, Y8, Y4
|
||
|
VPSUBW Y9, Y10, Y5
|
||
|
VPSUBW Y11, Y12, Y6
|
||
|
VPADDW Y7, Y8, Y7
|
||
|
VPADDW Y9, Y10, Y9
|
||
|
VPADDW Y11, Y12, Y11
|
||
|
VPMULLW Y4, Y0, Y8
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y13, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y12
|
||
|
VPADDW Y13, Y14, Y13
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y8, Y4, Y8
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y12, Y6, Y12
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VMOVDQU 1568(CX), Y0
|
||
|
VMOVDQU 1600(CX), Y1
|
||
|
VMOVDQU 1632(CX), Y2
|
||
|
VMOVDQU 1664(CX), Y3
|
||
|
VMOVSLDUP Y9, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y7, Y4
|
||
|
VPSRLQ $0x20, Y7, Y7
|
||
|
VPBLENDD $0xaa, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VMOVSLDUP Y10, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y8, Y4
|
||
|
VPSRLQ $0x20, Y8, Y8
|
||
|
VPBLENDD $0xaa, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VMOVSLDUP Y13, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y11, Y4
|
||
|
VPSRLQ $0x20, Y11, Y11
|
||
|
VPBLENDD $0xaa, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VMOVSLDUP Y14, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y12, Y4
|
||
|
VPSRLQ $0x20, Y12, Y12
|
||
|
VPBLENDD $0xaa, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
MOVL $0x00004ebf, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y4
|
||
|
VPMULHW Y4, Y7, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y7, Y7
|
||
|
VPMULHW Y4, Y11, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y11, Y11
|
||
|
VMOVDQU 1824(CX), Y0
|
||
|
VMOVDQU 1856(CX), Y1
|
||
|
VMOVDQU 1888(CX), Y2
|
||
|
VMOVDQU 1920(CX), Y3
|
||
|
VPUNPCKLQDQ Y8, Y7, Y4
|
||
|
VPUNPCKHQDQ Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPUNPCKLQDQ Y10, Y9, Y4
|
||
|
VPUNPCKHQDQ Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPUNPCKLQDQ Y12, Y11, Y4
|
||
|
VPUNPCKHQDQ Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPUNPCKLQDQ Y14, Y13, Y4
|
||
|
VPUNPCKHQDQ Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPSUBW Y7, Y8, Y4
|
||
|
VPSUBW Y9, Y10, Y5
|
||
|
VPSUBW Y11, Y12, Y6
|
||
|
VPADDW Y7, Y8, Y7
|
||
|
VPADDW Y9, Y10, Y9
|
||
|
VPADDW Y11, Y12, Y11
|
||
|
VPMULLW Y4, Y0, Y8
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y13, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y12
|
||
|
VPADDW Y13, Y14, Y13
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y8, Y4, Y8
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y12, Y6, Y12
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VPBROADCASTW 2080(CX), Y0
|
||
|
VPBROADCASTW 2082(CX), Y1
|
||
|
VPBROADCASTW 2084(CX), Y2
|
||
|
VPBROADCASTW 2086(CX), Y3
|
||
|
VPERM2I128 $0x20, Y9, Y7, Y4
|
||
|
VPERM2I128 $0x31, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPERM2I128 $0x20, Y10, Y8, Y4
|
||
|
VPERM2I128 $0x31, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VPERM2I128 $0x20, Y13, Y11, Y4
|
||
|
VPERM2I128 $0x31, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPERM2I128 $0x20, Y14, Y12, Y4
|
||
|
VPERM2I128 $0x31, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
MOVL $0x00004ebf, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y4
|
||
|
VPMULHW Y4, Y7, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y7, Y7
|
||
|
VPMULHW Y4, Y11, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y11, Y11
|
||
|
VPBROADCASTW 2096(CX), Y0
|
||
|
VPBROADCASTW 2098(CX), Y1
|
||
|
VPSUBW Y7, Y11, Y4
|
||
|
VPSUBW Y8, Y12, Y5
|
||
|
VPSUBW Y9, Y13, Y6
|
||
|
VPADDW Y7, Y11, Y7
|
||
|
VPADDW Y8, Y12, Y8
|
||
|
VPADDW Y9, Y13, Y9
|
||
|
VPMULLW Y4, Y0, Y11
|
||
|
VPMULLW Y5, Y0, Y12
|
||
|
VPSUBW Y10, Y14, Y2
|
||
|
VPMULLW Y6, Y0, Y13
|
||
|
VPADDW Y10, Y14, Y10
|
||
|
VPMULLW Y2, Y0, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y1, Y6
|
||
|
VPMULHW Y2, Y1, Y2
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y11, Y4, Y11
|
||
|
VPSUBW Y12, Y5, Y12
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y2, Y14
|
||
|
VMOVDQU Y7, (AX)
|
||
|
VMOVDQU Y8, 32(AX)
|
||
|
VMOVDQU Y9, 64(AX)
|
||
|
VMOVDQU Y10, 96(AX)
|
||
|
VMOVDQU Y11, 128(AX)
|
||
|
VMOVDQU Y12, 160(AX)
|
||
|
VMOVDQU Y13, 192(AX)
|
||
|
VMOVDQU Y14, 224(AX)
|
||
|
VMOVDQU 256(AX), Y7
|
||
|
VMOVDQU 288(AX), Y8
|
||
|
VMOVDQU 320(AX), Y9
|
||
|
VMOVDQU 352(AX), Y10
|
||
|
VMOVDQU 384(AX), Y11
|
||
|
VMOVDQU 416(AX), Y12
|
||
|
VMOVDQU 448(AX), Y13
|
||
|
VMOVDQU 480(AX), Y14
|
||
|
VMOVDQU 1184(CX), Y0
|
||
|
VMOVDQU 1216(CX), Y1
|
||
|
VMOVDQU 1248(CX), Y2
|
||
|
VMOVDQU 1280(CX), Y3
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VMOVDQU 1440(CX), Y0
|
||
|
VMOVDQU 1472(CX), Y1
|
||
|
VMOVDQU 1504(CX), Y2
|
||
|
VMOVDQU 1536(CX), Y3
|
||
|
VPSLLD $0x10, Y8, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y7, Y4
|
||
|
VPSRLD $0x10, Y7, Y7
|
||
|
VPBLENDW $0xaa, Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPSLLD $0x10, Y10, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y9, Y4
|
||
|
VPSRLD $0x10, Y9, Y9
|
||
|
VPBLENDW $0xaa, Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPSLLD $0x10, Y12, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y11, Y4
|
||
|
VPSRLD $0x10, Y11, Y11
|
||
|
VPBLENDW $0xaa, Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPSLLD $0x10, Y14, Y4
|
||
|
VPBLENDW $0xaa, Y4, Y13, Y4
|
||
|
VPSRLD $0x10, Y13, Y13
|
||
|
VPBLENDW $0xaa, Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPSUBW Y7, Y8, Y4
|
||
|
VPSUBW Y9, Y10, Y5
|
||
|
VPSUBW Y11, Y12, Y6
|
||
|
VPADDW Y7, Y8, Y7
|
||
|
VPADDW Y9, Y10, Y9
|
||
|
VPADDW Y11, Y12, Y11
|
||
|
VPMULLW Y4, Y0, Y8
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y13, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y12
|
||
|
VPADDW Y13, Y14, Y13
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y8, Y4, Y8
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y12, Y6, Y12
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VMOVDQU 1696(CX), Y0
|
||
|
VMOVDQU 1728(CX), Y1
|
||
|
VMOVDQU 1760(CX), Y2
|
||
|
VMOVDQU 1792(CX), Y3
|
||
|
VMOVSLDUP Y9, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y7, Y4
|
||
|
VPSRLQ $0x20, Y7, Y7
|
||
|
VPBLENDD $0xaa, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VMOVSLDUP Y10, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y8, Y4
|
||
|
VPSRLQ $0x20, Y8, Y8
|
||
|
VPBLENDD $0xaa, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VMOVSLDUP Y13, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y11, Y4
|
||
|
VPSRLQ $0x20, Y11, Y11
|
||
|
VPBLENDD $0xaa, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VMOVSLDUP Y14, Y4
|
||
|
VPBLENDD $0xaa, Y4, Y12, Y4
|
||
|
VPSRLQ $0x20, Y12, Y12
|
||
|
VPBLENDD $0xaa, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
MOVL $0x00004ebf, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y4
|
||
|
VPMULHW Y4, Y7, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y7, Y7
|
||
|
VPMULHW Y4, Y11, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y11, Y11
|
||
|
VMOVDQU 1952(CX), Y0
|
||
|
VMOVDQU 1984(CX), Y1
|
||
|
VMOVDQU 2016(CX), Y2
|
||
|
VMOVDQU 2048(CX), Y3
|
||
|
VPUNPCKLQDQ Y8, Y7, Y4
|
||
|
VPUNPCKHQDQ Y8, Y7, Y8
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPUNPCKLQDQ Y10, Y9, Y4
|
||
|
VPUNPCKHQDQ Y10, Y9, Y10
|
||
|
VMOVDQA Y4, Y9
|
||
|
VPUNPCKLQDQ Y12, Y11, Y4
|
||
|
VPUNPCKHQDQ Y12, Y11, Y12
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPUNPCKLQDQ Y14, Y13, Y4
|
||
|
VPUNPCKHQDQ Y14, Y13, Y14
|
||
|
VMOVDQA Y4, Y13
|
||
|
VPSUBW Y7, Y8, Y4
|
||
|
VPSUBW Y9, Y10, Y5
|
||
|
VPSUBW Y11, Y12, Y6
|
||
|
VPADDW Y7, Y8, Y7
|
||
|
VPADDW Y9, Y10, Y9
|
||
|
VPADDW Y11, Y12, Y11
|
||
|
VPMULLW Y4, Y0, Y8
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y13, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y12
|
||
|
VPADDW Y13, Y14, Y13
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y8, Y4, Y8
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y12, Y6, Y12
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
VPBROADCASTW 2088(CX), Y0
|
||
|
VPBROADCASTW 2090(CX), Y1
|
||
|
VPBROADCASTW 2092(CX), Y2
|
||
|
VPBROADCASTW 2094(CX), Y3
|
||
|
VPERM2I128 $0x20, Y9, Y7, Y4
|
||
|
VPERM2I128 $0x31, Y9, Y7, Y9
|
||
|
VMOVDQA Y4, Y7
|
||
|
VPERM2I128 $0x20, Y10, Y8, Y4
|
||
|
VPERM2I128 $0x31, Y10, Y8, Y10
|
||
|
VMOVDQA Y4, Y8
|
||
|
VPERM2I128 $0x20, Y13, Y11, Y4
|
||
|
VPERM2I128 $0x31, Y13, Y11, Y13
|
||
|
VMOVDQA Y4, Y11
|
||
|
VPERM2I128 $0x20, Y14, Y12, Y4
|
||
|
VPERM2I128 $0x31, Y14, Y12, Y14
|
||
|
VMOVDQA Y4, Y12
|
||
|
VPSUBW Y7, Y9, Y4
|
||
|
VPSUBW Y8, Y10, Y5
|
||
|
VPSUBW Y11, Y13, Y6
|
||
|
VPADDW Y7, Y9, Y7
|
||
|
VPADDW Y8, Y10, Y8
|
||
|
VPADDW Y11, Y13, Y11
|
||
|
VPMULLW Y4, Y0, Y9
|
||
|
VPMULLW Y5, Y0, Y10
|
||
|
VPSUBW Y12, Y14, Y0
|
||
|
VPMULLW Y6, Y2, Y13
|
||
|
VPADDW Y12, Y14, Y12
|
||
|
VPMULLW Y0, Y2, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y3, Y6
|
||
|
VPMULHW Y0, Y3, Y0
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y9, Y4, Y9
|
||
|
VPSUBW Y10, Y5, Y10
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y0, Y14
|
||
|
MOVL $0x00004ebf, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y4
|
||
|
VPMULHW Y4, Y7, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y7, Y7
|
||
|
VPMULHW Y4, Y11, Y5
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPMULLW Y15, Y5, Y5
|
||
|
VPSUBW Y5, Y11, Y11
|
||
|
VPBROADCASTW 2100(CX), Y0
|
||
|
VPBROADCASTW 2102(CX), Y1
|
||
|
VPSUBW Y7, Y11, Y4
|
||
|
VPSUBW Y8, Y12, Y5
|
||
|
VPSUBW Y9, Y13, Y6
|
||
|
VPADDW Y7, Y11, Y7
|
||
|
VPADDW Y8, Y12, Y8
|
||
|
VPADDW Y9, Y13, Y9
|
||
|
VPMULLW Y4, Y0, Y11
|
||
|
VPMULLW Y5, Y0, Y12
|
||
|
VPSUBW Y10, Y14, Y2
|
||
|
VPMULLW Y6, Y0, Y13
|
||
|
VPADDW Y10, Y14, Y10
|
||
|
VPMULLW Y2, Y0, Y14
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y6, Y1, Y6
|
||
|
VPMULHW Y2, Y1, Y2
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y11, Y4, Y11
|
||
|
VPSUBW Y12, Y5, Y12
|
||
|
VPSUBW Y13, Y6, Y13
|
||
|
VPSUBW Y14, Y2, Y14
|
||
|
VMOVDQU Y7, 256(AX)
|
||
|
VMOVDQU Y8, 288(AX)
|
||
|
VMOVDQU Y9, 320(AX)
|
||
|
VMOVDQU Y10, 352(AX)
|
||
|
VMOVDQU Y11, 384(AX)
|
||
|
VMOVDQU Y12, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y14, 480(AX)
|
||
|
VPBROADCASTW 2104(CX), Y0
|
||
|
VPBROADCASTW 2106(CX), Y1
|
||
|
VMOVDQU (AX), Y7
|
||
|
VMOVDQU 32(AX), Y8
|
||
|
VMOVDQU 64(AX), Y9
|
||
|
VMOVDQU 96(AX), Y10
|
||
|
VMOVDQU 256(AX), Y11
|
||
|
VMOVDQU 288(AX), Y12
|
||
|
VMOVDQU 320(AX), Y13
|
||
|
VMOVDQU 352(AX), Y14
|
||
|
VPSUBW Y7, Y11, Y2
|
||
|
VPSUBW Y8, Y12, Y3
|
||
|
VPSUBW Y9, Y13, Y4
|
||
|
VPADDW Y7, Y11, Y7
|
||
|
VPADDW Y8, Y12, Y8
|
||
|
VPADDW Y9, Y13, Y9
|
||
|
VPMULLW Y2, Y0, Y11
|
||
|
VPMULLW Y3, Y0, Y12
|
||
|
VPSUBW Y10, Y14, Y5
|
||
|
VPMULLW Y4, Y0, Y13
|
||
|
VPADDW Y10, Y14, Y10
|
||
|
VPMULLW Y5, Y0, Y14
|
||
|
VPMULHW Y2, Y1, Y2
|
||
|
VPMULHW Y3, Y1, Y3
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y11, Y2, Y11
|
||
|
VPSUBW Y12, Y3, Y12
|
||
|
VPSUBW Y13, Y4, Y13
|
||
|
VPSUBW Y14, Y5, Y14
|
||
|
MOVL $0xffffd8a1, DX
|
||
|
VMOVD DX, X0
|
||
|
VPBROADCASTW X0, Y0
|
||
|
MOVL $0x000005a1, DX
|
||
|
VMOVD DX, X1
|
||
|
VPBROADCASTW X1, Y1
|
||
|
VPMULLW Y7, Y0, Y2
|
||
|
VPMULLW Y8, Y0, Y3
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULHW Y7, Y1, Y7
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y7, Y7
|
||
|
VPSUBW Y3, Y8, Y8
|
||
|
VPSUBW Y4, Y9, Y9
|
||
|
VPSUBW Y5, Y10, Y10
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y11
|
||
|
VPSUBW Y3, Y12, Y12
|
||
|
VPSUBW Y4, Y13, Y13
|
||
|
VPSUBW Y5, Y14, Y14
|
||
|
VMOVDQU Y7, (AX)
|
||
|
VMOVDQU Y8, 32(AX)
|
||
|
VMOVDQU Y9, 64(AX)
|
||
|
VMOVDQU Y10, 96(AX)
|
||
|
VMOVDQU Y11, 256(AX)
|
||
|
VMOVDQU Y12, 288(AX)
|
||
|
VMOVDQU Y13, 320(AX)
|
||
|
VMOVDQU Y14, 352(AX)
|
||
|
VPBROADCASTW 2104(CX), Y0
|
||
|
VPBROADCASTW 2106(CX), Y1
|
||
|
VMOVDQU 128(AX), Y7
|
||
|
VMOVDQU 160(AX), Y8
|
||
|
VMOVDQU 192(AX), Y9
|
||
|
VMOVDQU 224(AX), Y10
|
||
|
VMOVDQU 384(AX), Y11
|
||
|
VMOVDQU 416(AX), Y12
|
||
|
VMOVDQU 448(AX), Y13
|
||
|
VMOVDQU 480(AX), Y14
|
||
|
VPSUBW Y7, Y11, Y2
|
||
|
VPSUBW Y8, Y12, Y3
|
||
|
VPSUBW Y9, Y13, Y4
|
||
|
VPADDW Y7, Y11, Y7
|
||
|
VPADDW Y8, Y12, Y8
|
||
|
VPADDW Y9, Y13, Y9
|
||
|
VPMULLW Y2, Y0, Y11
|
||
|
VPMULLW Y3, Y0, Y12
|
||
|
VPSUBW Y10, Y14, Y5
|
||
|
VPMULLW Y4, Y0, Y13
|
||
|
VPADDW Y10, Y14, Y10
|
||
|
VPMULLW Y5, Y0, Y14
|
||
|
VPMULHW Y2, Y1, Y2
|
||
|
VPMULHW Y3, Y1, Y3
|
||
|
VPMULHW Y4, Y1, Y4
|
||
|
VPMULHW Y5, Y1, Y5
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPMULHW Y12, Y15, Y12
|
||
|
VPMULHW Y13, Y15, Y13
|
||
|
VPMULHW Y14, Y15, Y14
|
||
|
VPSUBW Y11, Y2, Y11
|
||
|
VPSUBW Y12, Y3, Y12
|
||
|
VPSUBW Y13, Y4, Y13
|
||
|
VPSUBW Y14, Y5, Y14
|
||
|
MOVL $0xffffd8a1, CX
|
||
|
VMOVD CX, X0
|
||
|
VPBROADCASTW X0, Y0
|
||
|
MOVL $0x000005a1, CX
|
||
|
VMOVD CX, X1
|
||
|
VPBROADCASTW X1, Y1
|
||
|
VPMULLW Y7, Y0, Y2
|
||
|
VPMULLW Y8, Y0, Y3
|
||
|
VPMULLW Y9, Y0, Y4
|
||
|
VPMULLW Y10, Y0, Y5
|
||
|
VPMULHW Y7, Y1, Y7
|
||
|
VPMULHW Y8, Y1, Y8
|
||
|
VPMULHW Y9, Y1, Y9
|
||
|
VPMULHW Y10, Y1, Y10
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y7, Y7
|
||
|
VPSUBW Y3, Y8, Y8
|
||
|
VPSUBW Y4, Y9, Y9
|
||
|
VPSUBW Y5, Y10, Y10
|
||
|
VPMULLW Y11, Y0, Y2
|
||
|
VPMULLW Y12, Y0, Y3
|
||
|
VPMULLW Y13, Y0, Y4
|
||
|
VPMULLW Y14, Y0, Y5
|
||
|
VPMULHW Y11, Y1, Y11
|
||
|
VPMULHW Y12, Y1, Y12
|
||
|
VPMULHW Y13, Y1, Y13
|
||
|
VPMULHW Y14, Y1, Y14
|
||
|
VPMULHW Y2, Y15, Y2
|
||
|
VPMULHW Y3, Y15, Y3
|
||
|
VPMULHW Y4, Y15, Y4
|
||
|
VPMULHW Y5, Y15, Y5
|
||
|
VPSUBW Y2, Y11, Y11
|
||
|
VPSUBW Y3, Y12, Y12
|
||
|
VPSUBW Y4, Y13, Y13
|
||
|
VPSUBW Y5, Y14, Y14
|
||
|
VMOVDQU Y7, 128(AX)
|
||
|
VMOVDQU Y8, 160(AX)
|
||
|
VMOVDQU Y9, 192(AX)
|
||
|
VMOVDQU Y10, 224(AX)
|
||
|
VMOVDQU Y11, 384(AX)
|
||
|
VMOVDQU Y12, 416(AX)
|
||
|
VMOVDQU Y13, 448(AX)
|
||
|
VMOVDQU Y14, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
|
||
|
MOVQ p+0(FP), AX
|
||
|
MOVQ a+8(FP), CX
|
||
|
MOVQ b+16(FP), DX
|
||
|
LEAQ ·ZetasAVX2+0(SB), BX
|
||
|
MOVL $0xfffff301, BP
|
||
|
VMOVD BP, X0
|
||
|
VPBROADCASTW X0, Y14
|
||
|
MOVL $0x00000d01, BP
|
||
|
VMOVD BP, X0
|
||
|
VPBROADCASTW X0, Y15
|
||
|
VMOVDQU (CX), Y0
|
||
|
VMOVDQU 32(CX), Y1
|
||
|
VMOVDQU 64(CX), Y2
|
||
|
VMOVDQU 96(CX), Y3
|
||
|
VMOVDQU (DX), Y4
|
||
|
VMOVDQU 32(DX), Y5
|
||
|
VMOVDQU 64(DX), Y6
|
||
|
VMOVDQU 96(DX), Y7
|
||
|
VPMULLW Y1, Y5, Y8
|
||
|
VPMULLW Y0, Y4, Y9
|
||
|
VPMULLW Y0, Y5, Y10
|
||
|
VPMULLW Y1, Y4, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y1, Y5, Y12
|
||
|
VPMULHW Y0, Y4, Y13
|
||
|
VPMULHW Y0, Y5, Y0
|
||
|
VPMULHW Y1, Y4, Y1
|
||
|
VMOVDQA Y12, Y4
|
||
|
VMOVDQA Y13, Y5
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPSUBW Y9, Y5, Y5
|
||
|
VPSUBW Y10, Y0, Y0
|
||
|
VPSUBW Y11, Y1, Y1
|
||
|
VMOVDQU 800(BX), Y12
|
||
|
VMOVDQU 832(BX), Y13
|
||
|
VPMULLW Y4, Y12, Y8
|
||
|
VPMULHW Y4, Y13, Y4
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPADDW Y4, Y5, Y4
|
||
|
VPADDW Y0, Y1, Y5
|
||
|
VPMULLW Y3, Y7, Y8
|
||
|
VPMULLW Y2, Y6, Y9
|
||
|
VPMULLW Y2, Y7, Y10
|
||
|
VPMULLW Y3, Y6, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y3, Y7, Y12
|
||
|
VPMULHW Y2, Y6, Y13
|
||
|
VPMULHW Y2, Y7, Y2
|
||
|
VPMULHW Y3, Y6, Y3
|
||
|
VMOVDQA Y12, Y6
|
||
|
VMOVDQA Y13, Y7
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y9, Y7, Y7
|
||
|
VPSUBW Y10, Y2, Y2
|
||
|
VPSUBW Y11, Y3, Y3
|
||
|
VMOVDQU 800(BX), Y12
|
||
|
VMOVDQU 832(BX), Y13
|
||
|
VPMULLW Y6, Y12, Y8
|
||
|
VPMULHW Y6, Y13, Y6
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y6, Y7, Y6
|
||
|
VPADDW Y2, Y3, Y7
|
||
|
VMOVDQU Y4, (AX)
|
||
|
VMOVDQU Y5, 32(AX)
|
||
|
VMOVDQU Y6, 64(AX)
|
||
|
VMOVDQU Y7, 96(AX)
|
||
|
VMOVDQU 128(CX), Y0
|
||
|
VMOVDQU 160(CX), Y1
|
||
|
VMOVDQU 192(CX), Y2
|
||
|
VMOVDQU 224(CX), Y3
|
||
|
VMOVDQU 128(DX), Y4
|
||
|
VMOVDQU 160(DX), Y5
|
||
|
VMOVDQU 192(DX), Y6
|
||
|
VMOVDQU 224(DX), Y7
|
||
|
VPMULLW Y1, Y5, Y8
|
||
|
VPMULLW Y0, Y4, Y9
|
||
|
VPMULLW Y0, Y5, Y10
|
||
|
VPMULLW Y1, Y4, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y1, Y5, Y12
|
||
|
VPMULHW Y0, Y4, Y13
|
||
|
VPMULHW Y0, Y5, Y0
|
||
|
VPMULHW Y1, Y4, Y1
|
||
|
VMOVDQA Y12, Y4
|
||
|
VMOVDQA Y13, Y5
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPSUBW Y9, Y5, Y5
|
||
|
VPSUBW Y10, Y0, Y0
|
||
|
VPSUBW Y11, Y1, Y1
|
||
|
VMOVDQU 864(BX), Y12
|
||
|
VMOVDQU 896(BX), Y13
|
||
|
VPMULLW Y4, Y12, Y8
|
||
|
VPMULHW Y4, Y13, Y4
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPADDW Y4, Y5, Y4
|
||
|
VPADDW Y0, Y1, Y5
|
||
|
VPMULLW Y3, Y7, Y8
|
||
|
VPMULLW Y2, Y6, Y9
|
||
|
VPMULLW Y2, Y7, Y10
|
||
|
VPMULLW Y3, Y6, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y3, Y7, Y12
|
||
|
VPMULHW Y2, Y6, Y13
|
||
|
VPMULHW Y2, Y7, Y2
|
||
|
VPMULHW Y3, Y6, Y3
|
||
|
VMOVDQA Y12, Y6
|
||
|
VMOVDQA Y13, Y7
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y9, Y7, Y7
|
||
|
VPSUBW Y10, Y2, Y2
|
||
|
VPSUBW Y11, Y3, Y3
|
||
|
VMOVDQU 864(BX), Y12
|
||
|
VMOVDQU 896(BX), Y13
|
||
|
VPMULLW Y6, Y12, Y8
|
||
|
VPMULHW Y6, Y13, Y6
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y6, Y7, Y6
|
||
|
VPADDW Y2, Y3, Y7
|
||
|
VMOVDQU Y4, 128(AX)
|
||
|
VMOVDQU Y5, 160(AX)
|
||
|
VMOVDQU Y6, 192(AX)
|
||
|
VMOVDQU Y7, 224(AX)
|
||
|
VMOVDQU 256(CX), Y0
|
||
|
VMOVDQU 288(CX), Y1
|
||
|
VMOVDQU 320(CX), Y2
|
||
|
VMOVDQU 352(CX), Y3
|
||
|
VMOVDQU 256(DX), Y4
|
||
|
VMOVDQU 288(DX), Y5
|
||
|
VMOVDQU 320(DX), Y6
|
||
|
VMOVDQU 352(DX), Y7
|
||
|
VPMULLW Y1, Y5, Y8
|
||
|
VPMULLW Y0, Y4, Y9
|
||
|
VPMULLW Y0, Y5, Y10
|
||
|
VPMULLW Y1, Y4, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y1, Y5, Y12
|
||
|
VPMULHW Y0, Y4, Y13
|
||
|
VPMULHW Y0, Y5, Y0
|
||
|
VPMULHW Y1, Y4, Y1
|
||
|
VMOVDQA Y12, Y4
|
||
|
VMOVDQA Y13, Y5
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPSUBW Y9, Y5, Y5
|
||
|
VPSUBW Y10, Y0, Y0
|
||
|
VPSUBW Y11, Y1, Y1
|
||
|
VMOVDQU 928(BX), Y12
|
||
|
VMOVDQU 960(BX), Y13
|
||
|
VPMULLW Y4, Y12, Y8
|
||
|
VPMULHW Y4, Y13, Y4
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPADDW Y4, Y5, Y4
|
||
|
VPADDW Y0, Y1, Y5
|
||
|
VPMULLW Y3, Y7, Y8
|
||
|
VPMULLW Y2, Y6, Y9
|
||
|
VPMULLW Y2, Y7, Y10
|
||
|
VPMULLW Y3, Y6, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y3, Y7, Y12
|
||
|
VPMULHW Y2, Y6, Y13
|
||
|
VPMULHW Y2, Y7, Y2
|
||
|
VPMULHW Y3, Y6, Y3
|
||
|
VMOVDQA Y12, Y6
|
||
|
VMOVDQA Y13, Y7
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y9, Y7, Y7
|
||
|
VPSUBW Y10, Y2, Y2
|
||
|
VPSUBW Y11, Y3, Y3
|
||
|
VMOVDQU 928(BX), Y12
|
||
|
VMOVDQU 960(BX), Y13
|
||
|
VPMULLW Y6, Y12, Y8
|
||
|
VPMULHW Y6, Y13, Y6
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y6, Y7, Y6
|
||
|
VPADDW Y2, Y3, Y7
|
||
|
VMOVDQU Y4, 256(AX)
|
||
|
VMOVDQU Y5, 288(AX)
|
||
|
VMOVDQU Y6, 320(AX)
|
||
|
VMOVDQU Y7, 352(AX)
|
||
|
VMOVDQU 384(CX), Y0
|
||
|
VMOVDQU 416(CX), Y1
|
||
|
VMOVDQU 448(CX), Y2
|
||
|
VMOVDQU 480(CX), Y3
|
||
|
VMOVDQU 384(DX), Y4
|
||
|
VMOVDQU 416(DX), Y5
|
||
|
VMOVDQU 448(DX), Y6
|
||
|
VMOVDQU 480(DX), Y7
|
||
|
VPMULLW Y1, Y5, Y8
|
||
|
VPMULLW Y0, Y4, Y9
|
||
|
VPMULLW Y0, Y5, Y10
|
||
|
VPMULLW Y1, Y4, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y1, Y5, Y12
|
||
|
VPMULHW Y0, Y4, Y13
|
||
|
VPMULHW Y0, Y5, Y0
|
||
|
VPMULHW Y1, Y4, Y1
|
||
|
VMOVDQA Y12, Y4
|
||
|
VMOVDQA Y13, Y5
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPSUBW Y9, Y5, Y5
|
||
|
VPSUBW Y10, Y0, Y0
|
||
|
VPSUBW Y11, Y1, Y1
|
||
|
VMOVDQU 992(BX), Y12
|
||
|
VMOVDQU 1024(BX), Y13
|
||
|
VPMULLW Y4, Y12, Y8
|
||
|
VPMULHW Y4, Y13, Y4
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y4, Y4
|
||
|
VPADDW Y4, Y5, Y4
|
||
|
VPADDW Y0, Y1, Y5
|
||
|
VPMULLW Y3, Y7, Y8
|
||
|
VPMULLW Y2, Y6, Y9
|
||
|
VPMULLW Y2, Y7, Y10
|
||
|
VPMULLW Y3, Y6, Y11
|
||
|
VPMULLW Y8, Y14, Y8
|
||
|
VPMULLW Y9, Y14, Y9
|
||
|
VPMULLW Y10, Y14, Y10
|
||
|
VPMULLW Y11, Y14, Y11
|
||
|
VPMULHW Y3, Y7, Y12
|
||
|
VPMULHW Y2, Y6, Y13
|
||
|
VPMULHW Y2, Y7, Y2
|
||
|
VPMULHW Y3, Y6, Y3
|
||
|
VMOVDQA Y12, Y6
|
||
|
VMOVDQA Y13, Y7
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPMULHW Y9, Y15, Y9
|
||
|
VPMULHW Y10, Y15, Y10
|
||
|
VPMULHW Y11, Y15, Y11
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y9, Y7, Y7
|
||
|
VPSUBW Y10, Y2, Y2
|
||
|
VPSUBW Y11, Y3, Y3
|
||
|
VMOVDQU 992(BX), Y12
|
||
|
VMOVDQU 1024(BX), Y13
|
||
|
VPMULLW Y6, Y12, Y8
|
||
|
VPMULHW Y6, Y13, Y6
|
||
|
VPMULHW Y8, Y15, Y8
|
||
|
VPSUBW Y8, Y6, Y6
|
||
|
VPSUBW Y6, Y7, Y6
|
||
|
VPADDW Y2, Y3, Y7
|
||
|
VMOVDQU Y4, 384(AX)
|
||
|
VMOVDQU Y5, 416(AX)
|
||
|
VMOVDQU Y6, 448(AX)
|
||
|
VMOVDQU Y7, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func detangleAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
VMOVDQU (AX), Y0
|
||
|
VMOVDQU 32(AX), Y1
|
||
|
VMOVDQU 64(AX), Y2
|
||
|
VMOVDQU 96(AX), Y3
|
||
|
VMOVDQU 128(AX), Y4
|
||
|
VMOVDQU 160(AX), Y5
|
||
|
VMOVDQU 192(AX), Y6
|
||
|
VMOVDQU 224(AX), Y7
|
||
|
VPSLLD $0x10, Y1, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y0, Y8
|
||
|
VPSRLD $0x10, Y0, Y0
|
||
|
VPBLENDW $0xaa, Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPSLLD $0x10, Y3, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y2, Y8
|
||
|
VPSRLD $0x10, Y2, Y2
|
||
|
VPBLENDW $0xaa, Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPSLLD $0x10, Y5, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y4, Y8
|
||
|
VPSRLD $0x10, Y4, Y4
|
||
|
VPBLENDW $0xaa, Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPSLLD $0x10, Y7, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y6, Y8
|
||
|
VPSRLD $0x10, Y6, Y6
|
||
|
VPBLENDW $0xaa, Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVSLDUP Y2, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y0, Y8
|
||
|
VPSRLQ $0x20, Y0, Y0
|
||
|
VPBLENDD $0xaa, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VMOVSLDUP Y3, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y1, Y8
|
||
|
VPSRLQ $0x20, Y1, Y1
|
||
|
VPBLENDD $0xaa, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VMOVSLDUP Y6, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y4, Y8
|
||
|
VPSRLQ $0x20, Y4, Y4
|
||
|
VPBLENDD $0xaa, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VMOVSLDUP Y7, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y5, Y8
|
||
|
VPSRLQ $0x20, Y5, Y5
|
||
|
VPBLENDD $0xaa, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPUNPCKLQDQ Y1, Y0, Y8
|
||
|
VPUNPCKHQDQ Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPUNPCKLQDQ Y3, Y2, Y8
|
||
|
VPUNPCKHQDQ Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPUNPCKLQDQ Y5, Y4, Y8
|
||
|
VPUNPCKHQDQ Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPUNPCKLQDQ Y7, Y6, Y8
|
||
|
VPUNPCKHQDQ Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VPERM2I128 $0x20, Y2, Y0, Y8
|
||
|
VPERM2I128 $0x31, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPERM2I128 $0x20, Y3, Y1, Y8
|
||
|
VPERM2I128 $0x31, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VPERM2I128 $0x20, Y6, Y4, Y8
|
||
|
VPERM2I128 $0x31, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPERM2I128 $0x20, Y7, Y5, Y8
|
||
|
VPERM2I128 $0x31, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VMOVDQU Y0, (AX)
|
||
|
VMOVDQU Y1, 32(AX)
|
||
|
VMOVDQU Y2, 64(AX)
|
||
|
VMOVDQU Y3, 96(AX)
|
||
|
VMOVDQU Y4, 128(AX)
|
||
|
VMOVDQU Y5, 160(AX)
|
||
|
VMOVDQU Y6, 192(AX)
|
||
|
VMOVDQU Y7, 224(AX)
|
||
|
VMOVDQU 256(AX), Y0
|
||
|
VMOVDQU 288(AX), Y1
|
||
|
VMOVDQU 320(AX), Y2
|
||
|
VMOVDQU 352(AX), Y3
|
||
|
VMOVDQU 384(AX), Y4
|
||
|
VMOVDQU 416(AX), Y5
|
||
|
VMOVDQU 448(AX), Y6
|
||
|
VMOVDQU 480(AX), Y7
|
||
|
VPSLLD $0x10, Y1, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y0, Y8
|
||
|
VPSRLD $0x10, Y0, Y0
|
||
|
VPBLENDW $0xaa, Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPSLLD $0x10, Y3, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y2, Y8
|
||
|
VPSRLD $0x10, Y2, Y2
|
||
|
VPBLENDW $0xaa, Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPSLLD $0x10, Y5, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y4, Y8
|
||
|
VPSRLD $0x10, Y4, Y4
|
||
|
VPBLENDW $0xaa, Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPSLLD $0x10, Y7, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y6, Y8
|
||
|
VPSRLD $0x10, Y6, Y6
|
||
|
VPBLENDW $0xaa, Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVSLDUP Y2, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y0, Y8
|
||
|
VPSRLQ $0x20, Y0, Y0
|
||
|
VPBLENDD $0xaa, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VMOVSLDUP Y3, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y1, Y8
|
||
|
VPSRLQ $0x20, Y1, Y1
|
||
|
VPBLENDD $0xaa, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VMOVSLDUP Y6, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y4, Y8
|
||
|
VPSRLQ $0x20, Y4, Y4
|
||
|
VPBLENDD $0xaa, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VMOVSLDUP Y7, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y5, Y8
|
||
|
VPSRLQ $0x20, Y5, Y5
|
||
|
VPBLENDD $0xaa, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPUNPCKLQDQ Y1, Y0, Y8
|
||
|
VPUNPCKHQDQ Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPUNPCKLQDQ Y3, Y2, Y8
|
||
|
VPUNPCKHQDQ Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPUNPCKLQDQ Y5, Y4, Y8
|
||
|
VPUNPCKHQDQ Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPUNPCKLQDQ Y7, Y6, Y8
|
||
|
VPUNPCKHQDQ Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VPERM2I128 $0x20, Y2, Y0, Y8
|
||
|
VPERM2I128 $0x31, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPERM2I128 $0x20, Y3, Y1, Y8
|
||
|
VPERM2I128 $0x31, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VPERM2I128 $0x20, Y6, Y4, Y8
|
||
|
VPERM2I128 $0x31, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPERM2I128 $0x20, Y7, Y5, Y8
|
||
|
VPERM2I128 $0x31, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VMOVDQU Y0, 256(AX)
|
||
|
VMOVDQU Y1, 288(AX)
|
||
|
VMOVDQU Y2, 320(AX)
|
||
|
VMOVDQU Y3, 352(AX)
|
||
|
VMOVDQU Y4, 384(AX)
|
||
|
VMOVDQU Y5, 416(AX)
|
||
|
VMOVDQU Y6, 448(AX)
|
||
|
VMOVDQU Y7, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func tangleAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
VMOVDQU (AX), Y0
|
||
|
VMOVDQU 32(AX), Y1
|
||
|
VMOVDQU 64(AX), Y2
|
||
|
VMOVDQU 96(AX), Y3
|
||
|
VMOVDQU 128(AX), Y4
|
||
|
VMOVDQU 160(AX), Y5
|
||
|
VMOVDQU 192(AX), Y6
|
||
|
VMOVDQU 224(AX), Y7
|
||
|
VPERM2I128 $0x20, Y2, Y0, Y8
|
||
|
VPERM2I128 $0x31, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPERM2I128 $0x20, Y3, Y1, Y8
|
||
|
VPERM2I128 $0x31, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VPERM2I128 $0x20, Y6, Y4, Y8
|
||
|
VPERM2I128 $0x31, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPERM2I128 $0x20, Y7, Y5, Y8
|
||
|
VPERM2I128 $0x31, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPUNPCKLQDQ Y1, Y0, Y8
|
||
|
VPUNPCKHQDQ Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPUNPCKLQDQ Y3, Y2, Y8
|
||
|
VPUNPCKHQDQ Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPUNPCKLQDQ Y5, Y4, Y8
|
||
|
VPUNPCKHQDQ Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPUNPCKLQDQ Y7, Y6, Y8
|
||
|
VPUNPCKHQDQ Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVSLDUP Y2, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y0, Y8
|
||
|
VPSRLQ $0x20, Y0, Y0
|
||
|
VPBLENDD $0xaa, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VMOVSLDUP Y3, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y1, Y8
|
||
|
VPSRLQ $0x20, Y1, Y1
|
||
|
VPBLENDD $0xaa, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VMOVSLDUP Y6, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y4, Y8
|
||
|
VPSRLQ $0x20, Y4, Y4
|
||
|
VPBLENDD $0xaa, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VMOVSLDUP Y7, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y5, Y8
|
||
|
VPSRLQ $0x20, Y5, Y5
|
||
|
VPBLENDD $0xaa, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPSLLD $0x10, Y1, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y0, Y8
|
||
|
VPSRLD $0x10, Y0, Y0
|
||
|
VPBLENDW $0xaa, Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPSLLD $0x10, Y3, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y2, Y8
|
||
|
VPSRLD $0x10, Y2, Y2
|
||
|
VPBLENDW $0xaa, Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPSLLD $0x10, Y5, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y4, Y8
|
||
|
VPSRLD $0x10, Y4, Y4
|
||
|
VPBLENDW $0xaa, Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPSLLD $0x10, Y7, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y6, Y8
|
||
|
VPSRLD $0x10, Y6, Y6
|
||
|
VPBLENDW $0xaa, Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVDQU Y0, (AX)
|
||
|
VMOVDQU Y1, 32(AX)
|
||
|
VMOVDQU Y2, 64(AX)
|
||
|
VMOVDQU Y3, 96(AX)
|
||
|
VMOVDQU Y4, 128(AX)
|
||
|
VMOVDQU Y5, 160(AX)
|
||
|
VMOVDQU Y6, 192(AX)
|
||
|
VMOVDQU Y7, 224(AX)
|
||
|
VMOVDQU 256(AX), Y0
|
||
|
VMOVDQU 288(AX), Y1
|
||
|
VMOVDQU 320(AX), Y2
|
||
|
VMOVDQU 352(AX), Y3
|
||
|
VMOVDQU 384(AX), Y4
|
||
|
VMOVDQU 416(AX), Y5
|
||
|
VMOVDQU 448(AX), Y6
|
||
|
VMOVDQU 480(AX), Y7
|
||
|
VPERM2I128 $0x20, Y2, Y0, Y8
|
||
|
VPERM2I128 $0x31, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPERM2I128 $0x20, Y3, Y1, Y8
|
||
|
VPERM2I128 $0x31, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VPERM2I128 $0x20, Y6, Y4, Y8
|
||
|
VPERM2I128 $0x31, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPERM2I128 $0x20, Y7, Y5, Y8
|
||
|
VPERM2I128 $0x31, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPUNPCKLQDQ Y1, Y0, Y8
|
||
|
VPUNPCKHQDQ Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPUNPCKLQDQ Y3, Y2, Y8
|
||
|
VPUNPCKHQDQ Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPUNPCKLQDQ Y5, Y4, Y8
|
||
|
VPUNPCKHQDQ Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPUNPCKLQDQ Y7, Y6, Y8
|
||
|
VPUNPCKHQDQ Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVSLDUP Y2, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y0, Y8
|
||
|
VPSRLQ $0x20, Y0, Y0
|
||
|
VPBLENDD $0xaa, Y2, Y0, Y2
|
||
|
VMOVDQA Y8, Y0
|
||
|
VMOVSLDUP Y3, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y1, Y8
|
||
|
VPSRLQ $0x20, Y1, Y1
|
||
|
VPBLENDD $0xaa, Y3, Y1, Y3
|
||
|
VMOVDQA Y8, Y1
|
||
|
VMOVSLDUP Y6, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y4, Y8
|
||
|
VPSRLQ $0x20, Y4, Y4
|
||
|
VPBLENDD $0xaa, Y6, Y4, Y6
|
||
|
VMOVDQA Y8, Y4
|
||
|
VMOVSLDUP Y7, Y8
|
||
|
VPBLENDD $0xaa, Y8, Y5, Y8
|
||
|
VPSRLQ $0x20, Y5, Y5
|
||
|
VPBLENDD $0xaa, Y7, Y5, Y7
|
||
|
VMOVDQA Y8, Y5
|
||
|
VPSLLD $0x10, Y1, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y0, Y8
|
||
|
VPSRLD $0x10, Y0, Y0
|
||
|
VPBLENDW $0xaa, Y1, Y0, Y1
|
||
|
VMOVDQA Y8, Y0
|
||
|
VPSLLD $0x10, Y3, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y2, Y8
|
||
|
VPSRLD $0x10, Y2, Y2
|
||
|
VPBLENDW $0xaa, Y3, Y2, Y3
|
||
|
VMOVDQA Y8, Y2
|
||
|
VPSLLD $0x10, Y5, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y4, Y8
|
||
|
VPSRLD $0x10, Y4, Y4
|
||
|
VPBLENDW $0xaa, Y5, Y4, Y5
|
||
|
VMOVDQA Y8, Y4
|
||
|
VPSLLD $0x10, Y7, Y8
|
||
|
VPBLENDW $0xaa, Y8, Y6, Y8
|
||
|
VPSRLD $0x10, Y6, Y6
|
||
|
VPBLENDW $0xaa, Y7, Y6, Y7
|
||
|
VMOVDQA Y8, Y6
|
||
|
VMOVDQU Y0, 256(AX)
|
||
|
VMOVDQU Y1, 288(AX)
|
||
|
VMOVDQU Y2, 320(AX)
|
||
|
VMOVDQU Y3, 352(AX)
|
||
|
VMOVDQU Y4, 384(AX)
|
||
|
VMOVDQU Y5, 416(AX)
|
||
|
VMOVDQU Y6, 448(AX)
|
||
|
VMOVDQU Y7, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func barrettReduceAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
MOVL $0x00000d01, CX
|
||
|
VMOVD CX, X0
|
||
|
VPBROADCASTW X0, Y9
|
||
|
MOVL $0x00004ebf, CX
|
||
|
VMOVD CX, X0
|
||
|
VPBROADCASTW X0, Y8
|
||
|
VMOVDQU (AX), Y0
|
||
|
VMOVDQU 32(AX), Y1
|
||
|
VMOVDQU 64(AX), Y2
|
||
|
VMOVDQU 96(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VMOVDQU Y0, (AX)
|
||
|
VMOVDQU Y1, 32(AX)
|
||
|
VMOVDQU Y2, 64(AX)
|
||
|
VMOVDQU Y3, 96(AX)
|
||
|
VMOVDQU 128(AX), Y0
|
||
|
VMOVDQU 160(AX), Y1
|
||
|
VMOVDQU 192(AX), Y2
|
||
|
VMOVDQU 224(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VMOVDQU Y0, 128(AX)
|
||
|
VMOVDQU Y1, 160(AX)
|
||
|
VMOVDQU Y2, 192(AX)
|
||
|
VMOVDQU Y3, 224(AX)
|
||
|
VMOVDQU 256(AX), Y0
|
||
|
VMOVDQU 288(AX), Y1
|
||
|
VMOVDQU 320(AX), Y2
|
||
|
VMOVDQU 352(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VMOVDQU Y0, 256(AX)
|
||
|
VMOVDQU Y1, 288(AX)
|
||
|
VMOVDQU Y2, 320(AX)
|
||
|
VMOVDQU Y3, 352(AX)
|
||
|
VMOVDQU 384(AX), Y0
|
||
|
VMOVDQU 416(AX), Y1
|
||
|
VMOVDQU 448(AX), Y2
|
||
|
VMOVDQU 480(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VMOVDQU Y0, 384(AX)
|
||
|
VMOVDQU Y1, 416(AX)
|
||
|
VMOVDQU Y2, 448(AX)
|
||
|
VMOVDQU Y3, 480(AX)
|
||
|
RET
|
||
|
|
||
|
// func normalizeAVX2(p *[256]int16)
|
||
|
// Requires: AVX, AVX2
|
||
|
TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
|
||
|
MOVQ p+0(FP), AX
|
||
|
MOVL $0x00000d01, CX
|
||
|
VMOVD CX, X0
|
||
|
VPBROADCASTW X0, Y9
|
||
|
MOVL $0x00004ebf, CX
|
||
|
VMOVD CX, X0
|
||
|
VPBROADCASTW X0, Y8
|
||
|
VMOVDQU (AX), Y0
|
||
|
VMOVDQU 32(AX), Y1
|
||
|
VMOVDQU 64(AX), Y2
|
||
|
VMOVDQU 96(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VPSUBW Y9, Y0, Y0
|
||
|
VPSUBW Y9, Y1, Y1
|
||
|
VPSUBW Y9, Y2, Y2
|
||
|
VPSUBW Y9, Y3, Y3
|
||
|
VPSRAW $0x0f, Y0, Y4
|
||
|
VPSRAW $0x0f, Y1, Y5
|
||
|
VPSRAW $0x0f, Y2, Y6
|
||
|
VPSRAW $0x0f, Y3, Y7
|
||
|
VPAND Y4, Y9, Y4
|
||
|
VPAND Y5, Y9, Y5
|
||
|
VPAND Y6, Y9, Y6
|
||
|
VPAND Y7, Y9, Y7
|
||
|
VPADDW Y0, Y4, Y0
|
||
|
VPADDW Y1, Y5, Y1
|
||
|
VPADDW Y2, Y6, Y2
|
||
|
VPADDW Y3, Y7, Y3
|
||
|
VMOVDQU Y0, (AX)
|
||
|
VMOVDQU Y1, 32(AX)
|
||
|
VMOVDQU Y2, 64(AX)
|
||
|
VMOVDQU Y3, 96(AX)
|
||
|
VMOVDQU 128(AX), Y0
|
||
|
VMOVDQU 160(AX), Y1
|
||
|
VMOVDQU 192(AX), Y2
|
||
|
VMOVDQU 224(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VPSUBW Y9, Y0, Y0
|
||
|
VPSUBW Y9, Y1, Y1
|
||
|
VPSUBW Y9, Y2, Y2
|
||
|
VPSUBW Y9, Y3, Y3
|
||
|
VPSRAW $0x0f, Y0, Y4
|
||
|
VPSRAW $0x0f, Y1, Y5
|
||
|
VPSRAW $0x0f, Y2, Y6
|
||
|
VPSRAW $0x0f, Y3, Y7
|
||
|
VPAND Y4, Y9, Y4
|
||
|
VPAND Y5, Y9, Y5
|
||
|
VPAND Y6, Y9, Y6
|
||
|
VPAND Y7, Y9, Y7
|
||
|
VPADDW Y0, Y4, Y0
|
||
|
VPADDW Y1, Y5, Y1
|
||
|
VPADDW Y2, Y6, Y2
|
||
|
VPADDW Y3, Y7, Y3
|
||
|
VMOVDQU Y0, 128(AX)
|
||
|
VMOVDQU Y1, 160(AX)
|
||
|
VMOVDQU Y2, 192(AX)
|
||
|
VMOVDQU Y3, 224(AX)
|
||
|
VMOVDQU 256(AX), Y0
|
||
|
VMOVDQU 288(AX), Y1
|
||
|
VMOVDQU 320(AX), Y2
|
||
|
VMOVDQU 352(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VPSUBW Y9, Y0, Y0
|
||
|
VPSUBW Y9, Y1, Y1
|
||
|
VPSUBW Y9, Y2, Y2
|
||
|
VPSUBW Y9, Y3, Y3
|
||
|
VPSRAW $0x0f, Y0, Y4
|
||
|
VPSRAW $0x0f, Y1, Y5
|
||
|
VPSRAW $0x0f, Y2, Y6
|
||
|
VPSRAW $0x0f, Y3, Y7
|
||
|
VPAND Y4, Y9, Y4
|
||
|
VPAND Y5, Y9, Y5
|
||
|
VPAND Y6, Y9, Y6
|
||
|
VPAND Y7, Y9, Y7
|
||
|
VPADDW Y0, Y4, Y0
|
||
|
VPADDW Y1, Y5, Y1
|
||
|
VPADDW Y2, Y6, Y2
|
||
|
VPADDW Y3, Y7, Y3
|
||
|
VMOVDQU Y0, 256(AX)
|
||
|
VMOVDQU Y1, 288(AX)
|
||
|
VMOVDQU Y2, 320(AX)
|
||
|
VMOVDQU Y3, 352(AX)
|
||
|
VMOVDQU 384(AX), Y0
|
||
|
VMOVDQU 416(AX), Y1
|
||
|
VMOVDQU 448(AX), Y2
|
||
|
VMOVDQU 480(AX), Y3
|
||
|
VPMULHW Y8, Y0, Y4
|
||
|
VPMULHW Y8, Y1, Y5
|
||
|
VPMULHW Y8, Y2, Y6
|
||
|
VPMULHW Y8, Y3, Y7
|
||
|
VPSRAW $0x0a, Y4, Y4
|
||
|
VPSRAW $0x0a, Y5, Y5
|
||
|
VPSRAW $0x0a, Y6, Y6
|
||
|
VPSRAW $0x0a, Y7, Y7
|
||
|
VPMULLW Y9, Y4, Y4
|
||
|
VPMULLW Y9, Y5, Y5
|
||
|
VPMULLW Y9, Y6, Y6
|
||
|
VPMULLW Y9, Y7, Y7
|
||
|
VPSUBW Y4, Y0, Y0
|
||
|
VPSUBW Y5, Y1, Y1
|
||
|
VPSUBW Y6, Y2, Y2
|
||
|
VPSUBW Y7, Y3, Y3
|
||
|
VPSUBW Y9, Y0, Y0
|
||
|
VPSUBW Y9, Y1, Y1
|
||
|
VPSUBW Y9, Y2, Y2
|
||
|
VPSUBW Y9, Y3, Y3
|
||
|
VPSRAW $0x0f, Y0, Y4
|
||
|
VPSRAW $0x0f, Y1, Y5
|
||
|
VPSRAW $0x0f, Y2, Y6
|
||
|
VPSRAW $0x0f, Y3, Y7
|
||
|
VPAND Y4, Y9, Y4
|
||
|
VPAND Y5, Y9, Y5
|
||
|
VPAND Y6, Y9, Y6
|
||
|
VPAND Y7, Y9, Y7
|
||
|
VPADDW Y0, Y4, Y0
|
||
|
VPADDW Y1, Y5, Y1
|
||
|
VPADDW Y2, Y6, Y2
|
||
|
VPADDW Y3, Y7, Y3
|
||
|
VMOVDQU Y0, 384(AX)
|
||
|
VMOVDQU Y1, 416(AX)
|
||
|
VMOVDQU Y2, 448(AX)
|
||
|
VMOVDQU Y3, 480(AX)
|
||
|
RET
|