xs/vendor/github.com/templexxx/xor/avx2_amd64.s

439 lines
7.3 KiB
Go

#include "textflag.h"
// addr of mem
#define DST BX
#define SRC SI
#define SRC0 TMP4
#define SRC1 TMP5
// loop args
// num of vect
#define VECT CX
#define LEN DX
// pos of matrix
#define POS R8
// tmp store
// num of vect or ...
#define TMP1 R9
// pos of matrix or ...
#define TMP2 R10
// store addr of data/parity or ...
#define TMP3 R11
#define TMP4 R12
#define TMP5 R13
#define TMP6 R14
// func bytesAVX2mini(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $31, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop32b:
VMOVDQU (SRC0)(POS*1), Y0
VPXOR (SRC1)(POS*1), Y0, Y0
VMOVDQU Y0, (DST)(POS*1)
ADDQ $32, POS
CMPQ LEN, POS
JNE loop32b
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $31, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $31, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $32
JGE aligned
RET
ret:
RET
// func bytesAVX2small(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2small(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func bytesAVX2big(dst, src0, src1 []byte, size int)
TEXT ·bytesAVX2big(SB), NOSPLIT, $0
MOVQ len+72(FP), LEN
CMPQ LEN, $0
JE ret
MOVQ dst+0(FP), DST
MOVQ src0+24(FP), SRC0
MOVQ src1+48(FP), SRC1
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
VMOVDQU (SRC0)(POS*1), Y0
VMOVDQU 32(SRC0)(POS*1), Y1
VMOVDQU 64(SRC0)(POS*1), Y2
VMOVDQU 96(SRC0)(POS*1), Y3
VPXOR (SRC1)(POS*1), Y0, Y0
VPXOR 32(SRC1)(POS*1), Y1, Y1
VPXOR 64(SRC1)(POS*1), Y2, Y2
VPXOR 96(SRC1)(POS*1), Y3, Y3
LONG $0xe77da1c4; WORD $0x0304
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
SFENCE
VZEROUPPER
RET
loop_1b:
MOVB -1(SRC0)(LEN*1), TMP1
MOVB -1(SRC1)(LEN*1), TMP2
XORB TMP1, TMP2
MOVB TMP2, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP1
ANDQ $127, TMP1
loop_8b:
MOVQ -8(SRC0)(LEN*1), TMP2
MOVQ -8(SRC1)(LEN*1), TMP3
XORQ TMP2, TMP3
MOVQ TMP3, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP1
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2small(dst []byte, src [][]byte)
TEXT ·matrixAVX2small(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
VMOVDQU Y0, (DST)(POS*1)
VMOVDQU Y1, 32(DST)(POS*1)
VMOVDQU Y2, 64(DST)(POS*1)
VMOVDQU Y3, 96(DST)(POS*1)
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET
// func matrixAVX2big(dst []byte, src [][]byte)
TEXT ·matrixAVX2big(SB), NOSPLIT, $0
MOVQ dst+0(FP), DST
MOVQ src+24(FP), SRC
MOVQ vec+32(FP), VECT
MOVQ len+8(FP), LEN
TESTQ $127, LEN
JNZ not_aligned
aligned:
MOVQ $0, POS
loop128b:
MOVQ VECT, TMP1
SUBQ $2, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y0
VMOVDQU 32(TMP4)(POS*1), Y1
VMOVDQU 64(TMP3)(POS*1), Y2
VMOVDQU 96(TMP4)(POS*1), Y3
next_vect:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ TMP3, TMP4
VMOVDQU (TMP3)(POS*1), Y4
VMOVDQU 32(TMP4)(POS*1), Y5
VMOVDQU 64(TMP3)(POS*1), Y6
VMOVDQU 96(TMP4)(POS*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, TMP1
JGE next_vect
LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
ADDQ $128, POS
CMPQ LEN, POS
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVB -1(TMP3)(LEN*1), TMP5
next_vect_1b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVB -1(TMP3)(LEN*1), TMP6
XORB TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_1b
MOVB TMP5, -1(DST)(LEN*1)
SUBQ $1, LEN
TESTQ $7, LEN
JNZ loop_1b
CMPQ LEN, $0
JE ret
TESTQ $127, LEN
JZ aligned
not_aligned:
TESTQ $7, LEN
JNE loop_1b
MOVQ LEN, TMP4
ANDQ $127, TMP4
loop_8b:
MOVQ VECT, TMP1
MOVQ $0, TMP2
MOVQ (SRC)(TMP2*1), TMP3
SUBQ $2, TMP1
MOVQ -8(TMP3)(LEN*1), TMP5
next_vect_8b:
ADDQ $24, TMP2
MOVQ (SRC)(TMP2*1), TMP3
MOVQ -8(TMP3)(LEN*1), TMP6
XORQ TMP6, TMP5
SUBQ $1, TMP1
JGE next_vect_8b
MOVQ TMP5, -8(DST)(LEN*1)
SUBQ $8, LEN
SUBQ $8, TMP4
JG loop_8b
CMPQ LEN, $128
JGE aligned
RET
ret:
RET