cloudflared-mirror/vendor/github.com/cloudflare/circl/pke/kyber/internal/common/amd64.go

303 lines
11 KiB
Go

//go:build amd64
// +build amd64
package common
import (
"golang.org/x/sys/cpu"
)
// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
// the values int16(zeta * 62209) for each zeta, which is used in
// Montgomery reduction. There is some duplication and reordering as
// compared to Zetas to make it more covenient for use with AVX2.
var ZetasAVX2 = [...]int16{
// level 1: int16(Zetas[1]*62209) and Zetas[1]
31499, 2571,
// level 2
//
// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
14746, 2970, 788, 1812,
// level 3, like level 2.
13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
0, 0, // padding
// layer 4. offset: 1*16
//
// The precomputed multiplication and zetas are grouped by 16 at a
// time as used in the set of butterflies, etc.
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
622, 622, 622, 622, 622, 622, 622, 622,
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
182, 182, 182, 182, 182, 182, 182, 182,
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
962, 962, 962, 962, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
// layer 5. offset: 9*16
-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
573, 573, 573, 573, 2004, 2004, 2004, 2004,
264, 264, 264, 264, 383, 383, 383, 383,
5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
732, 732, 732, 732, 608, 608, 608, 608,
18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
1787, 1787, 1787, 1787, 411, 411, 411, 411,
3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
// layer 6. offset: 17*16
-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
448, 448, 2264, 2264, 677, 677, 2054, 2054,
// layer 7. offset: 25*16
-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
2226, 430, 555, 843, 2078, 871, 1550, 105,
422, 587, 177, 3094, 3038, 2869, 1574, 1653,
32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
1739, 644, 2457, 349, 418, 329, 3173, 3254,
-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
817, 1097, 603, 610, 1322, 2044, 1864, 384,
2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
3221, 3021, 996, 991, 958, 1869, 1522, 1628,
// layer 1 inverse
23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
1628, 1522, 1869, 958, 991, 996, 3021, 3221,
478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
384, 1864, 2044, 1322, 610, 603, 1097, 817,
-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
3254, 3173, 329, 418, 349, 2457, 644, 1739,
1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
1653, 1574, 2869, 3038, 3094, 177, 587, 422,
105, 1550, 871, 2078, 843, 555, 430, 2226,
// layer 2 inverse
-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
2054, 2054, 677, 677, 2264, 2264, 448, 448,
2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
// layer 3 inverse
-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
411, 411, 411, 411, 1787, 1787, 1787, 1787,
8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
608, 608, 608, 608, 732, 732, 732, 732,
1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
383, 383, 383, 383, 264, 264, 264, 264,
2004, 2004, 2004, 2004, 573, 573, 573, 573,
// layer 4 inverse
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
962, 962, 962, 962, 962, 962, 962, 962,
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
182, 182, 182, 182, 182, 182, 182, 182,
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
622, 622, 622, 622, 622, 622, 622, 622,
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
// layer 5 inverse
-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
// layer 6 inverse
788, 1812, 14746, 2970,
// layer 7 inverse
31499, 2571,
}
// Sets p to a + b. Does not normalize coefficients.
func (p *Poly) Add(a, b *Poly) {
if cpu.X86.HasAVX2 {
addAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.addGeneric(a, b)
}
}
// Sets p to a - b. Does not normalize coefficients.
func (p *Poly) Sub(a, b *Poly) {
if cpu.X86.HasAVX2 {
subAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.subGeneric(a, b)
}
}
// Executes an in-place forward "NTT" on p.
//
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤7q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity of the NTT)
// if the input is in regular form, then the result is also in regular form.
// The order of coefficients will be "tangled". These can be put back into
// their proper order by calling Detangle().
func (p *Poly) NTT() {
if cpu.X86.HasAVX2 {
nttAVX2((*[N]int16)(p))
} else {
p.nttGeneric()
}
}
// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
// factor R.
//
// Requires coefficients to be in "tangled" order, see Tangle().
// Assumes the coefficients are in absolute value ≤q. The resulting
// coefficients are in absolute value ≤q. If the input is in Montgomery
// form, then the result is in Montgomery form and so (by linearity)
// if the input is in regular form, then the result is also in regular form.
func (p *Poly) InvNTT() {
if cpu.X86.HasAVX2 {
invNttAVX2((*[N]int16)(p))
} else {
p.invNTTGeneric()
}
}
// Sets p to the "pointwise" multiplication of a and b.
//
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
// Montgomery form. Products between coefficients of a and b must be strictly
// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
// bounded in absolute value by 2q.
//
// Requires a and b to be in "tangled" order, see Tangle(). p will be in
// tangled order as well.
func (p *Poly) MulHat(a, b *Poly) {
if cpu.X86.HasAVX2 {
mulHatAVX2(
(*[N]int16)(p),
(*[N]int16)(a),
(*[N]int16)(b),
)
} else {
p.mulHatGeneric(a, b)
}
}
// Puts p into the right form to be used with (among others) InvNTT().
func (p *Poly) Tangle() {
if cpu.X86.HasAVX2 {
tangleAVX2((*[N]int16)(p))
}
// When AVX2 is not available, we use the standard order.
}
// Puts p back into standard form.
func (p *Poly) Detangle() {
if cpu.X86.HasAVX2 {
detangleAVX2((*[N]int16)(p))
}
// When AVX2 is not available, we use the standard order.
}
// Almost normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q}.
func (p *Poly) BarrettReduce() {
if cpu.X86.HasAVX2 {
barrettReduceAVX2((*[N]int16)(p))
} else {
p.barrettReduceGeneric()
}
}
// Normalizes coefficients.
//
// Ensures each coefficient is in {0, …, q-1}.
func (p *Poly) Normalize() {
if cpu.X86.HasAVX2 {
normalizeAVX2((*[N]int16)(p))
} else {
p.normalizeGeneric()
}
}