303 lines
11 KiB
Go
303 lines
11 KiB
Go
//go:build amd64
|
|
// +build amd64
|
|
|
|
package common
|
|
|
|
import (
|
|
"golang.org/x/sys/cpu"
|
|
)
|
|
|
|
// ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
|
|
// the values int16(zeta * 62209) for each zeta, which is used in
|
|
// Montgomery reduction. There is some duplication and reordering as
|
|
// compared to Zetas to make it more covenient for use with AVX2.
|
|
var ZetasAVX2 = [...]int16{
|
|
// level 1: int16(Zetas[1]*62209) and Zetas[1]
|
|
31499, 2571,
|
|
|
|
// level 2
|
|
//
|
|
// int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
|
|
14746, 2970, 788, 1812,
|
|
|
|
// level 3, like level 2.
|
|
13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
|
|
|
|
0, 0, // padding
|
|
|
|
// layer 4. offset: 1*16
|
|
//
|
|
// The precomputed multiplication and zetas are grouped by 16 at a
|
|
// time as used in the set of butterflies, etc.
|
|
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
|
|
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
|
|
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
|
|
622, 622, 622, 622, 622, 622, 622, 622,
|
|
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
|
|
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
|
|
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
|
|
182, 182, 182, 182, 182, 182, 182, 182,
|
|
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
|
|
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
|
|
962, 962, 962, 962, 962, 962, 962, 962,
|
|
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
|
|
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
|
|
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
|
|
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
|
|
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
|
|
|
|
// layer 5. offset: 9*16
|
|
-5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
|
|
-26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
|
|
573, 573, 573, 573, 2004, 2004, 2004, 2004,
|
|
264, 264, 264, 264, 383, 383, 383, 383,
|
|
5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
|
|
21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
|
|
2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
|
|
1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
|
|
-28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
|
|
-10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
|
|
2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
|
|
732, 732, 732, 732, 608, 608, 608, 608,
|
|
18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
|
|
26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
|
|
1787, 1787, 1787, 1787, 411, 411, 411, 411,
|
|
3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
|
|
|
|
// layer 6. offset: 17*16
|
|
-5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
|
|
-23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
|
|
1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
|
|
2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
|
|
-12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
|
|
9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
|
|
516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
|
|
1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
|
|
19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
|
|
-28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
|
|
2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
|
|
107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
|
|
13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
|
|
16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
|
|
2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
|
|
448, 448, 2264, 2264, 677, 677, 2054, 2054,
|
|
|
|
// layer 7. offset: 25*16
|
|
-334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
|
|
-27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
|
|
2226, 430, 555, 843, 2078, 871, 1550, 105,
|
|
422, 587, 177, 3094, 3038, 2869, 1574, 1653,
|
|
32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
|
|
-18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
|
|
3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
|
|
1739, 644, 2457, 349, 418, 329, 3173, 3254,
|
|
-31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
|
|
-20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
|
|
817, 1097, 603, 610, 1322, 2044, 1864, 384,
|
|
2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
|
|
10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
|
|
31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
|
|
2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
|
|
3221, 3021, 996, 991, 958, 1869, 1522, 1628,
|
|
|
|
// layer 1 inverse
|
|
23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
|
|
-17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
|
|
1628, 1522, 1869, 958, 991, 996, 3021, 3221,
|
|
478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
|
|
14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
|
|
-32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
|
|
1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
|
|
384, 1864, 2044, 1322, 610, 603, 1097, 817,
|
|
-12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
|
|
12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
|
|
3254, 3173, 329, 418, 349, 2457, 644, 1739,
|
|
1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
|
|
5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
|
|
-21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
|
|
1653, 1574, 2869, 3038, 3094, 177, 587, 422,
|
|
105, 1550, 871, 2078, 843, 555, 430, 2226,
|
|
|
|
// layer 2 inverse
|
|
-17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
|
|
-12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
|
|
2054, 2054, 677, 677, 2264, 2264, 448, 448,
|
|
2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
|
|
18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
|
|
-8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
|
|
2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
|
|
830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
|
|
27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
|
|
-12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
|
|
1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
|
|
2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
|
|
25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
|
|
30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
|
|
1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
|
|
1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
|
|
|
|
// layer 3 inverse
|
|
-16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
|
|
8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
|
|
1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
|
|
411, 411, 411, 411, 1787, 1787, 1787, 1787,
|
|
8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
|
|
24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
|
|
608, 608, 608, 608, 732, 732, 732, 732,
|
|
1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
|
|
-26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
|
|
-1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
|
|
3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
|
|
1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
|
|
-29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
|
|
17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
|
|
383, 383, 383, 383, 264, 264, 264, 264,
|
|
2004, 2004, 2004, 2004, 573, 573, 573, 573,
|
|
|
|
// layer 4 inverse
|
|
31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
|
|
-11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
|
|
1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
|
|
1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
|
|
1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
|
|
10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
|
|
2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
|
|
962, 962, 962, 962, 962, 962, 962, 962,
|
|
-15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
|
|
-3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
|
|
182, 182, 182, 182, 182, 182, 182, 182,
|
|
1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
|
|
27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
|
|
-20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
|
|
622, 622, 622, 622, 622, 622, 622, 622,
|
|
3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
|
|
|
|
// layer 5 inverse
|
|
-16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
|
|
|
|
// layer 6 inverse
|
|
788, 1812, 14746, 2970,
|
|
|
|
// layer 7 inverse
|
|
31499, 2571,
|
|
}
|
|
|
|
// Sets p to a + b. Does not normalize coefficients.
|
|
func (p *Poly) Add(a, b *Poly) {
|
|
if cpu.X86.HasAVX2 {
|
|
addAVX2(
|
|
(*[N]int16)(p),
|
|
(*[N]int16)(a),
|
|
(*[N]int16)(b),
|
|
)
|
|
} else {
|
|
p.addGeneric(a, b)
|
|
}
|
|
}
|
|
|
|
// Sets p to a - b. Does not normalize coefficients.
|
|
func (p *Poly) Sub(a, b *Poly) {
|
|
if cpu.X86.HasAVX2 {
|
|
subAVX2(
|
|
(*[N]int16)(p),
|
|
(*[N]int16)(a),
|
|
(*[N]int16)(b),
|
|
)
|
|
} else {
|
|
p.subGeneric(a, b)
|
|
}
|
|
}
|
|
|
|
// Executes an in-place forward "NTT" on p.
|
|
//
|
|
// Assumes the coefficients are in absolute value ≤q. The resulting
|
|
// coefficients are in absolute value ≤7q. If the input is in Montgomery
|
|
// form, then the result is in Montgomery form and so (by linearity of the NTT)
|
|
// if the input is in regular form, then the result is also in regular form.
|
|
// The order of coefficients will be "tangled". These can be put back into
|
|
// their proper order by calling Detangle().
|
|
func (p *Poly) NTT() {
|
|
if cpu.X86.HasAVX2 {
|
|
nttAVX2((*[N]int16)(p))
|
|
} else {
|
|
p.nttGeneric()
|
|
}
|
|
}
|
|
|
|
// Executes an in-place inverse "NTT" on p and multiply by the Montgomery
|
|
// factor R.
|
|
//
|
|
// Requires coefficients to be in "tangled" order, see Tangle().
|
|
// Assumes the coefficients are in absolute value ≤q. The resulting
|
|
// coefficients are in absolute value ≤q. If the input is in Montgomery
|
|
// form, then the result is in Montgomery form and so (by linearity)
|
|
// if the input is in regular form, then the result is also in regular form.
|
|
func (p *Poly) InvNTT() {
|
|
if cpu.X86.HasAVX2 {
|
|
invNttAVX2((*[N]int16)(p))
|
|
} else {
|
|
p.invNTTGeneric()
|
|
}
|
|
}
|
|
|
|
// Sets p to the "pointwise" multiplication of a and b.
|
|
//
|
|
// That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
|
|
// Montgomery form. Products between coefficients of a and b must be strictly
|
|
// bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
|
|
// bounded in absolute value by 2q.
|
|
//
|
|
// Requires a and b to be in "tangled" order, see Tangle(). p will be in
|
|
// tangled order as well.
|
|
func (p *Poly) MulHat(a, b *Poly) {
|
|
if cpu.X86.HasAVX2 {
|
|
mulHatAVX2(
|
|
(*[N]int16)(p),
|
|
(*[N]int16)(a),
|
|
(*[N]int16)(b),
|
|
)
|
|
} else {
|
|
p.mulHatGeneric(a, b)
|
|
}
|
|
}
|
|
|
|
// Puts p into the right form to be used with (among others) InvNTT().
|
|
func (p *Poly) Tangle() {
|
|
if cpu.X86.HasAVX2 {
|
|
tangleAVX2((*[N]int16)(p))
|
|
}
|
|
|
|
// When AVX2 is not available, we use the standard order.
|
|
}
|
|
|
|
// Puts p back into standard form.
|
|
func (p *Poly) Detangle() {
|
|
if cpu.X86.HasAVX2 {
|
|
detangleAVX2((*[N]int16)(p))
|
|
}
|
|
|
|
// When AVX2 is not available, we use the standard order.
|
|
}
|
|
|
|
// Almost normalizes coefficients.
|
|
//
|
|
// Ensures each coefficient is in {0, …, q}.
|
|
func (p *Poly) BarrettReduce() {
|
|
if cpu.X86.HasAVX2 {
|
|
barrettReduceAVX2((*[N]int16)(p))
|
|
} else {
|
|
p.barrettReduceGeneric()
|
|
}
|
|
}
|
|
|
|
// Normalizes coefficients.
|
|
//
|
|
// Ensures each coefficient is in {0, …, q-1}.
|
|
func (p *Poly) Normalize() {
|
|
if cpu.X86.HasAVX2 {
|
|
normalizeAVX2((*[N]int16)(p))
|
|
} else {
|
|
p.normalizeGeneric()
|
|
}
|
|
}
|