mirror of https://gogs.blitter.com/RLabs/xs
257 lines
12 KiB
Go
257 lines
12 KiB
Go
// hwaccel_amd64.go - AMD64 optimized routines.
|
|
//
|
|
// To the extent possible under law, Yawning Angel has waived all copyright
|
|
// and related or neighboring rights to the software, using the Creative
|
|
// Commons "CC0" public domain dedication. See LICENSE or
|
|
// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
|
|
|
|
// +build amd64,!gccgo,!noasm,go1.10
|
|
|
|
package kyber
|
|
|
|
var zetasExp = [752]uint16{
|
|
3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
|
|
3777, 3777, 3777, 3777, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
|
|
4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 3625, 3625, 3625, 3625,
|
|
3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625,
|
|
3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985,
|
|
3985, 3985, 3985, 3985, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581,
|
|
6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 2456, 2456, 2456, 2456,
|
|
2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456,
|
|
2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194,
|
|
2194, 2194, 2194, 2194, 121, 121, 121, 121, 121, 121, 121, 121, 121,
|
|
121, 121, 121, 121, 121, 121, 121, 5431, 5431, 5431, 5431, 5431, 5431,
|
|
5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 834, 834,
|
|
834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834,
|
|
5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186,
|
|
5186, 5186, 5186, 5186, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362,
|
|
5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 2876, 2876, 2876, 2876,
|
|
2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876,
|
|
5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980,
|
|
5980, 5980, 5980, 5980, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414,
|
|
1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 2816, 2816, 2816, 2816,
|
|
2816, 2816, 2816, 2816, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5593,
|
|
5444, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 1986, 1986, 1986, 1986,
|
|
1986, 1986, 1986, 1986, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 6082,
|
|
1993, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 3706, 3706, 3706, 3706,
|
|
3706, 3706, 3706, 3706, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 5675,
|
|
6156, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 5124, 5124, 5124, 5124,
|
|
5124, 5124, 5124, 5124, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1296,
|
|
1483, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 4851, 4851, 4851, 4851,
|
|
4851, 4851, 4851, 4851, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 3364,
|
|
617, 617, 617, 617, 617, 617, 617, 617, 1921, 1921, 1921, 1921, 1921,
|
|
1921, 1921, 1921, 3992, 3992, 3992, 3992, 5943, 5943, 5943, 5943, 3266,
|
|
3266, 3266, 3266, 4081, 4081, 4081, 4081, 810, 810, 810, 810, 1887,
|
|
1887, 1887, 1887, 7043, 7043, 7043, 7043, 7674, 7674, 7674, 7674, 7243,
|
|
7243, 7243, 7243, 7002, 7002, 7002, 7002, 6376, 6376, 6376, 6376, 5921,
|
|
5921, 5921, 5921, 396, 396, 396, 396, 4507, 4507, 4507, 4507, 4126,
|
|
4126, 4126, 4126, 5800, 5800, 5800, 5800, 3772, 3772, 3772, 3772, 5146,
|
|
5146, 5146, 5146, 5241, 5241, 5241, 5241, 5126, 5126, 5126, 5126, 1535,
|
|
1535, 1535, 1535, 7132, 7132, 7132, 7132, 3153, 3153, 3153, 3153, 2310,
|
|
2310, 2310, 2310, 6282, 6282, 6282, 6282, 1321, 1321, 1321, 1321, 514,
|
|
514, 514, 514, 4725, 4725, 4725, 4725, 7578, 7578, 7578, 7578, 2804,
|
|
2804, 2804, 2804, 5638, 5638, 5638, 5638, 6250, 6250, 6250, 6250, 6627,
|
|
6627, 1698, 1698, 4225, 4225, 1166, 1166, 2426, 2426, 3831, 3831, 915,
|
|
915, 7679, 7679, 4264, 4264, 7487, 7487, 2919, 2919, 2789, 2789, 3405,
|
|
3405, 2385, 2385, 5568, 5568, 4949, 4949, 2175, 2175, 373, 373, 3692,
|
|
3692, 6951, 6951, 5925, 5925, 3135, 3135, 5290, 5290, 660, 660, 6184,
|
|
6184, 2572, 2572, 4536, 4536, 1350, 1350, 5457, 5457, 4093, 4093, 6000,
|
|
6000, 2883, 2883, 6291, 6291, 1598, 1598, 3750, 3750, 2762, 2762, 2835,
|
|
2835, 2764, 2764, 5448, 5448, 3816, 3816, 6148, 6148, 1464, 1464, 6954,
|
|
6954, 1521, 1521, 1386, 1386, 4253, 4253, 6760, 6760, 4938, 4938, 5521,
|
|
5521, 2649, 2649, 6822, 6822, 2579, 2579, 1532, 1532, 1919, 1919, 7195,
|
|
7195, 404, 404, 6625, 6625, 783, 783, 1799, 1799, 5016, 5016, 3480,
|
|
3480, 2133, 2133, 4371, 4371, 6513, 6513, 7664, 3744, 2422, 2001, 1278,
|
|
929, 6333, 5451, 7502, 6439, 5622, 6611, 2161, 1649, 2072, 3177, 5610,
|
|
1121, 7245, 236, 715, 670, 7023, 6205, 5303, 2767, 3542, 7455, 1203,
|
|
1181, 7530, 3887, 1712, 7459, 2786, 7230, 4134, 1779, 6530, 7247, 3568,
|
|
3988, 3581, 6095, 1509, 2918, 2339, 6274, 3434, 4131, 2340, 2891, 2998,
|
|
4367, 3461, 4962, 5434, 5092, 1144, 1072, 1295, 4866, 3911, 3450, 3781,
|
|
5423, 796, 3163, 4473, 7092, 2963, 7557, 3214, 3334, 4315, 3936, 3723,
|
|
1931, 7252, 7279, 4273, 83, 6155, 826, 6343, 2345, 5378, 2515, 7039,
|
|
5844, 4716, 6890, 370, 293, 3312, 2083, 5992, 6904, 2070, 2262, 6788,
|
|
2386, 7493, 6162, 4807, 6277, 1012, 2130, 1441, 2532, 4346, 6597, 4338,
|
|
2937, 509, 6278, 2812, 3763, 592, 2005, 3657, 2460, 4004, 3752, 692,
|
|
1669, 2167, 4394,
|
|
}
|
|
|
|
var zetasInvExp = [752]uint16{
|
|
3287, 5514, 6012, 6989, 3929, 3677, 5221, 4024, 5676, 7089, 3918, 4869,
|
|
1403, 7172, 4744, 3343, 1084, 3335, 5149, 6240, 5551, 6669, 1404, 2874,
|
|
1519, 188, 5295, 893, 5419, 5611, 777, 1689, 5598, 4369, 7388, 7311,
|
|
791, 2965, 1837, 642, 5166, 2303, 5336, 1338, 6855, 1526, 7598, 3408,
|
|
402, 429, 5750, 3958, 3745, 3366, 4347, 4467, 124, 4718, 589, 3208,
|
|
4518, 6885, 2258, 3900, 4231, 3770, 2815, 6386, 6609, 6537, 2589, 2247,
|
|
2719, 4220, 3314, 4683, 4790, 5341, 3550, 4247, 1407, 5342, 4763, 6172,
|
|
1586, 4100, 3693, 4113, 434, 1151, 5902, 3547, 451, 4895, 222, 5969,
|
|
3794, 151, 6500, 6478, 226, 4139, 4914, 2378, 1476, 658, 7011, 6966,
|
|
7445, 436, 6560, 2071, 4504, 5609, 6032, 5520, 1070, 2059, 1242, 179,
|
|
2230, 1348, 6752, 6403, 5680, 5259, 3937, 17, 1168, 1168, 3310, 3310,
|
|
5548, 5548, 4201, 4201, 2665, 2665, 5882, 5882, 6898, 6898, 1056, 1056,
|
|
7277, 7277, 486, 486, 5762, 5762, 6149, 6149, 5102, 5102, 859, 859,
|
|
5032, 5032, 2160, 2160, 2743, 2743, 921, 921, 3428, 3428, 6295, 6295,
|
|
6160, 6160, 727, 727, 6217, 6217, 1533, 1533, 3865, 3865, 2233, 2233,
|
|
4917, 4917, 4846, 4846, 4919, 4919, 3931, 3931, 6083, 6083, 1390, 1390,
|
|
4798, 4798, 1681, 1681, 3588, 3588, 2224, 2224, 6331, 6331, 3145, 3145,
|
|
5109, 5109, 1497, 1497, 7021, 7021, 2391, 2391, 4546, 4546, 1756, 1756,
|
|
730, 730, 3989, 3989, 7308, 7308, 5506, 5506, 2732, 2732, 2113, 2113,
|
|
5296, 5296, 4276, 4276, 4892, 4892, 4762, 4762, 194, 194, 3417, 3417, 2,
|
|
2, 6766, 6766, 3850, 3850, 5255, 5255, 6515, 6515, 3456, 3456, 5983,
|
|
5983, 1054, 1054, 1431, 1431, 1431, 1431, 2043, 2043, 2043, 2043, 4877,
|
|
4877, 4877, 4877, 103, 103, 103, 103, 2956, 2956, 2956, 2956, 7167,
|
|
7167, 7167, 7167, 6360, 6360, 6360, 6360, 1399, 1399, 1399, 1399, 5371,
|
|
5371, 5371, 5371, 4528, 4528, 4528, 4528, 549, 549, 549, 549, 6146,
|
|
6146, 6146, 6146, 2555, 2555, 2555, 2555, 2440, 2440, 2440, 2440, 2535,
|
|
2535, 2535, 2535, 3909, 3909, 3909, 3909, 1881, 1881, 1881, 1881, 3555,
|
|
3555, 3555, 3555, 3174, 3174, 3174, 3174, 7285, 7285, 7285, 7285, 1760,
|
|
1760, 1760, 1760, 1305, 1305, 1305, 1305, 679, 679, 679, 679, 438, 438,
|
|
438, 438, 7, 7, 7, 7, 638, 638, 638, 638, 5794, 5794, 5794, 5794, 6871,
|
|
6871, 6871, 6871, 3600, 3600, 3600, 3600, 4415, 4415, 4415, 4415, 1738,
|
|
1738, 1738, 1738, 3689, 3689, 3689, 3689, 5760, 5760, 5760, 5760, 5760,
|
|
5760, 5760, 5760, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 4317,
|
|
4317, 4317, 4317, 4317, 4317, 4317, 4317, 2830, 2830, 2830, 2830, 2830,
|
|
2830, 2830, 2830, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6385,
|
|
6385, 6385, 6385, 6385, 6385, 6385, 6385, 2557, 2557, 2557, 2557, 2557,
|
|
2557, 2557, 2557, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 2006,
|
|
2006, 2006, 2006, 2006, 2006, 2006, 2006, 3975, 3975, 3975, 3975, 3975,
|
|
3975, 3975, 3975, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 1599,
|
|
1599, 1599, 1599, 1599, 1599, 1599, 1599, 5695, 5695, 5695, 5695, 5695,
|
|
5695, 5695, 5695, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2088,
|
|
2088, 2088, 2088, 2088, 2088, 2088, 2088, 4865, 4865, 4865, 4865, 4865,
|
|
4865, 4865, 4865, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267,
|
|
6267, 6267, 6267, 6267, 6267, 6267, 6267, 1701, 1701, 1701, 1701, 1701,
|
|
1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 4805,
|
|
4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805,
|
|
4805, 4805, 4805, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319,
|
|
2319, 2319, 2319, 2319, 2319, 2319, 2319, 2495, 2495, 2495, 2495, 2495,
|
|
2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 6847,
|
|
6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847,
|
|
6847, 6847, 6847, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250,
|
|
2250, 2250, 2250, 2250, 2250, 2250, 2250, 7560, 7560, 7560, 7560, 7560,
|
|
7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 5487,
|
|
5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487,
|
|
5487, 5487, 5487, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225,
|
|
5225, 5225, 5225, 5225, 5225, 5225, 5225, 1100, 1100, 1100, 1100, 1100,
|
|
1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 3696,
|
|
3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696,
|
|
3696, 3696, 3696, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056,
|
|
4056, 4056, 4056, 4056, 4056, 4056, 4056, 3182, 3182, 3182, 3182, 3182,
|
|
3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 5776,
|
|
5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776,
|
|
5776, 5776, 5776,
|
|
}
|
|
|
|
//go:noescape
|
|
func cpuidAmd64(cpuidParams *uint32)
|
|
|
|
//go:noescape
|
|
func xgetbv0Amd64(xcrVec *uint32)
|
|
|
|
//go:noescape
|
|
func nttAVX2(inout, zetas *uint16)
|
|
|
|
//go:noescape
|
|
func invnttAVX2(inout, omegas *uint16)
|
|
|
|
//go:noescape
|
|
func pointwiseAccK2AVX2(dst *uint16, a, b **uint16)
|
|
|
|
//go:noescape
|
|
func pointwiseAccK3AVX2(dst *uint16, a, b **uint16)
|
|
|
|
//go:noescape
|
|
func pointwiseAccK4AVX2(dst *uint16, a, b **uint16)
|
|
|
|
//go:noescape
|
|
func cbdEta4AVX2(dst *uint16, buf *byte)
|
|
|
|
func supportsAVX2() bool {
|
|
// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
|
|
const (
|
|
osXsaveBit = 1 << 27
|
|
avx2Bit = 1 << 5
|
|
)
|
|
|
|
// Check to see if CPUID actually supports the leaf that indicates AVX2.
|
|
// CPUID.(EAX=0H, ECX=0H) >= 7
|
|
regs := [4]uint32{0x00}
|
|
cpuidAmd64(®s[0])
|
|
if regs[0] < 7 {
|
|
return false
|
|
}
|
|
|
|
// Check to see if the OS knows how to save/restore XMM/YMM state.
|
|
// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
|
|
regs = [4]uint32{0x01}
|
|
cpuidAmd64(®s[0])
|
|
if regs[2]&osXsaveBit == 0 {
|
|
return false
|
|
}
|
|
xcrRegs := [2]uint32{}
|
|
xgetbv0Amd64(&xcrRegs[0])
|
|
if xcrRegs[0]&6 != 6 {
|
|
return false
|
|
}
|
|
|
|
// Check for AVX2 support.
|
|
// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
|
|
regs = [4]uint32{0x07}
|
|
cpuidAmd64(®s[0])
|
|
return regs[1]&avx2Bit != 0
|
|
}
|
|
|
|
var implAVX2 = &hwaccelImpl{
|
|
name: "AVX2",
|
|
nttFn: nttYMM,
|
|
invnttFn: invnttYMM,
|
|
pointwiseAccFn: pointwiseAccYMM,
|
|
cbdFn: cbdYMM,
|
|
}
|
|
|
|
func nttYMM(p *[kyberN]uint16) {
|
|
nttAVX2(&p[0], &zetasExp[0])
|
|
}
|
|
|
|
func invnttYMM(a *[kyberN]uint16) {
|
|
invnttAVX2(&a[0], &zetasInvExp[0])
|
|
}
|
|
|
|
func pointwiseAccYMM(p *poly, a, b *polyVec) {
|
|
// Unlike the C code, a polyVec won't have the polys in contigious
|
|
// memory. So each assembly function takes vectors of pointers to
|
|
// each polyvec's polys.
|
|
//
|
|
// Kind of ugly, but it's the price to pay for flexibility...
|
|
|
|
var aVec, bVec [4]*uint16 // k is in {2,3,4}.
|
|
for i := range a.vec {
|
|
aVec[i] = &a.vec[i].coeffs[0]
|
|
bVec[i] = &b.vec[i].coeffs[0]
|
|
}
|
|
|
|
switch len(a.vec) {
|
|
case 2:
|
|
pointwiseAccK2AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
|
|
case 3:
|
|
pointwiseAccK3AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
|
|
case 4:
|
|
pointwiseAccK4AVX2(&p.coeffs[0], &aVec[0], &bVec[0])
|
|
}
|
|
}
|
|
|
|
func cbdYMM(p *poly, buf []byte, eta int) {
|
|
switch eta {
|
|
case 4:
|
|
cbdEta4AVX2(&p.coeffs[0], &buf[0])
|
|
default:
|
|
cbdRef(p, buf, eta)
|
|
}
|
|
}
|
|
|
|
func initHardwareAcceleration() {
|
|
if supportsAVX2() {
|
|
isHardwareAccelerated = true
|
|
hardwareAccelImpl = implAVX2
|
|
}
|
|
}
|