// hwaccel_amd64.go - AMD64 optimized routines. // // To the extent possible under law, Yawning Angel has waived all copyright // and related or neighboring rights to the software, using the Creative // Commons "CC0" public domain dedication. See LICENSE or // for full details. // +build amd64,!gccgo,!noasm,go1.10 package kyber var zetasExp = [752]uint16{ 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3625, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 3985, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 6581, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2456, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 2194, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 5431, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 834, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5186, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 5362, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 2876, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 5980, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 2816, 2816, 2816, 2816, 2816, 2816, 2816, 2816, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5593, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 5444, 1986, 1986, 1986, 1986, 1986, 1986, 1986, 1986, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 6082, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 5675, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 6156, 5124, 5124, 5124, 5124, 5124, 5124, 5124, 5124, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1296, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 1483, 4851, 4851, 4851, 4851, 4851, 4851, 4851, 4851, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 3364, 617, 617, 617, 617, 617, 617, 617, 617, 1921, 1921, 1921, 1921, 1921, 1921, 1921, 1921, 3992, 3992, 3992, 3992, 5943, 5943, 5943, 5943, 3266, 3266, 3266, 3266, 4081, 4081, 4081, 4081, 810, 810, 810, 810, 1887, 1887, 1887, 1887, 7043, 7043, 7043, 7043, 7674, 7674, 7674, 7674, 7243, 7243, 7243, 7243, 7002, 7002, 7002, 7002, 6376, 6376, 6376, 6376, 5921, 5921, 5921, 5921, 396, 396, 396, 396, 4507, 4507, 4507, 4507, 4126, 4126, 4126, 4126, 5800, 5800, 5800, 5800, 3772, 3772, 3772, 3772, 5146, 5146, 5146, 5146, 5241, 5241, 5241, 5241, 5126, 5126, 5126, 5126, 1535, 1535, 1535, 1535, 7132, 7132, 7132, 7132, 3153, 3153, 3153, 3153, 2310, 2310, 2310, 2310, 6282, 6282, 6282, 6282, 1321, 1321, 1321, 1321, 514, 514, 514, 514, 4725, 4725, 4725, 4725, 7578, 7578, 7578, 7578, 2804, 2804, 2804, 2804, 5638, 5638, 5638, 5638, 6250, 6250, 6250, 6250, 6627, 6627, 1698, 1698, 4225, 4225, 1166, 1166, 2426, 2426, 3831, 3831, 915, 915, 7679, 7679, 4264, 4264, 7487, 7487, 2919, 2919, 2789, 2789, 3405, 3405, 2385, 2385, 5568, 5568, 4949, 4949, 2175, 2175, 373, 373, 3692, 3692, 6951, 6951, 5925, 5925, 3135, 3135, 5290, 5290, 660, 660, 6184, 6184, 2572, 2572, 4536, 4536, 1350, 1350, 5457, 5457, 4093, 4093, 6000, 6000, 2883, 2883, 6291, 6291, 1598, 1598, 3750, 3750, 2762, 2762, 2835, 2835, 2764, 2764, 5448, 5448, 3816, 3816, 6148, 6148, 1464, 1464, 6954, 6954, 1521, 1521, 1386, 1386, 4253, 4253, 6760, 6760, 4938, 4938, 5521, 5521, 2649, 2649, 6822, 6822, 2579, 2579, 1532, 1532, 1919, 1919, 7195, 7195, 404, 404, 6625, 6625, 783, 783, 1799, 1799, 5016, 5016, 3480, 3480, 2133, 2133, 4371, 4371, 6513, 6513, 7664, 3744, 2422, 2001, 1278, 929, 6333, 5451, 7502, 6439, 5622, 6611, 2161, 1649, 2072, 3177, 5610, 1121, 7245, 236, 715, 670, 7023, 6205, 5303, 2767, 3542, 7455, 1203, 1181, 7530, 3887, 1712, 7459, 2786, 7230, 4134, 1779, 6530, 7247, 3568, 3988, 3581, 6095, 1509, 2918, 2339, 6274, 3434, 4131, 2340, 2891, 2998, 4367, 3461, 4962, 5434, 5092, 1144, 1072, 1295, 4866, 3911, 3450, 3781, 5423, 796, 3163, 4473, 7092, 2963, 7557, 3214, 3334, 4315, 3936, 3723, 1931, 7252, 7279, 4273, 83, 6155, 826, 6343, 2345, 5378, 2515, 7039, 5844, 4716, 6890, 370, 293, 3312, 2083, 5992, 6904, 2070, 2262, 6788, 2386, 7493, 6162, 4807, 6277, 1012, 2130, 1441, 2532, 4346, 6597, 4338, 2937, 509, 6278, 2812, 3763, 592, 2005, 3657, 2460, 4004, 3752, 692, 1669, 2167, 4394, } var zetasInvExp = [752]uint16{ 3287, 5514, 6012, 6989, 3929, 3677, 5221, 4024, 5676, 7089, 3918, 4869, 1403, 7172, 4744, 3343, 1084, 3335, 5149, 6240, 5551, 6669, 1404, 2874, 1519, 188, 5295, 893, 5419, 5611, 777, 1689, 5598, 4369, 7388, 7311, 791, 2965, 1837, 642, 5166, 2303, 5336, 1338, 6855, 1526, 7598, 3408, 402, 429, 5750, 3958, 3745, 3366, 4347, 4467, 124, 4718, 589, 3208, 4518, 6885, 2258, 3900, 4231, 3770, 2815, 6386, 6609, 6537, 2589, 2247, 2719, 4220, 3314, 4683, 4790, 5341, 3550, 4247, 1407, 5342, 4763, 6172, 1586, 4100, 3693, 4113, 434, 1151, 5902, 3547, 451, 4895, 222, 5969, 3794, 151, 6500, 6478, 226, 4139, 4914, 2378, 1476, 658, 7011, 6966, 7445, 436, 6560, 2071, 4504, 5609, 6032, 5520, 1070, 2059, 1242, 179, 2230, 1348, 6752, 6403, 5680, 5259, 3937, 17, 1168, 1168, 3310, 3310, 5548, 5548, 4201, 4201, 2665, 2665, 5882, 5882, 6898, 6898, 1056, 1056, 7277, 7277, 486, 486, 5762, 5762, 6149, 6149, 5102, 5102, 859, 859, 5032, 5032, 2160, 2160, 2743, 2743, 921, 921, 3428, 3428, 6295, 6295, 6160, 6160, 727, 727, 6217, 6217, 1533, 1533, 3865, 3865, 2233, 2233, 4917, 4917, 4846, 4846, 4919, 4919, 3931, 3931, 6083, 6083, 1390, 1390, 4798, 4798, 1681, 1681, 3588, 3588, 2224, 2224, 6331, 6331, 3145, 3145, 5109, 5109, 1497, 1497, 7021, 7021, 2391, 2391, 4546, 4546, 1756, 1756, 730, 730, 3989, 3989, 7308, 7308, 5506, 5506, 2732, 2732, 2113, 2113, 5296, 5296, 4276, 4276, 4892, 4892, 4762, 4762, 194, 194, 3417, 3417, 2, 2, 6766, 6766, 3850, 3850, 5255, 5255, 6515, 6515, 3456, 3456, 5983, 5983, 1054, 1054, 1431, 1431, 1431, 1431, 2043, 2043, 2043, 2043, 4877, 4877, 4877, 4877, 103, 103, 103, 103, 2956, 2956, 2956, 2956, 7167, 7167, 7167, 7167, 6360, 6360, 6360, 6360, 1399, 1399, 1399, 1399, 5371, 5371, 5371, 5371, 4528, 4528, 4528, 4528, 549, 549, 549, 549, 6146, 6146, 6146, 6146, 2555, 2555, 2555, 2555, 2440, 2440, 2440, 2440, 2535, 2535, 2535, 2535, 3909, 3909, 3909, 3909, 1881, 1881, 1881, 1881, 3555, 3555, 3555, 3555, 3174, 3174, 3174, 3174, 7285, 7285, 7285, 7285, 1760, 1760, 1760, 1760, 1305, 1305, 1305, 1305, 679, 679, 679, 679, 438, 438, 438, 438, 7, 7, 7, 7, 638, 638, 638, 638, 5794, 5794, 5794, 5794, 6871, 6871, 6871, 6871, 3600, 3600, 3600, 3600, 4415, 4415, 4415, 4415, 1738, 1738, 1738, 1738, 3689, 3689, 3689, 3689, 5760, 5760, 5760, 5760, 5760, 5760, 5760, 5760, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 7064, 4317, 4317, 4317, 4317, 4317, 4317, 4317, 4317, 2830, 2830, 2830, 2830, 2830, 2830, 2830, 2830, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6198, 6385, 6385, 6385, 6385, 6385, 6385, 6385, 6385, 2557, 2557, 2557, 2557, 2557, 2557, 2557, 2557, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 1525, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 3975, 3975, 3975, 3975, 3975, 3975, 3975, 3975, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 5688, 1599, 1599, 1599, 1599, 1599, 1599, 1599, 1599, 5695, 5695, 5695, 5695, 5695, 5695, 5695, 5695, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2237, 2088, 2088, 2088, 2088, 2088, 2088, 2088, 2088, 4865, 4865, 4865, 4865, 4865, 4865, 4865, 4865, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 6267, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 1701, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 4805, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2319, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 2495, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 6847, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 2250, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 7560, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5487, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 5225, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 1100, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 3696, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 4056, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 3182, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, 5776, } //go:noescape func cpuidAmd64(cpuidParams *uint32) //go:noescape func xgetbv0Amd64(xcrVec *uint32) //go:noescape func nttAVX2(inout, zetas *uint16) //go:noescape func invnttAVX2(inout, omegas *uint16) //go:noescape func pointwiseAccK2AVX2(dst *uint16, a, b **uint16) //go:noescape func pointwiseAccK3AVX2(dst *uint16, a, b **uint16) //go:noescape func pointwiseAccK4AVX2(dst *uint16, a, b **uint16) //go:noescape func cbdEta4AVX2(dst *uint16, buf *byte) func supportsAVX2() bool { // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family const ( osXsaveBit = 1 << 27 avx2Bit = 1 << 5 ) // Check to see if CPUID actually supports the leaf that indicates AVX2. // CPUID.(EAX=0H, ECX=0H) >= 7 regs := [4]uint32{0x00} cpuidAmd64(®s[0]) if regs[0] < 7 { return false } // Check to see if the OS knows how to save/restore XMM/YMM state. // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 regs = [4]uint32{0x01} cpuidAmd64(®s[0]) if regs[2]&osXsaveBit == 0 { return false } xcrRegs := [2]uint32{} xgetbv0Amd64(&xcrRegs[0]) if xcrRegs[0]&6 != 6 { return false } // Check for AVX2 support. // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 regs = [4]uint32{0x07} cpuidAmd64(®s[0]) return regs[1]&avx2Bit != 0 } var implAVX2 = &hwaccelImpl{ name: "AVX2", nttFn: nttYMM, invnttFn: invnttYMM, pointwiseAccFn: pointwiseAccYMM, cbdFn: cbdYMM, } func nttYMM(p *[kyberN]uint16) { nttAVX2(&p[0], &zetasExp[0]) } func invnttYMM(a *[kyberN]uint16) { invnttAVX2(&a[0], &zetasInvExp[0]) } func pointwiseAccYMM(p *poly, a, b *polyVec) { // Unlike the C code, a polyVec won't have the polys in contigious // memory. So each assembly function takes vectors of pointers to // each polyvec's polys. // // Kind of ugly, but it's the price to pay for flexibility... var aVec, bVec [4]*uint16 // k is in {2,3,4}. for i := range a.vec { aVec[i] = &a.vec[i].coeffs[0] bVec[i] = &b.vec[i].coeffs[0] } switch len(a.vec) { case 2: pointwiseAccK2AVX2(&p.coeffs[0], &aVec[0], &bVec[0]) case 3: pointwiseAccK3AVX2(&p.coeffs[0], &aVec[0], &bVec[0]) case 4: pointwiseAccK4AVX2(&p.coeffs[0], &aVec[0], &bVec[0]) } } func cbdYMM(p *poly, buf []byte, eta int) { switch eta { case 4: cbdEta4AVX2(&p.coeffs[0], &buf[0]) default: cbdRef(p, buf, eta) } } func initHardwareAcceleration() { if supportsAVX2() { isHardwareAccelerated = true hardwareAccelImpl = implAVX2 } }