31 #define RESTRICT restrict
34 #include "../internal.h"
37 #if defined(OPENSSL_SSE2)
38 #include <emmintrin.h>
41 #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
42 (defined(__ARM_NEON__) || defined(__ARM_NEON))
69 #if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER))
71 #define HRSS_HAVE_VECTOR_UNIT
72 typedef __m128i vec_t;
75 static int vec_capable(
void) {
return 1; }
78 static inline vec_t vec_add(vec_t
a, vec_t
b) {
return _mm_add_epi16(
a,
b); }
81 static inline vec_t vec_sub(vec_t
a, vec_t
b) {
return _mm_sub_epi16(
a,
b); }
85 static inline vec_t vec_mul(vec_t
a,
uint16_t b) {
86 return _mm_mullo_epi16(
a, _mm_set1_epi16(
b));
91 static inline vec_t vec_fma(vec_t
a, vec_t
b,
uint16_t c) {
92 return _mm_add_epi16(
a, _mm_mullo_epi16(
b, _mm_set1_epi16(
c)));
96 static inline void vec3_rshift_word(vec_t
v[3]) {
101 const __m128i carry0 = _mm_srli_si128(
v[0], 14);
102 v[0] = _mm_slli_si128(
v[0], 2);
104 const __m128i carry1 = _mm_srli_si128(
v[1], 14);
105 v[1] = _mm_slli_si128(
v[1], 2);
108 v[2] = _mm_slli_si128(
v[2], 2);
113 static inline void vec4_rshift_word(vec_t
v[4]) {
118 const __m128i carry0 = _mm_srli_si128(
v[0], 14);
119 v[0] = _mm_slli_si128(
v[0], 2);
121 const __m128i carry1 = _mm_srli_si128(
v[1], 14);
122 v[1] = _mm_slli_si128(
v[1], 2);
125 const __m128i carry2 = _mm_srli_si128(
v[2], 14);
126 v[2] = _mm_slli_si128(
v[2], 2);
129 v[3] = _mm_slli_si128(
v[3], 2);
135 static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
136 return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
141 static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
145 for (
int i = 0;
i < 6;
i++) {
146 vec_t next_carry_s = _mm_srli_epi64(a_s[
i], 63);
147 a_s[
i] = _mm_slli_epi64(a_s[
i], 1);
148 a_s[
i] |= _mm_slli_si128(next_carry_s, 8);
150 carry_s = _mm_srli_si128(next_carry_s, 8);
152 vec_t next_carry_a = _mm_srli_epi64(a_a[
i], 63);
153 a_a[
i] = _mm_slli_epi64(a_a[
i], 1);
154 a_a[
i] |= _mm_slli_si128(next_carry_a, 8);
156 carry_a = _mm_srli_si128(next_carry_a, 8);
162 static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
166 for (
int i = 5;
i >= 0;
i--) {
167 const vec_t next_carry_s = _mm_slli_epi64(a_s[
i], 63);
168 a_s[
i] = _mm_srli_epi64(a_s[
i], 1);
169 a_s[
i] |= _mm_srli_si128(next_carry_s, 8);
171 carry_s = _mm_slli_si128(next_carry_s, 8);
173 const vec_t next_carry_a = _mm_slli_epi64(a_a[
i], 63);
174 a_a[
i] = _mm_srli_epi64(a_a[
i], 1);
175 a_a[
i] |= _mm_srli_si128(next_carry_a, 8);
177 carry_a = _mm_slli_si128(next_carry_a, 8);
183 static inline vec_t vec_broadcast_bit(vec_t
a) {
184 return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(
a, 63), 31),
190 #define vec_get_word(v, i) _mm_extract_epi16(v, i)
192 #elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
193 (defined(__ARM_NEON__) || defined(__ARM_NEON))
195 #define HRSS_HAVE_VECTOR_UNIT
196 typedef uint16x8_t vec_t;
201 static int vec_capable(
void) {
return CRYPTO_is_NEON_capable(); }
203 static inline vec_t vec_add(vec_t
a, vec_t
b) {
return a +
b; }
205 static inline vec_t vec_sub(vec_t
a, vec_t
b) {
return a -
b; }
207 static inline vec_t vec_mul(vec_t
a,
uint16_t b) {
return vmulq_n_u16(
a,
b); }
209 static inline vec_t vec_fma(vec_t
a, vec_t
b,
uint16_t c) {
210 return vmlaq_n_u16(
a,
b,
c);
213 static inline void vec3_rshift_word(vec_t
v[3]) {
214 const uint16x8_t
kZero = {0};
215 v[2] = vextq_u16(
v[1],
v[2], 7);
216 v[1] = vextq_u16(
v[0],
v[1], 7);
217 v[0] = vextq_u16(kZero,
v[0], 7);
220 static inline void vec4_rshift_word(vec_t
v[4]) {
221 const uint16x8_t
kZero = {0};
222 v[3] = vextq_u16(
v[2],
v[3], 7);
223 v[2] = vextq_u16(
v[1],
v[2], 7);
224 v[1] = vextq_u16(
v[0],
v[1], 7);
225 v[0] = vextq_u16(kZero,
v[0], 7);
228 static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
229 return vextq_u16(left, right, 5);
232 static inline uint16_t vec_get_word(vec_t
v,
unsigned i) {
236 #if !defined(OPENSSL_AARCH64)
238 static inline vec_t vec_broadcast_bit(vec_t
a) {
239 a = (vec_t)vshrq_n_s16(((int16x8_t)
a) << 15, 15);
240 return vdupq_lane_u16(vget_low_u16(
a), 0);
243 static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
246 const vec_t
kZero = {0};
248 for (
int i = 0;
i < 6;
i++) {
249 vec_t next_carry_s = a_s[
i] >> 15;
251 a_s[
i] |= vextq_u16(kZero, next_carry_s, 7);
253 carry_s = vextq_u16(next_carry_s, kZero, 7);
255 vec_t next_carry_a = a_a[
i] >> 15;
257 a_a[
i] |= vextq_u16(kZero, next_carry_a, 7);
259 carry_a = vextq_u16(next_carry_a, kZero, 7);
263 static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
266 const vec_t
kZero = {0};
268 for (
int i = 5;
i >= 0;
i--) {
269 vec_t next_carry_s = a_s[
i] << 15;
271 a_s[
i] |= vextq_u16(next_carry_s, kZero, 1);
273 carry_s = vextq_u16(kZero, next_carry_s, 1);
275 vec_t next_carry_a = a_a[
i] << 15;
277 a_a[
i] |= vextq_u16(next_carry_a, kZero, 1);
279 carry_a = vextq_u16(kZero, next_carry_a, 1);
283 #endif // !OPENSSL_AARCH64
285 #endif // (ARM || AARCH64) && NEON
306 for (
size_t i = 0;
i <
len;
i++) {
318 #if defined(OPENSSL_64_BIT)
319 static const crypto_word_t kMasks[6] = {
328 static const crypto_word_t kMasks[5] = {
338 in = ((
in >> (1 <<
i)) & kMasks[
i]) | ((
in & kMasks[
i]) << (1 <<
i));
352 const crypto_word_t
m =
379 const crypto_word_t
sum =
swap & (
a->v[
i] ^
b->v[
i]);
396 crypto_word_t carry = 0;
407 crypto_word_t carry = 0;
409 const crypto_word_t next_carry =
p->v[
i] & 1;
503 const crypto_word_t s1,
const crypto_word_t
a1,
504 const crypto_word_t s2,
const crypto_word_t
a2) {
506 *out_s = (s1 ^ s2) & *out_a;
511 const crypto_word_t s1,
const crypto_word_t
a1,
512 const crypto_word_t s2,
const crypto_word_t
a2) {
513 const crypto_word_t t = s1 ^
a2;
514 *out_s = t & (s2 ^
a1);
515 *out_a = (
a1 ^
a2) | (t ^ s2);
520 const crypto_word_t s1,
const crypto_word_t
a1,
521 const crypto_word_t s2,
const crypto_word_t
a2) {
522 const crypto_word_t t =
a1 ^
a2;
523 *out_s = (s1 ^
a2) & (t ^ s2);
524 *out_a = t | (s1 ^ s2);
542 crypto_word_t product_s, product_a;
546 product_s, product_a);
604 for (
size_t i = 0;
i <
n;
i++) {
612 for (
size_t i = 0;
i <
n;
i++) {
627 crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
628 crypto_word_t b_s =
b->s[0], b_a =
b->a[0];
629 const crypto_word_t a_s =
a->s[0], a_a =
a->a[0];
633 crypto_word_t m_s, m_a;
647 const crypto_word_t m_s_low = m_s <<
i;
649 const crypto_word_t m_a_low = m_a <<
i;
653 poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
654 poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
659 out->s[1] = r_s_high;
661 out->a[1] = r_a_high;
670 const size_t low_len =
n / 2;
671 const size_t high_len =
n - low_len;
672 const struct poly3_span a_high = {&
a->s[low_len], &
a->a[low_len]};
673 const struct poly3_span b_high = {&
b->s[low_len], &
b->a[low_len]};
678 const struct poly3_span b_cross_sum = {&
out->s[high_len], &
out->a[high_len]};
681 if (high_len != low_len) {
682 a_cross_sum.
s[low_len] = a_high.
s[low_len];
683 a_cross_sum.
a[low_len] = a_high.
a[low_len];
684 b_cross_sum.
s[low_len] = b_high.
s[low_len];
685 b_cross_sum.
a[low_len] = b_high.
a[low_len];
692 &
out->a[2 * low_len]};
697 poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
716 const struct poly3_span prod_span = {prod_s, prod_a};
717 const struct poly3_span scratch_span = {scratch_s, scratch_a};
718 const struct poly3_span x_span = {(crypto_word_t *)
x->s.v,
719 (crypto_word_t *)
x->a.v};
720 const struct poly3_span y_span = {(crypto_word_t *)
y->s.v,
721 (crypto_word_t *)
y->a.v};
741 #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
745 static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
746 vec_t b_a[6],
const vec_t
swap) {
747 for (
int i = 0;
i < 6;
i++) {
748 const vec_t sum_s =
swap & (a_s[
i] ^ b_s[
i]);
752 const vec_t sum_a =
swap & (a_a[
i] ^ b_a[
i]);
759 static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
760 vec_t b_a[6],
const vec_t ms,
762 for (
int i = 0;
i < 6;
i++) {
764 const vec_t
s = b_s[
i];
765 const vec_t
a = b_a[
i];
766 const vec_t product_a =
a & ma;
767 const vec_t product_s = (
s ^ ms) & product_a;
769 const vec_t out_s = a_s[
i];
770 const vec_t out_a = a_a[
i];
771 const vec_t
t = out_a ^ product_a;
772 a_s[
i] = (out_s ^ product_a) & (t ^ product_s);
773 a_a[
i] =
t | (out_s ^ product_s);
779 static void poly3_invert_vec(
struct poly3 *
out,
const struct poly3 *
in) {
781 const vec_t
kZero = {0};
782 const vec_t kOne = {1};
783 static const uint8_t kBottomSixtyOne[
sizeof(vec_t)] = {
784 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
786 vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
788 memset(&v_s, 0,
sizeof(v_s));
789 memset(&v_a, 0,
sizeof(v_a));
791 memset(&r_s, 0,
sizeof(r_s));
792 memset(&r_a, 0,
sizeof(r_a));
795 memset(f_s, 0,
sizeof(f_s));
796 memset(f_a, 0xff, 5 *
sizeof(vec_t));
797 memcpy(&f_a[5], kBottomSixtyOne,
sizeof(kBottomSixtyOne));
799 struct poly3 in_reversed;
808 for (
size_t i = 0;
i < (2*(
N-1)) - 1;
i++) {
809 poly3_vec_lshift1(v_s, v_a);
811 const crypto_word_t delta_sign_bit = (delta >> (
sizeof(delta) * 8 - 1)) & 1;
812 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
814 const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]);
816 {delta_is_non_negative & delta_is_non_zero};
817 const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term;
819 const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]);
820 const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a);
825 poly3_vec_cswap(f_s, f_a, g_s, g_a, mask);
826 poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a);
827 poly3_vec_rshift1(g_s, g_a);
829 poly3_vec_cswap(v_s, v_a, r_s, r_a, mask);
830 poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a);
840 #endif // HRSS_HAVE_VECTOR_UNIT
847 #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
849 poly3_invert_vec(
out,
in);
869 for (
size_t i = 0;
i < (2*(
N-1)) - 1;
i++) {
872 const crypto_word_t delta_sign_bit = (delta >> (
sizeof(delta) * 8 - 1)) & 1;
873 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
875 const crypto_word_t g_has_constant_term =
lsb_to_all(
g.a.v[0]);
876 const crypto_word_t mask =
877 g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
879 crypto_word_t c_s, c_a;
908 #define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
909 #define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
918 #if defined(HRSS_HAVE_VECTOR_UNIT)
934 for (
unsigned i = 0;
i <
N;
i++) {
956 #if defined(HRSS_HAVE_VECTOR_UNIT)
964 #if defined(POLY_RQ_MUL_ASM)
966 uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE];
971 #if defined(HRSS_HAVE_VECTOR_UNIT)
978 static void poly_mul_vec_aux(vec_t *restrict
out, vec_t *restrict
scratch,
979 const vec_t *restrict
a,
const vec_t *restrict
b,
1018 static const vec_t
kZero = {0};
1023 result[0] = vec_mul(vec_a[0], vec_get_word(
b[0], 0));
1024 result[1] = vec_mul(vec_a[1], vec_get_word(
b[0], 0));
1026 result[1] = vec_fma(
result[1], vec_a[0], vec_get_word(
b[1], 0));
1027 result[2] = vec_mul(vec_a[1], vec_get_word(
b[1], 0));
1030 vec3_rshift_word(vec_a);
1032 #define BLOCK(x, y) \
1035 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1037 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1039 vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1045 vec3_rshift_word(vec_a);
1050 vec3_rshift_word(vec_a);
1055 vec3_rshift_word(vec_a);
1060 vec3_rshift_word(vec_a);
1065 vec3_rshift_word(vec_a);
1070 vec3_rshift_word(vec_a);
1084 static const vec_t
kZero = {0};
1090 result[0] = vec_mul(
a[0], vec_get_word(
b[0], 0));
1091 result[1] = vec_mul(
a[1], vec_get_word(
b[0], 0));
1092 result[2] = vec_mul(
a[2], vec_get_word(
b[0], 0));
1094 #define BLOCK_PRE(x, y) \
1097 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1099 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1100 result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1108 vec4_rshift_word(vec_a);
1110 #define BLOCK(x, y) \
1113 vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1115 vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1117 vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1119 vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
1126 vec4_rshift_word(vec_a);
1132 vec4_rshift_word(vec_a);
1138 vec4_rshift_word(vec_a);
1144 vec4_rshift_word(vec_a);
1150 vec4_rshift_word(vec_a);
1156 vec4_rshift_word(vec_a);
1175 const size_t low_len =
n / 2;
1176 const size_t high_len =
n - low_len;
1177 const vec_t *a_high = &
a[low_len];
1178 const vec_t *b_high = &
b[low_len];
1182 for (
size_t i = 0;
i < low_len;
i++) {
1183 out[
i] = vec_add(a_high[
i],
a[
i]);
1184 out[high_len +
i] = vec_add(b_high[
i],
b[
i]);
1186 if (high_len != low_len) {
1187 out[low_len] = a_high[low_len];
1188 out[high_len + low_len] = b_high[low_len];
1191 vec_t *
const child_scratch = &
scratch[2 * high_len];
1193 poly_mul_vec_aux(
scratch, child_scratch,
out, &
out[high_len], high_len);
1195 poly_mul_vec_aux(&
out[low_len * 2], child_scratch, a_high, b_high, high_len);
1197 poly_mul_vec_aux(
out, child_scratch,
a,
b, low_len);
1200 for (
size_t i = 0;
i < low_len * 2;
i++) {
1203 if (low_len != high_len) {
1206 vec_sub(
scratch[low_len * 2 + 1],
out[low_len * 4 + 1]);
1210 for (
size_t i = 0;
i < high_len * 2;
i++) {
1217 const struct poly *
x,
const struct poly *
y) {
1222 "struct poly is the wrong size");
1224 "struct poly has incorrect alignment");
1226 vec_t *
const prod =
scratch->u.vec.prod;
1227 vec_t *
const aux_scratch =
scratch->u.vec.scratch;
1228 poly_mul_vec_aux(prod, aux_scratch,
x->vectors,
y->vectors,
VECS_PER_POLY);
1234 vec_t *out_vecs = (vec_t *)
out->v;
1239 out_vecs[
i] = vec_add(prod[
i], vec_merge_3_5(prev,
this));
1245 #endif // HRSS_HAVE_VECTOR_UNIT
1254 static const size_t kSchoolbookLimit = 64;
1255 if (
n < kSchoolbookLimit) {
1257 for (
size_t i = 0;
i <
n;
i++) {
1258 for (
size_t j = 0; j <
n; j++) {
1259 out[
i + j] += (unsigned)
a[
i] *
b[j];
1271 const size_t low_len =
n / 2;
1272 const size_t high_len =
n - low_len;
1273 const uint16_t *
const a_high = &
a[low_len];
1274 const uint16_t *
const b_high = &
b[low_len];
1276 for (
size_t i = 0;
i < low_len;
i++) {
1278 out[high_len +
i] = b_high[
i] +
b[
i];
1280 if (high_len != low_len) {
1281 out[low_len] = a_high[low_len];
1282 out[high_len + low_len] = b_high[low_len];
1291 for (
size_t i = 0;
i < low_len * 2;
i++) {
1294 if (low_len != high_len) {
1296 assert(
out[low_len * 4 + 1] == 0);
1299 for (
size_t i = 0;
i < high_len * 2;
i++) {
1306 const struct poly *
x,
const struct poly *
y) {
1311 for (
size_t i = 0;
i <
N;
i++) {
1312 out->v[
i] = prod[
i] + prod[
i +
N];
1318 const struct poly *
a,
const struct poly *
b) {
1319 #if defined(POLY_RQ_MUL_ASM)
1327 #if defined(HRSS_HAVE_VECTOR_UNIT)
1328 if (vec_capable()) {
1342 const uint16_t orig_final_coefficient =
p->v[
N - 1];
1344 for (
size_t i =
N - 1;
i > 0;
i--) {
1345 p->v[
i] =
p->v[
i - 1] -
p->v[
i];
1347 p->v[0] = orig_final_coefficient -
p->v[0];
1354 for (
unsigned i = 0;
i <
N;
i++) {
1355 p->v[
i] -= coeff700;
1361 for (
unsigned i = 0;
i <
N;
i++) {
1374 crypto_word_t word = 0;
1376 for (
unsigned i = 0;
i <
N;
i++) {
1404 crypto_word_t *words_s =
out->s.v;
1405 crypto_word_t *words_a =
out->a.v;
1406 crypto_word_t s = 0;
1407 crypto_word_t
a = 0;
1410 for (
unsigned i = 0;
i <
N;
i++) {
1416 const crypto_word_t s_bit = (crypto_word_t)(
v & 2) << (
BITS_PER_WORD - 2);
1443 crypto_word_t *words_s =
out->s.v;
1444 crypto_word_t *words_a =
out->a.v;
1445 crypto_word_t s = 0;
1446 crypto_word_t
a = 0;
1450 for (
unsigned i = 0;
i <
N;
i++) {
1459 const crypto_word_t s_bit = (crypto_word_t)(
mod3 & 2)
1485 const crypto_word_t *
words =
in->v;
1487 crypto_word_t word = *
words;
1489 for (
unsigned i = 0;
i <
N;
i++) {
1490 out->v[
i] = word & 1;
1503 const crypto_word_t *words_s =
in->s.v;
1504 const crypto_word_t *words_a =
in->a.v;
1505 crypto_word_t word_s = ~(*words_s);
1506 crypto_word_t word_a = *words_a;
1509 for (
unsigned i = 0;
i <
N;
i++) {
1511 out->v[
i] |= word_a & 1;
1519 word_s = ~(*words_s);
1550 for (
size_t i = 0;
i < (2*(
N-1)) - 1;
i++) {
1553 const crypto_word_t delta_sign_bit = (delta >> (
sizeof(delta) * 8 - 1)) & 1;
1554 const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
1556 const crypto_word_t g_has_constant_term =
lsb_to_all(
g.v[0]);
1557 const crypto_word_t mask =
1558 g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
1587 for (
unsigned i = 0;
i <
N;
i++) {
1597 for (
unsigned i = 0;
i < 4;
i++) {
1607 #define POLY_BYTES 1138
1613 for (
size_t i = 0;
i <
N / 8;
i++) {
1615 out[1] = (0x1f & (
p[0] >> 8)) | ((
p[1] & 0x07) << 5);
1617 out[3] = (3 & (
p[1] >> 11)) | ((
p[2] & 0x3f) << 2);
1618 out[4] = (0x7f & (
p[2] >> 6)) | ((
p[3] & 0x01) << 7);
1620 out[6] = (0xf & (
p[3] >> 9)) | ((
p[4] & 0x0f) << 4);
1622 out[8] = (1 & (
p[4] >> 12)) | ((
p[5] & 0x7f) << 1);
1623 out[9] = (0x3f & (
p[5] >> 7)) | ((
p[6] & 0x03) << 6);
1624 out[10] =
p[6] >> 2;
1625 out[11] = (7 & (
p[6] >> 10)) | ((
p[7] & 0x1f) << 3);
1626 out[12] =
p[7] >> 5;
1634 out[1] = (0x1f & (
p[0] >> 8)) | ((
p[1] & 0x07) << 5);
1636 out[3] = (3 & (
p[1] >> 11)) | ((
p[2] & 0x3f) << 2);
1637 out[4] = (0x7f & (
p[2] >> 6)) | ((
p[3] & 0x01) << 7);
1639 out[6] = 0xf & (
p[3] >> 9);
1649 for (
size_t i = 0;
i <
N / 8;
i++) {
1675 for (
unsigned i = 0;
i <
N - 1;
i++) {
1680 if ((
in[6] & 0xf0) != 0) {
1686 for (
size_t i = 0;
i <
N - 1;
i++) {
1699 return v ^ (
v >> 1);
1710 assert(coeffs[
N-1] == 0);
1718 out[
i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
1734 "HRSS_SAMPLE_BYTES incorrect");
1735 for (
size_t i = 0;
i <
N - 1;
i++) {
1738 v |= ((
v >> 1) ^ 1) - 1;
1753 for (
unsigned i = 0;
i <
N - 2;
i++) {
1761 for (
unsigned i = 0;
i <
N;
i += 2) {
1837 out->v[0] =
a->v[0] +
a->v[2];
1838 out->v[1] =
a->v[1];
1839 out->v[2] = -
a->v[0] +
a->v[2];
1844 for (
size_t i = 3;
i < 699;
i += 3) {
1845 s0 += -
a->v[
i] +
a->v[
i + 2];
1847 s2 +=
a->v[
i + 1] -
a->v[
i + 2];
1858 out->v[1] -= (s0 + s2);
1864 for (
size_t i = 3;
i <
N;
i++) {
1865 out->v[
i] = (
out->v[
i - 3] - (
a->v[
i - 2] +
a->v[
i - 1] +
a->v[
i]));
1871 const crypto_word_t
v =
out->v[700];
1872 for (
unsigned i = 0;
i <
N;
i++) {
1875 out->v[
i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
1902 "HRSS public key too small");
1914 "HRSS private key too small");
1942 struct poly pg_phi1;
1943 struct poly pfg_phi1;
1944 struct poly pfg_phi1_inverse;
1948 struct vars *
const vars =
malloc_align32(&malloc_ptr,
sizeof(
struct vars));
1967 for (
unsigned i = 0;
i <
N;
i++) {
1968 vars->pg_phi1.v[
i] *= 3;
1972 poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1);
1974 poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1);
1976 poly_mul(&vars->scratch, &pub->
ph, &vars->pfg_phi1_inverse, &vars->pg_phi1);
1977 poly_mul(&vars->scratch, &pub->
ph, &pub->
ph, &vars->pg_phi1);
1999 struct poly m,
r, m_lifted;
2000 struct poly prh_plus_m;
2007 struct vars *
const vars =
malloc_align32(&malloc_ptr,
sizeof(
struct vars));
2021 poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->
ph);
2022 for (
unsigned i = 0;
i <
N;
i++) {
2023 vars->prh_plus_m.v[
i] += vars->m_lifted.v[
i];
2033 SHA256_Update(&vars->hash_ctx, vars->m_bytes,
sizeof(vars->m_bytes));
2034 SHA256_Update(&vars->hash_ctx, vars->r_bytes,
sizeof(vars->r_bytes));
2054 struct poly3 cf3, m3;
2055 struct poly m, m_lifted;
2065 struct vars *
const vars =
malloc_align32(&malloc_ptr,
sizeof(
struct vars));
2078 "HRSS HMAC key larger than SHA-256 block size");
2079 for (
size_t i = 0;
i <
sizeof(priv->
hmac_key);
i++) {
2080 vars->masked_key[
i] = priv->
hmac_key[
i] ^ 0x36;
2083 sizeof(vars->masked_key) -
sizeof(priv->
hmac_key));
2086 SHA256_Update(&vars->hash_ctx, vars->masked_key,
sizeof(vars->masked_key));
2091 for (
size_t i = 0;
i <
sizeof(priv->
hmac_key);
i++) {
2092 vars->masked_key[
i] ^= (0x5c ^ 0x36);
2095 sizeof(vars->masked_key) -
sizeof(priv->
hmac_key));
2098 SHA256_Update(&vars->hash_ctx, vars->masked_key,
sizeof(vars->masked_key));
2099 SHA256_Update(&vars->hash_ctx, inner_digest,
sizeof(inner_digest));
2101 "HRSS shared key length incorrect");
2113 poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f);
2121 for (
unsigned i = 0;
i <
N;
i++) {
2122 vars->r.v[
i] = vars->c.v[
i] - vars->m_lifted.v[
i];
2154 "ciphertext is the wrong size");
2155 assert(ciphertext_len ==
sizeof(vars->expected_ciphertext));
2163 sizeof(vars->expected_ciphertext)));
2167 SHA256_Update(&vars->hash_ctx, vars->m_bytes,
sizeof(vars->m_bytes));
2168 SHA256_Update(&vars->hash_ctx, vars->r_bytes,
sizeof(vars->r_bytes));
2170 sizeof(vars->expected_ciphertext));
2173 for (
unsigned i = 0;
i <
sizeof(vars->shared_key);
i++) {