hrss.c
Go to the documentation of this file.
1 /* Copyright (c) 2018, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <openssl/hrss.h>
16 
17 #include <assert.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 
21 #include <openssl/bn.h>
22 #include <openssl/cpu.h>
23 #include <openssl/hmac.h>
24 #include <openssl/mem.h>
25 #include <openssl/rand.h>
26 #include <openssl/sha.h>
27 
28 #if defined(_MSC_VER)
29 #define RESTRICT
30 #else
31 #define RESTRICT restrict
32 #endif
33 
34 #include "../internal.h"
35 #include "internal.h"
36 
37 #if defined(OPENSSL_SSE2)
38 #include <emmintrin.h>
39 #endif
40 
41 #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
42  (defined(__ARM_NEON__) || defined(__ARM_NEON))
43 #include <arm_neon.h>
44 #endif
45 
46 // This is an implementation of [HRSS], but with a KEM transformation based on
47 // [SXY]. The primary references are:
48 
49 // HRSS: https://eprint.iacr.org/2017/667.pdf
50 // HRSSNIST:
51 // https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
52 // SXY: https://eprint.iacr.org/2017/1005.pdf
53 // NTRUTN14:
54 // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
55 // NTRUCOMP: https://eprint.iacr.org/2018/1174
56 // SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd
57 
58 
59 // Vector operations.
60 //
61 // A couple of functions in this file can use vector operations to meaningful
62 // effect. If we're building for a target that has a supported vector unit,
63 // |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a
64 // 128-bit vector. The following functions abstract over the differences between
65 // NEON and SSE2 for implementing some vector operations.
66 
67 // TODO: MSVC can likely also be made to work with vector operations, but ^ must
68 // be replaced with _mm_xor_si128, etc.
69 #if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER))
70 
71 #define HRSS_HAVE_VECTOR_UNIT
72 typedef __m128i vec_t;
73 
74 // vec_capable returns one iff the current platform supports SSE2.
75 static int vec_capable(void) { return 1; }
76 
77 // vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
78 static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
79 
80 // vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|.
81 static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
82 
83 // vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting
84 // vector.
85 static inline vec_t vec_mul(vec_t a, uint16_t b) {
86  return _mm_mullo_epi16(a, _mm_set1_epi16(b));
87 }
88 
89 // vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and
90 // returns the resulting vector.
91 static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
92  return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
93 }
94 
95 // vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16.
96 static inline void vec3_rshift_word(vec_t v[3]) {
97  // Intel's left and right shifting is backwards compared to the order in
98  // memory because they're based on little-endian order of words (and not just
99  // bytes). So the shifts in this function will be backwards from what one
100  // might expect.
101  const __m128i carry0 = _mm_srli_si128(v[0], 14);
102  v[0] = _mm_slli_si128(v[0], 2);
103 
104  const __m128i carry1 = _mm_srli_si128(v[1], 14);
105  v[1] = _mm_slli_si128(v[1], 2);
106  v[1] |= carry0;
107 
108  v[2] = _mm_slli_si128(v[2], 2);
109  v[2] |= carry1;
110 }
111 
112 // vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16.
113 static inline void vec4_rshift_word(vec_t v[4]) {
114  // Intel's left and right shifting is backwards compared to the order in
115  // memory because they're based on little-endian order of words (and not just
116  // bytes). So the shifts in this function will be backwards from what one
117  // might expect.
118  const __m128i carry0 = _mm_srli_si128(v[0], 14);
119  v[0] = _mm_slli_si128(v[0], 2);
120 
121  const __m128i carry1 = _mm_srli_si128(v[1], 14);
122  v[1] = _mm_slli_si128(v[1], 2);
123  v[1] |= carry0;
124 
125  const __m128i carry2 = _mm_srli_si128(v[2], 14);
126  v[2] = _mm_slli_si128(v[2], 2);
127  v[2] |= carry1;
128 
129  v[3] = _mm_slli_si128(v[3], 2);
130  v[3] |= carry2;
131 }
132 
133 // vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first
134 // five from |right|, and returns the resulting vector.
135 static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
136  return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
137 }
138 
139 // poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one
140 // bit.
141 static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
142  vec_t carry_s = {0};
143  vec_t carry_a = {0};
144 
145  for (int i = 0; i < 6; i++) {
146  vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
147  a_s[i] = _mm_slli_epi64(a_s[i], 1);
148  a_s[i] |= _mm_slli_si128(next_carry_s, 8);
149  a_s[i] |= carry_s;
150  carry_s = _mm_srli_si128(next_carry_s, 8);
151 
152  vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
153  a_a[i] = _mm_slli_epi64(a_a[i], 1);
154  a_a[i] |= _mm_slli_si128(next_carry_a, 8);
155  a_a[i] |= carry_a;
156  carry_a = _mm_srli_si128(next_carry_a, 8);
157  }
158 }
159 
160 // poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one
161 // bit.
162 static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
163  vec_t carry_s = {0};
164  vec_t carry_a = {0};
165 
166  for (int i = 5; i >= 0; i--) {
167  const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
168  a_s[i] = _mm_srli_epi64(a_s[i], 1);
169  a_s[i] |= _mm_srli_si128(next_carry_s, 8);
170  a_s[i] |= carry_s;
171  carry_s = _mm_slli_si128(next_carry_s, 8);
172 
173  const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
174  a_a[i] = _mm_srli_epi64(a_a[i], 1);
175  a_a[i] |= _mm_srli_si128(next_carry_a, 8);
176  a_a[i] |= carry_a;
177  carry_a = _mm_slli_si128(next_carry_a, 8);
178  }
179 }
180 
181 // vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in
182 // a vector and returns the result.
183 static inline vec_t vec_broadcast_bit(vec_t a) {
184  return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
185  0b01010101);
186 }
187 
188 // vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the
189 // compiler requires that |i| be a compile-time constant.)
190 #define vec_get_word(v, i) _mm_extract_epi16(v, i)
191 
192 #elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
193  (defined(__ARM_NEON__) || defined(__ARM_NEON))
194 
195 #define HRSS_HAVE_VECTOR_UNIT
196 typedef uint16x8_t vec_t;
197 
198 // These functions perform the same actions as the SSE2 function of the same
199 // name, above.
200 
201 static int vec_capable(void) { return CRYPTO_is_NEON_capable(); }
202 
203 static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
204 
205 static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
206 
207 static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
208 
209 static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
210  return vmlaq_n_u16(a, b, c);
211 }
212 
213 static inline void vec3_rshift_word(vec_t v[3]) {
214  const uint16x8_t kZero = {0};
215  v[2] = vextq_u16(v[1], v[2], 7);
216  v[1] = vextq_u16(v[0], v[1], 7);
217  v[0] = vextq_u16(kZero, v[0], 7);
218 }
219 
220 static inline void vec4_rshift_word(vec_t v[4]) {
221  const uint16x8_t kZero = {0};
222  v[3] = vextq_u16(v[2], v[3], 7);
223  v[2] = vextq_u16(v[1], v[2], 7);
224  v[1] = vextq_u16(v[0], v[1], 7);
225  v[0] = vextq_u16(kZero, v[0], 7);
226 }
227 
228 static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
229  return vextq_u16(left, right, 5);
230 }
231 
232 static inline uint16_t vec_get_word(vec_t v, unsigned i) {
233  return v[i];
234 }
235 
236 #if !defined(OPENSSL_AARCH64)
237 
238 static inline vec_t vec_broadcast_bit(vec_t a) {
239  a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
240  return vdupq_lane_u16(vget_low_u16(a), 0);
241 }
242 
243 static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
244  vec_t carry_s = {0};
245  vec_t carry_a = {0};
246  const vec_t kZero = {0};
247 
248  for (int i = 0; i < 6; i++) {
249  vec_t next_carry_s = a_s[i] >> 15;
250  a_s[i] <<= 1;
251  a_s[i] |= vextq_u16(kZero, next_carry_s, 7);
252  a_s[i] |= carry_s;
253  carry_s = vextq_u16(next_carry_s, kZero, 7);
254 
255  vec_t next_carry_a = a_a[i] >> 15;
256  a_a[i] <<= 1;
257  a_a[i] |= vextq_u16(kZero, next_carry_a, 7);
258  a_a[i] |= carry_a;
259  carry_a = vextq_u16(next_carry_a, kZero, 7);
260  }
261 }
262 
263 static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
264  vec_t carry_s = {0};
265  vec_t carry_a = {0};
266  const vec_t kZero = {0};
267 
268  for (int i = 5; i >= 0; i--) {
269  vec_t next_carry_s = a_s[i] << 15;
270  a_s[i] >>= 1;
271  a_s[i] |= vextq_u16(next_carry_s, kZero, 1);
272  a_s[i] |= carry_s;
273  carry_s = vextq_u16(kZero, next_carry_s, 1);
274 
275  vec_t next_carry_a = a_a[i] << 15;
276  a_a[i] >>= 1;
277  a_a[i] |= vextq_u16(next_carry_a, kZero, 1);
278  a_a[i] |= carry_a;
279  carry_a = vextq_u16(kZero, next_carry_a, 1);
280  }
281 }
282 
283 #endif // !OPENSSL_AARCH64
284 
285 #endif // (ARM || AARCH64) && NEON
286 
287 // Polynomials in this scheme have N terms.
288 // #define N 701
289 
290 // Underlying data types and arithmetic operations.
291 // ------------------------------------------------
292 
293 // Binary polynomials.
294 
295 // poly2 represents a degree-N polynomial over GF(2). The words are in little-
296 // endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
297 // final word is only partially used since N is not a multiple of the word size.
298 
299 // Defined in internal.h:
300 // struct poly2 {
301 // crypto_word_t v[WORDS_PER_POLY];
302 // };
303 
304 OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
305  const uint8_t *in = (const uint8_t *)void_in;
306  for (size_t i = 0; i < len; i++) {
307  printf("%02x", in[i]);
308  }
309  printf("\n");
310 }
311 
312 static void poly2_zero(struct poly2 *p) {
313  OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
314 }
315 
316 // word_reverse returns |in| with the bits in reverse order.
317 static crypto_word_t word_reverse(crypto_word_t in) {
318 #if defined(OPENSSL_64_BIT)
319  static const crypto_word_t kMasks[6] = {
320  UINT64_C(0x5555555555555555),
321  UINT64_C(0x3333333333333333),
322  UINT64_C(0x0f0f0f0f0f0f0f0f),
323  UINT64_C(0x00ff00ff00ff00ff),
324  UINT64_C(0x0000ffff0000ffff),
325  UINT64_C(0x00000000ffffffff),
326  };
327 #else
328  static const crypto_word_t kMasks[5] = {
329  0x55555555,
330  0x33333333,
331  0x0f0f0f0f,
332  0x00ff00ff,
333  0x0000ffff,
334  };
335 #endif
336 
337  for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMasks); i++) {
338  in = ((in >> (1 << i)) & kMasks[i]) | ((in & kMasks[i]) << (1 << i));
339  }
340 
341  return in;
342 }
343 
344 // lsb_to_all replicates the least-significant bit of |v| to all bits of the
345 // word. This is used in bit-slicing operations to make a vector from a fixed
346 // value.
347 static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
348 
349 // poly2_mod_phiN reduces |p| by Φ(N).
350 static void poly2_mod_phiN(struct poly2 *p) {
351  // m is the term at x^700, replicated to every bit.
352  const crypto_word_t m =
353  lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1));
354  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
355  p->v[i] ^= m;
356  }
357  p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1;
358 }
359 
360 // poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes
361 // the result to |out|.
362 static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) {
363  struct poly2 t;
364  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
365  t.v[i] = word_reverse(in->v[i]);
366  }
367 
368  static const size_t shift = BITS_PER_WORD - ((N-1) % BITS_PER_WORD);
369  for (size_t i = 0; i < WORDS_PER_POLY-1; i++) {
370  out->v[i] = t.v[WORDS_PER_POLY-1-i] >> shift;
371  out->v[i] |= t.v[WORDS_PER_POLY-2-i] << (BITS_PER_WORD - shift);
372  }
373  out->v[WORDS_PER_POLY-1] = t.v[0] >> shift;
374 }
375 
376 // poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones.
377 static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) {
378  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
379  const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
380  a->v[i] ^= sum;
381  b->v[i] ^= sum;
382  }
383 }
384 
385 // poly2_fmadd sets |out| to |out| + |in| * m, where m is either
386 // |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|.
387 static void poly2_fmadd(struct poly2 *out, const struct poly2 *in,
388  crypto_word_t m) {
389  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
390  out->v[i] ^= in->v[i] & m;
391  }
392 }
393 
394 // poly2_lshift1 left-shifts |p| by one bit.
395 static void poly2_lshift1(struct poly2 *p) {
396  crypto_word_t carry = 0;
397  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
398  const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
399  p->v[i] <<= 1;
400  p->v[i] |= carry;
401  carry = next_carry;
402  }
403 }
404 
405 // poly2_rshift1 right-shifts |p| by one bit.
406 static void poly2_rshift1(struct poly2 *p) {
407  crypto_word_t carry = 0;
408  for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
409  const crypto_word_t next_carry = p->v[i] & 1;
410  p->v[i] >>= 1;
411  p->v[i] |= carry << (BITS_PER_WORD - 1);
412  carry = next_carry;
413  }
414 }
415 
416 // poly2_clear_top_bits clears the bits in the final word that are only for
417 // alignment.
418 static void poly2_clear_top_bits(struct poly2 *p) {
419  p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
420 }
421 
422 // poly2_top_bits_are_clear returns one iff the extra bits in the final words of
423 // |p| are zero.
424 static int poly2_top_bits_are_clear(const struct poly2 *p) {
425  return (p->v[WORDS_PER_POLY - 1] &
426  ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
427 }
428 
429 // Ternary polynomials.
430 
431 // poly3 represents a degree-N polynomial over GF(3). Each coefficient is
432 // bitsliced across the |s| and |a| arrays, like this:
433 //
434 // s | a | value
435 // -----------------
436 // 0 | 0 | 0
437 // 0 | 1 | 1
438 // 1 | 1 | -1 (aka 2)
439 // 1 | 0 | <invalid>
440 //
441 // ('s' is for sign, and 'a' is the absolute value.)
442 //
443 // Once bitsliced as such, the following circuits can be used to implement
444 // addition and multiplication mod 3:
445 //
446 // (s3, a3) = (s1, a1) × (s2, a2)
447 // a3 = a1 ∧ a2
448 // s3 = (s1 ⊕ s2) ∧ a3
449 //
450 // (s3, a3) = (s1, a1) + (s2, a2)
451 // t = s1 ⊕ a2
452 // s3 = t ∧ (s2 ⊕ a1)
453 // a3 = (a1 ⊕ a2) ∨ (t ⊕ s2)
454 //
455 // (s3, a3) = (s1, a1) - (s2, a2)
456 // t = a1 ⊕ a2
457 // s3 = (s1 ⊕ a2) ∧ (t ⊕ s2)
458 // a3 = t ∨ (s1 ⊕ s2)
459 //
460 // Negating a value just involves XORing s by a.
461 //
462 // struct poly3 {
463 // struct poly2 s, a;
464 // };
465 
466 OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
467  struct poly3 p;
468  OPENSSL_memcpy(&p, in, sizeof(p));
469  p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
470  p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
471 
472  printf("{[");
473  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
474  if (i) {
475  printf(" ");
476  }
477  printf(BN_HEX_FMT2, p.s.v[i]);
478  }
479  printf("] [");
480  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
481  if (i) {
482  printf(" ");
483  }
484  printf(BN_HEX_FMT2, p.a.v[i]);
485  }
486  printf("]}\n");
487 }
488 
489 static void poly3_zero(struct poly3 *p) {
490  poly2_zero(&p->s);
491  poly2_zero(&p->a);
492 }
493 
494 // poly3_reverse_700 reverses the order of the first 700 terms of |in| and
495 // writes them to |out|.
496 static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) {
497  poly2_reverse_700(&out->a, &in->a);
498  poly2_reverse_700(&out->s, &in->s);
499 }
500 
501 // poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|).
502 static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a,
503  const crypto_word_t s1, const crypto_word_t a1,
504  const crypto_word_t s2, const crypto_word_t a2) {
505  *out_a = a1 & a2;
506  *out_s = (s1 ^ s2) & *out_a;
507 }
508 
509 // poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|).
510 static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a,
511  const crypto_word_t s1, const crypto_word_t a1,
512  const crypto_word_t s2, const crypto_word_t a2) {
513  const crypto_word_t t = s1 ^ a2;
514  *out_s = t & (s2 ^ a1);
515  *out_a = (a1 ^ a2) | (t ^ s2);
516 }
517 
518 // poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|).
519 static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a,
520  const crypto_word_t s1, const crypto_word_t a1,
521  const crypto_word_t s2, const crypto_word_t a2) {
522  const crypto_word_t t = a1 ^ a2;
523  *out_s = (s1 ^ a2) & (t ^ s2);
524  *out_a = t | (s1 ^ s2);
525 }
526 
527 // poly3_mul_const sets |p| to |p|×m, where m = (ms, ma).
528 static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
529  crypto_word_t ma) {
530  ms = lsb_to_all(ms);
531  ma = lsb_to_all(ma);
532 
533  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
534  poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma);
535  }
536 }
537 
538 // poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma).
539 static void poly3_fmsub(struct poly3 *RESTRICT out,
540  const struct poly3 *RESTRICT in, crypto_word_t ms,
541  crypto_word_t ma) {
542  crypto_word_t product_s, product_a;
543  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
544  poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma);
545  poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i],
546  product_s, product_a);
547  }
548 }
549 
550 // final_bit_to_all replicates the bit in the final position of the last word to
551 // all the bits in the word.
552 static crypto_word_t final_bit_to_all(crypto_word_t v) {
553  return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
554 }
555 
556 // poly3_top_bits_are_clear returns one iff the extra bits in the final words of
557 // |p| are zero.
558 OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
560 }
561 
562 // poly3_mod_phiN reduces |p| by Φ(N).
563 static void poly3_mod_phiN(struct poly3 *p) {
564  // In order to reduce by Φ(N) we subtract by the value of the greatest
565  // coefficient.
566  const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
567  const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
568 
569  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
570  poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s,
571  factor_a);
572  }
573 
574  poly2_clear_top_bits(&p->s);
575  poly2_clear_top_bits(&p->a);
576 }
577 
578 static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) {
579  poly2_cswap(&a->s, &b->s, swap);
580  poly2_cswap(&a->a, &b->a, swap);
581 }
582 
583 static void poly3_lshift1(struct poly3 *p) {
584  poly2_lshift1(&p->s);
585  poly2_lshift1(&p->a);
586 }
587 
588 static void poly3_rshift1(struct poly3 *p) {
589  poly2_rshift1(&p->s);
590  poly2_rshift1(&p->a);
591 }
592 
593 // poly3_span represents a pointer into a poly3.
594 struct poly3_span {
595  crypto_word_t *s;
596  crypto_word_t *a;
597 };
598 
599 // poly3_span_add adds |n| words of values from |a| and |b| and writes the
600 // result to |out|.
601 static void poly3_span_add(const struct poly3_span *out,
602  const struct poly3_span *a,
603  const struct poly3_span *b, size_t n) {
604  for (size_t i = 0; i < n; i++) {
605  poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
606  }
607 }
608 
609 // poly3_span_sub subtracts |n| words of |b| from |n| words of |a|.
610 static void poly3_span_sub(const struct poly3_span *a,
611  const struct poly3_span *b, size_t n) {
612  for (size_t i = 0; i < n; i++) {
613  poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
614  }
615 }
616 
617 // poly3_mul_aux is a recursive function that multiplies |n| words from |a| and
618 // |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of
619 // |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't
620 // used and the recursion stops. For |n| in {11, 22}, the transitive total
621 // amount of |scratch| needed happens to be 2n+2.
622 static void poly3_mul_aux(const struct poly3_span *out,
623  const struct poly3_span *scratch,
624  const struct poly3_span *a,
625  const struct poly3_span *b, size_t n) {
626  if (n == 1) {
627  crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
628  crypto_word_t b_s = b->s[0], b_a = b->a[0];
629  const crypto_word_t a_s = a->s[0], a_a = a->a[0];
630 
631  for (size_t i = 0; i < BITS_PER_WORD; i++) {
632  // Multiply (s, a) by the next value from (b_s, b_a).
633  crypto_word_t m_s, m_a;
634  poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a));
635  b_s >>= 1;
636  b_a >>= 1;
637 
638  if (i == 0) {
639  // Special case otherwise the code tries to shift by BITS_PER_WORD
640  // below, which is undefined.
641  r_s_low = m_s;
642  r_a_low = m_a;
643  continue;
644  }
645 
646  // Shift the multiplication result to the correct position.
647  const crypto_word_t m_s_low = m_s << i;
648  const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
649  const crypto_word_t m_a_low = m_a << i;
650  const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
651 
652  // Add into the result.
653  poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
654  poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
655  m_a_high);
656  }
657 
658  out->s[0] = r_s_low;
659  out->s[1] = r_s_high;
660  out->a[0] = r_a_low;
661  out->a[1] = r_a_high;
662  return;
663  }
664 
665  // Karatsuba multiplication.
666  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
667 
668  // When |n| is odd, the two "halves" will have different lengths. The first
669  // is always the smaller.
670  const size_t low_len = n / 2;
671  const size_t high_len = n - low_len;
672  const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
673  const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
674 
675  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
676  // half.
677  const struct poly3_span a_cross_sum = *out;
678  const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
679  poly3_span_add(&a_cross_sum, a, &a_high, low_len);
680  poly3_span_add(&b_cross_sum, b, &b_high, low_len);
681  if (high_len != low_len) {
682  a_cross_sum.s[low_len] = a_high.s[low_len];
683  a_cross_sum.a[low_len] = a_high.a[low_len];
684  b_cross_sum.s[low_len] = b_high.s[low_len];
685  b_cross_sum.a[low_len] = b_high.a[low_len];
686  }
687 
688  const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
689  &scratch->a[2 * high_len]};
690  const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
691  const struct poly3_span out_high = {&out->s[2 * low_len],
692  &out->a[2 * low_len]};
693 
694  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
695  poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
696  // Calculate a_1 × b_1.
697  poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
698  // Calculate a_0 × b_0.
699  poly3_mul_aux(out, &child_scratch, a, b, low_len);
700 
701  // Subtract those last two products from the first.
702  poly3_span_sub(scratch, out, low_len * 2);
703  poly3_span_sub(scratch, &out_high, high_len * 2);
704 
705  // Add the middle product into the output.
706  poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
707 }
708 
709 // HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N).
710 void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
711  const struct poly3 *y) {
712  crypto_word_t prod_s[WORDS_PER_POLY * 2];
713  crypto_word_t prod_a[WORDS_PER_POLY * 2];
714  crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
715  crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
716  const struct poly3_span prod_span = {prod_s, prod_a};
717  const struct poly3_span scratch_span = {scratch_s, scratch_a};
718  const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
719  (crypto_word_t *)x->a.v};
720  const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
721  (crypto_word_t *)y->a.v};
722 
723  poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
724 
725  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
726  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
727  // BITS_PER_WORD, so the upper-half vectors all have to be shifted before
728  // being added to the lower-half.
729  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
730  crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
731  v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
732  crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
733  v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
734 
735  poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
736  }
737 
739 }
740 
741 #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
742 
743 // poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is
744 // |0xff..ff|. Otherwise, |swap| must be zero.
745 static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
746  vec_t b_a[6], const vec_t swap) {
747  for (int i = 0; i < 6; i++) {
748  const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
749  a_s[i] ^= sum_s;
750  b_s[i] ^= sum_s;
751 
752  const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
753  a_a[i] ^= sum_a;
754  b_a[i] ^= sum_a;
755  }
756 }
757 
758 // poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|).
759 static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
760  vec_t b_a[6], const vec_t ms,
761  const vec_t ma) {
762  for (int i = 0; i < 6; i++) {
763  // See the bitslice formula, above.
764  const vec_t s = b_s[i];
765  const vec_t a = b_a[i];
766  const vec_t product_a = a & ma;
767  const vec_t product_s = (s ^ ms) & product_a;
768 
769  const vec_t out_s = a_s[i];
770  const vec_t out_a = a_a[i];
771  const vec_t t = out_a ^ product_a;
772  a_s[i] = (out_s ^ product_a) & (t ^ product_s);
773  a_a[i] = t | (out_s ^ product_s);
774  }
775 }
776 
777 // poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
778 // Φ(N).
779 static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) {
780  // This algorithm is taken from section 7.1 of [SAFEGCD].
781  const vec_t kZero = {0};
782  const vec_t kOne = {1};
783  static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
784  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
785 
786  vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
787  // v = 0
788  memset(&v_s, 0, sizeof(v_s));
789  memset(&v_a, 0, sizeof(v_a));
790  // r = 1
791  memset(&r_s, 0, sizeof(r_s));
792  memset(&r_a, 0, sizeof(r_a));
793  r_a[0] = kOne;
794  // f = all ones.
795  memset(f_s, 0, sizeof(f_s));
796  memset(f_a, 0xff, 5 * sizeof(vec_t));
797  memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
798  // g is the reversal of |in|.
799  struct poly3 in_reversed;
800  poly3_reverse_700(&in_reversed, in);
801  g_s[5] = kZero;
802  memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
803  g_a[5] = kZero;
804  memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
805 
806  int delta = 1;
807 
808  for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
809  poly3_vec_lshift1(v_s, v_a);
810 
811  const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
812  const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
813  const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
814  const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]);
815  const vec_t mask_w =
816  {delta_is_non_negative & delta_is_non_zero};
817  const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term;
818 
819  const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]);
820  const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a);
821 
822  delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta);
823  delta++;
824 
825  poly3_vec_cswap(f_s, f_a, g_s, g_a, mask);
826  poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a);
827  poly3_vec_rshift1(g_s, g_a);
828 
829  poly3_vec_cswap(v_s, v_a, r_s, r_a, mask);
830  poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a);
831  }
832 
833  assert(delta == 0);
834  memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t));
835  memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t));
836  poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0));
838 }
839 
840 #endif // HRSS_HAVE_VECTOR_UNIT
841 
842 // HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
843 // Φ(N).
844 void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) {
845  // The vector version of this function seems slightly slower on AArch64, but
846  // is useful on ARMv7 and x86-64.
847 #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
848  if (vec_capable()) {
849  poly3_invert_vec(out, in);
850  return;
851  }
852 #endif
853 
854  // This algorithm is taken from section 7.1 of [SAFEGCD].
855  struct poly3 v, r, f, g;
856  // v = 0
857  poly3_zero(&v);
858  // r = 1
859  poly3_zero(&r);
860  r.a.v[0] = 1;
861  // f = all ones.
862  OPENSSL_memset(&f.s, 0, sizeof(struct poly2));
863  OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2));
865  // g is the reversal of |in|.
867  int delta = 1;
868 
869  for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
870  poly3_lshift1(&v);
871 
872  const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
873  const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
874  const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
875  const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]);
876  const crypto_word_t mask =
877  g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
878 
879  crypto_word_t c_s, c_a;
880  poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]);
881  c_s = lsb_to_all(c_s);
882  c_a = lsb_to_all(c_a);
883 
884  delta = constant_time_select_int(mask, -delta, delta);
885  delta++;
886 
887  poly3_cswap(&f, &g, mask);
888  poly3_fmsub(&g, &f, c_s, c_a);
889  poly3_rshift1(&g);
890 
891  poly3_cswap(&v, &r, mask);
892  poly3_fmsub(&r, &v, c_s, c_a);
893  }
894 
895  assert(delta == 0);
896  poly3_mul_const(&v, f.s.v[0], f.a.v[0]);
898 }
899 
900 // Polynomials in Q.
901 
902 // Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
903 // coefficients do not form a field.)
904 #define Q 8192
905 
906 // VECS_PER_POLY is the number of 128-bit vectors needed to represent a
907 // polynomial.
908 #define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
909 #define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
910 
911 // poly represents a polynomial with coefficients mod Q. Note that, while Q is a
912 // power of two, this does not operate in GF(Q). That would be a binary field
913 // but this is simply mod Q. Thus the coefficients are not a field.
914 //
915 // Coefficients are ordered little-endian, thus the coefficient of x^0 is the
916 // first element of the array.
917 struct poly {
918 #if defined(HRSS_HAVE_VECTOR_UNIT)
919  union {
920  // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
921  // the vector code.
922  uint16_t v[N + 3];
923  vec_t vectors[VECS_PER_POLY];
924  };
925 #else
926  // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that
927  // requires alignment.
928  alignas(16) uint16_t v[N + 3];
929 #endif
930 };
931 
932 OPENSSL_UNUSED static void poly_print(const struct poly *p) {
933  printf("[");
934  for (unsigned i = 0; i < N; i++) {
935  if (i) {
936  printf(" ");
937  }
938  printf("%d", p->v[i]);
939  }
940  printf("]\n");
941 }
942 
943 // POLY_MUL_SCRATCH contains space for the working variables needed by
944 // |poly_mul|. The contents afterwards may be discarded, but the object may also
945 // be reused with future |poly_mul| calls to save heap allocations.
946 //
947 // This object must have 32-byte alignment.
949  union {
950  // This is used by |poly_mul_novec|.
951  struct {
954  } novec;
955 
956 #if defined(HRSS_HAVE_VECTOR_UNIT)
957  // This is used by |poly_mul_vec|.
958  struct {
959  vec_t prod[VECS_PER_POLY * 2];
960  vec_t scratch[172];
961  } vec;
962 #endif
963 
964 #if defined(POLY_RQ_MUL_ASM)
965  // This is the space used by |poly_Rq_mul|.
966  uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE];
967 #endif
968  } u;
969 };
970 
971 #if defined(HRSS_HAVE_VECTOR_UNIT)
972 
973 // poly_mul_vec_aux is a recursive function that multiplies |n| words from |a|
974 // and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements
975 // of |scratch| and the function recurses, except if |n| < 3, when |scratch|
976 // isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch|
977 // needs 172 elements.
978 static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch,
979  const vec_t *restrict a, const vec_t *restrict b,
980  const size_t n) {
981  // In [HRSS], the technique they used for polynomial multiplication is
982  // described: they start with Toom-4 at the top level and then two layers of
983  // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
984  // decomposition, which splits an input n-ways and produces 2n-1
985  // multiplications of those parts. So, starting with 704 coefficients (rounded
986  // up from 701 to have more factors of two), Toom-4 gives seven
987  // multiplications of degree-174 polynomials. Each round of Karatsuba (which
988  // is Toom-2) increases the number of multiplications by a factor of three
989  // while halving the size of the values being multiplied. So two rounds gives
990  // 63 multiplications of degree-44 polynomials. Then they (I think) form
991  // vectors by gathering all 63 coefficients of each power together, for each
992  // input, and doing more rounds of Karatsuba on the vectors until they bottom-
993  // out somewhere with schoolbook multiplication.
994  //
995  // I tried something like that for NEON. NEON vectors are 128 bits so hold
996  // eight coefficients. I wrote a function that did Karatsuba on eight
997  // multiplications at the same time, using such vectors, and a Go script that
998  // decomposed from degree-704, with Karatsuba in non-transposed form, until it
999  // reached multiplications of degree-44. It batched up those 81
1000  // multiplications into lots of eight with a single one left over (which was
1001  // handled directly).
1002  //
1003  // It worked, but it was significantly slower than the dumb algorithm used
1004  // below. Potentially that was because I misunderstood how [HRSS] did it, or
1005  // because Clang is bad at generating good code from NEON intrinsics on ARMv7.
1006  // (Which is true: the code generated by Clang for the below is pretty crap.)
1007  //
1008  // This algorithm is much simpler. It just does Karatsuba decomposition all
1009  // the way down and never transposes. When it gets down to degree-16 or
1010  // degree-24 values, they are multiplied using schoolbook multiplication and
1011  // vector intrinsics. The vector operations form each of the eight phase-
1012  // shifts of one of the inputs, point-wise multiply, and then add into the
1013  // result at the correct place. This means that 33% (degree-16) or 25%
1014  // (degree-24) of the multiplies and adds are wasted, but it does ok.
1015  if (n == 2) {
1016  vec_t result[4];
1017  vec_t vec_a[3];
1018  static const vec_t kZero = {0};
1019  vec_a[0] = a[0];
1020  vec_a[1] = a[1];
1021  vec_a[2] = kZero;
1022 
1023  result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
1024  result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
1025 
1026  result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
1027  result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
1028  result[3] = kZero;
1029 
1030  vec3_rshift_word(vec_a);
1031 
1032 #define BLOCK(x, y) \
1033  do { \
1034  result[x + 0] = \
1035  vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1036  result[x + 1] = \
1037  vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1038  result[x + 2] = \
1039  vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1040  } while (0)
1041 
1042  BLOCK(0, 1);
1043  BLOCK(1, 9);
1044 
1045  vec3_rshift_word(vec_a);
1046 
1047  BLOCK(0, 2);
1048  BLOCK(1, 10);
1049 
1050  vec3_rshift_word(vec_a);
1051 
1052  BLOCK(0, 3);
1053  BLOCK(1, 11);
1054 
1055  vec3_rshift_word(vec_a);
1056 
1057  BLOCK(0, 4);
1058  BLOCK(1, 12);
1059 
1060  vec3_rshift_word(vec_a);
1061 
1062  BLOCK(0, 5);
1063  BLOCK(1, 13);
1064 
1065  vec3_rshift_word(vec_a);
1066 
1067  BLOCK(0, 6);
1068  BLOCK(1, 14);
1069 
1070  vec3_rshift_word(vec_a);
1071 
1072  BLOCK(0, 7);
1073  BLOCK(1, 15);
1074 
1075 #undef BLOCK
1076 
1077  memcpy(out, result, sizeof(result));
1078  return;
1079  }
1080 
1081  if (n == 3) {
1082  vec_t result[6];
1083  vec_t vec_a[4];
1084  static const vec_t kZero = {0};
1085  vec_a[0] = a[0];
1086  vec_a[1] = a[1];
1087  vec_a[2] = a[2];
1088  vec_a[3] = kZero;
1089 
1090  result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
1091  result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
1092  result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
1093 
1094 #define BLOCK_PRE(x, y) \
1095  do { \
1096  result[x + 0] = \
1097  vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1098  result[x + 1] = \
1099  vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1100  result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1101  } while (0)
1102 
1103  BLOCK_PRE(1, 8);
1104  BLOCK_PRE(2, 16);
1105 
1106  result[5] = kZero;
1107 
1108  vec4_rshift_word(vec_a);
1109 
1110 #define BLOCK(x, y) \
1111  do { \
1112  result[x + 0] = \
1113  vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
1114  result[x + 1] = \
1115  vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
1116  result[x + 2] = \
1117  vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
1118  result[x + 3] = \
1119  vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
1120  } while (0)
1121 
1122  BLOCK(0, 1);
1123  BLOCK(1, 9);
1124  BLOCK(2, 17);
1125 
1126  vec4_rshift_word(vec_a);
1127 
1128  BLOCK(0, 2);
1129  BLOCK(1, 10);
1130  BLOCK(2, 18);
1131 
1132  vec4_rshift_word(vec_a);
1133 
1134  BLOCK(0, 3);
1135  BLOCK(1, 11);
1136  BLOCK(2, 19);
1137 
1138  vec4_rshift_word(vec_a);
1139 
1140  BLOCK(0, 4);
1141  BLOCK(1, 12);
1142  BLOCK(2, 20);
1143 
1144  vec4_rshift_word(vec_a);
1145 
1146  BLOCK(0, 5);
1147  BLOCK(1, 13);
1148  BLOCK(2, 21);
1149 
1150  vec4_rshift_word(vec_a);
1151 
1152  BLOCK(0, 6);
1153  BLOCK(1, 14);
1154  BLOCK(2, 22);
1155 
1156  vec4_rshift_word(vec_a);
1157 
1158  BLOCK(0, 7);
1159  BLOCK(1, 15);
1160  BLOCK(2, 23);
1161 
1162 #undef BLOCK
1163 #undef BLOCK_PRE
1164 
1165  memcpy(out, result, sizeof(result));
1166 
1167  return;
1168  }
1169 
1170  // Karatsuba multiplication.
1171  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
1172 
1173  // When |n| is odd, the two "halves" will have different lengths. The first is
1174  // always the smaller.
1175  const size_t low_len = n / 2;
1176  const size_t high_len = n - low_len;
1177  const vec_t *a_high = &a[low_len];
1178  const vec_t *b_high = &b[low_len];
1179 
1180  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
1181  // half.
1182  for (size_t i = 0; i < low_len; i++) {
1183  out[i] = vec_add(a_high[i], a[i]);
1184  out[high_len + i] = vec_add(b_high[i], b[i]);
1185  }
1186  if (high_len != low_len) {
1187  out[low_len] = a_high[low_len];
1188  out[high_len + low_len] = b_high[low_len];
1189  }
1190 
1191  vec_t *const child_scratch = &scratch[2 * high_len];
1192  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
1193  poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
1194  // Calculate a_1 × b_1.
1195  poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
1196  // Calculate a_0 × b_0.
1197  poly_mul_vec_aux(out, child_scratch, a, b, low_len);
1198 
1199  // Subtract those last two products from the first.
1200  for (size_t i = 0; i < low_len * 2; i++) {
1201  scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
1202  }
1203  if (low_len != high_len) {
1204  scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
1205  scratch[low_len * 2 + 1] =
1206  vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
1207  }
1208 
1209  // Add the middle product into the output.
1210  for (size_t i = 0; i < high_len * 2; i++) {
1211  out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
1212  }
1213 }
1214 
1215 // poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1).
1216 static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1217  const struct poly *x, const struct poly *y) {
1218  OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t));
1219  OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t));
1220 
1221  OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
1222  "struct poly is the wrong size");
1223  OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t),
1224  "struct poly has incorrect alignment");
1225 
1226  vec_t *const prod = scratch->u.vec.prod;
1227  vec_t *const aux_scratch = scratch->u.vec.scratch;
1228  poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY);
1229 
1230  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
1231  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
1232  // the vector size, so the upper-half vectors all have to be shifted before
1233  // being added to the lower-half.
1234  vec_t *out_vecs = (vec_t *)out->v;
1235 
1236  for (size_t i = 0; i < VECS_PER_POLY; i++) {
1237  const vec_t prev = prod[VECS_PER_POLY - 1 + i];
1238  const vec_t this = prod[VECS_PER_POLY + i];
1239  out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
1240  }
1241 
1242  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
1243 }
1244 
1245 #endif // HRSS_HAVE_VECTOR_UNIT
1246 
1247 // poly_mul_novec_aux writes the product of |a| and |b| to |out|, using
1248 // |scratch| as scratch space. It'll use Karatsuba if the inputs are large
1249 // enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and
1250 // the function recurses, except if |n| < 64, when |scratch| isn't used and the
1251 // recursion stops. If |n| == |N| then |scratch| needs 1318 elements.
1253  const uint16_t *a, const uint16_t *b, size_t n) {
1254  static const size_t kSchoolbookLimit = 64;
1255  if (n < kSchoolbookLimit) {
1256  OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
1257  for (size_t i = 0; i < n; i++) {
1258  for (size_t j = 0; j < n; j++) {
1259  out[i + j] += (unsigned) a[i] * b[j];
1260  }
1261  }
1262 
1263  return;
1264  }
1265 
1266  // Karatsuba multiplication.
1267  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
1268 
1269  // When |n| is odd, the two "halves" will have different lengths. The
1270  // first is always the smaller.
1271  const size_t low_len = n / 2;
1272  const size_t high_len = n - low_len;
1273  const uint16_t *const a_high = &a[low_len];
1274  const uint16_t *const b_high = &b[low_len];
1275 
1276  for (size_t i = 0; i < low_len; i++) {
1277  out[i] = a_high[i] + a[i];
1278  out[high_len + i] = b_high[i] + b[i];
1279  }
1280  if (high_len != low_len) {
1281  out[low_len] = a_high[low_len];
1282  out[high_len + low_len] = b_high[low_len];
1283  }
1284 
1285  uint16_t *const child_scratch = &scratch[2 * high_len];
1286  poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
1287  poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
1288  high_len);
1289  poly_mul_novec_aux(out, child_scratch, a, b, low_len);
1290 
1291  for (size_t i = 0; i < low_len * 2; i++) {
1292  scratch[i] -= out[i] + out[low_len * 2 + i];
1293  }
1294  if (low_len != high_len) {
1295  scratch[low_len * 2] -= out[low_len * 4];
1296  assert(out[low_len * 4 + 1] == 0);
1297  }
1298 
1299  for (size_t i = 0; i < high_len * 2; i++) {
1300  out[low_len + i] += scratch[i];
1301  }
1302 }
1303 
1304 // poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1).
1305 static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1306  const struct poly *x, const struct poly *y) {
1307  uint16_t *const prod = scratch->u.novec.prod;
1308  uint16_t *const aux_scratch = scratch->u.novec.scratch;
1309  poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N);
1310 
1311  for (size_t i = 0; i < N; i++) {
1312  out->v[i] = prod[i] + prod[i + N];
1313  }
1314  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
1315 }
1316 
1317 static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
1318  const struct poly *a, const struct poly *b) {
1319 #if defined(POLY_RQ_MUL_ASM)
1320  const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
1321  if (has_avx2) {
1322  poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
1323  return;
1324  }
1325 #endif
1326 
1327 #if defined(HRSS_HAVE_VECTOR_UNIT)
1328  if (vec_capable()) {
1329  poly_mul_vec(scratch, r, a, b);
1330  return;
1331  }
1332 #endif
1333 
1334  // Fallback, non-vector case.
1335  poly_mul_novec(scratch, r, a, b);
1336 }
1337 
1338 // poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1).
1339 static void poly_mul_x_minus_1(struct poly *p) {
1340  // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
1341  // the value of the previous one.
1342  const uint16_t orig_final_coefficient = p->v[N - 1];
1343 
1344  for (size_t i = N - 1; i > 0; i--) {
1345  p->v[i] = p->v[i - 1] - p->v[i];
1346  }
1347  p->v[0] = orig_final_coefficient - p->v[0];
1348 }
1349 
1350 // poly_mod_phiN sets |p| to |p| mod Φ(N).
1351 static void poly_mod_phiN(struct poly *p) {
1352  const uint16_t coeff700 = p->v[N - 1];
1353 
1354  for (unsigned i = 0; i < N; i++) {
1355  p->v[i] -= coeff700;
1356  }
1357 }
1358 
1359 // poly_clamp reduces each coefficient mod Q.
1360 static void poly_clamp(struct poly *p) {
1361  for (unsigned i = 0; i < N; i++) {
1362  p->v[i] &= Q - 1;
1363  }
1364 }
1365 
1366 
1367 // Conversion functions
1368 // --------------------
1369 
1370 // poly2_from_poly sets |*out| to |in| mod 2.
1371 static void poly2_from_poly(struct poly2 *out, const struct poly *in) {
1372  crypto_word_t *words = out->v;
1373  unsigned shift = 0;
1374  crypto_word_t word = 0;
1375 
1376  for (unsigned i = 0; i < N; i++) {
1377  word >>= 1;
1378  word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
1379  shift++;
1380 
1381  if (shift == BITS_PER_WORD) {
1382  *words = word;
1383  words++;
1384  word = 0;
1385  shift = 0;
1386  }
1387  }
1388 
1389  word >>= BITS_PER_WORD - shift;
1390  *words = word;
1391 }
1392 
1393 // mod3 treats |a| as a signed number and returns |a| mod 3.
1395  const int16_t q = ((int32_t)a * 21845) >> 16;
1396  int16_t ret = a - 3 * q;
1397  // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0,
1398  // 1, 2, 0}.
1399  return ret & ((ret & (ret >> 1)) - 1);
1400 }
1401 
1402 // poly3_from_poly sets |*out| to |in|.
1403 static void poly3_from_poly(struct poly3 *out, const struct poly *in) {
1404  crypto_word_t *words_s = out->s.v;
1405  crypto_word_t *words_a = out->a.v;
1406  crypto_word_t s = 0;
1407  crypto_word_t a = 0;
1408  unsigned shift = 0;
1409 
1410  for (unsigned i = 0; i < N; i++) {
1411  // This duplicates the 13th bit upwards to the top of the uint16,
1412  // essentially treating it as a sign bit and converting into a signed int16.
1413  // The signed value is reduced mod 3, yielding {0, 1, 2}.
1414  const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
1415  s >>= 1;
1416  const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
1417  s |= s_bit;
1418  a >>= 1;
1419  a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
1420  shift++;
1421 
1422  if (shift == BITS_PER_WORD) {
1423  *words_s = s;
1424  words_s++;
1425  *words_a = a;
1426  words_a++;
1427  s = a = 0;
1428  shift = 0;
1429  }
1430  }
1431 
1432  s >>= BITS_PER_WORD - shift;
1433  a >>= BITS_PER_WORD - shift;
1434  *words_s = s;
1435  *words_a = a;
1436 }
1437 
1438 // poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1,
1439 // Q-1}. It returns a mask indicating whether all coefficients were found to be
1440 // in that set.
1441 static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
1442  const struct poly *in) {
1443  crypto_word_t *words_s = out->s.v;
1444  crypto_word_t *words_a = out->a.v;
1445  crypto_word_t s = 0;
1446  crypto_word_t a = 0;
1447  unsigned shift = 0;
1448  crypto_word_t ok = CONSTTIME_TRUE_W;
1449 
1450  for (unsigned i = 0; i < N; i++) {
1451  const uint16_t v = in->v[i];
1452  // Maps {0, 1, Q-1} to {0, 1, 2}.
1453  uint16_t mod3 = v & 3;
1454  mod3 ^= mod3 >> 1;
1455  const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q;
1456  ok &= constant_time_eq_w(v, expected);
1457 
1458  s >>= 1;
1459  const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2)
1460  << (BITS_PER_WORD - 2);
1461  s |= s_bit;
1462  a >>= 1;
1463  a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
1464  shift++;
1465 
1466  if (shift == BITS_PER_WORD) {
1467  *words_s = s;
1468  words_s++;
1469  *words_a = a;
1470  words_a++;
1471  s = a = 0;
1472  shift = 0;
1473  }
1474  }
1475 
1476  s >>= BITS_PER_WORD - shift;
1477  a >>= BITS_PER_WORD - shift;
1478  *words_s = s;
1479  *words_a = a;
1480 
1481  return ok;
1482 }
1483 
1484 static void poly_from_poly2(struct poly *out, const struct poly2 *in) {
1485  const crypto_word_t *words = in->v;
1486  unsigned shift = 0;
1487  crypto_word_t word = *words;
1488 
1489  for (unsigned i = 0; i < N; i++) {
1490  out->v[i] = word & 1;
1491  word >>= 1;
1492  shift++;
1493 
1494  if (shift == BITS_PER_WORD) {
1495  words++;
1496  word = *words;
1497  shift = 0;
1498  }
1499  }
1500 }
1501 
1502 static void poly_from_poly3(struct poly *out, const struct poly3 *in) {
1503  const crypto_word_t *words_s = in->s.v;
1504  const crypto_word_t *words_a = in->a.v;
1505  crypto_word_t word_s = ~(*words_s);
1506  crypto_word_t word_a = *words_a;
1507  unsigned shift = 0;
1508 
1509  for (unsigned i = 0; i < N; i++) {
1510  out->v[i] = (uint16_t)(word_s & 1) - 1;
1511  out->v[i] |= word_a & 1;
1512  word_s >>= 1;
1513  word_a >>= 1;
1514  shift++;
1515 
1516  if (shift == BITS_PER_WORD) {
1517  words_s++;
1518  words_a++;
1519  word_s = ~(*words_s);
1520  word_a = *words_a;
1521  shift = 0;
1522  }
1523  }
1524 }
1525 
1526 // Polynomial inversion
1527 // --------------------
1528 
1529 // poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod
1530 // Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
1531 // mod Q.
1532 static void poly_invert_mod2(struct poly *out, const struct poly *in) {
1533  // This algorithm is taken from section 7.1 of [SAFEGCD].
1534  struct poly2 v, r, f, g;
1535 
1536  // v = 0
1537  poly2_zero(&v);
1538  // r = 1
1539  poly2_zero(&r);
1540  r.v[0] = 1;
1541  // f = all ones.
1542  OPENSSL_memset(&f, 0xff, sizeof(struct poly2));
1544  // g is the reversal of |in|.
1545  poly2_from_poly(&g, in);
1546  poly2_mod_phiN(&g);
1547  poly2_reverse_700(&g, &g);
1548  int delta = 1;
1549 
1550  for (size_t i = 0; i < (2*(N-1)) - 1; i++) {
1551  poly2_lshift1(&v);
1552 
1553  const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1;
1554  const crypto_word_t delta_is_non_negative = delta_sign_bit - 1;
1555  const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta);
1556  const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]);
1557  const crypto_word_t mask =
1558  g_has_constant_term & delta_is_non_negative & delta_is_non_zero;
1559 
1560  const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]);
1561 
1562  delta = constant_time_select_int(mask, -delta, delta);
1563  delta++;
1564 
1565  poly2_cswap(&f, &g, mask);
1566  poly2_fmadd(&g, &f, c);
1567  poly2_rshift1(&g);
1568 
1569  poly2_cswap(&v, &r, mask);
1570  poly2_fmadd(&r, &v, c);
1571  }
1572 
1573  assert(delta == 0);
1574  assert(f.v[0] & 1);
1575  poly2_reverse_700(&v, &v);
1576  poly_from_poly2(out, &v);
1577 }
1578 
1579 // poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)).
1580 static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out,
1581  const struct poly *in) {
1582  // Inversion mod Q, which is done based on the result of inverting mod
1583  // 2. See [NTRUTN14] paper, bottom of page two.
1584  struct poly a, *b, tmp;
1585 
1586  // a = -in.
1587  for (unsigned i = 0; i < N; i++) {
1588  a.v[i] = -in->v[i];
1589  }
1590 
1591  // b = in^-1 mod 2.
1592  b = out;
1593  poly_invert_mod2(b, in);
1594 
1595  // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
1596  // times, which is four.
1597  for (unsigned i = 0; i < 4; i++) {
1598  poly_mul(scratch, &tmp, &a, b);
1599  tmp.v[0] += 2;
1600  poly_mul(scratch, b, b, &tmp);
1601  }
1602 }
1603 
1604 // Marshal and unmarshal functions for various basic types.
1605 // --------------------------------------------------------
1606 
1607 #define POLY_BYTES 1138
1608 
1609 // poly_marshal serialises all but the final coefficient of |in| to |out|.
1610 static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
1611  const uint16_t *p = in->v;
1612 
1613  for (size_t i = 0; i < N / 8; i++) {
1614  out[0] = p[0];
1615  out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
1616  out[2] = p[1] >> 3;
1617  out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
1618  out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
1619  out[5] = p[3] >> 1;
1620  out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4);
1621  out[7] = p[4] >> 4;
1622  out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1);
1623  out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6);
1624  out[10] = p[6] >> 2;
1625  out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3);
1626  out[12] = p[7] >> 5;
1627 
1628  p += 8;
1629  out += 13;
1630  }
1631 
1632  // There are four remaining values.
1633  out[0] = p[0];
1634  out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
1635  out[2] = p[1] >> 3;
1636  out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
1637  out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
1638  out[5] = p[3] >> 1;
1639  out[6] = 0xf & (p[3] >> 9);
1640 }
1641 
1642 // poly_unmarshal parses the output of |poly_marshal| and sets |out| such that
1643 // all but the final coefficients match, and the final coefficient is calculated
1644 // such that evaluating |out| at one results in zero. It returns one on success
1645 // or zero if |in| is an invalid encoding.
1646 static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
1647  uint16_t *p = out->v;
1648 
1649  for (size_t i = 0; i < N / 8; i++) {
1650  p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
1651  p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
1652  (uint16_t)(in[3] & 3) << 11;
1653  p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
1654  p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
1655  (uint16_t)(in[6] & 0xf) << 9;
1656  p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 |
1657  (uint16_t)(in[8] & 1) << 12;
1658  p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7;
1659  p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 |
1660  (uint16_t)(in[11] & 7) << 10;
1661  p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5;
1662 
1663  p += 8;
1664  in += 13;
1665  }
1666 
1667  // There are four coefficients remaining.
1668  p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
1669  p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
1670  (uint16_t)(in[3] & 3) << 11;
1671  p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
1672  p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
1673  (uint16_t)(in[6] & 0xf) << 9;
1674 
1675  for (unsigned i = 0; i < N - 1; i++) {
1676  out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
1677  }
1678 
1679  // There are four unused bits in the last byte. We require them to be zero.
1680  if ((in[6] & 0xf0) != 0) {
1681  return 0;
1682  }
1683 
1684  // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
1685  uint32_t sum = 0;
1686  for (size_t i = 0; i < N - 1; i++) {
1687  sum += out->v[i];
1688  }
1689 
1690  out->v[N - 1] = (uint16_t)(0u - sum);
1691 
1692  return 1;
1693 }
1694 
1695 // mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may
1696 // have an invalid value when processing attacker-controlled inputs.
1698  v &= 3;
1699  return v ^ (v >> 1);
1700 }
1701 
1702 // poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are
1703 // all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may
1704 // have invalid values when processing attacker-controlled inputs.)
1706  const struct poly *in) {
1707  const uint16_t *coeffs = in->v;
1708 
1709  // Only 700 coefficients are marshaled because in[700] must be zero.
1710  assert(coeffs[N-1] == 0);
1711 
1712  for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
1713  const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
1714  const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
1715  const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
1716  const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
1717  const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
1718  out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
1719  coeffs += 5;
1720  }
1721 }
1722 
1723 // HRSS-specific functions
1724 // -----------------------
1725 
1726 // poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}.
1727 // This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but
1728 // with HRSS-SXY the sampling algorithm is now a private detail of the
1729 // implementation (previously it had to match between two parties). This
1730 // function uses that freedom to implement a flatter distribution of values.
1731 static void poly_short_sample(struct poly *out,
1732  const uint8_t in[HRSS_SAMPLE_BYTES]) {
1734  "HRSS_SAMPLE_BYTES incorrect");
1735  for (size_t i = 0; i < N - 1; i++) {
1736  uint16_t v = mod3(in[i]);
1737  // Map {0, 1, 2} -> {0, 1, 0xffff}
1738  v |= ((v >> 1) ^ 1) - 1;
1739  out->v[i] = v;
1740  }
1741  out->v[N - 1] = 0;
1742 }
1743 
1744 // poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
1745 // section 1.8.2.
1746 static void poly_short_sample_plus(struct poly *out,
1747  const uint8_t in[HRSS_SAMPLE_BYTES]) {
1749 
1750  // sum (and the product in the for loop) will overflow. But that's fine
1751  // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out.
1752  uint16_t sum = 0;
1753  for (unsigned i = 0; i < N - 2; i++) {
1754  sum += (unsigned) out->v[i] * out->v[i + 1];
1755  }
1756 
1757  // If the sum is negative, flip the sign of even-positioned coefficients. (See
1758  // page 8 of [HRSS].)
1759  sum = ((int16_t) sum) >> 15;
1760  const uint16_t scale = sum | (~sum & 1);
1761  for (unsigned i = 0; i < N; i += 2) {
1762  out->v[i] = (unsigned) out->v[i] * scale;
1763  }
1764 }
1765 
1766 // poly_lift computes the function discussed in [HRSS], appendix B.
1767 static void poly_lift(struct poly *out, const struct poly *a) {
1768  // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
1769  // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
1770 
1771  // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
1772  //
1773  // R.<x> = PolynomialRing(GF(3)…)
1774  // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
1775  // list(inv)[:15]
1776  // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
1777  //
1778  // This three-element pattern of coefficients repeats for the whole
1779  // polynomial.
1780  //
1781  // Next define the overbar operator such that z̅ = z[0] +
1782  // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
1783  // of the constant term. So index one is the coefficient of 𝑥 and so
1784  // on.)
1785  //
1786  // A less odd way to define this is to see that z̅ negates the indexes,
1787  // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
1788  //
1789  // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
1790  // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
1791  // of the point-wise products.) Although we calculated the inverse mod
1792  // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
1793  // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
1794  //
1795  // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
1796  // of the list of coefficients.
1797  //
1798  // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
1799  //
1800  // def reverse(xs):
1801  // suffix = list(xs[1:])
1802  // suffix.reverse()
1803  // return [xs[0]] + suffix
1804  //
1805  // def rotate(xs):
1806  // return [xs[-1]] + xs[:-1]
1807  //
1808  // zoverbar = reverse(list(inv) + [0])
1809  // xzoverbar = rotate(reverse(list(inv) + [0]))
1810  // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
1811  //
1812  // zoverbar[:15]
1813  // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
1814  // xzoverbar[:15]
1815  // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
1816  // x2zoverbar[:15]
1817  // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
1818  //
1819  // (For a formula for z̅, see lemma two of appendix B.)
1820  //
1821  // After the first three elements have been taken care of, all then have
1822  // a repeating three-element cycle. The next value (𝑥^3z̅) involves
1823  // three rotations of the first pattern, thus the three-element cycle
1824  // lines up. However, the discontinuity in the first three elements
1825  // obviously moves to a different position. Consider the difference
1826  // between 𝑥^3z̅ and z̅:
1827  //
1828  // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
1829  // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1830  //
1831  // This pattern of differences is the same for all elements, although it
1832  // obviously moves right with the rotations.
1833  //
1834  // From this, we reach algorithm eight of appendix B.
1835 
1836  // Handle the first three elements of the inner products.
1837  out->v[0] = a->v[0] + a->v[2];
1838  out->v[1] = a->v[1];
1839  out->v[2] = -a->v[0] + a->v[2];
1840 
1841  // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
1842  // respectively. We do not compute s1 because it's just -(s0 + s1).
1843  uint16_t s0 = 0, s2 = 0;
1844  for (size_t i = 3; i < 699; i += 3) {
1845  s0 += -a->v[i] + a->v[i + 2];
1846  // s1 += a->v[i] - a->v[i + 1];
1847  s2 += a->v[i + 1] - a->v[i + 2];
1848  }
1849 
1850  // Handle the fact that the three-element pattern doesn't fill the
1851  // polynomial exactly (since 701 isn't a multiple of three).
1852  s0 -= a->v[699];
1853  // s1 += a->v[699] - a->v[700];
1854  s2 += a->v[700];
1855 
1856  // Note that s0 + s1 + s2 = 0.
1857  out->v[0] += s0;
1858  out->v[1] -= (s0 + s2); // = s1
1859  out->v[2] += s2;
1860 
1861  // Calculate the remaining inner products by taking advantage of the
1862  // fact that the pattern repeats every three cycles and the pattern of
1863  // differences moves with the rotation.
1864  for (size_t i = 3; i < N; i++) {
1865  out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
1866  }
1867 
1868  // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
1869  // element and convert to mod Q. (See above about adding twice as
1870  // subtraction.)
1871  const crypto_word_t v = out->v[700];
1872  for (unsigned i = 0; i < N; i++) {
1873  const uint16_t vi_mod3 = mod3(out->v[i] - v);
1874  // Map {0, 1, 2} to {0, 1, 0xffff}.
1875  out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
1876  }
1877 
1879 }
1880 
1881 struct public_key {
1882  struct poly ph;
1883 };
1884 
1885 struct private_key {
1886  struct poly3 f, f_inverse;
1889 };
1890 
1891 // public_key_from_external converts an external public key pointer into an
1892 // internal one. Externally the alignment is only specified to be eight bytes
1893 // but we need 16-byte alignment. We could annotate the external struct with
1894 // that alignment but we can only assume that malloced pointers are 8-byte
1895 // aligned in any case. (Even if the underlying malloc returns values with
1896 // 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess
1897 // that up.)
1899  struct HRSS_public_key *ext) {
1901  sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
1902  "HRSS public key too small");
1903 
1904  return align_pointer(ext->opaque, 16);
1905 }
1906 
1907 // private_key_from_external does the same thing as |public_key_from_external|,
1908 // but for private keys. See the comment on that function about alignment
1909 // issues.
1911  struct HRSS_private_key *ext) {
1913  sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
1914  "HRSS private key too small");
1915 
1916  return align_pointer(ext->opaque, 16);
1917 }
1918 
1919 // malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and
1920 // sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release
1921 // it. It returns NULL if out of memory.
1922 static void *malloc_align32(void **out_ptr, size_t size) {
1923  void *ptr = OPENSSL_malloc(size + 31);
1924  if (!ptr) {
1925  *out_ptr = NULL;
1926  return NULL;
1927  }
1928 
1929  *out_ptr = ptr;
1930  return align_pointer(ptr, 32);
1931 }
1932 
1934  struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
1936  struct public_key *pub = public_key_from_external(out_pub);
1937  struct private_key *priv = private_key_from_external(out_priv);
1938 
1939  struct vars {
1940  struct POLY_MUL_SCRATCH scratch;
1941  struct poly f;
1942  struct poly pg_phi1;
1943  struct poly pfg_phi1;
1944  struct poly pfg_phi1_inverse;
1945  };
1946 
1947  void *malloc_ptr;
1948  struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
1949  if (!vars) {
1950  // If the caller ignores the return value the output will still be safe.
1951  // The private key output is randomised in case it's later passed to
1952  // |HRSS_encap|.
1953  memset(out_pub, 0, sizeof(struct HRSS_public_key));
1954  RAND_bytes((uint8_t*) out_priv, sizeof(struct HRSS_private_key));
1955  return 0;
1956  }
1957 
1959  sizeof(priv->hmac_key));
1960 
1961  poly_short_sample_plus(&vars->f, in);
1962  poly3_from_poly(&priv->f, &vars->f);
1963  HRSS_poly3_invert(&priv->f_inverse, &priv->f);
1964 
1965  // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
1966  poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES);
1967  for (unsigned i = 0; i < N; i++) {
1968  vars->pg_phi1.v[i] *= 3;
1969  }
1970  poly_mul_x_minus_1(&vars->pg_phi1);
1971 
1972  poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1);
1973 
1974  poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1);
1975 
1976  poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1);
1977  poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1);
1978  poly_clamp(&pub->ph);
1979 
1980  poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse,
1981  &vars->f);
1982  poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f);
1983  poly_clamp(&priv->ph_inverse);
1984 
1985  OPENSSL_free(malloc_ptr);
1986  return 1;
1987 }
1988 
1989 static const char kSharedKey[] = "shared key";
1990 
1991 int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32],
1992  const struct HRSS_public_key *in_pub,
1994  const struct public_key *pub =
1995  public_key_from_external((struct HRSS_public_key *)in_pub);
1996 
1997  struct vars {
1998  struct POLY_MUL_SCRATCH scratch;
1999  struct poly m, r, m_lifted;
2000  struct poly prh_plus_m;
2001  SHA256_CTX hash_ctx;
2002  uint8_t m_bytes[HRSS_POLY3_BYTES];
2003  uint8_t r_bytes[HRSS_POLY3_BYTES];
2004  };
2005 
2006  void *malloc_ptr;
2007  struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
2008  if (!vars) {
2009  // If the caller ignores the return value the output will still be safe.
2010  // The private key output is randomised in case it's used to encrypt and
2011  // transmit something.
2012  memset(out_ciphertext, 0, POLY_BYTES);
2013  RAND_bytes(out_shared_key, 32);
2014  return 0;
2015  }
2016 
2017  poly_short_sample(&vars->m, in);
2018  poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES);
2019  poly_lift(&vars->m_lifted, &vars->m);
2020 
2021  poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph);
2022  for (unsigned i = 0; i < N; i++) {
2023  vars->prh_plus_m.v[i] += vars->m_lifted.v[i];
2024  }
2025 
2026  poly_marshal(out_ciphertext, &vars->prh_plus_m);
2027 
2028  poly_marshal_mod3(vars->m_bytes, &vars->m);
2029  poly_marshal_mod3(vars->r_bytes, &vars->r);
2030 
2031  SHA256_Init(&vars->hash_ctx);
2032  SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
2033  SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
2034  SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
2035  SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES);
2036  SHA256_Final(out_shared_key, &vars->hash_ctx);
2037 
2038  OPENSSL_free(malloc_ptr);
2039  return 1;
2040 }
2041 
2042 int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
2043  const struct HRSS_private_key *in_priv,
2044  const uint8_t *ciphertext, size_t ciphertext_len) {
2045  const struct private_key *priv =
2046  private_key_from_external((struct HRSS_private_key *)in_priv);
2047 
2048  struct vars {
2049  struct POLY_MUL_SCRATCH scratch;
2050  uint8_t masked_key[SHA256_CBLOCK];
2051  SHA256_CTX hash_ctx;
2052  struct poly c;
2053  struct poly f, cf;
2054  struct poly3 cf3, m3;
2055  struct poly m, m_lifted;
2056  struct poly r;
2057  struct poly3 r3;
2058  uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES];
2059  uint8_t m_bytes[HRSS_POLY3_BYTES];
2060  uint8_t r_bytes[HRSS_POLY3_BYTES];
2061  uint8_t shared_key[32];
2062  };
2063 
2064  void *malloc_ptr;
2065  struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars));
2066  if (!vars) {
2067  // If the caller ignores the return value the output will still be safe.
2068  // The private key output is randomised in case it's used to encrypt and
2069  // transmit something.
2070  RAND_bytes(out_shared_key, HRSS_KEY_BYTES);
2071  return 0;
2072  }
2073 
2074  // This is HMAC, expanded inline rather than using the |HMAC| function so that
2075  // we can avoid dealing with possible allocation failures and so keep this
2076  // function infallible.
2077  OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(vars->masked_key),
2078  "HRSS HMAC key larger than SHA-256 block size");
2079  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
2080  vars->masked_key[i] = priv->hmac_key[i] ^ 0x36;
2081  }
2082  OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36,
2083  sizeof(vars->masked_key) - sizeof(priv->hmac_key));
2084 
2085  SHA256_Init(&vars->hash_ctx);
2086  SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
2087  SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len);
2088  uint8_t inner_digest[SHA256_DIGEST_LENGTH];
2089  SHA256_Final(inner_digest, &vars->hash_ctx);
2090 
2091  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
2092  vars->masked_key[i] ^= (0x5c ^ 0x36);
2093  }
2094  OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c,
2095  sizeof(vars->masked_key) - sizeof(priv->hmac_key));
2096 
2097  SHA256_Init(&vars->hash_ctx);
2098  SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key));
2099  SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest));
2101  "HRSS shared key length incorrect");
2102  SHA256_Final(out_shared_key, &vars->hash_ctx);
2103 
2104  // If the ciphertext is publicly invalid then a random shared key is still
2105  // returned to simply the logic of the caller, but this path is not constant
2106  // time.
2107  if (ciphertext_len != HRSS_CIPHERTEXT_BYTES ||
2108  !poly_unmarshal(&vars->c, ciphertext)) {
2109  goto out;
2110  }
2111 
2112  poly_from_poly3(&vars->f, &priv->f);
2113  poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f);
2114  poly3_from_poly(&vars->cf3, &vars->cf);
2115  // Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
2116  HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse);
2117 
2118  poly_from_poly3(&vars->m, &vars->m3);
2119  poly_lift(&vars->m_lifted, &vars->m);
2120 
2121  for (unsigned i = 0; i < N; i++) {
2122  vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i];
2123  }
2124  poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse);
2125  poly_mod_phiN(&vars->r);
2126  poly_clamp(&vars->r);
2127 
2128  crypto_word_t ok = poly3_from_poly_checked(&vars->r3, &vars->r);
2129 
2130  // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather
2131  // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)|
2132  // (called |b|) with:
2133  // t = (−b(1)/N) mod Q
2134  // c' = b + tΦ(N) + lift(m) mod Q
2135  //
2136  // When polynomials are transmitted, the final coefficient is omitted and
2137  // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also,
2138  // |poly_lift| multiplies the result by (x-1) and therefore evaluating a
2139  // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so
2140  // (c - lift(m))(1) == 0.
2141  //
2142  // Although we defer the reduction above, |b| is conceptually reduced mod
2143  // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every
2144  // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is
2145  // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction.
2146  // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to
2147  // recover |c| at all so long as we do the checks in
2148  // |poly3_from_poly_checked|.
2149  //
2150  // The |poly_marshal| here then is just confirming that |poly_unmarshal| is
2151  // strict and could be omitted.
2152 
2154  "ciphertext is the wrong size");
2155  assert(ciphertext_len == sizeof(vars->expected_ciphertext));
2156  poly_marshal(vars->expected_ciphertext, &vars->c);
2157 
2158  poly_marshal_mod3(vars->m_bytes, &vars->m);
2159  poly_marshal_mod3(vars->r_bytes, &vars->r);
2160 
2162  CRYPTO_memcmp(ciphertext, vars->expected_ciphertext,
2163  sizeof(vars->expected_ciphertext)));
2164 
2165  SHA256_Init(&vars->hash_ctx);
2166  SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey));
2167  SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes));
2168  SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes));
2169  SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext,
2170  sizeof(vars->expected_ciphertext));
2171  SHA256_Final(vars->shared_key, &vars->hash_ctx);
2172 
2173  for (unsigned i = 0; i < sizeof(vars->shared_key); i++) {
2174  out_shared_key[i] =
2175  constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]);
2176  }
2177 
2178 out:
2179  OPENSSL_free(malloc_ptr);
2180  return 1;
2181 }
2182 
2184  const struct HRSS_public_key *in_pub) {
2185  const struct public_key *pub =
2186  public_key_from_external((struct HRSS_public_key *)in_pub);
2187  poly_marshal(out, &pub->ph);
2188 }
2189 
2191  const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
2192  struct public_key *pub = public_key_from_external(out);
2193  if (!poly_unmarshal(&pub->ph, in)) {
2194  return 0;
2195  }
2196  OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
2197  return 1;
2198 }
ptr
char * ptr
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:45
_gevent_test_main.result
result
Definition: _gevent_test_main.py:96
poly3_span::a
crypto_word_t * a
Definition: hrss.c:596
bn.h
malloc_align32
static void * malloc_align32(void **out_ptr, size_t size)
Definition: hrss.c:1922
hrss.h
poly3
Definition: third_party/boringssl-with-bazel/src/crypto/hrss/internal.h:35
public_key
Definition: hrss.c:1881
gen_build_yaml.out
dictionary out
Definition: src/benchmark/gen_build_yaml.py:24
RAND_bytes
#define RAND_bytes
Definition: boringssl_prefix_symbols.h:2060
HRSS_SAMPLE_BYTES
#define HRSS_SAMPLE_BYTES
Definition: hrss.h:42
private_key::hmac_key
uint8_t hmac_key[32]
Definition: hrss.c:1888
word_reverse
static crypto_word_t word_reverse(crypto_word_t in)
Definition: hrss.c:317
poly_unmarshal
static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES])
Definition: hrss.c:1646
POLY_BYTES
#define POLY_BYTES
Definition: hrss.c:1607
memset
return memset(p, 0, total)
poly_invert
static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out, const struct poly *in)
Definition: hrss.c:1580
poly3_zero
static void poly3_zero(struct poly3 *p)
Definition: hrss.c:489
poly3_reverse_700
static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in)
Definition: hrss.c:496
uint16_t
unsigned short uint16_t
Definition: stdint-msvc2008.h:79
poly3_mul_const
static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, crypto_word_t ma)
Definition: hrss.c:528
CONSTTIME_TRUE_W
#define CONSTTIME_TRUE_W
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:276
OPENSSL_UNUSED
#define OPENSSL_UNUSED
Definition: base.h:253
y
const double y
Definition: bloaty/third_party/googletest/googlemock/test/gmock-matchers_test.cc:3611
SHA256_CBLOCK
#define SHA256_CBLOCK
Definition: sha.h:152
poly_mul_x_minus_1
static void poly_mul_x_minus_1(struct poly *p)
Definition: hrss.c:1339
grpc::testing::sum
double sum(const T &container, F functor)
Definition: test/cpp/qps/stats.h:30
ext
void * ext
Definition: x509v3.h:87
private_key::ph_inverse
struct poly ph_inverse
Definition: hrss.c:1887
OPENSSL_ARRAY_SIZE
#define OPENSSL_ARRAY_SIZE(array)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:179
poly_Rq_mul
#define poly_Rq_mul
Definition: boringssl_prefix_symbols.h:3354
align_pointer
static void * align_pointer(void *ptr, size_t alignment)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:230
printf
_Use_decl_annotations_ int __cdecl printf(const char *_Format,...)
Definition: cs_driver.c:91
HRSS_poly3_invert
void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in)
Definition: hrss.c:844
mod3
static uint16_t mod3(int16_t a)
Definition: hrss.c:1394
ciphertext
const char * ciphertext
Definition: protobuf/src/google/protobuf/stubs/strutil_unittest.cc:86
poly_mul_novec_aux
static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, const uint16_t *a, const uint16_t *b, size_t n)
Definition: hrss.c:1252
u
OPENSSL_EXPORT pem_password_cb void * u
Definition: pem.h:351
internal.h
public_key::ph
struct poly ph
Definition: hrss.c:1882
poly3_top_bits_are_clear
static OPENSSL_UNUSED int poly3_top_bits_are_clear(const struct poly3 *p)
Definition: hrss.c:558
words
std::vector< std::string > words
Definition: bloaty/third_party/protobuf/src/google/protobuf/repeated_field_unittest.cc:1787
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
xds_manager.p
p
Definition: xds_manager.py:60
poly3_span
Definition: hrss.c:594
BITS_IN_LAST_WORD
#define BITS_IN_LAST_WORD
Definition: third_party/boringssl-with-bazel/src/crypto/hrss/internal.h:29
poly
Definition: hrss.c:917
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
BITS_PER_WORD
#define BITS_PER_WORD
Definition: third_party/boringssl-with-bazel/src/crypto/hrss/internal.h:27
private_key::f_inverse
struct poly3 f f_inverse
Definition: hrss.c:1886
OPENSSL_memset
static void * OPENSSL_memset(void *dst, int c, size_t n)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:835
kSharedKey
static const char kSharedKey[]
Definition: hrss.c:1989
constant_time_is_zero_w
static crypto_word_t constant_time_is_zero_w(crypto_word_t a)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:372
HRSS_poly3_mul
void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, const struct poly3 *y)
Definition: hrss.c:710
VECS_PER_POLY
#define VECS_PER_POLY
Definition: hrss.c:909
HRSS_CIPHERTEXT_BYTES
#define HRSS_CIPHERTEXT_BYTES
Definition: hrss.h:52
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
OPENSSL_malloc
#define OPENSSL_malloc
Definition: boringssl_prefix_symbols.h:1885
POLY_MUL_SCRATCH::u
union POLY_MUL_SCRATCH::@356 u
for
for(map_begin_internal(intern, &it);!map_done(&it);map_next(&it))
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/map.c:207
poly2_lshift1
static void poly2_lshift1(struct poly2 *p)
Definition: hrss.c:395
memcpy
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
in
const char * in
Definition: third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc:391
c
void c(T a)
Definition: miscompile_with_no_unique_address_test.cc:40
autogen_x86imm.f
f
Definition: autogen_x86imm.py:9
int16_t
signed short int16_t
Definition: stdint-msvc2008.h:76
OPENSSL_ia32cap_P
#define OPENSSL_ia32cap_P
Definition: boringssl_prefix_symbols.h:1874
a2
T::first_type a2
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:307
HRSS_generate_key
int HRSS_generate_key(struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, const uint8_t in[HRSS_SAMPLE_BYTES+HRSS_SAMPLE_BYTES+32])
Definition: hrss.c:1933
HRSS_public_key
Definition: hrss.h:35
HRSS_encap
int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32], const struct HRSS_public_key *in_pub, const uint8_t in[HRSS_SAMPLE_BYTES+HRSS_SAMPLE_BYTES])
Definition: hrss.c:1991
poly3_word_add
static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2)
Definition: hrss.c:510
poly2_zero
static void poly2_zero(struct poly2 *p)
Definition: hrss.c:312
poly3_span_sub
static void poly3_span_sub(const struct poly3_span *a, const struct poly3_span *b, size_t n)
Definition: hrss.c:610
swap
#define swap(a, b)
Definition: qsort.h:111
HRSS_PUBLIC_KEY_BYTES
#define HRSS_PUBLIC_KEY_BYTES
Definition: hrss.h:50
POLY_MUL_SCRATCH::novec
struct POLY_MUL_SCRATCH::@356::@357 novec
poly_invert_mod2
static void poly_invert_mod2(struct poly *out, const struct poly *in)
Definition: hrss.c:1532
constant_time_select_int
static int constant_time_select_int(crypto_word_t mask, int a, int b)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:441
HRSS_POLY3_BYTES
#define HRSS_POLY3_BYTES
Definition: hrss.h:57
poly_print
static OPENSSL_UNUSED void poly_print(const struct poly *p)
Definition: hrss.c:932
poly_lift
static void poly_lift(struct poly *out, const struct poly *a)
Definition: hrss.c:1767
setup.v
v
Definition: third_party/bloaty/third_party/capstone/bindings/python/setup.py:42
absl::str_format_internal::Flags::kZero
@ kZero
poly_mod_phiN
static void poly_mod_phiN(struct poly *p)
Definition: hrss.c:1351
HRSS_KEY_BYTES
#define HRSS_KEY_BYTES
Definition: hrss.h:54
sha.h
poly_from_poly2
static void poly_from_poly2(struct poly *out, const struct poly2 *in)
Definition: hrss.c:1484
hexdump
static OPENSSL_UNUSED void hexdump(const void *void_in, size_t len)
Definition: hrss.c:304
poly_marshal_mod3
static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], const struct poly *in)
Definition: hrss.c:1705
OPENSSL_memcpy
static void * OPENSSL_memcpy(void *dst, const void *src, size_t n)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:819
poly3_fmsub
static void poly3_fmsub(struct poly3 *RESTRICT out, const struct poly3 *RESTRICT in, crypto_word_t ms, crypto_word_t ma)
Definition: hrss.c:539
poly3_rshift1
static void poly3_rshift1(struct poly3 *p)
Definition: hrss.c:588
poly_short_sample
static void poly_short_sample(struct poly *out, const uint8_t in[HRSS_SAMPLE_BYTES])
Definition: hrss.c:1731
lsb_to_all
static crypto_word_t lsb_to_all(crypto_word_t v)
Definition: hrss.c:347
private_key_from_external
static struct private_key * private_key_from_external(struct HRSS_private_key *ext)
Definition: hrss.c:1910
x
int x
Definition: bloaty/third_party/googletest/googlemock/test/gmock-matchers_test.cc:3610
poly2
Definition: third_party/boringssl-with-bazel/src/crypto/hrss/internal.h:31
poly3_span::s
crypto_word_t * s
Definition: hrss.c:595
bm_speedup.scale
def scale(a, mul)
Definition: bm_speedup.py:24
a1
T::first_type a1
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:305
WORDS_PER_POLY
#define WORDS_PER_POLY
Definition: third_party/boringssl-with-bazel/src/crypto/hrss/internal.h:28
poly2_top_bits_are_clear
static int poly2_top_bits_are_clear(const struct poly2 *p)
Definition: hrss.c:424
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
g
struct @717 g
poly2_mod_phiN
static void poly2_mod_phiN(struct poly2 *p)
Definition: hrss.c:350
UINT64_C
#define UINT64_C(val)
Definition: stdint-msvc2008.h:238
HRSS_marshal_public_key
void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub)
Definition: hrss.c:2183
poly_short_sample_plus
static void poly_short_sample_plus(struct poly *out, const uint8_t in[HRSS_SAMPLE_BYTES])
Definition: hrss.c:1746
n
int n
Definition: abseil-cpp/absl/container/btree_test.cc:1080
poly_marshal
static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in)
Definition: hrss.c:1610
poly3_word_sub
static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2)
Definition: hrss.c:519
RESTRICT
#define RESTRICT
Definition: hrss.c:31
poly3_cswap
static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap)
Definition: hrss.c:578
HRSS_parse_public_key
int HRSS_parse_public_key(struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES])
Definition: hrss.c:2190
poly3_span_add
static void poly3_span_add(const struct poly3_span *out, const struct poly3_span *a, const struct poly3_span *b, size_t n)
Definition: hrss.c:601
poly3_word_mul
static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2)
Definition: hrss.c:502
poly::v
uint16_t v[N+3]
Definition: hrss.c:928
sha256_state_st
Definition: sha.h:189
poly3_from_poly_checked
static crypto_word_t poly3_from_poly_checked(struct poly3 *out, const struct poly *in)
Definition: hrss.c:1441
rand.h
scratch
static char scratch[256]
Definition: test-random.c:27
N
#define N
Definition: sync_test.cc:37
poly3_mul_aux
static void poly3_mul_aux(const struct poly3_span *out, const struct poly3_span *scratch, const struct poly3_span *a, const struct poly3_span *b, size_t n)
Definition: hrss.c:622
poly2_fmadd
static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, crypto_word_t m)
Definition: hrss.c:387
Q
#define Q
Definition: hrss.c:904
SHA256_Init
#define SHA256_Init
Definition: boringssl_prefix_symbols.h:2156
absl::str_format_internal::LengthMod::t
@ t
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
private_key
Definition: hrss.c:1885
SHA256_DIGEST_LENGTH
#define SHA256_DIGEST_LENGTH
Definition: sha.h:155
fix_build_deps.r
r
Definition: fix_build_deps.py:491
poly3_mod_phiN
static void poly3_mod_phiN(struct poly3 *p)
Definition: hrss.c:563
public_key_from_external
static struct public_key * public_key_from_external(struct HRSS_public_key *ext)
Definition: hrss.c:1898
ok
bool ok
Definition: async_end2end_test.cc:197
poly2_cswap
static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap)
Definition: hrss.c:377
POLY_MUL_SCRATCH
Definition: hrss.c:948
POLY_MUL_SCRATCH::prod
uint16_t prod[2 *N]
Definition: hrss.c:952
cpu.h
poly_mul_novec
static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, const struct poly *x, const struct poly *y)
Definition: hrss.c:1305
POLY_MUL_SCRATCH::scratch
uint16_t scratch[1318]
Definition: hrss.c:953
mem.h
constant_time_select_8
static uint8_t constant_time_select_8(uint8_t mask, uint8_t a, uint8_t b)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:434
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
autogen_x86imm.tmp
tmp
Definition: autogen_x86imm.py:12
poly3_lshift1
static void poly3_lshift1(struct poly3 *p)
Definition: hrss.c:583
constant_time_eq_w
static crypto_word_t constant_time_eq_w(crypto_word_t a, crypto_word_t b)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:394
size
voidpf void uLong size
Definition: bloaty/third_party/zlib/contrib/minizip/ioapi.h:136
regress.m
m
Definition: regress/regress.py:25
HRSS_private_key
Definition: hrss.h:31
int32_t
signed int int32_t
Definition: stdint-msvc2008.h:77
mod3_from_modQ
static uint16_t mod3_from_modQ(uint16_t v)
Definition: hrss.c:1697
SHA256_Final
#define SHA256_Final
Definition: boringssl_prefix_symbols.h:2155
OPENSSL_free
#define OPENSSL_free
Definition: boringssl_prefix_symbols.h:1869
HRSS_decap
int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], const struct HRSS_private_key *in_priv, const uint8_t *ciphertext, size_t ciphertext_len)
Definition: hrss.c:2042
poly2_reverse_700
static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in)
Definition: hrss.c:362
CRYPTO_memcmp
#define CRYPTO_memcmp
Definition: boringssl_prefix_symbols.h:1178
poly2_from_poly
static void poly2_from_poly(struct poly2 *out, const struct poly *in)
Definition: hrss.c:1371
hmac.h
final_bit_to_all
static crypto_word_t final_bit_to_all(crypto_word_t v)
Definition: hrss.c:552
poly3_print
static OPENSSL_UNUSED void poly3_print(const struct poly3 *in)
Definition: hrss.c:466
OPENSSL_STATIC_ASSERT
#define OPENSSL_STATIC_ASSERT(cond, msg)
Definition: type_check.h:75
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
poly_mul
static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r, const struct poly *a, const struct poly *b)
Definition: hrss.c:1317
poly2_rshift1
static void poly2_rshift1(struct poly2 *p)
Definition: hrss.c:406
poly_from_poly3
static void poly_from_poly3(struct poly *out, const struct poly3 *in)
Definition: hrss.c:1502
poly2_clear_top_bits
static void poly2_clear_top_bits(struct poly2 *p)
Definition: hrss.c:418
poly_clamp
static void poly_clamp(struct poly *p)
Definition: hrss.c:1360
poly3_from_poly
static void poly3_from_poly(struct poly3 *out, const struct poly *in)
Definition: hrss.c:1403
SHA256_Update
#define SHA256_Update
Definition: boringssl_prefix_symbols.h:2159


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:02