aes_nohw.c
Go to the documentation of this file.
1 /* Copyright (c) 2019, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <openssl/aes.h>
16 
17 #include <assert.h>
18 #include <string.h>
19 
20 #include "../../internal.h"
21 
22 #if defined(OPENSSL_SSE2)
23 #include <emmintrin.h>
24 #endif
25 
26 
27 // This file contains a constant-time implementation of AES, bitsliced with
28 // 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
29 // batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
30 //
31 // This implementation is based on the algorithms described in the following
32 // references:
33 // - https://bearssl.org/constanttime.html#aes
34 // - https://eprint.iacr.org/2009/129.pdf
35 // - https://eprint.iacr.org/2009/191.pdf
36 
37 
38 // Word operations.
39 //
40 // An aes_word_t is the word used for this AES implementation. Throughout this
41 // file, bits and bytes are ordered little-endian, though "left" and "right"
42 // shifts match the operations themselves, which makes them reversed in a
43 // little-endian, left-to-right reading.
44 //
45 // Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
46 // |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
47 // bits each, each corresponding to a byte in an AES block in column-major
48 // order (AES's byte order). We refer to these as "logical bytes". Note, in the
49 // 32-bit and 64-bit implementations, they are smaller than a byte. (The
50 // contents of a logical byte will be described later.)
51 //
52 // MSVC does not support C bit operators on |__m128i|, so the wrapper functions
53 // |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
54 // |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
55 // value ranges from 0 to 15 independent of |aes_word_t| and
56 // |AES_NOHW_BATCH_SIZE|.
57 //
58 // This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
59 // uses row-major order. Matching the AES order was easier to reason about, and
60 // we do not have PSHUFB available to arbitrarily permute bytes.
61 
62 #if defined(OPENSSL_SSE2)
63 typedef __m128i aes_word_t;
64 // AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
65 // MSVC, so we define a constant.
66 #define AES_NOHW_WORD_SIZE 16
67 #define AES_NOHW_BATCH_SIZE 8
68 #define AES_NOHW_ROW0_MASK \
69  _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff)
70 #define AES_NOHW_ROW1_MASK \
71  _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00)
72 #define AES_NOHW_ROW2_MASK \
73  _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000)
74 #define AES_NOHW_ROW3_MASK \
75  _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000)
76 #define AES_NOHW_COL01_MASK \
77  _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff)
78 #define AES_NOHW_COL2_MASK \
79  _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000)
80 #define AES_NOHW_COL3_MASK \
81  _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000)
82 
84  return _mm_and_si128(a, b);
85 }
86 
88  return _mm_or_si128(a, b);
89 }
90 
92  return _mm_xor_si128(a, b);
93 }
94 
95 static inline aes_word_t aes_nohw_not(aes_word_t a) {
96  return _mm_xor_si128(
97  a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff));
98 }
99 
100 // These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
101 // must be constants.
102 #define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \
103  _mm_slli_si128((a), (i))
104 #define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \
105  _mm_srli_si128((a), (i))
106 #else // !OPENSSL_SSE2
107 #if defined(OPENSSL_64_BIT)
108 typedef uint64_t aes_word_t;
109 #define AES_NOHW_WORD_SIZE 8
110 #define AES_NOHW_BATCH_SIZE 4
111 #define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
112 #define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
113 #define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
114 #define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
115 #define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff)
116 #define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000)
117 #define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000)
118 #else // !OPENSSL_64_BIT
120 #define AES_NOHW_WORD_SIZE 4
121 #define AES_NOHW_BATCH_SIZE 2
122 #define AES_NOHW_ROW0_MASK 0x03030303
123 #define AES_NOHW_ROW1_MASK 0x0c0c0c0c
124 #define AES_NOHW_ROW2_MASK 0x30303030
125 #define AES_NOHW_ROW3_MASK 0xc0c0c0c0
126 #define AES_NOHW_COL01_MASK 0x0000ffff
127 #define AES_NOHW_COL2_MASK 0x00ff0000
128 #define AES_NOHW_COL3_MASK 0xff000000
129 #endif // OPENSSL_64_BIT
130 
132  return a & b;
133 }
134 
136  return a | b;
137 }
138 
140  return a ^ b;
141 }
142 
143 static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
144 
146  return a << (i * AES_NOHW_BATCH_SIZE);
147 }
148 
150  return a >> (i * AES_NOHW_BATCH_SIZE);
151 }
152 #endif // OPENSSL_SSE2
153 
154 OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
155  "batch size does not match word size");
157  "AES_NOHW_WORD_SIZE is incorrect");
158 
159 
160 // Block representations.
161 //
162 // This implementation uses three representations for AES blocks. First, the
163 // public API represents blocks as uint8_t[16] in the usual way. Second, most
164 // AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
165 // This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
166 // containing bitsliced blocks a, b, c, d, this would be as follows (vertical
167 // bars divide logical bytes):
168 //
169 // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
170 // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
171 // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
172 // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
173 // ...
174 //
175 // Finally, an individual block may be stored as an intermediate form in an
176 // aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
177 // block, so that block[0]'s ith logical byte contains least-significant
178 // |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
179 // |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
180 // "compacting" the block. Note this is no-op with 128-bit words because then
181 // |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
182 // words, one block would be stored in two words:
183 //
184 // block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
185 // block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
186 //
187 // Observe that the distances between corresponding bits in bitsliced and
188 // compact bit orders match. If we line up corresponding words of each block,
189 // the bitsliced and compact representations may be converted by tranposing bits
190 // in corresponding logical bytes. Continuing the 64-bit example:
191 //
192 // block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
193 // block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ...
194 // block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ...
195 // block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ...
196 //
197 // batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
198 // batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
199 // batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
200 // batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
201 //
202 // Note also that bitwise operations and (logical) byte permutations on an
203 // |aes_word_t| work equally for the bitsliced and compact words.
204 //
205 // We use the compact form in the |AES_KEY| representation to save work
206 // inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
207 // temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
208 // before or after |aes_nohw_transpose|.
209 
210 #define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
211 
212 // An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
213 // specified, it is in bitsliced form.
214 typedef struct {
217 
218 // An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
219 // suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
220 // |AES_KEY|s so it should not be used as a long-term key representation.
221 typedef struct {
222  // keys is an array of batches, one for each round key. Each batch stores
223  // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
226 
227 // aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
228 // compact form.
231  size_t i) {
232  // Note the words are interleaved. The order comes from |aes_nohw_transpose|.
233  // If |i| is zero and this is the 64-bit implementation, in[0] contains bits
234  // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
235  // w[4] so that bits 0 and 4 are in the correct position. (In general, bits
236  // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
237  // will be correctly placed.)
238  assert(i < AES_NOHW_BATCH_SIZE);
239 #if defined(OPENSSL_SSE2)
240  batch->w[i] = in[0];
241 #elif defined(OPENSSL_64_BIT)
242  batch->w[i] = in[0];
243  batch->w[i + 4] = in[1];
244 #else
245  batch->w[i] = in[0];
246  batch->w[i + 2] = in[1];
247  batch->w[i + 4] = in[2];
248  batch->w[i + 6] = in[3];
249 #endif
250 }
251 
252 // aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
253 // compact form.
254 static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
256  size_t i) {
257  assert(i < AES_NOHW_BATCH_SIZE);
258 #if defined(OPENSSL_SSE2)
259  out[0] = batch->w[i];
260 #elif defined(OPENSSL_64_BIT)
261  out[0] = batch->w[i];
262  out[1] = batch->w[i + 4];
263 #else
264  out[0] = batch->w[i];
265  out[1] = batch->w[i + 2];
266  out[2] = batch->w[i + 4];
267  out[3] = batch->w[i + 6];
268 #endif
269 }
270 
271 #if !defined(OPENSSL_SSE2)
272 // aes_nohw_delta_swap returns |a| with bits |a & mask| and
273 // |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
275  aes_word_t shift) {
276  // See
277  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
278  aes_word_t b = (a ^ (a >> shift)) & mask;
279  return a ^ b ^ (b << shift);
280 }
281 
282 // In the 32-bit and 64-bit implementations, a block spans multiple words.
283 // |aes_nohw_compact_block| must permute bits across different words. First we
284 // implement |aes_nohw_compact_word| which performs a smaller version of the
285 // transformation which stays within a single word.
286 //
287 // These transformations are generalizations of the output of
288 // http://programming.sirrida.de/calcperm.php on smaller inputs.
289 #if defined(OPENSSL_64_BIT)
290 static inline uint64_t aes_nohw_compact_word(uint64_t a) {
291  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
292  // quartets of those chunks:
293  // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
294  // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
295  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
296  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
297  // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
298  // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
299  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
300  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
301  // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
302  // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
303  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
304  return a;
305 }
306 
308  // Reverse the steps of |aes_nohw_uncompact_word|.
309  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
310  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
311  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
312  return a;
313 }
314 #else // !OPENSSL_64_BIT
316  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
317  // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
318  // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
319  // Note: 0x00cc = 0b0000_0000_1100_1100
320  // 0x00cc << 6 = 0b0011_0011_0000_0000
321  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
322  // Now we swap groups of four bits (still numbering by pairs):
323  // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
324  // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
325  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
326  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
327  return a;
328 }
329 
331  // Reverse the steps of |aes_nohw_uncompact_word|.
332  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
333  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
334  return a;
335 }
336 
338  uint8_t a2, uint8_t a3) {
339  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
340  ((uint32_t)a3 << 24);
341 }
342 #endif // OPENSSL_64_BIT
343 #endif // !OPENSSL_SSE2
344 
346  const uint8_t in[16]) {
347  memcpy(out, in, 16);
348 #if defined(OPENSSL_SSE2)
349  // No conversions needed.
350 #elif defined(OPENSSL_64_BIT)
353  out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
354  out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
355 #else
360  // Note clang, when building for ARM Thumb2, will sometimes miscompile
361  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
362  // without optimizations. This bug was introduced in
363  // https://reviews.llvm.org/rL340261 and fixed in
364  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
365  out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3);
366  out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
367  out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
368  out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
369 #endif
370 }
371 
372 static inline void aes_nohw_uncompact_block(
374 #if defined(OPENSSL_SSE2)
375  memcpy(out, in, 16); // No conversions needed.
376 #elif defined(OPENSSL_64_BIT)
377  uint64_t a0 = in[0];
378  uint64_t a1 = in[1];
379  uint64_t b0 =
380  aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
381  uint64_t b1 =
382  aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
383  memcpy(out, &b0, 8);
384  memcpy(out + 8, &b1, 8);
385 #else
386  uint32_t a0 = in[0];
387  uint32_t a1 = in[1];
388  uint32_t a2 = in[2];
389  uint32_t a3 = in[3];
390  // Note clang, when building for ARM Thumb2, will sometimes miscompile
391  // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
392  // without optimizations. This bug was introduced in
393  // https://reviews.llvm.org/rL340261 and fixed in
394  // https://reviews.llvm.org/rL351310. The following is written to avoid this.
395  uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3);
396  uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8);
397  uint32_t b2 =
398  aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16);
399  uint32_t b3 =
400  aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24);
401  b0 = aes_nohw_uncompact_word(b0);
404  b3 = aes_nohw_uncompact_word(b3);
405  memcpy(out, &b0, 4);
406  memcpy(out + 4, &b1, 4);
407  memcpy(out + 8, &b2, 4);
408  memcpy(out + 12, &b3, 4);
409 #endif
410 }
411 
412 // aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
413 // |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
414 // |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
415 // is repeated to the full width of |aes_word_t|.
416 #if defined(OPENSSL_SSE2)
417 // This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
418 // constant shift values.
419 #define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \
420  /* uint32_t */ mask, /* const */ shift) \
421  do { \
422  __m128i swap = \
423  _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \
424  _mm_set_epi32((mask), (mask), (mask), (mask))); \
425  *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \
426  *(b) = _mm_xor_si128(*(b), swap); \
427  \
428  } while (0)
429 #else
431  uint32_t mask, aes_word_t shift) {
432 #if defined(OPENSSL_64_BIT)
433  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
434 #else
435  aes_word_t mask_w = mask;
436 #endif
437  // This is a variation on a delta swap.
438  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
439  *a ^= swap << shift;
440  *b ^= swap;
441 }
442 #endif // OPENSSL_SSE2
443 
444 // aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
445 // the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
446 // and transposes each square.
448  // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
449  aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
450  aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
451  aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
452  aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
453 
454 #if AES_NOHW_BATCH_SIZE >= 4
455  // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
456  aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
457  aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
458  aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
459  aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
460 #endif
461 
462 #if AES_NOHW_BATCH_SIZE >= 8
463  // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
464  aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
465  aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
466  aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
467  aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
468 #endif
469 }
470 
471 // aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
472 // |num_blocks| must be at most |AES_NOHW_BATCH|.
474  size_t num_blocks) {
475  // Don't leave unused blocks uninitialized.
476  memset(out, 0, sizeof(AES_NOHW_BATCH));
477  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
478  for (size_t i = 0; i < num_blocks; i++) {
480  aes_nohw_compact_block(block, in + 16 * i);
482  }
483 
485 }
486 
487 // aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
488 // |num_blocks| must be at most |AES_NOHW_BATCH|.
489 static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
490  const AES_NOHW_BATCH *batch) {
493 
494  assert(num_blocks <= AES_NOHW_BATCH_SIZE);
495  for (size_t i = 0; i < num_blocks; i++) {
499  }
500 }
501 
502 
503 // AES round steps.
504 
506  const AES_NOHW_BATCH *key) {
507  for (size_t i = 0; i < 8; i++) {
508  batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
509  }
510 }
511 
513  // See https://eprint.iacr.org/2009/191.pdf, Appendix C.
514  aes_word_t x0 = batch->w[7];
515  aes_word_t x1 = batch->w[6];
516  aes_word_t x2 = batch->w[5];
517  aes_word_t x3 = batch->w[4];
518  aes_word_t x4 = batch->w[3];
519  aes_word_t x5 = batch->w[2];
520  aes_word_t x6 = batch->w[1];
521  aes_word_t x7 = batch->w[0];
522 
523  // Figure 2, the top linear transformation.
524  aes_word_t y14 = aes_nohw_xor(x3, x5);
525  aes_word_t y13 = aes_nohw_xor(x0, x6);
526  aes_word_t y9 = aes_nohw_xor(x0, x3);
527  aes_word_t y8 = aes_nohw_xor(x0, x5);
528  aes_word_t t0 = aes_nohw_xor(x1, x2);
529  aes_word_t y1 = aes_nohw_xor(t0, x7);
530  aes_word_t y4 = aes_nohw_xor(y1, x3);
531  aes_word_t y12 = aes_nohw_xor(y13, y14);
532  aes_word_t y2 = aes_nohw_xor(y1, x0);
533  aes_word_t y5 = aes_nohw_xor(y1, x6);
534  aes_word_t y3 = aes_nohw_xor(y5, y8);
535  aes_word_t t1 = aes_nohw_xor(x4, y12);
536  aes_word_t y15 = aes_nohw_xor(t1, x5);
537  aes_word_t y20 = aes_nohw_xor(t1, x1);
538  aes_word_t y6 = aes_nohw_xor(y15, x7);
539  aes_word_t y10 = aes_nohw_xor(y15, t0);
540  aes_word_t y11 = aes_nohw_xor(y20, y9);
541  aes_word_t y7 = aes_nohw_xor(x7, y11);
542  aes_word_t y17 = aes_nohw_xor(y10, y11);
543  aes_word_t y19 = aes_nohw_xor(y10, y8);
544  aes_word_t y16 = aes_nohw_xor(t0, y11);
545  aes_word_t y21 = aes_nohw_xor(y13, y16);
546  aes_word_t y18 = aes_nohw_xor(x0, y16);
547 
548  // Figure 3, the middle non-linear section.
549  aes_word_t t2 = aes_nohw_and(y12, y15);
550  aes_word_t t3 = aes_nohw_and(y3, y6);
551  aes_word_t t4 = aes_nohw_xor(t3, t2);
552  aes_word_t t5 = aes_nohw_and(y4, x7);
553  aes_word_t t6 = aes_nohw_xor(t5, t2);
554  aes_word_t t7 = aes_nohw_and(y13, y16);
555  aes_word_t t8 = aes_nohw_and(y5, y1);
556  aes_word_t t9 = aes_nohw_xor(t8, t7);
557  aes_word_t t10 = aes_nohw_and(y2, y7);
558  aes_word_t t11 = aes_nohw_xor(t10, t7);
559  aes_word_t t12 = aes_nohw_and(y9, y11);
560  aes_word_t t13 = aes_nohw_and(y14, y17);
561  aes_word_t t14 = aes_nohw_xor(t13, t12);
562  aes_word_t t15 = aes_nohw_and(y8, y10);
563  aes_word_t t16 = aes_nohw_xor(t15, t12);
564  aes_word_t t17 = aes_nohw_xor(t4, t14);
565  aes_word_t t18 = aes_nohw_xor(t6, t16);
566  aes_word_t t19 = aes_nohw_xor(t9, t14);
567  aes_word_t t20 = aes_nohw_xor(t11, t16);
568  aes_word_t t21 = aes_nohw_xor(t17, y20);
569  aes_word_t t22 = aes_nohw_xor(t18, y19);
570  aes_word_t t23 = aes_nohw_xor(t19, y21);
571  aes_word_t t24 = aes_nohw_xor(t20, y18);
572  aes_word_t t25 = aes_nohw_xor(t21, t22);
573  aes_word_t t26 = aes_nohw_and(t21, t23);
574  aes_word_t t27 = aes_nohw_xor(t24, t26);
575  aes_word_t t28 = aes_nohw_and(t25, t27);
576  aes_word_t t29 = aes_nohw_xor(t28, t22);
577  aes_word_t t30 = aes_nohw_xor(t23, t24);
578  aes_word_t t31 = aes_nohw_xor(t22, t26);
579  aes_word_t t32 = aes_nohw_and(t31, t30);
580  aes_word_t t33 = aes_nohw_xor(t32, t24);
581  aes_word_t t34 = aes_nohw_xor(t23, t33);
582  aes_word_t t35 = aes_nohw_xor(t27, t33);
583  aes_word_t t36 = aes_nohw_and(t24, t35);
584  aes_word_t t37 = aes_nohw_xor(t36, t34);
585  aes_word_t t38 = aes_nohw_xor(t27, t36);
586  aes_word_t t39 = aes_nohw_and(t29, t38);
587  aes_word_t t40 = aes_nohw_xor(t25, t39);
588  aes_word_t t41 = aes_nohw_xor(t40, t37);
589  aes_word_t t42 = aes_nohw_xor(t29, t33);
590  aes_word_t t43 = aes_nohw_xor(t29, t40);
591  aes_word_t t44 = aes_nohw_xor(t33, t37);
592  aes_word_t t45 = aes_nohw_xor(t42, t41);
593  aes_word_t z0 = aes_nohw_and(t44, y15);
594  aes_word_t z1 = aes_nohw_and(t37, y6);
595  aes_word_t z2 = aes_nohw_and(t33, x7);
596  aes_word_t z3 = aes_nohw_and(t43, y16);
597  aes_word_t z4 = aes_nohw_and(t40, y1);
598  aes_word_t z5 = aes_nohw_and(t29, y7);
599  aes_word_t z6 = aes_nohw_and(t42, y11);
600  aes_word_t z7 = aes_nohw_and(t45, y17);
601  aes_word_t z8 = aes_nohw_and(t41, y10);
602  aes_word_t z9 = aes_nohw_and(t44, y12);
603  aes_word_t z10 = aes_nohw_and(t37, y3);
604  aes_word_t z11 = aes_nohw_and(t33, y4);
605  aes_word_t z12 = aes_nohw_and(t43, y13);
606  aes_word_t z13 = aes_nohw_and(t40, y5);
607  aes_word_t z14 = aes_nohw_and(t29, y2);
608  aes_word_t z15 = aes_nohw_and(t42, y9);
609  aes_word_t z16 = aes_nohw_and(t45, y14);
610  aes_word_t z17 = aes_nohw_and(t41, y8);
611 
612  // Figure 4, bottom linear transformation.
613  aes_word_t t46 = aes_nohw_xor(z15, z16);
614  aes_word_t t47 = aes_nohw_xor(z10, z11);
615  aes_word_t t48 = aes_nohw_xor(z5, z13);
616  aes_word_t t49 = aes_nohw_xor(z9, z10);
617  aes_word_t t50 = aes_nohw_xor(z2, z12);
618  aes_word_t t51 = aes_nohw_xor(z2, z5);
619  aes_word_t t52 = aes_nohw_xor(z7, z8);
620  aes_word_t t53 = aes_nohw_xor(z0, z3);
621  aes_word_t t54 = aes_nohw_xor(z6, z7);
622  aes_word_t t55 = aes_nohw_xor(z16, z17);
623  aes_word_t t56 = aes_nohw_xor(z12, t48);
624  aes_word_t t57 = aes_nohw_xor(t50, t53);
625  aes_word_t t58 = aes_nohw_xor(z4, t46);
626  aes_word_t t59 = aes_nohw_xor(z3, t54);
627  aes_word_t t60 = aes_nohw_xor(t46, t57);
628  aes_word_t t61 = aes_nohw_xor(z14, t57);
629  aes_word_t t62 = aes_nohw_xor(t52, t58);
630  aes_word_t t63 = aes_nohw_xor(t49, t58);
631  aes_word_t t64 = aes_nohw_xor(z4, t59);
632  aes_word_t t65 = aes_nohw_xor(t61, t62);
633  aes_word_t t66 = aes_nohw_xor(z1, t63);
634  aes_word_t s0 = aes_nohw_xor(t59, t63);
635  aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
636  aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
637  aes_word_t t67 = aes_nohw_xor(t64, t65);
638  aes_word_t s3 = aes_nohw_xor(t53, t66);
639  aes_word_t s4 = aes_nohw_xor(t51, t66);
640  aes_word_t s5 = aes_nohw_xor(t47, t65);
641  aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
642  aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
643 
644  batch->w[0] = s7;
645  batch->w[1] = s6;
646  batch->w[2] = s5;
647  batch->w[3] = s4;
648  batch->w[4] = s3;
649  batch->w[5] = s2;
650  batch->w[6] = s1;
651  batch->w[7] = s0;
652 }
653 
654 // aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
655 // S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
657  aes_word_t a0 = batch->w[0];
658  aes_word_t a1 = batch->w[1];
659  aes_word_t a2 = batch->w[2];
660  aes_word_t a3 = batch->w[3];
661  aes_word_t a4 = batch->w[4];
662  aes_word_t a5 = batch->w[5];
663  aes_word_t a6 = batch->w[6];
664  aes_word_t a7 = batch->w[7];
665 
666  // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant
667  // [1 0 0 0 1 1 1 1].
668  aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7));
669  aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0));
671  aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2));
672  aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3));
673  aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4));
674  aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5));
675  aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6));
676 
677  // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant,
678  // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.)
679  batch->w[0] = aes_nohw_not(b0);
680  batch->w[1] = b1;
681  batch->w[2] = aes_nohw_not(b2);
682  batch->w[3] = b3;
683  batch->w[4] = b4;
684  batch->w[5] = b5;
685  batch->w[6] = b6;
686  batch->w[7] = b7;
687 }
688 
690  // We implement the inverse S-box using the forwards implementation with the
691  // technique described in https://www.bearssl.org/constanttime.html#aes.
692  //
693  // The forwards S-box inverts its input and applies an affine transformation:
694  // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then:
695  //
696  // InvS(x) = Inv(InvA(x)).
697  // = InvA(S(InvA(x)))
701 }
702 
703 // aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
704 // to the right by |n|. This is a macro because |aes_nohw_shift_*| require
705 // constant shift counts in the SSE2 implementation.
706 #define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
707  (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \
708  aes_nohw_shift_left((v), 16 - (n)*4)))
709 
711  for (size_t i = 0; i < 8; i++) {
716  row1 = aes_nohw_rotate_cols_right(row1, 1);
717  row2 = aes_nohw_rotate_cols_right(row2, 2);
718  row3 = aes_nohw_rotate_cols_right(row3, 3);
719  batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
720  }
721 }
722 
724  for (size_t i = 0; i < 8; i++) {
729  row1 = aes_nohw_rotate_cols_right(row1, 3);
730  row2 = aes_nohw_rotate_cols_right(row2, 2);
731  row3 = aes_nohw_rotate_cols_right(row3, 1);
732  batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
733  }
734 }
735 
736 // aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
737 // down by one.
739 #if defined(OPENSSL_SSE2)
740  return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24));
741 #elif defined(OPENSSL_64_BIT)
742  return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
743  ((v << 12) & UINT64_C(0xf000f000f000f000));
744 #else
745  return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
746 #endif
747 }
748 
749 // aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
750 // by two.
752 #if defined(OPENSSL_SSE2)
753  return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16));
754 #elif defined(OPENSSL_64_BIT)
755  return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
756  ((v << 8) & UINT64_C(0xff00ff00ff00ff00));
757 #else
758  return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
759 #endif
760 }
761 
763  // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
764  aes_word_t a0 = batch->w[0];
765  aes_word_t a1 = batch->w[1];
766  aes_word_t a2 = batch->w[2];
767  aes_word_t a3 = batch->w[3];
768  aes_word_t a4 = batch->w[4];
769  aes_word_t a5 = batch->w[5];
770  aes_word_t a6 = batch->w[6];
771  aes_word_t a7 = batch->w[7];
772 
774  aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
776  aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
778  aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
780  aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
782  aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
784  aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
786  aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
788  aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
789 
790  batch->w[0] =
792  batch->w[1] =
793  aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
795  batch->w[2] =
797  batch->w[3] =
798  aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
800  batch->w[4] =
801  aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
803  batch->w[5] =
805  batch->w[6] =
807  batch->w[7] =
809 }
810 
812  aes_word_t a0 = batch->w[0];
813  aes_word_t a1 = batch->w[1];
814  aes_word_t a2 = batch->w[2];
815  aes_word_t a3 = batch->w[3];
816  aes_word_t a4 = batch->w[4];
817  aes_word_t a5 = batch->w[5];
818  aes_word_t a6 = batch->w[6];
819  aes_word_t a7 = batch->w[7];
820 
821  // bsaes-x86_64.pl describes the following decomposition of the inverse
822  // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler
823  // multiplication.
824  //
825  // | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
826  // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
827  // | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
828  // | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
829  //
830  // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described
831  // by the following bit equations:
832  //
833  // b0 = a6
834  // b1 = a6 ^ a7
835  // b2 = a0 ^ a7
836  // b3 = a1 ^ a6
837  // b4 = a2 ^ a6 ^ a7
838  // b5 = a3 ^ a7
839  // b6 = a4
840  // b7 = a5
841  //
842  // Each coefficient is given by:
843  //
844  // b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij
845  //
846  // We combine the two equations below. Note a_i(j+2) is a row rotation.
855 
856  batch->w[0] = aes_nohw_xor(a0, a6_r6);
857  batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7));
858  batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7));
859  batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6));
860  batch->w[4] =
861  aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7));
862  batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7));
863  batch->w[6] = aes_nohw_xor(a6, a4_r4);
864  batch->w[7] = aes_nohw_xor(a7, a5_r5);
865 
866  // Apply the [02 03 01 01] matrix, which is just MixColumns.
868 }
869 
871  size_t num_rounds, AES_NOHW_BATCH *batch) {
872  aes_nohw_add_round_key(batch, &key->keys[0]);
873  for (size_t i = 1; i < num_rounds; i++) {
877  aes_nohw_add_round_key(batch, &key->keys[i]);
878  }
881  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
882 }
883 
885  size_t num_rounds, AES_NOHW_BATCH *batch) {
886  aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
889  for (size_t i = num_rounds - 1; i > 0; i--) {
890  aes_nohw_add_round_key(batch, &key->keys[i]);
894  }
895  aes_nohw_add_round_key(batch, &key->keys[0]);
896 }
897 
898 
899 // Key schedule.
900 
902  const AES_KEY *key) {
903  for (size_t i = 0; i <= key->rounds; i++) {
904  // Copy the round key into each block in the batch.
905  for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
907  memcpy(tmp, key->rd_key + 4 * i, 16);
908  aes_nohw_batch_set(&out->keys[i], tmp, j);
909  }
910  aes_nohw_transpose(&out->keys[i]);
911  }
912 }
913 
914 static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
915  0x20, 0x40, 0x80, 0x1b, 0x36};
916 
917 // aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
918 // |rcon|, stored in a |aes_word_t|.
919 static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
920  rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
921 #if defined(OPENSSL_SSE2)
922  return _mm_set_epi32(0, 0, 0, rcon);
923 #else
924  return ((aes_word_t)rcon);
925 #endif
926 }
927 
931  memset(&batch, 0, sizeof(batch));
937 }
938 
939 static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
940  key->rounds = 10;
941 
944  memcpy(key->rd_key, block, 16);
945 
946  for (size_t i = 1; i <= 10; i++) {
949  uint8_t rcon = aes_nohw_rcon[i - 1];
950  for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
951  // Incorporate |rcon| and the transformed word into the first word.
952  block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
953  block[j] = aes_nohw_xor(
954  block[j],
956  // Propagate to the remaining words. Note this is reordered from the usual
957  // formulation to avoid needing masks.
958  aes_word_t v = block[j];
962  }
963  memcpy(key->rd_key + 4 * i, block, 16);
964  }
965 }
966 
967 static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) {
968  key->rounds = 12;
969 
971  aes_word_t *block1 = storage1, *block2 = storage2;
972 
973  // AES-192's key schedule is complex because each key schedule iteration
974  // produces six words, but we compute on blocks and each block is four words.
975  // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time.
976  // We loop below every three blocks or two key schedule iterations.
977  //
978  // On entry to the loop, |block1| and the first half of |block2| contain the
979  // previous key schedule iteration. |block1| has been written to |key|, but
980  // |block2| has not as it is incomplete.
981  aes_nohw_compact_block(block1, in);
982  memcpy(key->rd_key, block1, 16);
983 
984  uint8_t half_block[16] = {0};
985  memcpy(half_block, in + 16, 8);
986  aes_nohw_compact_block(block2, half_block);
987 
988  for (size_t i = 0; i < 4; i++) {
990  aes_nohw_sub_block(sub, block2);
991  uint8_t rcon = aes_nohw_rcon[2 * i];
992  for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
993  // Compute the first two words of the next key schedule iteration, which
994  // go in the second half of |block2|. The first two words of the previous
995  // iteration are in the first half of |block1|. Apply |rcon| here too
996  // because the shifts match.
997  block2[j] = aes_nohw_or(
998  block2[j],
1000  aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8));
1001  // Incorporate the transformed word and propagate. Note the last word of
1002  // the previous iteration corresponds to the second word of |copy|. This
1003  // is incorporated into the first word of the next iteration, or the third
1004  // word of |block2|.
1005  block2[j] = aes_nohw_xor(
1006  block2[j], aes_nohw_and(aes_nohw_shift_left(
1007  aes_nohw_rotate_rows_down(sub[j]), 4),
1009  block2[j] = aes_nohw_xor(
1010  block2[j],
1012 
1013  // Compute the remaining four words, which fill |block1|. Begin by moving
1014  // the corresponding words of the previous iteration: the second half of
1015  // |block1| and the first half of |block2|.
1016  block1[j] = aes_nohw_shift_right(block1[j], 8);
1017  block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8));
1018  // Incorporate the second word, computed previously in |block2|, and
1019  // propagate.
1020  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1021  aes_word_t v = block1[j];
1022  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1023  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1024  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1025  }
1026 
1027  // This completes two round keys. Note half of |block2| was computed in the
1028  // previous loop iteration but was not yet output.
1029  memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16);
1030  memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16);
1031 
1032  aes_nohw_sub_block(sub, block1);
1033  rcon = aes_nohw_rcon[2 * i + 1];
1034  for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1035  // Compute the first four words of the next key schedule iteration in
1036  // |block2|. Begin by moving the corresponding words of the previous
1037  // iteration: the second half of |block2| and the first half of |block1|.
1038  block2[j] = aes_nohw_shift_right(block2[j], 8);
1039  block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8));
1040  // Incorporate rcon and the transformed word. Note the last word of the
1041  // previous iteration corresponds to the last word of |copy|.
1042  block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j));
1043  block2[j] = aes_nohw_xor(
1044  block2[j],
1046  // Propagate to the remaining words.
1047  aes_word_t v = block2[j];
1048  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1049  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1050  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1051 
1052  // Compute the last two words, which go in the first half of |block1|. The
1053  // last two words of the previous iteration are in the second half of
1054  // |block1|.
1055  block1[j] = aes_nohw_shift_right(block1[j], 8);
1056  // Propagate blocks and mask off the excess.
1057  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12));
1058  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4));
1059  block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK);
1060  }
1061 
1062  // |block2| has a complete round key. |block1| will be completed in the next
1063  // iteration.
1064  memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16);
1065 
1066  // Swap blocks to restore the invariant.
1067  aes_word_t *tmp = block1;
1068  block1 = block2;
1069  block2 = tmp;
1070  }
1071 }
1072 
1073 static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
1074  key->rounds = 14;
1075 
1076  // Each key schedule iteration produces two round keys.
1078  aes_nohw_compact_block(block1, in);
1079  memcpy(key->rd_key, block1, 16);
1080 
1081  aes_nohw_compact_block(block2, in + 16);
1082  memcpy(key->rd_key + 4, block2, 16);
1083 
1084  for (size_t i = 2; i <= 14; i += 2) {
1086  aes_nohw_sub_block(sub, block2);
1087  uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
1088  for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1089  // Incorporate |rcon| and the transformed word into the first word.
1090  block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
1091  block1[j] = aes_nohw_xor(
1092  block1[j],
1094  // Propagate to the remaining words.
1095  aes_word_t v = block1[j];
1096  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
1097  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
1098  block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
1099  }
1100  memcpy(key->rd_key + 4 * i, block1, 16);
1101 
1102  if (i == 14) {
1103  break;
1104  }
1105 
1106  aes_nohw_sub_block(sub, block1);
1107  for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
1108  // Incorporate the transformed word into the first word.
1109  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
1110  // Propagate to the remaining words.
1111  aes_word_t v = block2[j];
1112  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
1113  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
1114  block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
1115  }
1116  memcpy(key->rd_key + 4 * (i + 1), block2, 16);
1117  }
1118 }
1119 
1120 
1121 // External API.
1122 
1124  AES_KEY *aeskey) {
1125  switch (bits) {
1126  case 128:
1127  aes_nohw_setup_key_128(aeskey, key);
1128  return 0;
1129  case 192:
1130  aes_nohw_setup_key_192(aeskey, key);
1131  return 0;
1132  case 256:
1133  aes_nohw_setup_key_256(aeskey, key);
1134  return 0;
1135  }
1136  return 1;
1137 }
1138 
1140  AES_KEY *aeskey) {
1141  return aes_nohw_set_encrypt_key(key, bits, aeskey);
1142 }
1143 
1145  AES_NOHW_SCHEDULE sched;
1148  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1149  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1150  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1151 }
1152 
1154  AES_NOHW_SCHEDULE sched;
1157  aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
1158  aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1159  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1160 }
1161 
1162 static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
1163  const uint8_t b[16]) {
1164  for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
1165  aes_word_t x, y;
1166  memcpy(&x, a + i, sizeof(aes_word_t));
1167  memcpy(&y, b + i, sizeof(aes_word_t));
1168  x = aes_nohw_xor(x, y);
1169  memcpy(out + i, &x, sizeof(aes_word_t));
1170  }
1171 }
1172 
1174  size_t blocks, const AES_KEY *key,
1175  const uint8_t ivec[16]) {
1176  if (blocks == 0) {
1177  return;
1178  }
1179 
1180  AES_NOHW_SCHEDULE sched;
1182 
1183  // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
1184  alignas(AES_NOHW_WORD_SIZE) union {
1185  uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
1186  uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
1187  } ivs, enc_ivs;
1188  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1189  memcpy(ivs.u8 + 16 * i, ivec, 16);
1190  }
1191 
1192  uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
1193  for (;;) {
1194  // Update counters.
1195  for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
1196  ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
1197  }
1198 
1199  size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1201  aes_nohw_to_batch(&batch, ivs.u8, todo);
1202  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1203  aes_nohw_from_batch(enc_ivs.u8, todo, &batch);
1204 
1205  for (size_t i = 0; i < todo; i++) {
1206  aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs.u8 + 16 * i);
1207  }
1208 
1209  blocks -= todo;
1210  if (blocks == 0) {
1211  break;
1212  }
1213 
1214  in += 16 * AES_NOHW_BATCH_SIZE;
1215  out += 16 * AES_NOHW_BATCH_SIZE;
1216  ctr += AES_NOHW_BATCH_SIZE;
1217  }
1218 }
1219 
1221  const AES_KEY *key, uint8_t *ivec, const int enc) {
1222  assert(len % 16 == 0);
1223  size_t blocks = len / 16;
1224  if (blocks == 0) {
1225  return;
1226  }
1227 
1228  AES_NOHW_SCHEDULE sched;
1230  alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16];
1231  memcpy(iv, ivec, 16);
1232 
1233  if (enc) {
1234  // CBC encryption is not parallelizable.
1235  while (blocks > 0) {
1236  aes_nohw_xor_block(iv, iv, in);
1237 
1239  aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1);
1240  aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
1241  aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
1242 
1243  memcpy(iv, out, 16);
1244 
1245  in += 16;
1246  out += 16;
1247  blocks--;
1248  }
1249  memcpy(ivec, iv, 16);
1250  return;
1251  }
1252 
1253  for (;;) {
1254  size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
1255  // Make a copy of the input so we can decrypt in-place.
1257  memcpy(copy, in, todo * 16);
1258 
1261  aes_nohw_decrypt_batch(&sched, key->rounds, &batch);
1263 
1264  aes_nohw_xor_block(out, out, iv);
1265  for (size_t i = 1; i < todo; i++) {
1266  aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1));
1267  }
1268 
1269  // Save the last block as the IV.
1270  memcpy(iv, copy + 16 * (todo - 1), 16);
1271 
1272  blocks -= todo;
1273  if (blocks == 0) {
1274  break;
1275  }
1276 
1277  in += 16 * AES_NOHW_BATCH_SIZE;
1278  out += 16 * AES_NOHW_BATCH_SIZE;
1279  }
1280 
1281  memcpy(ivec, iv, 16);
1282 }
OPENSSL_STATIC_ASSERT
OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE *128==8 *8 *sizeof(aes_word_t), "batch size does not match word size")
gen_build_yaml.out
dictionary out
Definition: src/benchmark/gen_build_yaml.py:24
aes_nohw_and
static aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b)
Definition: aes_nohw.c:131
aes_nohw_add_round_key
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, const AES_NOHW_BATCH *key)
Definition: aes_nohw.c:505
memset
return memset(p, 0, total)
keys
const void * keys
Definition: abseil-cpp/absl/random/internal/randen.cc:49
AES_NOHW_WORD_SIZE
#define AES_NOHW_WORD_SIZE
Definition: aes_nohw.c:120
y
const double y
Definition: bloaty/third_party/googletest/googlemock/test/gmock-matchers_test.cc:3611
string.h
copy
static int copy(grpc_slice_buffer *input, grpc_slice_buffer *output)
Definition: message_compress.cc:145
aes_nohw_shift_left
static aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i)
Definition: aes_nohw.c:145
AES_NOHW_ROW1_MASK
#define AES_NOHW_ROW1_MASK
Definition: aes_nohw.c:123
aes_nohw_from_batch
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, const AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:489
AES_NOHW_BLOCK_WORDS
#define AES_NOHW_BLOCK_WORDS
Definition: aes_nohw.c:210
aes_nohw_transpose
static void aes_nohw_transpose(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:447
AES_NOHW_COL3_MASK
#define AES_NOHW_COL3_MASK
Definition: aes_nohw.c:128
aes_nohw_setup_key_128
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16])
Definition: aes_nohw.c:939
aes_nohw_shift_right
static aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i)
Definition: aes_nohw.c:149
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
aes_nohw_or
static aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b)
Definition: aes_nohw.c:135
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
aes_word_t
uint32_t aes_word_t
Definition: aes_nohw.c:119
aes_nohw_setup_key_192
static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24])
Definition: aes_nohw.c:967
aes_nohw_setup_key_256
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32])
Definition: aes_nohw.c:1073
AES_NOHW_ROW0_MASK
#define AES_NOHW_ROW0_MASK
Definition: aes_nohw.c:122
aes_nohw_rotate_rows_down
static aes_word_t aes_nohw_rotate_rows_down(aes_word_t v)
Definition: aes_nohw.c:738
block
Block * block
Definition: protobuf/src/google/protobuf/descriptor.cc:1041
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
aes_nohw_rotate_cols_right
#define aes_nohw_rotate_cols_right(v, n)
Definition: aes_nohw.c:706
aes_nohw_swap_bits
static void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, uint32_t mask, aes_word_t shift)
Definition: aes_nohw.c:430
aes_nohw_rcon
static const uint8_t aes_nohw_rcon[10]
Definition: aes_nohw.c:914
memcpy
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
in
const char * in
Definition: third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc:391
a2
T::first_type a2
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:307
CRYPTO_bswap4
static uint32_t CRYPTO_bswap4(uint32_t x)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:753
swap
#define swap(a, b)
Definition: qsort.h:111
b2
T::second_type b2
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:308
t0
static int64_t t0
Definition: bloaty/third_party/re2/util/benchmark.cc:44
aes_nohw_decrypt
void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key)
Definition: aes_nohw.c:1153
setup.v
v
Definition: third_party/bloaty/third_party/capstone/bindings/python/setup.py:42
aes.h
uint64_t
unsigned __int64 uint64_t
Definition: stdint-msvc2008.h:90
bits
OPENSSL_EXPORT ASN1_BIT_STRING * bits
Definition: x509v3.h:482
aes_nohw_batch_get
static void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, aes_word_t out[AES_NOHW_BLOCK_WORDS], size_t i)
Definition: aes_nohw.c:254
aes_nohw_rcon_slice
static aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i)
Definition: aes_nohw.c:919
aes_nohw_mix_columns
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:762
x
int x
Definition: bloaty/third_party/googletest/googlemock/test/gmock-matchers_test.cc:3610
batch
grpc_transport_stream_op_batch * batch
Definition: retry_filter.cc:243
a1
T::first_type a1
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:305
aes_nohw_expand_round_keys
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, const AES_KEY *key)
Definition: aes_nohw.c:901
aes_nohw_encrypt_batch
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, size_t num_rounds, AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:870
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
UINT64_C
#define UINT64_C(val)
Definition: stdint-msvc2008.h:238
AES_NOHW_SCHEDULE
Definition: aes_nohw.c:221
AES_NOHW_ROW2_MASK
#define AES_NOHW_ROW2_MASK
Definition: aes_nohw.c:124
aes_nohw_to_batch
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, size_t num_blocks)
Definition: aes_nohw.c:473
aes_nohw_uncompact_word
static uint32_t aes_nohw_uncompact_word(uint32_t a)
Definition: aes_nohw.c:330
aes_nohw_inv_mix_columns
static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:811
b1
T::second_type b1
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:306
aes_nohw_compact_word
static uint32_t aes_nohw_compact_word(uint32_t a)
Definition: aes_nohw.c:315
AES_NOHW_BATCH
Definition: aes_nohw.c:214
AES_NOHW_COL2_MASK
#define AES_NOHW_COL2_MASK
Definition: aes_nohw.c:127
aes_nohw_inv_sub_bytes
static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:689
key
const char * key
Definition: hpack_parser_table.cc:164
aes_nohw_xor_block
static void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], const uint8_t b[16])
Definition: aes_nohw.c:1162
aes_nohw_cbc_encrypt
void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, const AES_KEY *key, uint8_t *ivec, const int enc)
Definition: aes_nohw.c:1220
aes_nohw_uncompact_block
static void aes_nohw_uncompact_block(uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS])
Definition: aes_nohw.c:372
aes_nohw_compact_block
static void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const uint8_t in[16])
Definition: aes_nohw.c:345
aes_nohw_rotate_rows_twice
static aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v)
Definition: aes_nohw.c:751
AES_MAXNR
#define AES_MAXNR
Definition: aes.h:66
AES_NOHW_COL01_MASK
#define AES_NOHW_COL01_MASK
Definition: aes_nohw.c:126
aes_nohw_sub_bytes
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:512
aes_nohw_set_decrypt_key
int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey)
Definition: aes_nohw.c:1139
aes_nohw_ctr32_encrypt_blocks
void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t blocks, const AES_KEY *key, const uint8_t ivec[16])
Definition: aes_nohw.c:1173
aes_nohw_word_from_bytes
static uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3)
Definition: aes_nohw.c:337
aes_nohw_delta_swap
static aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, aes_word_t shift)
Definition: aes_nohw.c:274
aes_nohw_inv_shift_rows
static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:723
aes_nohw_set_encrypt_key
int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey)
Definition: aes_nohw.c:1123
aes_nohw_xor
static aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b)
Definition: aes_nohw.c:139
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
autogen_x86imm.tmp
tmp
Definition: autogen_x86imm.py:12
AES_NOHW_ROW3_MASK
#define AES_NOHW_ROW3_MASK
Definition: aes_nohw.c:125
aes_nohw_batch_set
static void aes_nohw_batch_set(AES_NOHW_BATCH *batch, const aes_word_t in[AES_NOHW_BLOCK_WORDS], size_t i)
Definition: aes_nohw.c:229
t1
Table t1
Definition: abseil-cpp/absl/container/internal/raw_hash_set_allocator_test.cc:185
mkowners.todo
todo
Definition: mkowners.py:209
aes_nohw_decrypt_batch
static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key, size_t num_rounds, AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:884
aes_nohw_shift_rows
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:710
aes_nohw_sub_bytes_inv_affine
static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch)
Definition: aes_nohw.c:656
aes_key_st
Definition: aes.h:72
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
aes_nohw_not
static aes_word_t aes_nohw_not(aes_word_t a)
Definition: aes_nohw.c:143
AES_NOHW_BATCH_SIZE
#define AES_NOHW_BATCH_SIZE
Definition: aes_nohw.c:121
aes_nohw_encrypt
void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key)
Definition: aes_nohw.c:1144
aes_nohw_sub_block
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], const aes_word_t in[AES_NOHW_BLOCK_WORDS])
Definition: aes_nohw.c:928


grpc
Author(s):
autogenerated on Fri May 16 2025 02:57:40