gcm_nohw.c
Go to the documentation of this file.
1 /* Copyright (c) 2019, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <openssl/base.h>
16 
17 #include "../../internal.h"
18 #include "internal.h"
19 
20 #if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
21 #include <emmintrin.h>
22 #endif
23 
24 
25 // This file contains a constant-time implementation of GHASH based on the notes
26 // in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
27 // algorithm described in
28 // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
29 //
30 // Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our
31 // primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run
32 // the 32-bit implementation, but we can use its intrinsics if necessary.
33 
34 #if defined(BORINGSSL_HAS_UINT128)
35 
36 static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
37  uint64_t b) {
38  // One term every four bits means the largest term is 64/4 = 16, which barely
39  // overflows into the next term. Using one term every five bits would cost 25
40  // multiplications instead of 16. It is faster to mask off the bottom four
41  // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
42  // separately.
43  uint64_t a0 = a & UINT64_C(0x1111111111111110);
44  uint64_t a1 = a & UINT64_C(0x2222222222222220);
45  uint64_t a2 = a & UINT64_C(0x4444444444444440);
46  uint64_t a3 = a & UINT64_C(0x8888888888888880);
47 
48  uint64_t b0 = b & UINT64_C(0x1111111111111111);
49  uint64_t b1 = b & UINT64_C(0x2222222222222222);
50  uint64_t b2 = b & UINT64_C(0x4444444444444444);
51  uint64_t b3 = b & UINT64_C(0x8888888888888888);
52 
53  uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^
54  (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1);
55  uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^
56  (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2);
57  uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^
58  (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3);
59  uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^
60  (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0);
61 
62  // Multiply the bottom four bits of |a| with |b|.
63  uint64_t a0_mask = UINT64_C(0) - (a & 1);
64  uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1);
65  uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1);
66  uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1);
67  uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^
68  ((uint128_t)(a2_mask & b) << 2) ^
69  ((uint128_t)(a3_mask & b) << 3);
70 
71  *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^
72  (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^
73  (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^
74  (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra);
75  *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^
76  (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^
77  (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^
78  (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^
79  ((uint64_t)(extra >> 64));
80 }
81 
82 #elif defined(OPENSSL_SSE2)
83 
84 static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
85  // One term every four bits means the largest term is 32/4 = 8, which does not
86  // overflow into the next term.
87  __m128i aa = _mm_setr_epi32(a, 0, a, 0);
88  __m128i bb = _mm_setr_epi32(b, 0, b, 0);
89 
90  __m128i a0a0 =
91  _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0));
92  __m128i a2a2 =
93  _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0));
94  __m128i b0b1 =
95  _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0));
96  __m128i b2b3 =
97  _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0));
98 
99  __m128i c0c1 =
100  _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3));
101  __m128i c2c3 =
102  _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3));
103 
104  __m128i a1a1 =
105  _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0));
106  __m128i a3a3 =
107  _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0));
108  __m128i b3b0 =
109  _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0));
110  __m128i b1b2 =
111  _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0));
112 
113  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0));
114  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2));
115  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0));
116  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2));
117 
118  c0c1 = _mm_and_si128(
119  c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222));
120  c2c3 = _mm_and_si128(
121  c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888));
122 
123  c0c1 = _mm_xor_si128(c0c1, c2c3);
124  // c0 ^= c1
125  c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8));
126  return c0c1;
127 }
128 
129 static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
130  uint64_t b) {
131  uint32_t a0 = a & 0xffffffff;
132  uint32_t a1 = a >> 32;
133  uint32_t b0 = b & 0xffffffff;
134  uint32_t b1 = b >> 32;
135  // Karatsuba multiplication.
136  __m128i lo = gcm_mul32_nohw(a0, b0);
137  __m128i hi = gcm_mul32_nohw(a1, b1);
138  __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1);
139  mid = _mm_xor_si128(mid, lo);
140  mid = _mm_xor_si128(mid, hi);
141  __m128i ret = _mm_unpacklo_epi64(lo, hi);
142  mid = _mm_slli_si128(mid, 4);
143  mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0));
144  ret = _mm_xor_si128(ret, mid);
145  memcpy(out_lo, &ret, 8);
146  memcpy(out_hi, ((char*)&ret) + 8, 8);
147 }
148 
149 #else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2
150 
152  // One term every four bits means the largest term is 32/4 = 8, which does not
153  // overflow into the next term.
154  uint32_t a0 = a & 0x11111111;
155  uint32_t a1 = a & 0x22222222;
156  uint32_t a2 = a & 0x44444444;
157  uint32_t a3 = a & 0x88888888;
158 
159  uint32_t b0 = b & 0x11111111;
160  uint32_t b1 = b & 0x22222222;
161  uint32_t b2 = b & 0x44444444;
162  uint32_t b3 = b & 0x88888888;
163 
164  uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^
165  (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1);
166  uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^
167  (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2);
168  uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^
169  (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3);
170  uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^
171  (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0);
172 
173  return (c0 & UINT64_C(0x1111111111111111)) |
174  (c1 & UINT64_C(0x2222222222222222)) |
175  (c2 & UINT64_C(0x4444444444444444)) |
176  (c3 & UINT64_C(0x8888888888888888));
177 }
178 
179 static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
180  uint64_t b) {
181  uint32_t a0 = a & 0xffffffff;
182  uint32_t a1 = a >> 32;
183  uint32_t b0 = b & 0xffffffff;
184  uint32_t b1 = b >> 32;
185  // Karatsuba multiplication.
186  uint64_t lo = gcm_mul32_nohw(a0, b0);
187  uint64_t hi = gcm_mul32_nohw(a1, b1);
188  uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
189  *out_lo = lo ^ (mid << 32);
190  *out_hi = hi ^ (mid >> 32);
191 }
192 
193 #endif // BORINGSSL_HAS_UINT128
194 
195 void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) {
196  // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
197  // avoids a shift by 1 in the multiplication, needed to account for bit
198  // reversal losing a bit after multiplication, that is,
199  // rev128(X) * rev128(Y) = rev255(X*Y).
200  //
201  // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
202  // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
203  //
204  // See also slide 16 of
205  // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
206  Htable[0].lo = Xi[1];
207  Htable[0].hi = Xi[0];
208 
209  uint64_t carry = Htable[0].hi >> 63;
210  carry = 0u - carry;
211 
212  Htable[0].hi <<= 1;
213  Htable[0].hi |= Htable[0].lo >> 63;
214  Htable[0].lo <<= 1;
215 
216  // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
217  // conditionally add 0xc200...0001.
218  Htable[0].lo ^= carry & 1;
219  Htable[0].hi ^= carry & UINT64_C(0xc200000000000000);
220 
221  // This implementation does not use the rest of |Htable|.
222 }
223 
224 static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) {
225  // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
226  // through |r3|. Note there is no byte or bit reversal because we are
227  // evaluating POLYVAL.
228  uint64_t r0, r1;
229  gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo);
230  uint64_t r2, r3;
231  gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi);
232  uint64_t mid0, mid1;
233  gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo);
234  mid0 ^= r0 ^ r2;
235  mid1 ^= r1 ^ r3;
236  r2 ^= mid1;
237  r1 ^= mid0;
238 
239  // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
240  // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
241  // have:
242  //
243  // 1 = x^121 + x^126 + x^127 + x^128
244  // x^-128 = x^-7 + x^-2 + x^-1 + 1
245  //
246  // This is the GHASH reduction step, but with bits flowing in reverse.
247 
248  // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
249  // another reduction steps. Instead, we gather the excess bits, incorporate
250  // them into |r0| and |r1| and reduce once. See slides 17-19
251  // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
252  r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
253 
254  // 1
255  r2 ^= r0;
256  r3 ^= r1;
257 
258  // x^-1
259  r2 ^= r0 >> 1;
260  r2 ^= r1 << 63;
261  r3 ^= r1 >> 1;
262 
263  // x^-2
264  r2 ^= r0 >> 2;
265  r2 ^= r1 << 62;
266  r3 ^= r1 >> 2;
267 
268  // x^-7
269  r2 ^= r0 >> 7;
270  r2 ^= r1 << 57;
271  r3 ^= r1 >> 7;
272 
273  Xi[0] = r2;
274  Xi[1] = r3;
275 }
276 
277 void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16]) {
278  uint64_t swapped[2];
279  swapped[0] = CRYPTO_bswap8(Xi[1]);
280  swapped[1] = CRYPTO_bswap8(Xi[0]);
281  gcm_polyval_nohw(swapped, &Htable[0]);
282  Xi[0] = CRYPTO_bswap8(swapped[1]);
283  Xi[1] = CRYPTO_bswap8(swapped[0]);
284 }
285 
286 void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
287  size_t len) {
288  uint64_t swapped[2];
289  swapped[0] = CRYPTO_bswap8(Xi[1]);
290  swapped[1] = CRYPTO_bswap8(Xi[0]);
291 
292  while (len >= 16) {
293  uint64_t block[2];
294  OPENSSL_memcpy(block, inp, 16);
295  swapped[0] ^= CRYPTO_bswap8(block[1]);
296  swapped[1] ^= CRYPTO_bswap8(block[0]);
297  gcm_polyval_nohw(swapped, &Htable[0]);
298  inp += 16;
299  len -= 16;
300  }
301 
302  Xi[0] = CRYPTO_bswap8(swapped[1]);
303  Xi[1] = CRYPTO_bswap8(swapped[0]);
304 }
regen-readme.inp
inp
Definition: regen-readme.py:11
gcm_mul32_nohw
static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b)
Definition: gcm_nohw.c:151
u128
Definition: third_party/boringssl-with-bazel/src/crypto/fipsmodule/modes/internal.h:115
CRYPTO_bswap8
static uint64_t CRYPTO_bswap8(uint64_t x)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:759
u
OPENSSL_EXPORT pem_password_cb void * u
Definition: pem.h:351
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
gcm_ghash_nohw
void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, size_t len)
Definition: gcm_nohw.c:286
base.h
block
Block * block
Definition: protobuf/src/google/protobuf/descriptor.cc:1041
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
memcpy
memcpy(mem, inblock.get(), min(CONTAINING_RECORD(inblock.get(), MEMBLOCK, data) ->size, size))
a2
T::first_type a2
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:307
b2
T::second_type b2
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:308
gcm_init_nohw
void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2])
Definition: gcm_nohw.c:195
uint64_t
unsigned __int64 uint64_t
Definition: stdint-msvc2008.h:90
OPENSSL_memcpy
static void * OPENSSL_memcpy(void *dst, const void *src, size_t n)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:819
a1
T::first_type a1
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:305
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
u128::hi
uint64_t hi
Definition: third_party/boringssl-with-bazel/src/crypto/fipsmodule/modes/internal.h:115
UINT64_C
#define UINT64_C(val)
Definition: stdint-msvc2008.h:238
H
#define H(b, c, d)
Definition: md4.c:114
b1
T::second_type b1
Definition: abseil-cpp/absl/container/internal/hash_function_defaults_test.cc:306
absl::hash_internal::c1
static const uint32_t c1
Definition: abseil-cpp/absl/hash/internal/city.cc:58
internal.h
gcm_gmult_nohw
void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16])
Definition: gcm_nohw.c:277
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
u128::lo
uint64_t lo
Definition: third_party/boringssl-with-bazel/src/crypto/fipsmodule/modes/internal.h:115
gcm_mul64_nohw
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, uint64_t b)
Definition: gcm_nohw.c:179
gcm_polyval_nohw
static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H)
Definition: gcm_nohw.c:224
len
int len
Definition: abseil-cpp/absl/base/internal/low_level_alloc_test.cc:46
absl::hash_internal::c2
static const uint32_t c2
Definition: abseil-cpp/absl/hash/internal/city.cc:59


grpc
Author(s):
autogenerated on Fri May 16 2025 02:58:25