poly1305_vec.c
Go to the documentation of this file.
1 /* Copyright (c) 2014, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 // This implementation of poly1305 is by Andrew Moon
16 // (https://github.com/floodyberry/poly1305-donna) and released as public
17 // domain. It implements SIMD vectorization based on the algorithm described in
18 // http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte
19 // block size
20 
21 #include <openssl/poly1305.h>
22 
23 #include "../internal.h"
24 
25 
26 #if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64)
27 
28 #include <emmintrin.h>
29 
30 static uint32_t load_u32_le(const uint8_t in[4]) {
31  uint32_t ret;
32  OPENSSL_memcpy(&ret, in, 4);
33  return ret;
34 }
35 
36 static uint64_t load_u64_le(const uint8_t in[8]) {
37  uint64_t ret;
38  OPENSSL_memcpy(&ret, in, 8);
39  return ret;
40 }
41 
42 static void store_u64_le(uint8_t out[8], uint64_t v) {
43  OPENSSL_memcpy(out, &v, 8);
44 }
45 
46 typedef __m128i xmmi;
47 
48 static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = {
49  (1 << 26) - 1, 0, (1 << 26) - 1, 0};
50 static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0};
51 static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = {
52  (1 << 24), 0, (1 << 24), 0};
53 
54 static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; }
55 
56 static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; }
57 
58 static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) {
59  return (uint128_t)a * b;
60 }
61 
62 static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; }
63 
64 static inline uint64_t shr128(uint128_t v, const int shift) {
65  return (uint64_t)(v >> shift);
66 }
67 
68 static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) {
69  return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift);
70 }
71 
72 typedef struct poly1305_power_t {
73  union {
74  xmmi v;
75  uint64_t u[2];
76  uint32_t d[4];
77  } R20, R21, R22, R23, R24, S21, S22, S23, S24;
78 } poly1305_power;
79 
80 typedef struct poly1305_state_internal_t {
81  poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144
82  bytes of free storage */
83  union {
84  xmmi H[5]; // 80 bytes
85  uint64_t HH[10];
86  };
87  // uint64_t r0,r1,r2; [24 bytes]
88  // uint64_t pad0,pad1; [16 bytes]
89  uint64_t started; // 8 bytes
90  uint64_t leftover; // 8 bytes
91  uint8_t buffer[64]; // 64 bytes
92 } poly1305_state_internal; /* 448 bytes total + 63 bytes for
93  alignment = 511 bytes raw */
94 
96  sizeof(struct poly1305_state_internal_t) + 63 <= sizeof(poly1305_state),
97  "poly1305_state isn't large enough to hold aligned poly1305_state_internal_t");
98 
99 static inline poly1305_state_internal *poly1305_aligned_state(
101  return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63);
102 }
103 
104 static inline size_t poly1305_min(size_t a, size_t b) {
105  return (a < b) ? a : b;
106 }
107 
109  poly1305_state_internal *st = poly1305_aligned_state(state);
110  poly1305_power *p;
111  uint64_t r0, r1, r2;
112  uint64_t t0, t1;
113 
114  // clamp key
115  t0 = load_u64_le(key + 0);
116  t1 = load_u64_le(key + 8);
117  r0 = t0 & 0xffc0fffffff;
118  t0 >>= 44;
119  t0 |= t1 << 20;
120  r1 = t0 & 0xfffffc0ffff;
121  t1 >>= 24;
122  r2 = t1 & 0x00ffffffc0f;
123 
124  // store r in un-used space of st->P[1]
125  p = &st->P[1];
126  p->R20.d[1] = (uint32_t)(r0);
127  p->R20.d[3] = (uint32_t)(r0 >> 32);
128  p->R21.d[1] = (uint32_t)(r1);
129  p->R21.d[3] = (uint32_t)(r1 >> 32);
130  p->R22.d[1] = (uint32_t)(r2);
131  p->R22.d[3] = (uint32_t)(r2 >> 32);
132 
133  // store pad
134  p->R23.d[1] = load_u32_le(key + 16);
135  p->R23.d[3] = load_u32_le(key + 20);
136  p->R24.d[1] = load_u32_le(key + 24);
137  p->R24.d[3] = load_u32_le(key + 28);
138 
139  // H = 0
140  st->H[0] = _mm_setzero_si128();
141  st->H[1] = _mm_setzero_si128();
142  st->H[2] = _mm_setzero_si128();
143  st->H[3] = _mm_setzero_si128();
144  st->H[4] = _mm_setzero_si128();
145 
146  st->started = 0;
147  st->leftover = 0;
148 }
149 
150 static void poly1305_first_block(poly1305_state_internal *st,
151  const uint8_t *m) {
152  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
153  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
154  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
155  xmmi T5, T6;
156  poly1305_power *p;
157  uint128_t d[3];
158  uint64_t r0, r1, r2;
159  uint64_t r20, r21, r22, s22;
160  uint64_t pad0, pad1;
161  uint64_t c;
162  uint64_t i;
163 
164  // pull out stored info
165  p = &st->P[1];
166 
167  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
168  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
169  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
170  pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
171  pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
172 
173  // compute powers r^2,r^4
174  r20 = r0;
175  r21 = r1;
176  r22 = r2;
177  for (i = 0; i < 2; i++) {
178  s22 = r22 * (5 << 2);
179 
180  d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22));
181  d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21));
182  d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20));
183 
184  r20 = lo128(d[0]) & 0xfffffffffff;
185  c = shr128(d[0], 44);
186  d[1] = add128_64(d[1], c);
187  r21 = lo128(d[1]) & 0xfffffffffff;
188  c = shr128(d[1], 44);
189  d[2] = add128_64(d[2], c);
190  r22 = lo128(d[2]) & 0x3ffffffffff;
191  c = shr128(d[2], 42);
192  r20 += c * 5;
193  c = (r20 >> 44);
194  r20 = r20 & 0xfffffffffff;
195  r21 += c;
196 
197  p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff),
198  _MM_SHUFFLE(1, 0, 1, 0));
199  p->R21.v = _mm_shuffle_epi32(
200  _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff),
201  _MM_SHUFFLE(1, 0, 1, 0));
202  p->R22.v =
203  _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff),
204  _MM_SHUFFLE(1, 0, 1, 0));
205  p->R23.v = _mm_shuffle_epi32(
206  _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff),
207  _MM_SHUFFLE(1, 0, 1, 0));
208  p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))),
209  _MM_SHUFFLE(1, 0, 1, 0));
210  p->S21.v = _mm_mul_epu32(p->R21.v, FIVE);
211  p->S22.v = _mm_mul_epu32(p->R22.v, FIVE);
212  p->S23.v = _mm_mul_epu32(p->R23.v, FIVE);
213  p->S24.v = _mm_mul_epu32(p->R24.v, FIVE);
214  p--;
215  }
216 
217  // put saved info back
218  p = &st->P[1];
219  p->R20.d[1] = (uint32_t)(r0);
220  p->R20.d[3] = (uint32_t)(r0 >> 32);
221  p->R21.d[1] = (uint32_t)(r1);
222  p->R21.d[3] = (uint32_t)(r1 >> 32);
223  p->R22.d[1] = (uint32_t)(r2);
224  p->R22.d[3] = (uint32_t)(r2 >> 32);
225  p->R23.d[1] = (uint32_t)(pad0);
226  p->R23.d[3] = (uint32_t)(pad0 >> 32);
227  p->R24.d[1] = (uint32_t)(pad1);
228  p->R24.d[3] = (uint32_t)(pad1 >> 32);
229 
230  // H = [Mx,My]
231  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
232  _mm_loadl_epi64((const xmmi *)(m + 16)));
233  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
234  _mm_loadl_epi64((const xmmi *)(m + 24)));
235  st->H[0] = _mm_and_si128(MMASK, T5);
236  st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
237  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
238  st->H[2] = _mm_and_si128(MMASK, T5);
239  st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
240  st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
241 }
242 
243 static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m,
244  size_t bytes) {
245  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
246  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
247  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
248 
249  poly1305_power *p;
250  xmmi H0, H1, H2, H3, H4;
251  xmmi T0, T1, T2, T3, T4, T5, T6;
252  xmmi M0, M1, M2, M3, M4;
253  xmmi C1, C2;
254 
255  H0 = st->H[0];
256  H1 = st->H[1];
257  H2 = st->H[2];
258  H3 = st->H[3];
259  H4 = st->H[4];
260 
261  while (bytes >= 64) {
262  // H *= [r^4,r^4]
263  p = &st->P[0];
264  T0 = _mm_mul_epu32(H0, p->R20.v);
265  T1 = _mm_mul_epu32(H0, p->R21.v);
266  T2 = _mm_mul_epu32(H0, p->R22.v);
267  T3 = _mm_mul_epu32(H0, p->R23.v);
268  T4 = _mm_mul_epu32(H0, p->R24.v);
269  T5 = _mm_mul_epu32(H1, p->S24.v);
270  T6 = _mm_mul_epu32(H1, p->R20.v);
271  T0 = _mm_add_epi64(T0, T5);
272  T1 = _mm_add_epi64(T1, T6);
273  T5 = _mm_mul_epu32(H2, p->S23.v);
274  T6 = _mm_mul_epu32(H2, p->S24.v);
275  T0 = _mm_add_epi64(T0, T5);
276  T1 = _mm_add_epi64(T1, T6);
277  T5 = _mm_mul_epu32(H3, p->S22.v);
278  T6 = _mm_mul_epu32(H3, p->S23.v);
279  T0 = _mm_add_epi64(T0, T5);
280  T1 = _mm_add_epi64(T1, T6);
281  T5 = _mm_mul_epu32(H4, p->S21.v);
282  T6 = _mm_mul_epu32(H4, p->S22.v);
283  T0 = _mm_add_epi64(T0, T5);
284  T1 = _mm_add_epi64(T1, T6);
285  T5 = _mm_mul_epu32(H1, p->R21.v);
286  T6 = _mm_mul_epu32(H1, p->R22.v);
287  T2 = _mm_add_epi64(T2, T5);
288  T3 = _mm_add_epi64(T3, T6);
289  T5 = _mm_mul_epu32(H2, p->R20.v);
290  T6 = _mm_mul_epu32(H2, p->R21.v);
291  T2 = _mm_add_epi64(T2, T5);
292  T3 = _mm_add_epi64(T3, T6);
293  T5 = _mm_mul_epu32(H3, p->S24.v);
294  T6 = _mm_mul_epu32(H3, p->R20.v);
295  T2 = _mm_add_epi64(T2, T5);
296  T3 = _mm_add_epi64(T3, T6);
297  T5 = _mm_mul_epu32(H4, p->S23.v);
298  T6 = _mm_mul_epu32(H4, p->S24.v);
299  T2 = _mm_add_epi64(T2, T5);
300  T3 = _mm_add_epi64(T3, T6);
301  T5 = _mm_mul_epu32(H1, p->R23.v);
302  T4 = _mm_add_epi64(T4, T5);
303  T5 = _mm_mul_epu32(H2, p->R22.v);
304  T4 = _mm_add_epi64(T4, T5);
305  T5 = _mm_mul_epu32(H3, p->R21.v);
306  T4 = _mm_add_epi64(T4, T5);
307  T5 = _mm_mul_epu32(H4, p->R20.v);
308  T4 = _mm_add_epi64(T4, T5);
309 
310  // H += [Mx,My]*[r^2,r^2]
311  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
312  _mm_loadl_epi64((const xmmi *)(m + 16)));
313  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
314  _mm_loadl_epi64((const xmmi *)(m + 24)));
315  M0 = _mm_and_si128(MMASK, T5);
316  M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
317  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
318  M2 = _mm_and_si128(MMASK, T5);
319  M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
320  M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
321 
322  p = &st->P[1];
323  T5 = _mm_mul_epu32(M0, p->R20.v);
324  T6 = _mm_mul_epu32(M0, p->R21.v);
325  T0 = _mm_add_epi64(T0, T5);
326  T1 = _mm_add_epi64(T1, T6);
327  T5 = _mm_mul_epu32(M1, p->S24.v);
328  T6 = _mm_mul_epu32(M1, p->R20.v);
329  T0 = _mm_add_epi64(T0, T5);
330  T1 = _mm_add_epi64(T1, T6);
331  T5 = _mm_mul_epu32(M2, p->S23.v);
332  T6 = _mm_mul_epu32(M2, p->S24.v);
333  T0 = _mm_add_epi64(T0, T5);
334  T1 = _mm_add_epi64(T1, T6);
335  T5 = _mm_mul_epu32(M3, p->S22.v);
336  T6 = _mm_mul_epu32(M3, p->S23.v);
337  T0 = _mm_add_epi64(T0, T5);
338  T1 = _mm_add_epi64(T1, T6);
339  T5 = _mm_mul_epu32(M4, p->S21.v);
340  T6 = _mm_mul_epu32(M4, p->S22.v);
341  T0 = _mm_add_epi64(T0, T5);
342  T1 = _mm_add_epi64(T1, T6);
343  T5 = _mm_mul_epu32(M0, p->R22.v);
344  T6 = _mm_mul_epu32(M0, p->R23.v);
345  T2 = _mm_add_epi64(T2, T5);
346  T3 = _mm_add_epi64(T3, T6);
347  T5 = _mm_mul_epu32(M1, p->R21.v);
348  T6 = _mm_mul_epu32(M1, p->R22.v);
349  T2 = _mm_add_epi64(T2, T5);
350  T3 = _mm_add_epi64(T3, T6);
351  T5 = _mm_mul_epu32(M2, p->R20.v);
352  T6 = _mm_mul_epu32(M2, p->R21.v);
353  T2 = _mm_add_epi64(T2, T5);
354  T3 = _mm_add_epi64(T3, T6);
355  T5 = _mm_mul_epu32(M3, p->S24.v);
356  T6 = _mm_mul_epu32(M3, p->R20.v);
357  T2 = _mm_add_epi64(T2, T5);
358  T3 = _mm_add_epi64(T3, T6);
359  T5 = _mm_mul_epu32(M4, p->S23.v);
360  T6 = _mm_mul_epu32(M4, p->S24.v);
361  T2 = _mm_add_epi64(T2, T5);
362  T3 = _mm_add_epi64(T3, T6);
363  T5 = _mm_mul_epu32(M0, p->R24.v);
364  T4 = _mm_add_epi64(T4, T5);
365  T5 = _mm_mul_epu32(M1, p->R23.v);
366  T4 = _mm_add_epi64(T4, T5);
367  T5 = _mm_mul_epu32(M2, p->R22.v);
368  T4 = _mm_add_epi64(T4, T5);
369  T5 = _mm_mul_epu32(M3, p->R21.v);
370  T4 = _mm_add_epi64(T4, T5);
371  T5 = _mm_mul_epu32(M4, p->R20.v);
372  T4 = _mm_add_epi64(T4, T5);
373 
374  // H += [Mx,My]
375  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)),
376  _mm_loadl_epi64((const xmmi *)(m + 48)));
377  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)),
378  _mm_loadl_epi64((const xmmi *)(m + 56)));
379  M0 = _mm_and_si128(MMASK, T5);
380  M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
381  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
382  M2 = _mm_and_si128(MMASK, T5);
383  M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
384  M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
385 
386  T0 = _mm_add_epi64(T0, M0);
387  T1 = _mm_add_epi64(T1, M1);
388  T2 = _mm_add_epi64(T2, M2);
389  T3 = _mm_add_epi64(T3, M3);
390  T4 = _mm_add_epi64(T4, M4);
391 
392  // reduce
393  C1 = _mm_srli_epi64(T0, 26);
394  C2 = _mm_srli_epi64(T3, 26);
395  T0 = _mm_and_si128(T0, MMASK);
396  T3 = _mm_and_si128(T3, MMASK);
397  T1 = _mm_add_epi64(T1, C1);
398  T4 = _mm_add_epi64(T4, C2);
399  C1 = _mm_srli_epi64(T1, 26);
400  C2 = _mm_srli_epi64(T4, 26);
401  T1 = _mm_and_si128(T1, MMASK);
402  T4 = _mm_and_si128(T4, MMASK);
403  T2 = _mm_add_epi64(T2, C1);
404  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
405  C1 = _mm_srli_epi64(T2, 26);
406  C2 = _mm_srli_epi64(T0, 26);
407  T2 = _mm_and_si128(T2, MMASK);
408  T0 = _mm_and_si128(T0, MMASK);
409  T3 = _mm_add_epi64(T3, C1);
410  T1 = _mm_add_epi64(T1, C2);
411  C1 = _mm_srli_epi64(T3, 26);
412  T3 = _mm_and_si128(T3, MMASK);
413  T4 = _mm_add_epi64(T4, C1);
414 
415  // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My])
416  H0 = T0;
417  H1 = T1;
418  H2 = T2;
419  H3 = T3;
420  H4 = T4;
421 
422  m += 64;
423  bytes -= 64;
424  }
425 
426  st->H[0] = H0;
427  st->H[1] = H1;
428  st->H[2] = H2;
429  st->H[3] = H3;
430  st->H[4] = H4;
431 }
432 
433 static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m,
434  size_t bytes) {
435  const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask);
436  const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128);
437  const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5);
438 
439  poly1305_power *p;
440  xmmi H0, H1, H2, H3, H4;
441  xmmi M0, M1, M2, M3, M4;
442  xmmi T0, T1, T2, T3, T4, T5, T6;
443  xmmi C1, C2;
444 
445  uint64_t r0, r1, r2;
446  uint64_t t0, t1, t2, t3, t4;
447  uint64_t c;
448  size_t consumed = 0;
449 
450  H0 = st->H[0];
451  H1 = st->H[1];
452  H2 = st->H[2];
453  H3 = st->H[3];
454  H4 = st->H[4];
455 
456  // p = [r^2,r^2]
457  p = &st->P[1];
458 
459  if (bytes >= 32) {
460  // H *= [r^2,r^2]
461  T0 = _mm_mul_epu32(H0, p->R20.v);
462  T1 = _mm_mul_epu32(H0, p->R21.v);
463  T2 = _mm_mul_epu32(H0, p->R22.v);
464  T3 = _mm_mul_epu32(H0, p->R23.v);
465  T4 = _mm_mul_epu32(H0, p->R24.v);
466  T5 = _mm_mul_epu32(H1, p->S24.v);
467  T6 = _mm_mul_epu32(H1, p->R20.v);
468  T0 = _mm_add_epi64(T0, T5);
469  T1 = _mm_add_epi64(T1, T6);
470  T5 = _mm_mul_epu32(H2, p->S23.v);
471  T6 = _mm_mul_epu32(H2, p->S24.v);
472  T0 = _mm_add_epi64(T0, T5);
473  T1 = _mm_add_epi64(T1, T6);
474  T5 = _mm_mul_epu32(H3, p->S22.v);
475  T6 = _mm_mul_epu32(H3, p->S23.v);
476  T0 = _mm_add_epi64(T0, T5);
477  T1 = _mm_add_epi64(T1, T6);
478  T5 = _mm_mul_epu32(H4, p->S21.v);
479  T6 = _mm_mul_epu32(H4, p->S22.v);
480  T0 = _mm_add_epi64(T0, T5);
481  T1 = _mm_add_epi64(T1, T6);
482  T5 = _mm_mul_epu32(H1, p->R21.v);
483  T6 = _mm_mul_epu32(H1, p->R22.v);
484  T2 = _mm_add_epi64(T2, T5);
485  T3 = _mm_add_epi64(T3, T6);
486  T5 = _mm_mul_epu32(H2, p->R20.v);
487  T6 = _mm_mul_epu32(H2, p->R21.v);
488  T2 = _mm_add_epi64(T2, T5);
489  T3 = _mm_add_epi64(T3, T6);
490  T5 = _mm_mul_epu32(H3, p->S24.v);
491  T6 = _mm_mul_epu32(H3, p->R20.v);
492  T2 = _mm_add_epi64(T2, T5);
493  T3 = _mm_add_epi64(T3, T6);
494  T5 = _mm_mul_epu32(H4, p->S23.v);
495  T6 = _mm_mul_epu32(H4, p->S24.v);
496  T2 = _mm_add_epi64(T2, T5);
497  T3 = _mm_add_epi64(T3, T6);
498  T5 = _mm_mul_epu32(H1, p->R23.v);
499  T4 = _mm_add_epi64(T4, T5);
500  T5 = _mm_mul_epu32(H2, p->R22.v);
501  T4 = _mm_add_epi64(T4, T5);
502  T5 = _mm_mul_epu32(H3, p->R21.v);
503  T4 = _mm_add_epi64(T4, T5);
504  T5 = _mm_mul_epu32(H4, p->R20.v);
505  T4 = _mm_add_epi64(T4, T5);
506 
507  // H += [Mx,My]
508  T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)),
509  _mm_loadl_epi64((const xmmi *)(m + 16)));
510  T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)),
511  _mm_loadl_epi64((const xmmi *)(m + 24)));
512  M0 = _mm_and_si128(MMASK, T5);
513  M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
514  T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
515  M2 = _mm_and_si128(MMASK, T5);
516  M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
517  M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
518 
519  T0 = _mm_add_epi64(T0, M0);
520  T1 = _mm_add_epi64(T1, M1);
521  T2 = _mm_add_epi64(T2, M2);
522  T3 = _mm_add_epi64(T3, M3);
523  T4 = _mm_add_epi64(T4, M4);
524 
525  // reduce
526  C1 = _mm_srli_epi64(T0, 26);
527  C2 = _mm_srli_epi64(T3, 26);
528  T0 = _mm_and_si128(T0, MMASK);
529  T3 = _mm_and_si128(T3, MMASK);
530  T1 = _mm_add_epi64(T1, C1);
531  T4 = _mm_add_epi64(T4, C2);
532  C1 = _mm_srli_epi64(T1, 26);
533  C2 = _mm_srli_epi64(T4, 26);
534  T1 = _mm_and_si128(T1, MMASK);
535  T4 = _mm_and_si128(T4, MMASK);
536  T2 = _mm_add_epi64(T2, C1);
537  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
538  C1 = _mm_srli_epi64(T2, 26);
539  C2 = _mm_srli_epi64(T0, 26);
540  T2 = _mm_and_si128(T2, MMASK);
541  T0 = _mm_and_si128(T0, MMASK);
542  T3 = _mm_add_epi64(T3, C1);
543  T1 = _mm_add_epi64(T1, C2);
544  C1 = _mm_srli_epi64(T3, 26);
545  T3 = _mm_and_si128(T3, MMASK);
546  T4 = _mm_add_epi64(T4, C1);
547 
548  // H = (H*[r^2,r^2] + [Mx,My])
549  H0 = T0;
550  H1 = T1;
551  H2 = T2;
552  H3 = T3;
553  H4 = T4;
554 
555  consumed = 32;
556  }
557 
558  // finalize, H *= [r^2,r]
559  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
560  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
561  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
562 
563  p->R20.d[2] = (uint32_t)(r0)&0x3ffffff;
564  p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
565  p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
566  p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
567  p->R24.d[2] = (uint32_t)((r2 >> 16));
568  p->S21.d[2] = p->R21.d[2] * 5;
569  p->S22.d[2] = p->R22.d[2] * 5;
570  p->S23.d[2] = p->R23.d[2] * 5;
571  p->S24.d[2] = p->R24.d[2] * 5;
572 
573  // H *= [r^2,r]
574  T0 = _mm_mul_epu32(H0, p->R20.v);
575  T1 = _mm_mul_epu32(H0, p->R21.v);
576  T2 = _mm_mul_epu32(H0, p->R22.v);
577  T3 = _mm_mul_epu32(H0, p->R23.v);
578  T4 = _mm_mul_epu32(H0, p->R24.v);
579  T5 = _mm_mul_epu32(H1, p->S24.v);
580  T6 = _mm_mul_epu32(H1, p->R20.v);
581  T0 = _mm_add_epi64(T0, T5);
582  T1 = _mm_add_epi64(T1, T6);
583  T5 = _mm_mul_epu32(H2, p->S23.v);
584  T6 = _mm_mul_epu32(H2, p->S24.v);
585  T0 = _mm_add_epi64(T0, T5);
586  T1 = _mm_add_epi64(T1, T6);
587  T5 = _mm_mul_epu32(H3, p->S22.v);
588  T6 = _mm_mul_epu32(H3, p->S23.v);
589  T0 = _mm_add_epi64(T0, T5);
590  T1 = _mm_add_epi64(T1, T6);
591  T5 = _mm_mul_epu32(H4, p->S21.v);
592  T6 = _mm_mul_epu32(H4, p->S22.v);
593  T0 = _mm_add_epi64(T0, T5);
594  T1 = _mm_add_epi64(T1, T6);
595  T5 = _mm_mul_epu32(H1, p->R21.v);
596  T6 = _mm_mul_epu32(H1, p->R22.v);
597  T2 = _mm_add_epi64(T2, T5);
598  T3 = _mm_add_epi64(T3, T6);
599  T5 = _mm_mul_epu32(H2, p->R20.v);
600  T6 = _mm_mul_epu32(H2, p->R21.v);
601  T2 = _mm_add_epi64(T2, T5);
602  T3 = _mm_add_epi64(T3, T6);
603  T5 = _mm_mul_epu32(H3, p->S24.v);
604  T6 = _mm_mul_epu32(H3, p->R20.v);
605  T2 = _mm_add_epi64(T2, T5);
606  T3 = _mm_add_epi64(T3, T6);
607  T5 = _mm_mul_epu32(H4, p->S23.v);
608  T6 = _mm_mul_epu32(H4, p->S24.v);
609  T2 = _mm_add_epi64(T2, T5);
610  T3 = _mm_add_epi64(T3, T6);
611  T5 = _mm_mul_epu32(H1, p->R23.v);
612  T4 = _mm_add_epi64(T4, T5);
613  T5 = _mm_mul_epu32(H2, p->R22.v);
614  T4 = _mm_add_epi64(T4, T5);
615  T5 = _mm_mul_epu32(H3, p->R21.v);
616  T4 = _mm_add_epi64(T4, T5);
617  T5 = _mm_mul_epu32(H4, p->R20.v);
618  T4 = _mm_add_epi64(T4, T5);
619 
620  C1 = _mm_srli_epi64(T0, 26);
621  C2 = _mm_srli_epi64(T3, 26);
622  T0 = _mm_and_si128(T0, MMASK);
623  T3 = _mm_and_si128(T3, MMASK);
624  T1 = _mm_add_epi64(T1, C1);
625  T4 = _mm_add_epi64(T4, C2);
626  C1 = _mm_srli_epi64(T1, 26);
627  C2 = _mm_srli_epi64(T4, 26);
628  T1 = _mm_and_si128(T1, MMASK);
629  T4 = _mm_and_si128(T4, MMASK);
630  T2 = _mm_add_epi64(T2, C1);
631  T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
632  C1 = _mm_srli_epi64(T2, 26);
633  C2 = _mm_srli_epi64(T0, 26);
634  T2 = _mm_and_si128(T2, MMASK);
635  T0 = _mm_and_si128(T0, MMASK);
636  T3 = _mm_add_epi64(T3, C1);
637  T1 = _mm_add_epi64(T1, C2);
638  C1 = _mm_srli_epi64(T3, 26);
639  T3 = _mm_and_si128(T3, MMASK);
640  T4 = _mm_add_epi64(T4, C1);
641 
642  // H = H[0]+H[1]
643  H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
644  H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
645  H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
646  H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
647  H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
648 
649  t0 = _mm_cvtsi128_si32(H0);
650  c = (t0 >> 26);
651  t0 &= 0x3ffffff;
652  t1 = _mm_cvtsi128_si32(H1) + c;
653  c = (t1 >> 26);
654  t1 &= 0x3ffffff;
655  t2 = _mm_cvtsi128_si32(H2) + c;
656  c = (t2 >> 26);
657  t2 &= 0x3ffffff;
658  t3 = _mm_cvtsi128_si32(H3) + c;
659  c = (t3 >> 26);
660  t3 &= 0x3ffffff;
661  t4 = _mm_cvtsi128_si32(H4) + c;
662  c = (t4 >> 26);
663  t4 &= 0x3ffffff;
664  t0 = t0 + (c * 5);
665  c = (t0 >> 26);
666  t0 &= 0x3ffffff;
667  t1 = t1 + c;
668 
669  st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff);
670  st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff);
671  st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff);
672 
673  return consumed;
674 }
675 
677  size_t bytes) {
678  poly1305_state_internal *st = poly1305_aligned_state(state);
679  size_t want;
680 
681  // Work around a C language bug. See https://crbug.com/1019588.
682  if (bytes == 0) {
683  return;
684  }
685 
686  // need at least 32 initial bytes to start the accelerated branch
687  if (!st->started) {
688  if ((st->leftover == 0) && (bytes > 32)) {
689  poly1305_first_block(st, m);
690  m += 32;
691  bytes -= 32;
692  } else {
693  want = poly1305_min(32 - st->leftover, bytes);
694  OPENSSL_memcpy(st->buffer + st->leftover, m, want);
695  bytes -= want;
696  m += want;
697  st->leftover += want;
698  if ((st->leftover < 32) || (bytes == 0)) {
699  return;
700  }
701  poly1305_first_block(st, st->buffer);
702  st->leftover = 0;
703  }
704  st->started = 1;
705  }
706 
707  // handle leftover
708  if (st->leftover) {
709  want = poly1305_min(64 - st->leftover, bytes);
710  OPENSSL_memcpy(st->buffer + st->leftover, m, want);
711  bytes -= want;
712  m += want;
713  st->leftover += want;
714  if (st->leftover < 64) {
715  return;
716  }
717  poly1305_blocks(st, st->buffer, 64);
718  st->leftover = 0;
719  }
720 
721  // process 64 byte blocks
722  if (bytes >= 64) {
723  want = (bytes & ~63);
724  poly1305_blocks(st, m, want);
725  m += want;
726  bytes -= want;
727  }
728 
729  if (bytes) {
730  OPENSSL_memcpy(st->buffer + st->leftover, m, bytes);
731  st->leftover += bytes;
732  }
733 }
734 
736  poly1305_state_internal *st = poly1305_aligned_state(state);
737  size_t leftover = st->leftover;
738  uint8_t *m = st->buffer;
739  uint128_t d[3];
740  uint64_t h0, h1, h2;
741  uint64_t t0, t1;
742  uint64_t g0, g1, g2, c, nc;
743  uint64_t r0, r1, r2, s1, s2;
744  poly1305_power *p;
745 
746  if (st->started) {
747  size_t consumed = poly1305_combine(st, m, leftover);
748  leftover -= consumed;
749  m += consumed;
750  }
751 
752  // st->HH will either be 0 or have the combined result
753  h0 = st->HH[0];
754  h1 = st->HH[1];
755  h2 = st->HH[2];
756 
757  p = &st->P[1];
758  r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1];
759  r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1];
760  r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1];
761  s1 = r1 * (5 << 2);
762  s2 = r2 * (5 << 2);
763 
764  if (leftover < 16) {
765  goto poly1305_donna_atmost15bytes;
766  }
767 
768 poly1305_donna_atleast16bytes:
769  t0 = load_u64_le(m + 0);
770  t1 = load_u64_le(m + 8);
771  h0 += t0 & 0xfffffffffff;
772  t0 = shr128_pair(t1, t0, 44);
773  h1 += t0 & 0xfffffffffff;
774  h2 += (t1 >> 24) | ((uint64_t)1 << 40);
775 
776 poly1305_donna_mul:
777  d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)),
778  mul64x64_128(h2, s1));
779  d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)),
780  mul64x64_128(h2, s2));
781  d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)),
782  mul64x64_128(h2, r0));
783  h0 = lo128(d[0]) & 0xfffffffffff;
784  c = shr128(d[0], 44);
785  d[1] = add128_64(d[1], c);
786  h1 = lo128(d[1]) & 0xfffffffffff;
787  c = shr128(d[1], 44);
788  d[2] = add128_64(d[2], c);
789  h2 = lo128(d[2]) & 0x3ffffffffff;
790  c = shr128(d[2], 42);
791  h0 += c * 5;
792 
793  m += 16;
794  leftover -= 16;
795  if (leftover >= 16) {
796  goto poly1305_donna_atleast16bytes;
797  }
798 
799 // final bytes
800 poly1305_donna_atmost15bytes:
801  if (!leftover) {
802  goto poly1305_donna_finish;
803  }
804 
805  m[leftover++] = 1;
806  OPENSSL_memset(m + leftover, 0, 16 - leftover);
807  leftover = 16;
808 
809  t0 = load_u64_le(m + 0);
810  t1 = load_u64_le(m + 8);
811  h0 += t0 & 0xfffffffffff;
812  t0 = shr128_pair(t1, t0, 44);
813  h1 += t0 & 0xfffffffffff;
814  h2 += (t1 >> 24);
815 
816  goto poly1305_donna_mul;
817 
818 poly1305_donna_finish:
819  c = (h0 >> 44);
820  h0 &= 0xfffffffffff;
821  h1 += c;
822  c = (h1 >> 44);
823  h1 &= 0xfffffffffff;
824  h2 += c;
825  c = (h2 >> 42);
826  h2 &= 0x3ffffffffff;
827  h0 += c * 5;
828 
829  g0 = h0 + 5;
830  c = (g0 >> 44);
831  g0 &= 0xfffffffffff;
832  g1 = h1 + c;
833  c = (g1 >> 44);
834  g1 &= 0xfffffffffff;
835  g2 = h2 + c - ((uint64_t)1 << 42);
836 
837  c = (g2 >> 63) - 1;
838  nc = ~c;
839  h0 = (h0 & nc) | (g0 & c);
840  h1 = (h1 & nc) | (g1 & c);
841  h2 = (h2 & nc) | (g2 & c);
842 
843  // pad
844  t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1];
845  t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1];
846  h0 += (t0 & 0xfffffffffff);
847  c = (h0 >> 44);
848  h0 &= 0xfffffffffff;
849  t0 = shr128_pair(t1, t0, 44);
850  h1 += (t0 & 0xfffffffffff) + c;
851  c = (h1 >> 44);
852  h1 &= 0xfffffffffff;
853  t1 = (t1 >> 24);
854  h2 += (t1)+c;
855 
856  store_u64_le(mac + 0, ((h0) | (h1 << 44)));
857  store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24)));
858 }
859 
860 #endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64
gen_build_yaml.out
dictionary out
Definition: src/benchmark/gen_build_yaml.py:24
S21
#define S21
Definition: bloaty/third_party/protobuf/src/google/protobuf/stubs/structurally_valid.cc:105
absl::container_internal::H2
h2_t H2(size_t hash)
Definition: abseil-cpp/absl/container/internal/raw_hash_set.h:486
u
OPENSSL_EXPORT pem_password_cb void * u
Definition: pem.h:351
a
int a
Definition: abseil-cpp/absl/container/internal/hash_policy_traits_test.cc:88
xds_manager.p
p
Definition: xds_manager.py:60
uint8_t
unsigned char uint8_t
Definition: stdint-msvc2008.h:78
poly1305_aligned_state
static struct poly1305_state_st * poly1305_aligned_state(poly1305_state *state)
Definition: poly1305.c:57
OPENSSL_memset
static void * OPENSSL_memset(void *dst, int c, size_t n)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:835
uint32_t
unsigned int uint32_t
Definition: stdint-msvc2008.h:80
re2::T1
@ T1
Definition: bloaty/third_party/re2/util/rune.cc:31
for
for(map_begin_internal(intern, &it);!map_done(&it);map_next(&it))
Definition: bloaty/third_party/protobuf/php/ext/google/protobuf/map.c:207
in
const char * in
Definition: third_party/abseil-cpp/absl/strings/internal/str_format/parser_test.cc:391
c
void c(T a)
Definition: miscompile_with_no_unique_address_test.cc:40
poly1305.h
t0
static int64_t t0
Definition: bloaty/third_party/re2/util/benchmark.cc:44
setup.v
v
Definition: third_party/bloaty/third_party/capstone/bindings/python/setup.py:42
uint64_t
unsigned __int64 uint64_t
Definition: stdint-msvc2008.h:90
OPENSSL_memcpy
static void * OPENSSL_memcpy(void *dst, const void *src, size_t n)
Definition: third_party/boringssl-with-bazel/src/crypto/internal.h:819
re2::T3
@ T3
Definition: bloaty/third_party/re2/util/rune.cc:34
buffer
char buffer[1024]
Definition: libuv/docs/code/idle-compute/main.c:8
b
uint64_t b
Definition: abseil-cpp/absl/container/internal/layout_test.cc:53
absl::container_internal::H1
size_t H1(size_t hash, const ctrl_t *ctrl)
Definition: abseil-cpp/absl/container/internal/raw_hash_set.h:479
d
static const fe d
Definition: curve25519_tables.h:19
UINT64_C
#define UINT64_C(val)
Definition: stdint-msvc2008.h:238
H
#define H(b, c, d)
Definition: md4.c:114
CRYPTO_poly1305_init
#define CRYPTO_poly1305_init
Definition: boringssl_prefix_symbols.h:1184
P
Definition: miscompile_with_no_unique_address_test.cc:29
key
const char * key
Definition: hpack_parser_table.cc:164
CRYPTO_poly1305_finish
#define CRYPTO_poly1305_finish
Definition: boringssl_prefix_symbols.h:1183
bytes
uint8 bytes[10]
Definition: bloaty/third_party/protobuf/src/google/protobuf/io/coded_stream_unittest.cc:153
re2::T2
@ T2
Definition: bloaty/third_party/re2/util/rune.cc:33
ret
UniquePtr< SSL_SESSION > ret
Definition: ssl_x509.cc:1029
state
Definition: bloaty/third_party/zlib/contrib/blast/blast.c:41
poly1305_state
uint8_t poly1305_state[512]
Definition: poly1305.h:25
re2::T4
@ T4
Definition: bloaty/third_party/re2/util/rune.cc:35
re2::T5
@ T5
Definition: bloaty/third_party/re2/util/rune.cc:36
regress.m
m
Definition: regress/regress.py:25
t1
Table t1
Definition: abseil-cpp/absl/container/internal/raw_hash_set_allocator_test.cc:185
OPENSSL_STATIC_ASSERT
#define OPENSSL_STATIC_ASSERT(cond, msg)
Definition: type_check.h:75
i
uint64_t i
Definition: abseil-cpp/absl/container/btree_benchmark.cc:230
CRYPTO_poly1305_update
#define CRYPTO_poly1305_update
Definition: boringssl_prefix_symbols.h:1185


grpc
Author(s):
autogenerated on Fri May 16 2025 02:59:44