22 #include "../internal.h"
26 #if defined(OPENSSL_POLY1305_NEON)
32 #define addmulmod openssl_poly1305_neon2_addmulmod
33 #define blocks openssl_poly1305_neon2_blocks
35 extern void addmulmod(fe1305x2 *
r,
const fe1305x2 *
x,
const fe1305x2 *
y,
38 extern int blocks(fe1305x2 *h,
const fe1305x2 *precomp,
const uint8_t *
in,
41 static void freeze(fe1305x2 *
r) {
56 for (
i = 0;
i < 3; ++
i) {
116 static void fe1305x2_tobytearray(
uint8_t r[16], fe1305x2 *
x) {
132 store32(
r, x0 + (x1 << 26));
133 store32(
r + 4, (x1 >> 6) + (x2 << 20));
134 store32(
r + 8, (x2 >> 12) + (x3 << 14));
135 store32(
r + 12, (x3 >> 18) + (x4 << 8));
138 static void fe1305x2_frombytearray(fe1305x2 *
r,
const uint8_t *
x,
size_t xlen) {
142 for (
i = 0; (
i < 16) && (
i < xlen);
i++) {
148 for (;
i < 17;
i++) {
152 r->v[0] = 0x3ffffff & load32(t);
153 r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
154 r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
155 r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
156 r->v[8] = load32(t + 13);
159 for (
i = 0; (
i < 16) && (
i < xlen);
i++) {
163 for (;
i < 17;
i++) {
167 r->v[1] = 0x3ffffff & load32(t);
168 r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
169 r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
170 r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
171 r->v[9] = load32(t + 13);
173 r->v[1] =
r->v[3] =
r->v[5] =
r->v[7] =
r->v[9] = 0;
177 static const alignas(16) fe1305x2 zero;
188 "poly1305_state isn't large enough to hold aligned poly1305_state_st.");
192 fe1305x2 *
const r = (fe1305x2 *)(st->data + (15 & (-(
int)st->data)));
193 fe1305x2 *
const h =
r + 1;
194 fe1305x2 *
const c =
h + 1;
195 fe1305x2 *
const precomp =
c + 1;
197 r->v[1] =
r->v[0] = 0x3ffffff & load32(
key);
198 r->v[3] =
r->v[2] = 0x3ffff03 & (load32(
key + 3) >> 2);
199 r->v[5] =
r->v[4] = 0x3ffc0ff & (load32(
key + 6) >> 4);
200 r->v[7] =
r->v[6] = 0x3f03fff & (load32(
key + 9) >> 6);
201 r->v[9] =
r->v[8] = 0x00fffff & (load32(
key + 12) >> 8);
203 for (
size_t j = 0;
j < 10;
j++) {
207 addmulmod(precomp,
r,
r, &zero);
208 addmulmod(precomp + 1, precomp, precomp, &zero);
217 fe1305x2 *
const r = (fe1305x2 *)(st->data + (15 & (-(
int)st->data)));
218 fe1305x2 *
const h =
r + 1;
219 fe1305x2 *
const c =
h + 1;
220 fe1305x2 *
const precomp =
c + 1;
227 for (
size_t i = 0;
i <
todo;
i++) {
235 addmulmod(h, h, precomp, &zero);
236 fe1305x2_frombytearray(
c, st->
buf,
sizeof(st->
buf));
237 for (
size_t i = 0;
i < 10;
i++) {
244 while (in_len > 32) {
245 size_t tlen = 1048576;
249 tlen -= blocks(h, precomp,
in, tlen);
255 for (
size_t i = 0;
i < in_len;
i++) {
264 fe1305x2 *
const r = (fe1305x2 *)(st->data + (15 & (-(
int)st->data)));
265 fe1305x2 *
const h =
r + 1;
266 fe1305x2 *
const c =
h + 1;
267 fe1305x2 *
const precomp =
c + 1;
269 addmulmod(h, h, precomp, &zero);
273 precomp->v[1] =
r->v[1];
274 precomp->v[3] =
r->v[3];
275 precomp->v[5] =
r->v[5];
276 precomp->v[7] =
r->v[7];
277 precomp->v[9] =
r->v[9];
278 addmulmod(h, h, precomp,
c);
286 addmulmod(h, h,
r,
c);
296 fe1305x2_frombytearray(
c, st->
key, 16);
297 c->v[8] ^= (1 << 24);
304 fe1305x2_tobytearray(mac, h);
307 #endif // OPENSSL_POLY1305_NEON