68 #if defined(OPENSSL_PPC64LE)
76 typedef vector
unsigned int vec_uint32_t;
77 typedef vector
unsigned char vec_uint8_t;
80 static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4,
81 11, 10, 9, 8, 15, 14, 13, 12};
84 static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32,
85 32, 32, 32, 32, 32, 32, 32, 32};
86 static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96,
87 96, 96, 96, 96, 96, 96, 96, 96};
89 #define K_00_19 0x5a827999UL
90 #define K_20_39 0x6ed9eba1UL
91 #define K_40_59 0x8f1bbcdcUL
92 #define K_60_79 0xca62c1d6UL
106 static vec_uint32_t sched_00_15(vec_uint32_t *pre_added,
const void *
data,
108 const vector
unsigned char unaligned_data =
109 vec_vsx_ld(0, (
const unsigned char*)
data);
110 const vec_uint32_t
v = (vec_uint32_t) unaligned_data;
111 const vec_uint32_t w = vec_perm(
v,
v, k_swap_endianness);
112 vec_st(w +
k, 0, pre_added);
127 static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
128 vec_uint32_t minus_8, vec_uint32_t minus_12,
129 vec_uint32_t minus_16, vec_uint32_t
k) {
130 const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
131 const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8);
132 const vec_uint32_t k_1_bit = vec_splat_u32(1);
133 const vec_uint32_t w_prime =
134 vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
135 const vec_uint32_t w =
136 w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
137 vec_st(w +
k, 0, pre_added);
143 static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
144 vec_uint32_t minus_8, vec_uint32_t minus_16,
145 vec_uint32_t minus_28, vec_uint32_t minus_32,
147 const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8);
148 const vec_uint32_t k_2_bits = vec_splat_u32(2);
149 const vec_uint32_t w =
150 vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
151 vec_st(w +
k, 0, pre_added);
160 #define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
161 #define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
162 #define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
163 #define F_60_79(b, c, d) F_20_39(b, c, d)
166 #define BODY_00_19(i, a, b, c, d, e, f) \
168 (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
169 (b) = rotate((b), 30); \
172 #define BODY_20_39(i, a, b, c, d, e, f) \
174 (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
175 (b) = rotate((b), 30); \
178 #define BODY_40_59(i, a, b, c, d, e, f) \
180 (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
181 (b) = rotate((b), 30); \
184 #define BODY_60_79(i, a, b, c, d, e, f) \
186 (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
187 (b) = rotate((b), 30); \
203 vec_uint32_t
k = K_00_19_x_4;
204 const vec_uint32_t w0 = sched_00_15(vw + 0,
data + 0,
k);
205 BODY_00_19(0,
A, B,
C, D, E,
T);
206 BODY_00_19(1,
T,
A, B,
C, D, E);
207 BODY_00_19(2, E,
T,
A, B,
C, D);
208 BODY_00_19(3, D, E,
T,
A, B,
C);
210 const vec_uint32_t w4 = sched_00_15(vw + 1,
data + 16,
k);
211 BODY_00_19(4,
C, D, E,
T,
A, B);
212 BODY_00_19(5, B,
C, D, E,
T,
A);
213 BODY_00_19(6,
A, B,
C, D, E,
T);
214 BODY_00_19(7,
T,
A, B,
C, D, E);
216 const vec_uint32_t w8 = sched_00_15(vw + 2,
data + 32,
k);
217 BODY_00_19(8, E,
T,
A, B,
C, D);
218 BODY_00_19(9, D, E,
T,
A, B,
C);
219 BODY_00_19(10,
C, D, E,
T,
A, B);
220 BODY_00_19(11, B,
C, D, E,
T,
A);
222 const vec_uint32_t w12 = sched_00_15(vw + 3,
data + 48,
k);
223 BODY_00_19(12,
A, B,
C, D, E,
T);
224 BODY_00_19(13,
T,
A, B,
C, D, E);
225 BODY_00_19(14, E,
T,
A, B,
C, D);
226 BODY_00_19(15, D, E,
T,
A, B,
C);
228 const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0,
k);
229 BODY_00_19(16,
C, D, E,
T,
A, B);
230 BODY_00_19(17, B,
C, D, E,
T,
A);
231 BODY_00_19(18,
A, B,
C, D, E,
T);
232 BODY_00_19(19,
T,
A, B,
C, D, E);
235 const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4,
k);
236 BODY_20_39(20, E,
T,
A, B,
C, D);
237 BODY_20_39(21, D, E,
T,
A, B,
C);
238 BODY_20_39(22,
C, D, E,
T,
A, B);
239 BODY_20_39(23, B,
C, D, E,
T,
A);
241 const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8,
k);
242 BODY_20_39(24,
A, B,
C, D, E,
T);
243 BODY_20_39(25,
T,
A, B,
C, D, E);
244 BODY_20_39(26, E,
T,
A, B,
C, D);
245 BODY_20_39(27, D, E,
T,
A, B,
C);
247 const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12,
k);
248 BODY_20_39(28,
C, D, E,
T,
A, B);
249 BODY_20_39(29, B,
C, D, E,
T,
A);
250 BODY_20_39(30,
A, B,
C, D, E,
T);
251 BODY_20_39(31,
T,
A, B,
C, D, E);
253 const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0,
k);
254 BODY_20_39(32, E,
T,
A, B,
C, D);
255 BODY_20_39(33, D, E,
T,
A, B,
C);
256 BODY_20_39(34,
C, D, E,
T,
A, B);
257 BODY_20_39(35, B,
C, D, E,
T,
A);
259 const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4,
k);
260 BODY_20_39(36,
A, B,
C, D, E,
T);
261 BODY_20_39(37,
T,
A, B,
C, D, E);
262 BODY_20_39(38, E,
T,
A, B,
C, D);
263 BODY_20_39(39, D, E,
T,
A, B,
C);
266 const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8,
k);
272 const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12,
k);
278 const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16,
k);
284 const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20,
k);
290 const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24,
k);
297 const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28,
k);
303 const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32,
k);
309 const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36,
k);
315 const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40,
k);
322 (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44,
k);
348 #endif // OPENSSL_PPC64LE