File openssl-1_1-AES-GCM-performance-optimzation-with-stitched-method.patch of Package openssl-1_1
xxxxxxxxxx
1
From 44a563dde1584cd9284e80b6e45ee5019be8d36c Mon Sep 17 00:00:00 2001
2
From: Danny Tsen <dtsen@us.ibm.com>
3
Date: Mon, 18 Oct 2021 10:51:42 -0400
4
Subject: [PATCH] AES-GCM performance optimzation with stitched method for p9+
5
ppc64le
6
7
Assembly code reviewed by Shricharan Srivatsan <ssrivat@us.ibm.com>
8
9
Reviewed-by: Tomas Mraz <tomas@openssl.org>
10
Reviewed-by: Paul Dale <pauli@openssl.org>
11
(Merged from https://github.com/openssl/openssl/pull/16854)
12
---
13
Configurations/00-base-templates.conf | 2
14
crypto/evp/e_aes.c | 33
15
crypto/modes/asm/aes-gcm-ppc.pl | 1439 ++++++++++++++++++++++++++++++++++
16
crypto/modes/build.info | 1
17
4 files changed, 1466 insertions(+), 9 deletions(-)
18
create mode 100644 crypto/modes/asm/aes-gcm-ppc.pl
19
create mode 100644 providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
20
21
--- a/Configurations/00-base-templates.conf
22
+++ b/Configurations/00-base-templates.conf
23
24
bn_asm_src => "bn-ppc.s ppc-mont.s",
25
aes_asm_src => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s",
26
sha1_asm_src => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s",
27
- modes_asm_src => "ghashp8-ppc.s",
28
+ modes_asm_src => "ghashp8-ppc.s aes-gcm-ppc.s",
29
chacha_asm_src => "chacha-ppc.s",
30
poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s",
31
},
32
--- a/crypto/evp/e_aes.c
33
+++ b/crypto/evp/e_aes.c
34
35
# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
36
# define HWAES_xts_encrypt aes_p8_xts_encrypt
37
# define HWAES_xts_decrypt aes_p8_xts_decrypt
38
+# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
39
+# define AES_GCM_ENC_BYTES 128
40
+# define AES_GCM_DEC_BYTES 128
41
+size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
42
+ const void *key, unsigned char ivec[16], u64 *Xi);
43
+size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
44
+ const void *key, unsigned char ivec[16], u64 *Xi);
45
+void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
46
+# if PPC_AES_GCM_CAPABLE
47
+# define AES_gcm_encrypt ppc_aes_gcm_encrypt
48
+# define AES_gcm_decrypt ppc_aes_gcm_decrypt
49
+# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
50
+ (gctx)->gcm.ghash==gcm_ghash_p8)
51
+# endif
52
#endif
53
54
#if defined(OPENSSL_CPUID_OBJ) && ( \
55
56
*/
57
# define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32)))
58
59
+# define AES_GCM_ENC_BYTES 32
60
+# define AES_GCM_DEC_BYTES 16
61
+
62
int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
63
AES_KEY *key);
64
int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
65
66
if (gctx->ctr) {
67
size_t bulk = 0;
68
#if defined(AES_GCM_ASM)
69
- if (len >= 32 && AES_GCM_ASM(gctx)) {
70
+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) {
71
if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
72
return -1;
73
74
75
} else {
76
size_t bulk = 0;
77
#if defined(AES_GCM_ASM2)
78
- if (len >= 32 && AES_GCM_ASM2(gctx)) {
79
+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) {
80
if (CRYPTO_gcm128_encrypt(&gctx->gcm, NULL, NULL, 0))
81
return -1;
82
83
84
if (gctx->ctr) {
85
size_t bulk = 0;
86
#if defined(AES_GCM_ASM)
87
- if (len >= 16 && AES_GCM_ASM(gctx)) {
88
+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) {
89
if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
90
return -1;
91
92
93
} else {
94
size_t bulk = 0;
95
#if defined(AES_GCM_ASM2)
96
- if (len >= 16 && AES_GCM_ASM2(gctx)) {
97
+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) {
98
if (CRYPTO_gcm128_decrypt(&gctx->gcm, NULL, NULL, 0))
99
return -1;
100
101
102
if (gctx->ctr) {
103
size_t bulk = 0;
104
#if defined(AES_GCM_ASM)
105
- if (len >= 32 && AES_GCM_ASM(gctx)) {
106
+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM(gctx)) {
107
size_t res = (16 - gctx->gcm.mres) % 16;
108
109
if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
110
111
} else {
112
size_t bulk = 0;
113
#if defined(AES_GCM_ASM2)
114
- if (len >= 32 && AES_GCM_ASM2(gctx)) {
115
+ if (len >= AES_GCM_ENC_BYTES && AES_GCM_ASM2(gctx)) {
116
size_t res = (16 - gctx->gcm.mres) % 16;
117
118
if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, res))
119
120
if (gctx->ctr) {
121
size_t bulk = 0;
122
#if defined(AES_GCM_ASM)
123
- if (len >= 16 && AES_GCM_ASM(gctx)) {
124
+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM(gctx)) {
125
size_t res = (16 - gctx->gcm.mres) % 16;
126
127
if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
128
129
} else {
130
size_t bulk = 0;
131
#if defined(AES_GCM_ASM2)
132
- if (len >= 16 && AES_GCM_ASM2(gctx)) {
133
+ if (len >= AES_GCM_DEC_BYTES && AES_GCM_ASM2(gctx)) {
134
size_t res = (16 - gctx->gcm.mres) % 16;
135
136
if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, res))
137
--- /dev/null
138
+++ b/crypto/modes/asm/aes-gcm-ppc.pl
139
140
+#! /usr/bin/env perl
141
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
142
+# Copyright 2021- IBM Inc. All rights reserved
143
+#
144
+# Licensed under the Apache License 2.0 (the "License"). You may not use
145
+# this file except in compliance with the License. You can obtain a copy
146
+# in the file LICENSE in the source distribution or at
147
+# https://www.openssl.org/source/license.html
148
+#
149
+#===================================================================================
150
+# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
151
+#
152
+# GHASH is based on the Karatsuba multiplication method.
153
+#
154
+# Xi xor X1
155
+#
156
+# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
157
+# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
158
+# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
159
+# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
160
+# (X4.h * H.h + X4.l * H.l + X4 * H)
161
+#
162
+# Xi = v0
163
+# H Poly = v2
164
+# Hash keys = v3 - v14
165
+# ( H.l, H, H.h)
166
+# ( H^2.l, H^2, H^2.h)
167
+# ( H^3.l, H^3, H^3.h)
168
+# ( H^4.l, H^4, H^4.h)
169
+#
170
+# v30 is IV
171
+# v31 - counter 1
172
+#
173
+# AES used,
174
+# vs0 - vs14 for round keys
175
+# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
176
+#
177
+# This implementation uses stitched AES-GCM approach to improve overall performance.
178
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
179
+#
180
+# Current large block (16384 bytes) performance per second with 128 bit key --
181
+#
182
+# Encrypt Decrypt
183
+# Power10[le] (3.5GHz) 5.32G 5.26G
184
+#
185
+# ===================================================================================
186
+#
187
+# $output is the last argument if it looks like a file (it has an extension)
188
+# $flavour is the first argument if it doesn't look like a file
189
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
190
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
191
+
192
+if ($flavour =~ /64/) {
193
+ $SIZE_T=8;
194
+ $LRSAVE=2*$SIZE_T;
195
+ $STU="stdu";
196
+ $POP="ld";
197
+ $PUSH="std";
198
+ $UCMP="cmpld";
199
+ $SHRI="srdi";
200
+} elsif ($flavour =~ /32/) {
201
+ $SIZE_T=4;
202
+ $LRSAVE=$SIZE_T;
203
+ $STU="stwu";
204
+ $POP="lwz";
205
+ $PUSH="stw";
206
+ $UCMP="cmplw";
207
+ $SHRI="srwi";
208
+} else { die "nonsense $flavour"; }
209
+
210
+$sp="r1";
211
+$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
212
+
213
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
214
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
215
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
216
+die "can't locate ppc-xlate.pl";
217
+
218
+open STDOUT,"| $^X $xlate $flavour \"$output\""
219
+ or die "can't call $xlate: $!";
220
+
221
+$code=<<___;
222
+.machine "any"
223
+.abiversion 2
224
+.text
225
+
226
+# 4x loops
227
+# v15 - v18 - input states
228
+# vs1 - vs9 - round keys
229
+#
230
+.macro Loop_aes_middle4x
231
+ xxlor 19+32, 1, 1
232
+ xxlor 20+32, 2, 2
233
+ xxlor 21+32, 3, 3
234
+ xxlor 22+32, 4, 4
235
+
236
+ vcipher 15, 15, 19
237
+ vcipher 16, 16, 19
238
+ vcipher 17, 17, 19
239
+ vcipher 18, 18, 19
240
+
241
+ vcipher 15, 15, 20
242
+ vcipher 16, 16, 20
243
+ vcipher 17, 17, 20
244
+ vcipher 18, 18, 20
245
+
246
+ vcipher 15, 15, 21
247
+ vcipher 16, 16, 21
248
+ vcipher 17, 17, 21
249
+ vcipher 18, 18, 21
250
+
251
+ vcipher 15, 15, 22
252
+ vcipher 16, 16, 22
253
+ vcipher 17, 17, 22
254
+ vcipher 18, 18, 22
255
+
256
+ xxlor 19+32, 5, 5
257
+ xxlor 20+32, 6, 6
258
+ xxlor 21+32, 7, 7
259
+ xxlor 22+32, 8, 8
260
+
261
+ vcipher 15, 15, 19
262
+ vcipher 16, 16, 19
263
+ vcipher 17, 17, 19
264
+ vcipher 18, 18, 19
265
+
266
+ vcipher 15, 15, 20
267
+ vcipher 16, 16, 20
268
+ vcipher 17, 17, 20
269
+ vcipher 18, 18, 20
270
+
271
+ vcipher 15, 15, 21
272
+ vcipher 16, 16, 21
273
+ vcipher 17, 17, 21
274
+ vcipher 18, 18, 21
275
+
276
+ vcipher 15, 15, 22
277
+ vcipher 16, 16, 22
278
+ vcipher 17, 17, 22
279
+ vcipher 18, 18, 22
280
+
281
+ xxlor 23+32, 9, 9
282
+ vcipher 15, 15, 23
283
+ vcipher 16, 16, 23
284
+ vcipher 17, 17, 23
285
+ vcipher 18, 18, 23
286
+.endm
287
+
288
+# 8x loops
289
+# v15 - v22 - input states
290
+# vs1 - vs9 - round keys
291
+#
292
+.macro Loop_aes_middle8x
293
+ xxlor 23+32, 1, 1
294
+ xxlor 24+32, 2, 2
295
+ xxlor 25+32, 3, 3
296
+ xxlor 26+32, 4, 4
297
+
298
+ vcipher 15, 15, 23
299
+ vcipher 16, 16, 23
300
+ vcipher 17, 17, 23
301
+ vcipher 18, 18, 23
302
+ vcipher 19, 19, 23
303
+ vcipher 20, 20, 23
304
+ vcipher 21, 21, 23
305
+ vcipher 22, 22, 23
306
+
307
+ vcipher 15, 15, 24
308
+ vcipher 16, 16, 24
309
+ vcipher 17, 17, 24
310
+ vcipher 18, 18, 24
311
+ vcipher 19, 19, 24
312
+ vcipher 20, 20, 24
313
+ vcipher 21, 21, 24
314
+ vcipher 22, 22, 24
315
+
316
+ vcipher 15, 15, 25
317
+ vcipher 16, 16, 25
318
+ vcipher 17, 17, 25
319
+ vcipher 18, 18, 25
320
+ vcipher 19, 19, 25
321
+ vcipher 20, 20, 25
322
+ vcipher 21, 21, 25
323
+ vcipher 22, 22, 25
324
+
325
+ vcipher 15, 15, 26
326
+ vcipher 16, 16, 26
327
+ vcipher 17, 17, 26
328
+ vcipher 18, 18, 26
329
+ vcipher 19, 19, 26
330
+ vcipher 20, 20, 26
331
+ vcipher 21, 21, 26
332
+ vcipher 22, 22, 26
333
+
334
+ xxlor 23+32, 5, 5
335
+ xxlor 24+32, 6, 6
336
+ xxlor 25+32, 7, 7
337
+ xxlor 26+32, 8, 8
338
+
339
+ vcipher 15, 15, 23
340
+ vcipher 16, 16, 23
341
+ vcipher 17, 17, 23
342
+ vcipher 18, 18, 23
343
+ vcipher 19, 19, 23
344
+ vcipher 20, 20, 23
345
+ vcipher 21, 21, 23
346
+ vcipher 22, 22, 23
347
+
348
+ vcipher 15, 15, 24
349
+ vcipher 16, 16, 24
350
+ vcipher 17, 17, 24
351
+ vcipher 18, 18, 24
352
+ vcipher 19, 19, 24
353
+ vcipher 20, 20, 24
354
+ vcipher 21, 21, 24
355
+ vcipher 22, 22, 24
356
+
357
+ vcipher 15, 15, 25
358
+ vcipher 16, 16, 25
359
+ vcipher 17, 17, 25
360
+ vcipher 18, 18, 25
361
+ vcipher 19, 19, 25
362
+ vcipher 20, 20, 25
363
+ vcipher 21, 21, 25
364
+ vcipher 22, 22, 25
365
+
366
+ vcipher 15, 15, 26
367
+ vcipher 16, 16, 26
368
+ vcipher 17, 17, 26
369
+ vcipher 18, 18, 26
370
+ vcipher 19, 19, 26
371
+ vcipher 20, 20, 26
372
+ vcipher 21, 21, 26
373
+ vcipher 22, 22, 26
374
+
375
+ xxlor 23+32, 9, 9
376
+ vcipher 15, 15, 23
377
+ vcipher 16, 16, 23
378
+ vcipher 17, 17, 23
379
+ vcipher 18, 18, 23
380
+ vcipher 19, 19, 23
381
+ vcipher 20, 20, 23
382
+ vcipher 21, 21, 23
383
+ vcipher 22, 22, 23
384
+.endm
385
+
386
+#
387
+# Compute 4x hash values based on Karatsuba method.
388
+#
389
+ppc_aes_gcm_ghash:
390
+ vxor 15, 15, 0
391
+
392
+ xxlxor 29, 29, 29
393
+
394
+ vpmsumd 23, 12, 15 # H4.L * X.L
395
+ vpmsumd 24, 9, 16
396
+ vpmsumd 25, 6, 17
397
+ vpmsumd 26, 3, 18
398
+
399
+ vxor 23, 23, 24
400
+ vxor 23, 23, 25
401
+ vxor 23, 23, 26 # L
402
+
403
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
404
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
405
+ vpmsumd 26, 7, 17
406
+ vpmsumd 27, 4, 18
407
+
408
+ vxor 24, 24, 25
409
+ vxor 24, 24, 26
410
+ vxor 24, 24, 27 # M
411
+
412
+ # sum hash and reduction with H Poly
413
+ vpmsumd 28, 23, 2 # reduction
414
+
415
+ xxlor 29+32, 29, 29
416
+ vsldoi 26, 24, 29, 8 # mL
417
+ vsldoi 29, 29, 24, 8 # mH
418
+ vxor 23, 23, 26 # mL + L
419
+
420
+ vsldoi 23, 23, 23, 8 # swap
421
+ vxor 23, 23, 28
422
+
423
+ vpmsumd 24, 14, 15 # H4.H * X.H
424
+ vpmsumd 25, 11, 16
425
+ vpmsumd 26, 8, 17
426
+ vpmsumd 27, 5, 18
427
+
428
+ vxor 24, 24, 25
429
+ vxor 24, 24, 26
430
+ vxor 24, 24, 27
431
+
432
+ vxor 24, 24, 29
433
+
434
+ # sum hash and reduction with H Poly
435
+ vsldoi 27, 23, 23, 8 # swap
436
+ vpmsumd 23, 23, 2
437
+ vxor 27, 27, 24
438
+ vxor 23, 23, 27
439
+
440
+ xxlor 32, 23+32, 23+32 # update hash
441
+
442
+ blr
443
+
444
+#
445
+# Combine two 4x ghash
446
+# v15 - v22 - input blocks
447
+#
448
+.macro ppc_aes_gcm_ghash2_4x
449
+ # first 4x hash
450
+ vxor 15, 15, 0 # Xi + X
451
+
452
+ xxlxor 29, 29, 29
453
+
454
+ vpmsumd 23, 12, 15 # H4.L * X.L
455
+ vpmsumd 24, 9, 16
456
+ vpmsumd 25, 6, 17
457
+ vpmsumd 26, 3, 18
458
+
459
+ vxor 23, 23, 24
460
+ vxor 23, 23, 25
461
+ vxor 23, 23, 26 # L
462
+
463
+ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
464
+ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
465
+ vpmsumd 26, 7, 17
466
+ vpmsumd 27, 4, 18
467
+
468
+ vxor 24, 24, 25
469
+ vxor 24, 24, 26
470
+
471
+ # sum hash and reduction with H Poly
472
+ vpmsumd 28, 23, 2 # reduction
473
+
474
+ xxlor 29+32, 29, 29
475
+
476
+ vxor 24, 24, 27 # M
477
+ vsldoi 26, 24, 29, 8 # mL
478
+ vsldoi 29, 29, 24, 8 # mH
479
+ vxor 23, 23, 26 # mL + L
480
+
481
+ vsldoi 23, 23, 23, 8 # swap
482
+ vxor 23, 23, 28
483
+
484
+ vpmsumd 24, 14, 15 # H4.H * X.H
485
+ vpmsumd 25, 11, 16
486
+ vpmsumd 26, 8, 17
487
+ vpmsumd 27, 5, 18
488
+
489
+ vxor 24, 24, 25
490
+ vxor 24, 24, 26
491
+ vxor 24, 24, 27 # H
492
+
493
+ vxor 24, 24, 29 # H + mH
494
+
495
+ # sum hash and reduction with H Poly
496
+ vsldoi 27, 23, 23, 8 # swap
497
+ vpmsumd 23, 23, 2
498
+ vxor 27, 27, 24
499
+ vxor 27, 23, 27 # 1st Xi
500
+
501
+ # 2nd 4x hash
502
+ vpmsumd 24, 9, 20
503
+ vpmsumd 25, 6, 21
504
+ vpmsumd 26, 3, 22
505
+ vxor 19, 19, 27 # Xi + X
506
+ vpmsumd 23, 12, 19 # H4.L * X.L
507
+
508
+ vxor 23, 23, 24
509
+ vxor 23, 23, 25
510
+ vxor 23, 23, 26 # L
511
+
512
+ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
513
+ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
514
+ vpmsumd 26, 7, 21
515
+ vpmsumd 27, 4, 22
516
+
517
+ vxor 24, 24, 25
518
+ vxor 24, 24, 26
519
+
520
+ # sum hash and reduction with H Poly
521
+ vpmsumd 28, 23, 2 # reduction
522
+
523
+ xxlor 29+32, 29, 29
524
+
525
+ vxor 24, 24, 27 # M
526
+ vsldoi 26, 24, 29, 8 # mL
527
+ vsldoi 29, 29, 24, 8 # mH
528
+ vxor 23, 23, 26 # mL + L
529
+
530
+ vsldoi 23, 23, 23, 8 # swap
531
+ vxor 23, 23, 28
532
+
533
+ vpmsumd 24, 14, 19 # H4.H * X.H
534
+ vpmsumd 25, 11, 20
535
+ vpmsumd 26, 8, 21
536
+ vpmsumd 27, 5, 22
537
+
538
+ vxor 24, 24, 25
539
+ vxor 24, 24, 26
540
+ vxor 24, 24, 27 # H
541
+
542
+ vxor 24, 24, 29 # H + mH
543
+
544
+ # sum hash and reduction with H Poly
545
+ vsldoi 27, 23, 23, 8 # swap
546
+ vpmsumd 23, 23, 2
547
+ vxor 27, 27, 24
548
+ vxor 23, 23, 27
549
+
550
+ xxlor 32, 23+32, 23+32 # update hash
551
+
552
+.endm
553
+
554
+#
555
+# Compute update single hash
556
+#
557
+.macro ppc_update_hash_1x
558
+ vxor 28, 28, 0
559
+
560
+ vxor 19, 19, 19
561
+
562
+ vpmsumd 22, 3, 28 # L
563
+ vpmsumd 23, 4, 28 # M
564
+ vpmsumd 24, 5, 28 # H
565
+
566
+ vpmsumd 27, 22, 2 # reduction
567
+
568
+ vsldoi 25, 23, 19, 8 # mL
569
+ vsldoi 26, 19, 23, 8 # mH
570
+ vxor 22, 22, 25 # LL + LL
571
+ vxor 24, 24, 26 # HH + HH
572
+
573
+ vsldoi 22, 22, 22, 8 # swap
574
+ vxor 22, 22, 27
575
+
576
+ vsldoi 20, 22, 22, 8 # swap
577
+ vpmsumd 22, 22, 2 # reduction
578
+ vxor 20, 20, 24
579
+ vxor 22, 22, 20
580
+
581
+ vmr 0, 22 # update hash
582
+
583
+.endm
584
+
585
+#
586
+# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
587
+# const AES_KEY *key, unsigned char iv[16],
588
+# void *Xip);
589
+#
590
+# r3 - inp
591
+# r4 - out
592
+# r5 - len
593
+# r6 - AES round keys
594
+# r7 - iv
595
+# r8 - Xi, HPoli, hash keys
596
+#
597
+.global ppc_aes_gcm_encrypt
598
+.align 5
599
+ppc_aes_gcm_encrypt:
600
+_ppc_aes_gcm_encrypt:
601
+
602
+ stdu 1,-512(1)
603
+ mflr 0
604
+
605
+ std 14,112(1)
606
+ std 15,120(1)
607
+ std 16,128(1)
608
+ std 17,136(1)
609
+ std 18,144(1)
610
+ std 19,152(1)
611
+ std 20,160(1)
612
+ std 21,168(1)
613
+ li 9, 256
614
+ stvx 20, 9, 1
615
+ addi 9, 9, 16
616
+ stvx 21, 9, 1
617
+ addi 9, 9, 16
618
+ stvx 22, 9, 1
619
+ addi 9, 9, 16
620
+ stvx 23, 9, 1
621
+ addi 9, 9, 16
622
+ stvx 24, 9, 1
623
+ addi 9, 9, 16
624
+ stvx 25, 9, 1
625
+ addi 9, 9, 16
626
+ stvx 26, 9, 1
627
+ addi 9, 9, 16
628
+ stvx 27, 9, 1
629
+ addi 9, 9, 16
630
+ stvx 28, 9, 1
631
+ addi 9, 9, 16
632
+ stvx 29, 9, 1
633
+ addi 9, 9, 16
634
+ stvx 30, 9, 1
635
+ addi 9, 9, 16
636
+ stvx 31, 9, 1
637
+ std 0, 528(1)
638
+
639
+ # Load Xi
640
+ lxvb16x 32, 0, 8 # load Xi
641
+
642
+ # load Hash - h^4, h^3, h^2, h
643
+ li 10, 32
644
+ lxvd2x 2+32, 10, 8 # H Poli
645
+ li 10, 48
646
+ lxvd2x 3+32, 10, 8 # Hl
647
+ li 10, 64
648
+ lxvd2x 4+32, 10, 8 # H
649
+ li 10, 80
650
+ lxvd2x 5+32, 10, 8 # Hh
651
+
652
+ li 10, 96
653
+ lxvd2x 6+32, 10, 8 # H^2l
654
+ li 10, 112
655
+ lxvd2x 7+32, 10, 8 # H^2
656
+ li 10, 128
657
+ lxvd2x 8+32, 10, 8 # H^2h
658
+
659
+ li 10, 144
660
+ lxvd2x 9+32, 10, 8 # H^3l
661
+ li 10, 160
662
+ lxvd2x 10+32, 10, 8 # H^3
663
+ li 10, 176
664
+ lxvd2x 11+32, 10, 8 # H^3h
665
+
666
+ li 10, 192
667
+ lxvd2x 12+32, 10, 8 # H^4l
668
+ li 10, 208
669
+ lxvd2x 13+32, 10, 8 # H^4
670
+ li 10, 224
671
+ lxvd2x 14+32, 10, 8 # H^4h
672
+
673
+ # initialize ICB: GHASH( IV ), IV - r7
674
+ lxvb16x 30+32, 0, 7 # load IV - v30
675
+
676
+ mr 12, 5 # length
677
+ li 11, 0 # block index
678
+
679
+ # counter 1
680
+ vxor 31, 31, 31
681
+ vspltisb 22, 1
682
+ vsldoi 31, 31, 22,1 # counter 1
683
+
684
+ # load round key to VSR
685
+ lxv 0, 0(6)
686
+ lxv 1, 0x10(6)
687
+ lxv 2, 0x20(6)
688
+ lxv 3, 0x30(6)
689
+ lxv 4, 0x40(6)
690
+ lxv 5, 0x50(6)
691
+ lxv 6, 0x60(6)
692
+ lxv 7, 0x70(6)
693
+ lxv 8, 0x80(6)
694
+ lxv 9, 0x90(6)
695
+ lxv 10, 0xa0(6)
696
+
697
+ # load rounds - 10 (128), 12 (192), 14 (256)
698
+ lwz 9,240(6)
699
+
700
+ #
701
+ # vxor state, state, w # addroundkey
702
+ xxlor 32+29, 0, 0
703
+ vxor 15, 30, 29 # IV + round key - add round key 0
704
+
705
+ cmpdi 9, 10
706
+ beq Loop_aes_gcm_8x
707
+
708
+ # load 2 more round keys (v11, v12)
709
+ lxv 11, 0xb0(6)
710
+ lxv 12, 0xc0(6)
711
+
712
+ cmpdi 9, 12
713
+ beq Loop_aes_gcm_8x
714
+
715
+ # load 2 more round keys (v11, v12, v13, v14)
716
+ lxv 13, 0xd0(6)
717
+ lxv 14, 0xe0(6)
718
+ cmpdi 9, 14
719
+ beq Loop_aes_gcm_8x
720
+
721
+ b aes_gcm_out
722
+
723
+.align 5
724
+Loop_aes_gcm_8x:
725
+ mr 14, 3
726
+ mr 9, 4
727
+
728
+ # n blcoks
729
+ li 10, 128
730
+ divdu 10, 5, 10 # n 128 bytes-blocks
731
+ cmpdi 10, 0
732
+ beq Loop_last_block
733
+
734
+ vaddudm 30, 30, 31 # IV + counter
735
+ vxor 16, 30, 29
736
+ vaddudm 30, 30, 31
737
+ vxor 17, 30, 29
738
+ vaddudm 30, 30, 31
739
+ vxor 18, 30, 29
740
+ vaddudm 30, 30, 31
741
+ vxor 19, 30, 29
742
+ vaddudm 30, 30, 31
743
+ vxor 20, 30, 29
744
+ vaddudm 30, 30, 31
745
+ vxor 21, 30, 29
746
+ vaddudm 30, 30, 31
747
+ vxor 22, 30, 29
748
+
749
+ mtctr 10
750
+
751
+ li 15, 16
752
+ li 16, 32
753
+ li 17, 48
754
+ li 18, 64
755
+ li 19, 80
756
+ li 20, 96
757
+ li 21, 112
758
+
759
+ lwz 10, 240(6)
760
+
761
+Loop_8x_block:
762
+
763
+ lxvb16x 15, 0, 14 # load block
764
+ lxvb16x 16, 15, 14 # load block
765
+ lxvb16x 17, 16, 14 # load block
766
+ lxvb16x 18, 17, 14 # load block
767
+ lxvb16x 19, 18, 14 # load block
768
+ lxvb16x 20, 19, 14 # load block
769
+ lxvb16x 21, 20, 14 # load block
770
+ lxvb16x 22, 21, 14 # load block
771
+ addi 14, 14, 128
772
+
773
+ Loop_aes_middle8x
774
+
775
+ xxlor 23+32, 10, 10
776
+
777
+ cmpdi 10, 10
778
+ beq Do_next_ghash
779
+
780
+ # 192 bits
781
+ xxlor 24+32, 11, 11
782
+
783
+ vcipher 15, 15, 23
784
+ vcipher 16, 16, 23
785
+ vcipher 17, 17, 23
786
+ vcipher 18, 18, 23
787
+ vcipher 19, 19, 23
788
+ vcipher 20, 20, 23
789
+ vcipher 21, 21, 23
790
+ vcipher 22, 22, 23
791
+
792
+ vcipher 15, 15, 24
793
+ vcipher 16, 16, 24
794
+ vcipher 17, 17, 24
795
+ vcipher 18, 18, 24
796
+ vcipher 19, 19, 24
797
+ vcipher 20, 20, 24
798
+ vcipher 21, 21, 24
799
+ vcipher 22, 22, 24
800
+
801
+ xxlor 23+32, 12, 12
802
+
803
+ cmpdi 10, 12
804
+ beq Do_next_ghash
805
+
806
+ # 256 bits
807
+ xxlor 24+32, 13, 13
808
+
809
+ vcipher 15, 15, 23
810
+ vcipher 16, 16, 23
811
+ vcipher 17, 17, 23
812
+ vcipher 18, 18, 23
813
+ vcipher 19, 19, 23
814
+ vcipher 20, 20, 23
815
+ vcipher 21, 21, 23
816
+ vcipher 22, 22, 23
817
+
818
+ vcipher 15, 15, 24
819
+ vcipher 16, 16, 24
820
+ vcipher 17, 17, 24
821
+ vcipher 18, 18, 24
822
+ vcipher 19, 19, 24
823
+ vcipher 20, 20, 24
824
+ vcipher 21, 21, 24
825
+ vcipher 22, 22, 24
826
+
827
+ xxlor 23+32, 14, 14
828
+
829
+ cmpdi 10, 14
830
+ beq Do_next_ghash
831
+ b aes_gcm_out
832
+
833
+Do_next_ghash:
834
+
835
+ #
836
+ # last round
837
+ vcipherlast 15, 15, 23
838
+ vcipherlast 16, 16, 23
839
+
840
+ xxlxor 47, 47, 15
841
+ stxvb16x 47, 0, 9 # store output
842
+ xxlxor 48, 48, 16
843
+ stxvb16x 48, 15, 9 # store output
844
+
845
+ vcipherlast 17, 17, 23
846
+ vcipherlast 18, 18, 23
847
+
848
+ xxlxor 49, 49, 17
849
+ stxvb16x 49, 16, 9 # store output
850
+ xxlxor 50, 50, 18
851
+ stxvb16x 50, 17, 9 # store output
852
+
853
+ vcipherlast 19, 19, 23
854
+ vcipherlast 20, 20, 23
855
+
856
+ xxlxor 51, 51, 19
857
+ stxvb16x 51, 18, 9 # store output
858
+ xxlxor 52, 52, 20
859
+ stxvb16x 52, 19, 9 # store output
860
+
861
+ vcipherlast 21, 21, 23
862
+ vcipherlast 22, 22, 23
863
+
864
+ xxlxor 53, 53, 21
865
+ stxvb16x 53, 20, 9 # store output
866
+ xxlxor 54, 54, 22
867
+ stxvb16x 54, 21, 9 # store output
868
+
869
+ addi 9, 9, 128
870
+
871
+ # ghash here
872
+ ppc_aes_gcm_ghash2_4x
873
+
874
+ xxlor 27+32, 0, 0
875
+ vaddudm 30, 30, 31 # IV + counter
876
+ vmr 29, 30
877
+ vxor 15, 30, 27 # add round key
878
+ vaddudm 30, 30, 31
879
+ vxor 16, 30, 27
880
+ vaddudm 30, 30, 31
881
+ vxor 17, 30, 27
882
+ vaddudm 30, 30, 31
883
+ vxor 18, 30, 27
884
+ vaddudm 30, 30, 31
885
+ vxor 19, 30, 27
886
+ vaddudm 30, 30, 31
887
+ vxor 20, 30, 27
888
+ vaddudm 30, 30, 31
889
+ vxor 21, 30, 27
890
+ vaddudm 30, 30, 31
891
+ vxor 22, 30, 27
892
+
893
+ addi 12, 12, -128
894
+ addi 11, 11, 128
895
+
896
+ bdnz Loop_8x_block
897
+
898
+ vmr 30, 29
899
+
900
+Loop_last_block:
901
+ cmpdi 12, 0
902
+ beq aes_gcm_out
903
+
904
+ # loop last few blocks
905
+ li 10, 16
906
+ divdu 10, 12, 10
907
+
908
+ mtctr 10
909
+
910
+ lwz 10, 240(6)
911
+
912
+ cmpdi 12, 16
913
+ blt Final_block
914
+
915
+.macro Loop_aes_middle_1x
916
+ xxlor 19+32, 1, 1
917
+ xxlor 20+32, 2, 2
918
+ xxlor 21+32, 3, 3
919
+ xxlor 22+32, 4, 4
920
+
921
+ vcipher 15, 15, 19
922
+ vcipher 15, 15, 20
923
+ vcipher 15, 15, 21
924
+ vcipher 15, 15, 22
925
+
926
+ xxlor 19+32, 5, 5
927
+ xxlor 20+32, 6, 6
928
+ xxlor 21+32, 7, 7
929
+ xxlor 22+32, 8, 8
930
+
931
+ vcipher 15, 15, 19
932
+ vcipher 15, 15, 20
933
+ vcipher 15, 15, 21
934
+ vcipher 15, 15, 22
935
+
936
+ xxlor 19+32, 9, 9
937
+ vcipher 15, 15, 19
938
+.endm
939
+
940
+Next_rem_block:
941
+ lxvb16x 15, 0, 14 # load block
942
+
943
+ Loop_aes_middle_1x
944
+
945
+ xxlor 23+32, 10, 10
946
+
947
+ cmpdi 10, 10
948
+ beq Do_next_1x
949
+
950
+ # 192 bits
951
+ xxlor 24+32, 11, 11
952
+
953
+ vcipher 15, 15, 23
954
+ vcipher 15, 15, 24
955
+
956
+ xxlor 23+32, 12, 12
957
+
958
+ cmpdi 10, 12
959
+ beq Do_next_1x
960
+
961
+ # 256 bits
962
+ xxlor 24+32, 13, 13
963
+
964
+ vcipher 15, 15, 23
965
+ vcipher 15, 15, 24
966
+
967
+ xxlor 23+32, 14, 14
968
+
969
+ cmpdi 10, 14
970
+ beq Do_next_1x
971
+
972
+Do_next_1x:
973
+ vcipherlast 15, 15, 23
974
+
975
+ xxlxor 47, 47, 15
976
+ stxvb16x 47, 0, 9 # store output
977
+ addi 14, 14, 16
978
+ addi 9, 9, 16
979
+
980
+ vmr 28, 15
981
+ ppc_update_hash_1x
982
+
983
+ addi 12, 12, -16
984
+ addi 11, 11, 16
985
+ xxlor 19+32, 0, 0
986
+ vaddudm 30, 30, 31 # IV + counter
987
+ vxor 15, 30, 19 # add round key
988
+
989
+ bdnz Next_rem_block
990
+
991
+ cmpdi 12, 0
992
+ beq aes_gcm_out
993
+
994
+Final_block:
995
+ Loop_aes_middle_1x
996
+
997
+ xxlor 23+32, 10, 10
998
+
999
+ cmpdi 10, 10
1000
+ beq Do_final_1x
1001
+
1002
+ # 192 bits
1003
+ xxlor 24+32, 11, 11
1004
+
1005
+ vcipher 15, 15, 23
1006
+ vcipher 15, 15, 24
1007
+
1008
+ xxlor 23+32, 12, 12
1009
+
1010
+ cmpdi 10, 12
1011
+ beq Do_final_1x
1012
+
1013
+ # 256 bits
1014
+ xxlor 24+32, 13, 13
1015
+
1016
+ vcipher 15, 15, 23
1017
+ vcipher 15, 15, 24
1018
+
1019
+ xxlor 23+32, 14, 14
1020
+
1021
+ cmpdi 10, 14
1022
+ beq Do_final_1x
1023
+
1024
+Do_final_1x:
1025
+ vcipherlast 15, 15, 23
1026
+
1027
+ lxvb16x 15, 0, 14 # load last block
1028
+ xxlxor 47, 47, 15
1029
+
1030
+ # create partial block mask
1031
+ li 15, 16
1032
+ sub 15, 15, 12 # index to the mask
1033
+
1034
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1035
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
1036
+ li 10, 192
1037
+ stvx 16, 10, 1
1038
+ addi 10, 10, 16
1039
+ stvx 17, 10, 1
1040
+
1041
+ addi 10, 1, 192
1042
+ lxvb16x 16, 15, 10 # load partial block mask
1043
+ xxland 47, 47, 16
1044
+
1045
+ vmr 28, 15
1046
+ ppc_update_hash_1x
1047
+
1048
+ # * should store only the remaining bytes.
1049
+ bl Write_partial_block
1050
+
1051
+ b aes_gcm_out
1052
+
1053
+#
1054
+# Write partial block
1055
+# r9 - output
1056
+# r12 - remaining bytes
1057
+# v15 - partial input data
1058
+#
1059
+Write_partial_block:
1060
+ li 10, 192
1061
+ stxvb16x 15+32, 10, 1 # last block
1062
+
1063
+ #add 10, 9, 11 # Output
1064
+ addi 10, 9, -1
1065
+ addi 16, 1, 191
1066
+
1067
+ mtctr 12 # remaining bytes
1068
+ li 15, 0
1069
+
1070
+Write_last_byte:
1071
+ lbzu 14, 1(16)
1072
+ stbu 14, 1(10)
1073
+ bdnz Write_last_byte
1074
+ blr
1075
+
1076
+aes_gcm_out:
1077
+ # out = state
1078
+ stxvb16x 32, 0, 8 # write out Xi
1079
+ add 3, 11, 12 # return count
1080
+
1081
+ li 9, 256
1082
+ lvx 20, 9, 1
1083
+ addi 9, 9, 16
1084
+ lvx 21, 9, 1
1085
+ addi 9, 9, 16
1086
+ lvx 22, 9, 1
1087
+ addi 9, 9, 16
1088
+ lvx 23, 9, 1
1089
+ addi 9, 9, 16
1090
+ lvx 24, 9, 1
1091
+ addi 9, 9, 16
1092
+ lvx 25, 9, 1
1093
+ addi 9, 9, 16
1094
+ lvx 26, 9, 1
1095
+ addi 9, 9, 16
1096
+ lvx 27, 9, 1
1097
+ addi 9, 9, 16
1098
+ lvx 28, 9, 1
1099
+ addi 9, 9, 16
1100
+ lvx 29, 9, 1
1101
+ addi 9, 9, 16
1102
+ lvx 30, 9, 1
1103
+ addi 9, 9, 16
1104
+ lvx 31, 9, 1
1105
+
1106
+ ld 0, 528(1)
1107
+ ld 14,112(1)
1108
+ ld 15,120(1)
1109
+ ld 16,128(1)
1110
+ ld 17,136(1)
1111
+ ld 18,144(1)
1112
+ ld 19,152(1)
1113
+ ld 20,160(1)
1114
+ ld 21,168(1)
1115
+
1116
+ mtlr 0
1117
+ addi 1, 1, 512
1118
+ blr
1119
+
1120
+#
1121
+# 8x Decrypt
1122
+#
1123
+.global ppc_aes_gcm_decrypt
1124
+.align 5
1125
+ppc_aes_gcm_decrypt:
1126
+_ppc_aes_gcm_decrypt:
1127
+
1128
+ stdu 1,-512(1)
1129
+ mflr 0
1130
+
1131
+ std 14,112(1)
1132
+ std 15,120(1)
1133
+ std 16,128(1)
1134
+ std 17,136(1)
1135
+ std 18,144(1)
1136
+ std 19,152(1)
1137
+ std 20,160(1)
1138
+ std 21,168(1)
1139
+ li 9, 256
1140
+ stvx 20, 9, 1
1141
+ addi 9, 9, 16
1142
+ stvx 21, 9, 1
1143
+ addi 9, 9, 16
1144
+ stvx 22, 9, 1
1145
+ addi 9, 9, 16
1146
+ stvx 23, 9, 1
1147
+ addi 9, 9, 16
1148
+ stvx 24, 9, 1
1149
+ addi 9, 9, 16
1150
+ stvx 25, 9, 1
1151
+ addi 9, 9, 16
1152
+ stvx 26, 9, 1
1153
+ addi 9, 9, 16
1154
+ stvx 27, 9, 1
1155
+ addi 9, 9, 16
1156
+ stvx 28, 9, 1
1157
+ addi 9, 9, 16
1158
+ stvx 29, 9, 1
1159
+ addi 9, 9, 16
1160
+ stvx 30, 9, 1
1161
+ addi 9, 9, 16
1162
+ stvx 31, 9, 1
1163
+ std 0, 528(1)
1164
+
1165
+ # Load Xi
1166
+ lxvb16x 32, 0, 8 # load Xi
1167
+
1168
+ # load Hash - h^4, h^3, h^2, h
1169
+ li 10, 32
1170
+ lxvd2x 2+32, 10, 8 # H Poli
1171
+ li 10, 48
1172
+ lxvd2x 3+32, 10, 8 # Hl
1173
+ li 10, 64
1174
+ lxvd2x 4+32, 10, 8 # H
1175
+ li 10, 80
1176
+ lxvd2x 5+32, 10, 8 # Hh
1177
+
1178
+ li 10, 96
1179
+ lxvd2x 6+32, 10, 8 # H^2l
1180
+ li 10, 112
1181
+ lxvd2x 7+32, 10, 8 # H^2
1182
+ li 10, 128
1183
+ lxvd2x 8+32, 10, 8 # H^2h
1184
+
1185
+ li 10, 144
1186
+ lxvd2x 9+32, 10, 8 # H^3l
1187
+ li 10, 160
1188
+ lxvd2x 10+32, 10, 8 # H^3
1189
+ li 10, 176
1190
+ lxvd2x 11+32, 10, 8 # H^3h
1191
+
1192
+ li 10, 192
1193
+ lxvd2x 12+32, 10, 8 # H^4l
1194
+ li 10, 208
1195
+ lxvd2x 13+32, 10, 8 # H^4
1196
+ li 10, 224
1197
+ lxvd2x 14+32, 10, 8 # H^4h
1198
+
1199
+ # initialize ICB: GHASH( IV ), IV - r7
1200
+ lxvb16x 30+32, 0, 7 # load IV - v30
1201
+
1202
+ mr 12, 5 # length
1203
+ li 11, 0 # block index
1204
+
1205
+ # counter 1
1206
+ vxor 31, 31, 31
1207
+ vspltisb 22, 1
1208
+ vsldoi 31, 31, 22,1 # counter 1
1209
+
1210
+ # load round key to VSR
1211
+ lxv 0, 0(6)
1212
+ lxv 1, 0x10(6)
1213
+ lxv 2, 0x20(6)
1214
+ lxv 3, 0x30(6)
1215
+ lxv 4, 0x40(6)
1216
+ lxv 5, 0x50(6)
1217
+ lxv 6, 0x60(6)
1218
+ lxv 7, 0x70(6)
1219
+ lxv 8, 0x80(6)
1220
+ lxv 9, 0x90(6)
1221
+ lxv 10, 0xa0(6)
1222
+
1223
+ # load rounds - 10 (128), 12 (192), 14 (256)
1224
+ lwz 9,240(6)
1225
+
1226
+ #
1227
+ # vxor state, state, w # addroundkey
1228
+ xxlor 32+29, 0, 0
1229
+ vxor 15, 30, 29 # IV + round key - add round key 0
1230
+
1231
+ cmpdi 9, 10
1232
+ beq Loop_aes_gcm_8x_dec
1233
+
1234
+ # load 2 more round keys (v11, v12)
1235
+ lxv 11, 0xb0(6)
1236
+ lxv 12, 0xc0(6)
1237
+
1238
+ cmpdi 9, 12
1239
+ beq Loop_aes_gcm_8x_dec
1240
+
1241
+ # load 2 more round keys (v11, v12, v13, v14)
1242
+ lxv 13, 0xd0(6)
1243
+ lxv 14, 0xe0(6)
1244
+ cmpdi 9, 14
1245
+ beq Loop_aes_gcm_8x_dec
1246
+
1247
+ b aes_gcm_out
1248
+
1249
+.align 5
1250
+Loop_aes_gcm_8x_dec:
1251
+ mr 14, 3
1252
+ mr 9, 4
1253
+
1254
+ # n blcoks
1255
+ li 10, 128
1256
+ divdu 10, 5, 10 # n 128 bytes-blocks
1257
+ cmpdi 10, 0
1258
+ beq Loop_last_block_dec
1259
+
1260
+ vaddudm 30, 30, 31 # IV + counter
1261
+ vxor 16, 30, 29
1262
+ vaddudm 30, 30, 31
1263
+ vxor 17, 30, 29
1264
+ vaddudm 30, 30, 31
1265
+ vxor 18, 30, 29
1266
+ vaddudm 30, 30, 31
1267
+ vxor 19, 30, 29
1268
+ vaddudm 30, 30, 31
1269
+ vxor 20, 30, 29
1270
+ vaddudm 30, 30, 31
1271
+ vxor 21, 30, 29
1272
+ vaddudm 30, 30, 31
1273
+ vxor 22, 30, 29
1274
+
1275
+ mtctr 10
1276
+
1277
+ li 15, 16
1278
+ li 16, 32
1279
+ li 17, 48
1280
+ li 18, 64
1281
+ li 19, 80
1282
+ li 20, 96
1283
+ li 21, 112
1284
+
1285
+ lwz 10, 240(6)
1286
+
1287
+Loop_8x_block_dec:
1288
+
1289
+ lxvb16x 15, 0, 14 # load block
1290
+ lxvb16x 16, 15, 14 # load block
1291
+ lxvb16x 17, 16, 14 # load block
1292
+ lxvb16x 18, 17, 14 # load block
1293
+ lxvb16x 19, 18, 14 # load block
1294
+ lxvb16x 20, 19, 14 # load block
1295
+ lxvb16x 21, 20, 14 # load block
1296
+ lxvb16x 22, 21, 14 # load block
1297
+ addi 14, 14, 128
1298
+
1299
+ Loop_aes_middle8x
1300
+
1301
+ xxlor 23+32, 10, 10
1302
+
1303
+ cmpdi 10, 10
1304
+ beq Do_last_aes_dec
1305
+
1306
+ # 192 bits
1307
+ xxlor 24+32, 11, 11
1308
+
1309
+ vcipher 15, 15, 23
1310
+ vcipher 16, 16, 23
1311
+ vcipher 17, 17, 23
1312
+ vcipher 18, 18, 23
1313
+ vcipher 19, 19, 23
1314
+ vcipher 20, 20, 23
1315
+ vcipher 21, 21, 23
1316
+ vcipher 22, 22, 23
1317
+
1318
+ vcipher 15, 15, 24
1319
+ vcipher 16, 16, 24
1320
+ vcipher 17, 17, 24
1321
+ vcipher 18, 18, 24
1322
+ vcipher 19, 19, 24
1323
+ vcipher 20, 20, 24
1324
+ vcipher 21, 21, 24
1325
+ vcipher 22, 22, 24
1326
+
1327
+ xxlor 23+32, 12, 12
1328
+
1329
+ cmpdi 10, 12
1330
+ beq Do_last_aes_dec
1331
+
1332
+ # 256 bits
1333
+ xxlor 24+32, 13, 13
1334
+
1335
+ vcipher 15, 15, 23
1336
+ vcipher 16, 16, 23
1337
+ vcipher 17, 17, 23
1338
+ vcipher 18, 18, 23
1339
+ vcipher 19, 19, 23
1340
+ vcipher 20, 20, 23
1341
+ vcipher 21, 21, 23
1342
+ vcipher 22, 22, 23
1343
+
1344
+ vcipher 15, 15, 24
1345
+ vcipher 16, 16, 24
1346
+ vcipher 17, 17, 24
1347
+ vcipher 18, 18, 24
1348
+ vcipher 19, 19, 24
1349
+ vcipher 20, 20, 24
1350
+ vcipher 21, 21, 24
1351
+ vcipher 22, 22, 24
1352
+
1353
+ xxlor 23+32, 14, 14
1354
+
1355
+ cmpdi 10, 14
1356
+ beq Do_last_aes_dec
1357
+ b aes_gcm_out
1358
+
1359
+Do_last_aes_dec:
1360
+
1361
+ #
1362
+ # last round
1363
+ vcipherlast 15, 15, 23
1364
+ vcipherlast 16, 16, 23
1365
+
1366
+ xxlxor 47, 47, 15
1367
+ stxvb16x 47, 0, 9 # store output
1368
+ xxlxor 48, 48, 16
1369
+ stxvb16x 48, 15, 9 # store output
1370
+
1371
+ vcipherlast 17, 17, 23
1372
+ vcipherlast 18, 18, 23
1373
+
1374
+ xxlxor 49, 49, 17
1375
+ stxvb16x 49, 16, 9 # store output
1376
+ xxlxor 50, 50, 18
1377
+ stxvb16x 50, 17, 9 # store output
1378
+
1379
+ vcipherlast 19, 19, 23
1380
+ vcipherlast 20, 20, 23
1381
+
1382
+ xxlxor 51, 51, 19
1383
+ stxvb16x 51, 18, 9 # store output
1384
+ xxlxor 52, 52, 20
1385
+ stxvb16x 52, 19, 9 # store output
1386
+
1387
+ vcipherlast 21, 21, 23
1388
+ vcipherlast 22, 22, 23
1389
+
1390
+ xxlxor 53, 53, 21
1391
+ stxvb16x 53, 20, 9 # store output
1392
+ xxlxor 54, 54, 22
1393
+ stxvb16x 54, 21, 9 # store output
1394
+
1395
+ addi 9, 9, 128
1396
+
1397
+ xxlor 15+32, 15, 15
1398
+ xxlor 16+32, 16, 16
1399
+ xxlor 17+32, 17, 17
1400
+ xxlor 18+32, 18, 18
1401
+ xxlor 19+32, 19, 19
1402
+ xxlor 20+32, 20, 20
1403
+ xxlor 21+32, 21, 21
1404
+ xxlor 22+32, 22, 22
1405
+
1406
+ # ghash here
1407
+ ppc_aes_gcm_ghash2_4x
1408
+
1409
+ xxlor 27+32, 0, 0
1410
+ vaddudm 30, 30, 31 # IV + counter
1411
+ vmr 29, 30
1412
+ vxor 15, 30, 27 # add round key
1413
+ vaddudm 30, 30, 31
1414
+ vxor 16, 30, 27
1415
+ vaddudm 30, 30, 31
1416
+ vxor 17, 30, 27
1417
+ vaddudm 30, 30, 31
1418
+ vxor 18, 30, 27
1419
+ vaddudm 30, 30, 31
1420
+ vxor 19, 30, 27
1421
+ vaddudm 30, 30, 31
1422
+ vxor 20, 30, 27
1423
+ vaddudm 30, 30, 31
1424
+ vxor 21, 30, 27
1425
+ vaddudm 30, 30, 31
1426
+ vxor 22, 30, 27
1427
+ addi 12, 12, -128
1428
+ addi 11, 11, 128
1429
+
1430
+ bdnz Loop_8x_block_dec
1431
+
1432
+ vmr 30, 29
1433
+
1434
+Loop_last_block_dec:
1435
+ cmpdi 12, 0
1436
+ beq aes_gcm_out
1437
+
1438
+ # loop last few blocks
1439
+ li 10, 16
1440
+ divdu 10, 12, 10
1441
+
1442
+ mtctr 10
1443
+
1444
+ lwz 10,240(6)
1445
+
1446
+ cmpdi 12, 16
1447
+ blt Final_block_dec
1448
+
1449
+Next_rem_block_dec:
1450
+ lxvb16x 15, 0, 14 # load block
1451
+
1452
+ Loop_aes_middle_1x
1453
+
1454
+ xxlor 23+32, 10, 10
1455
+
1456
+ cmpdi 10, 10
1457
+ beq Do_next_1x_dec
1458
+
1459
+ # 192 bits
1460
+ xxlor 24+32, 11, 11
1461
+
1462
+ vcipher 15, 15, 23
1463
+ vcipher 15, 15, 24
1464
+
1465
+ xxlor 23+32, 12, 12
1466
+
1467
+ cmpdi 10, 12
1468
+ beq Do_next_1x_dec
1469
+
1470
+ # 256 bits
1471
+ xxlor 24+32, 13, 13
1472
+
1473
+ vcipher 15, 15, 23
1474
+ vcipher 15, 15, 24
1475
+
1476
+ xxlor 23+32, 14, 14
1477
+
1478
+ cmpdi 10, 14
1479
+ beq Do_next_1x_dec
1480
+
1481
+Do_next_1x_dec:
1482
+ vcipherlast 15, 15, 23
1483
+
1484
+ xxlxor 47, 47, 15
1485
+ stxvb16x 47, 0, 9 # store output
1486
+ addi 14, 14, 16
1487
+ addi 9, 9, 16
1488
+
1489
+ xxlor 28+32, 15, 15
1490
+ ppc_update_hash_1x
1491
+
1492
+ addi 12, 12, -16
1493
+ addi 11, 11, 16
1494
+ xxlor 19+32, 0, 0
1495
+ vaddudm 30, 30, 31 # IV + counter
1496
+ vxor 15, 30, 19 # add round key
1497
+
1498
+ bdnz Next_rem_block_dec
1499
+
1500
+ cmpdi 12, 0
1501
+ beq aes_gcm_out
1502
+
1503
+Final_block_dec:
1504
+ Loop_aes_middle_1x
1505
+
1506
+ xxlor 23+32, 10, 10
1507
+
1508
+ cmpdi 10, 10
1509
+ beq Do_final_1x_dec
1510
+
1511
+ # 192 bits
1512
+ xxlor 24+32, 11, 11
1513
+
1514
+ vcipher 15, 15, 23
1515
+ vcipher 15, 15, 24
1516
+
1517
+ xxlor 23+32, 12, 12
1518
+
1519
+ cmpdi 10, 12
1520
+ beq Do_final_1x_dec
1521
+
1522
+ # 256 bits
1523
+ xxlor 24+32, 13, 13
1524
+
1525
+ vcipher 15, 15, 23
1526
+ vcipher 15, 15, 24
1527
+
1528
+ xxlor 23+32, 14, 14
1529
+
1530
+ cmpdi 10, 14
1531
+ beq Do_final_1x_dec
1532
+
1533
+Do_final_1x_dec:
1534
+ vcipherlast 15, 15, 23
1535
+
1536
+ lxvb16x 15, 0, 14 # load block
1537
+ xxlxor 47, 47, 15
1538
+
1539
+ # create partial block mask
1540
+ li 15, 16
1541
+ sub 15, 15, 12 # index to the mask
1542
+
1543
+ vspltisb 16, -1 # first 16 bytes - 0xffff...ff
1544
+ vspltisb 17, 0 # second 16 bytes - 0x0000...00
1545
+ li 10, 192
1546
+ stvx 16, 10, 1
1547
+ addi 10, 10, 16
1548
+ stvx 17, 10, 1
1549
+
1550
+ addi 10, 1, 192
1551
+ lxvb16x 16, 15, 10 # load block mask
1552
+ xxland 47, 47, 16
1553
+
1554
+ xxlor 28+32, 15, 15
1555
+ ppc_update_hash_1x
1556
+
1557
+ # * should store only the remaining bytes.
1558
+ bl Write_partial_block
1559
+
1560
+ b aes_gcm_out
1561
+
1562
+
1563
+___
1564
+
1565
+foreach (split("\n",$code)) {
1566
+ s/\`([^\`]*)\`/eval $1/geo;
1567
+
1568
+ if ($flavour =~ /le$/o) { # little-endian
1569
+ s/le\?//o or
1570
+ s/be\?/#be#/o;
1571
+ } else {
1572
+ s/le\?/#le#/o or
1573
+ s/be\?//o;
1574
+ }
1575
+ print $_,"\n";
1576
+}
1577
+
1578
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
1579
--- a/crypto/modes/build.info
1580
+++ b/crypto/modes/build.info
1581
1582
GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME)
1583
GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME)
1584
GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME)
1585
+GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl $(PERLASM_SCHEME)
1586
GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME)
1587
INCLUDE[ghash-armv4.o]=..
1588
GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
1589