File openssl-1_1-chacha20-performance-optimizations-for-ppc64le-with-.patch of Package openssl-1_1
1536
1
From f596bbe4da779b56eea34d96168b557d78e1149a Mon Sep 17 00:00:00 2001
2
From: Deepankar Bhattacharjee <deepankar.b@in.ibm.com>
3
Date: Mon, 20 Sep 2021 10:45:15 -0400
4
Subject: [PATCH] chacha20 performance optimizations for ppc64le with 8x lanes,
5
Performance increase around 50%.
6
7
Co-authored-by: Madhusudhanan Duraisamy <madurais@in.ibm.com>
8
9
Co-authored-by: Nilamjyoti Goswami <nilamgoswami@in.ibm.com>
10
11
Co-authored-by: Siva Sundar Anbareeswaran <srisivasundar@in.ibm.com>
12
13
Reviewed-by: Danny Tsen <dtsen@us.ibm.com>
14
Tested-by: Danny Tsen <dtsen@us.ibm.com>
15
Signed-off-by: Danny <dtsen@us.ibm.com>
16
17
Reviewed-by: Tomas Mraz <tomas@openssl.org>
18
Reviewed-by: Paul Dale <pauli@openssl.org>
19
(Merged from https://github.com/openssl/openssl/pull/16637)
20
---
21
Configurations/00-base-templates.conf | 2
22
crypto/chacha/asm/chachap10-ppc.pl | 1354 ++++++++++++++++++++++++++++++++++
23
crypto/chacha/build.info | 1
24
crypto/perlasm/ppc-xlate.pl | 17
25
crypto/ppc_arch.h | 1
26
crypto/ppccap.c | 24
27
crypto/ppccpuid.pl | 11
28
7 files changed, 1404 insertions(+), 6 deletions(-)
29
create mode 100755 crypto/chacha/asm/chachap10-ppc.pl
30
31
--- a/Configurations/00-base-templates.conf
32
+++ b/Configurations/00-base-templates.conf
33
34
aes_asm_src => "aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s",
35
sha1_asm_src => "sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s sha512p8-ppc.s",
36
modes_asm_src => "ghashp8-ppc.s aes-gcm-ppc.s",
37
- chacha_asm_src => "chacha-ppc.s",
38
+ chacha_asm_src => "chacha-ppc.s chachap10-ppc.s",
39
poly1305_asm_src=> "poly1305-ppc.s poly1305-ppcfp.s",
40
},
41
ppc64_asm => {
42
--- /dev/null
43
+++ b/crypto/chacha/asm/chachap10-ppc.pl
44
45
+#! /usr/bin/env perl
46
+# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
47
+#
48
+# Licensed under the Apache License 2.0 (the "License"). You may not use
49
+# this file except in compliance with the License. You can obtain a copy
50
+# in the file LICENSE in the source distribution or at
51
+# https://www.openssl.org/source/license.html
52
+
53
+#
54
+# ====================================================================
55
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
56
+# project. The module is, however, dual licensed under OpenSSL and
57
+# CRYPTOGAMS licenses depending on where you obtain it. For further
58
+# details see http://www.openssl.org/~appro/cryptogams/.
59
+# ====================================================================
60
+#
61
+# October 2015
62
+#
63
+# ChaCha20 for PowerPC/AltiVec.
64
+#
65
+# June 2018
66
+#
67
+# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
68
+# processors that can't issue more than one vector instruction per
69
+# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
70
+# interleave would perform better. Incidentally PowerISA 2.07 (first
71
+# implemented by POWER8) defined new usable instructions, hence 4xVSX
72
+# code path...
73
+#
74
+# Performance in cycles per byte out of large buffer.
75
+#
76
+# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX
77
+#
78
+# Freescale e300 13.6/+115% - -
79
+# PPC74x0/G4e 6.81/+310% 3.81 -
80
+# PPC970/G5 9.29/+160% ? -
81
+# POWER7 8.62/+61% 3.35 -
82
+# POWER8 8.70/+51% 2.91 2.09
83
+# POWER9 8.80/+29% 4.44(*) 2.45(**)
84
+#
85
+# (*) this is trade-off result, it's possible to improve it, but
86
+# then it would negatively affect all others;
87
+# (**) POWER9 seems to be "allergic" to mixing vector and integer
88
+# instructions, which is why switch to vector-only code pays
89
+# off that much;
90
+
91
+# $output is the last argument if it looks like a file (it has an extension)
92
+# $flavour is the first argument if it doesn't look like a file
93
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
94
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
95
+
96
+if ($flavour =~ /64/) {
97
+ $SIZE_T =8;
98
+ $LRSAVE =2*$SIZE_T;
99
+ $STU ="stdu";
100
+ $POP ="ld";
101
+ $PUSH ="std";
102
+ $UCMP ="cmpld";
103
+} elsif ($flavour =~ /32/) {
104
+ $SIZE_T =4;
105
+ $LRSAVE =$SIZE_T;
106
+ $STU ="stwu";
107
+ $POP ="lwz";
108
+ $PUSH ="stw";
109
+ $UCMP ="cmplw";
110
+} else { die "nonsense $flavour"; }
111
+
112
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
113
+
114
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
115
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
116
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
117
+die "can't locate ppc-xlate.pl";
118
+
119
+open STDOUT,"| $^X $xlate $flavour \"$output\""
120
+ or die "can't call $xlate: $!";
121
+
122
+$LOCALS=6*$SIZE_T;
123
+$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables
124
+
125
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
126
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
127
+ $code .= "\t$opcode\t".join(',',@_)."\n";
128
+}
129
+
130
+my $sp = "r1";
131
+
132
+my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
133
+
134
+
135
+{{{
136
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
137
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
138
+my @K = map("v$_",(16..19));
139
+my $CTR = "v26";
140
+my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
141
+my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
142
+my $beperm = "v31";
143
+
144
+my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
145
+
146
+my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload
147
+
148
+sub VSX_lane_ROUND_1x {
149
+my $a=@_[0];
150
+my $b=@_[1];
151
+my $c=@_[2];
152
+my $d=@_[3];
153
+my $odd=@_[4];
154
+ vadduwm ($a,$a,$b);
155
+ vxor ($d,$d,$a);
156
+ vrlw ($d,$d,$sixteen);
157
+ vadduwm ($c,$c,$d);
158
+ vxor ($b,$b,$c);
159
+ vrlw ($b,$b,$twelve);
160
+ vadduwm ($a,$a,$b);
161
+ vxor ($d,$d,$a);
162
+ vrlw ($d,$d,$eight);
163
+ vadduwm ($c,$c,$d);
164
+ vxor ($b,$b,$c);
165
+ vrlw ($b,$b,$seven);
166
+ xxsldwi ($c,$c,$c,2);
167
+ xxsldwi ($b,$b,$b,$odd?3:1);
168
+ xxsldwi ($d,$d,$d,$odd?1:3);
169
+}
170
+
171
+
172
+sub VSX_lane_ROUND_4x {
173
+my ($a0,$b0,$c0,$d0)=@_;
174
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
175
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
176
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
177
+my @x=map("\"v$_\"",(0..15));
178
+
179
+ (
180
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
181
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
182
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
183
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
184
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])",
185
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])",
186
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])",
187
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])",
188
+ "&vrlw (@x[$d0],@x[$d0],'$sixteen')",
189
+ "&vrlw (@x[$d1],@x[$d1],'$sixteen')",
190
+ "&vrlw (@x[$d2],@x[$d2],'$sixteen')",
191
+ "&vrlw (@x[$d3],@x[$d3],'$sixteen')",
192
+
193
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
194
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
195
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
196
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
197
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])",
198
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])",
199
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])",
200
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])",
201
+ "&vrlw (@x[$b0],@x[$b0],'$twelve')",
202
+ "&vrlw (@x[$b1],@x[$b1],'$twelve')",
203
+ "&vrlw (@x[$b2],@x[$b2],'$twelve')",
204
+ "&vrlw (@x[$b3],@x[$b3],'$twelve')",
205
+
206
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
207
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
208
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
209
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
210
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])",
211
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])",
212
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])",
213
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])",
214
+ "&vrlw (@x[$d0],@x[$d0],'$eight')",
215
+ "&vrlw (@x[$d1],@x[$d1],'$eight')",
216
+ "&vrlw (@x[$d2],@x[$d2],'$eight')",
217
+ "&vrlw (@x[$d3],@x[$d3],'$eight')",
218
+
219
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
220
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
221
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
222
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
223
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])",
224
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])",
225
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])",
226
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])",
227
+ "&vrlw (@x[$b0],@x[$b0],'$seven')",
228
+ "&vrlw (@x[$b1],@x[$b1],'$seven')",
229
+ "&vrlw (@x[$b2],@x[$b2],'$seven')",
230
+ "&vrlw (@x[$b3],@x[$b3],'$seven')"
231
+ );
232
+}
233
+
234
+$code.=<<___;
235
+
236
+.globl .ChaCha20_ctr32_vsx_p10
237
+.align 5
238
+.ChaCha20_ctr32_vsx_p10:
239
+ ${UCMP}i $len,256
240
+ bgt ChaCha20_ctr32_vsx_8x
241
+ $STU $sp,-$FRAME($sp)
242
+ mflr r0
243
+ li r10,`15+$LOCALS+64`
244
+ li r11,`31+$LOCALS+64`
245
+ mfspr r12,256
246
+ stvx v26,r10,$sp
247
+ addi r10,r10,32
248
+ stvx v27,r11,$sp
249
+ addi r11,r11,32
250
+ stvx v28,r10,$sp
251
+ addi r10,r10,32
252
+ stvx v29,r11,$sp
253
+ addi r11,r11,32
254
+ stvx v30,r10,$sp
255
+ stvx v31,r11,$sp
256
+ stw r12,`$FRAME-4`($sp) # save vrsave
257
+ li r12,-4096+63
258
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
259
+ mtspr 256,r12 # preserve 29 AltiVec registers
260
+
261
+ bl Lconsts # returns pointer Lsigma in r12
262
+ lvx_4w @K[0],0,r12 # load sigma
263
+ addi r12,r12,0x70
264
+ li $x10,16
265
+ li $x20,32
266
+ li $x30,48
267
+ li r11,64
268
+
269
+ lvx_4w @K[1],0,$key # load key
270
+ lvx_4w @K[2],$x10,$key
271
+ lvx_4w @K[3],0,$ctr # load counter
272
+
273
+ vxor $xt0,$xt0,$xt0
274
+ lvx_4w $xt1,r11,r12
275
+ vspltw $CTR,@K[3],0
276
+ vsldoi @K[3],@K[3],$xt0,4
277
+ vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0]
278
+ vadduwm $CTR,$CTR,$xt1
279
+
280
+ be?lvsl $beperm,0,$x10 # 0x00..0f
281
+ be?vspltisb $xt0,3 # 0x03..03
282
+ be?vxor $beperm,$beperm,$xt0 # swap bytes within words
283
+
284
+ li r0,10 # inner loop counter
285
+ mtctr r0
286
+ b Loop_outer_vsx
287
+
288
+.align 5
289
+Loop_outer_vsx:
290
+ lvx $xa0,$x00,r12 # load [smashed] sigma
291
+ lvx $xa1,$x10,r12
292
+ lvx $xa2,$x20,r12
293
+ lvx $xa3,$x30,r12
294
+
295
+ vspltw $xb0,@K[1],0 # smash the key
296
+ vspltw $xb1,@K[1],1
297
+ vspltw $xb2,@K[1],2
298
+ vspltw $xb3,@K[1],3
299
+
300
+ vspltw $xc0,@K[2],0
301
+ vspltw $xc1,@K[2],1
302
+ vspltw $xc2,@K[2],2
303
+ vspltw $xc3,@K[2],3
304
+
305
+ vmr $xd0,$CTR # smash the counter
306
+ vspltw $xd1,@K[3],1
307
+ vspltw $xd2,@K[3],2
308
+ vspltw $xd3,@K[3],3
309
+
310
+ vspltisw $sixteen,-16 # synthesize constants
311
+ vspltisw $twelve,12
312
+ vspltisw $eight,8
313
+ vspltisw $seven,7
314
+
315
+ ${UCMP}i $len,64
316
+ bgt Loop_vsx_4x
317
+
318
+ vmr $xa0,@K[0]
319
+ vmr $xb0,@K[1]
320
+ vmr $xc0,@K[2]
321
+ vmr $xd0,@K[3]
322
+
323
+Loop_vsx_1x:
324
+___
325
+ VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,0);
326
+ VSX_lane_ROUND_1x($xa0, $xb0, $xc0,$xd0,1);
327
+
328
+$code.=<<___;
329
+
330
+ bdnz Loop_vsx_1x
331
+
332
+ vadduwm $xa0, $xa0, @K[0]
333
+ vadduwm $xb0, $xb0, @K[1]
334
+ vadduwm $xc0, $xc0, @K[2]
335
+ vadduwm $xd0, $xd0, @K[3]
336
+ ${UCMP}i $len,0x40
337
+ blt Ltail_vsx
338
+
339
+ lvx_4w $xt0,$x00, $inp
340
+ lvx_4w $xt1,$x10, $inp
341
+ lvx_4w $xt2,$x20, $inp
342
+ lvx_4w $xt3,$x30, $inp
343
+
344
+ vxor $xa0,$xa0,$xt0
345
+ vxor $xb0,$xb0,$xt1
346
+ vxor $xc0,$xc0,$xt2
347
+ vxor $xd0,$xd0,$xt3
348
+
349
+ stvx_4w $xa0,$x00,$out
350
+ stvx_4w $xb0,$x10,$out
351
+ addi $inp,$inp,0x40
352
+ stvx_4w $xc0,$x20,$out
353
+ subi $len,$len,0x40
354
+ stvx_4w $xd0,$x30,$out
355
+ addi $out,$out,0x40
356
+ beq Ldone_vsx
357
+
358
+Loop_vsx_4x:
359
+___
360
+ foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
361
+ foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
362
+$code.=<<___;
363
+
364
+ bdnz Loop_vsx_4x
365
+
366
+ vadduwm $xd0,$xd0,$CTR
367
+
368
+ vmrgew $xt0,$xa0,$xa1 # transpose data
369
+ vmrgew $xt1,$xa2,$xa3
370
+ vmrgow $xa0,$xa0,$xa1
371
+ vmrgow $xa2,$xa2,$xa3
372
+ vmrgew $xt2,$xb0,$xb1
373
+ vmrgew $xt3,$xb2,$xb3
374
+ vpermdi $xa1,$xa0,$xa2,0b00
375
+ vpermdi $xa3,$xa0,$xa2,0b11
376
+ vpermdi $xa0,$xt0,$xt1,0b00
377
+ vpermdi $xa2,$xt0,$xt1,0b11
378
+
379
+ vmrgow $xb0,$xb0,$xb1
380
+ vmrgow $xb2,$xb2,$xb3
381
+ vmrgew $xt0,$xc0,$xc1
382
+ vmrgew $xt1,$xc2,$xc3
383
+ vpermdi $xb1,$xb0,$xb2,0b00
384
+ vpermdi $xb3,$xb0,$xb2,0b11
385
+ vpermdi $xb0,$xt2,$xt3,0b00
386
+ vpermdi $xb2,$xt2,$xt3,0b11
387
+
388
+ vmrgow $xc0,$xc0,$xc1
389
+ vmrgow $xc2,$xc2,$xc3
390
+ vmrgew $xt2,$xd0,$xd1
391
+ vmrgew $xt3,$xd2,$xd3
392
+ vpermdi $xc1,$xc0,$xc2,0b00
393
+ vpermdi $xc3,$xc0,$xc2,0b11
394
+ vpermdi $xc0,$xt0,$xt1,0b00
395
+ vpermdi $xc2,$xt0,$xt1,0b11
396
+
397
+ vmrgow $xd0,$xd0,$xd1
398
+ vmrgow $xd2,$xd2,$xd3
399
+ vspltisw $xt0,4
400
+ vadduwm $CTR,$CTR,$xt0 # next counter value
401
+ vpermdi $xd1,$xd0,$xd2,0b00
402
+ vpermdi $xd3,$xd0,$xd2,0b11
403
+ vpermdi $xd0,$xt2,$xt3,0b00
404
+ vpermdi $xd2,$xt2,$xt3,0b11
405
+
406
+ vadduwm $xa0,$xa0,@K[0]
407
+ vadduwm $xb0,$xb0,@K[1]
408
+ vadduwm $xc0,$xc0,@K[2]
409
+ vadduwm $xd0,$xd0,@K[3]
410
+
411
+ be?vperm $xa0,$xa0,$xa0,$beperm
412
+ be?vperm $xb0,$xb0,$xb0,$beperm
413
+ be?vperm $xc0,$xc0,$xc0,$beperm
414
+ be?vperm $xd0,$xd0,$xd0,$beperm
415
+
416
+ ${UCMP}i $len,0x40
417
+ blt Ltail_vsx
418
+
419
+ lvx_4w $xt0,$x00,$inp
420
+ lvx_4w $xt1,$x10,$inp
421
+ lvx_4w $xt2,$x20,$inp
422
+ lvx_4w $xt3,$x30,$inp
423
+
424
+ vxor $xt0,$xt0,$xa0
425
+ vxor $xt1,$xt1,$xb0
426
+ vxor $xt2,$xt2,$xc0
427
+ vxor $xt3,$xt3,$xd0
428
+
429
+ stvx_4w $xt0,$x00,$out
430
+ stvx_4w $xt1,$x10,$out
431
+ addi $inp,$inp,0x40
432
+ stvx_4w $xt2,$x20,$out
433
+ subi $len,$len,0x40
434
+ stvx_4w $xt3,$x30,$out
435
+ addi $out,$out,0x40
436
+ beq Ldone_vsx
437
+
438
+ vadduwm $xa0,$xa1,@K[0]
439
+ vadduwm $xb0,$xb1,@K[1]
440
+ vadduwm $xc0,$xc1,@K[2]
441
+ vadduwm $xd0,$xd1,@K[3]
442
+
443
+ be?vperm $xa0,$xa0,$xa0,$beperm
444
+ be?vperm $xb0,$xb0,$xb0,$beperm
445
+ be?vperm $xc0,$xc0,$xc0,$beperm
446
+ be?vperm $xd0,$xd0,$xd0,$beperm
447
+
448
+ ${UCMP}i $len,0x40
449
+ blt Ltail_vsx
450
+
451
+ lvx_4w $xt0,$x00,$inp
452
+ lvx_4w $xt1,$x10,$inp
453
+ lvx_4w $xt2,$x20,$inp
454
+ lvx_4w $xt3,$x30,$inp
455
+
456
+ vxor $xt0,$xt0,$xa0
457
+ vxor $xt1,$xt1,$xb0
458
+ vxor $xt2,$xt2,$xc0
459
+ vxor $xt3,$xt3,$xd0
460
+
461
+ stvx_4w $xt0,$x00,$out
462
+ stvx_4w $xt1,$x10,$out
463
+ addi $inp,$inp,0x40
464
+ stvx_4w $xt2,$x20,$out
465
+ subi $len,$len,0x40
466
+ stvx_4w $xt3,$x30,$out
467
+ addi $out,$out,0x40
468
+ beq Ldone_vsx
469
+
470
+ vadduwm $xa0,$xa2,@K[0]
471
+ vadduwm $xb0,$xb2,@K[1]
472
+ vadduwm $xc0,$xc2,@K[2]
473
+ vadduwm $xd0,$xd2,@K[3]
474
+
475
+ be?vperm $xa0,$xa0,$xa0,$beperm
476
+ be?vperm $xb0,$xb0,$xb0,$beperm
477
+ be?vperm $xc0,$xc0,$xc0,$beperm
478
+ be?vperm $xd0,$xd0,$xd0,$beperm
479
+
480
+ ${UCMP}i $len,0x40
481
+ blt Ltail_vsx
482
+
483
+ lvx_4w $xt0,$x00,$inp
484
+ lvx_4w $xt1,$x10,$inp
485
+ lvx_4w $xt2,$x20,$inp
486
+ lvx_4w $xt3,$x30,$inp
487
+
488
+ vxor $xt0,$xt0,$xa0
489
+ vxor $xt1,$xt1,$xb0
490
+ vxor $xt2,$xt2,$xc0
491
+ vxor $xt3,$xt3,$xd0
492
+
493
+ stvx_4w $xt0,$x00,$out
494
+ stvx_4w $xt1,$x10,$out
495
+ addi $inp,$inp,0x40
496
+ stvx_4w $xt2,$x20,$out
497
+ subi $len,$len,0x40
498
+ stvx_4w $xt3,$x30,$out
499
+ addi $out,$out,0x40
500
+ beq Ldone_vsx
501
+
502
+ vadduwm $xa0,$xa3,@K[0]
503
+ vadduwm $xb0,$xb3,@K[1]
504
+ vadduwm $xc0,$xc3,@K[2]
505
+ vadduwm $xd0,$xd3,@K[3]
506
+
507
+ be?vperm $xa0,$xa0,$xa0,$beperm
508
+ be?vperm $xb0,$xb0,$xb0,$beperm
509
+ be?vperm $xc0,$xc0,$xc0,$beperm
510
+ be?vperm $xd0,$xd0,$xd0,$beperm
511
+
512
+ ${UCMP}i $len,0x40
513
+ blt Ltail_vsx
514
+
515
+ lvx_4w $xt0,$x00,$inp
516
+ lvx_4w $xt1,$x10,$inp
517
+ lvx_4w $xt2,$x20,$inp
518
+ lvx_4w $xt3,$x30,$inp
519
+
520
+ vxor $xt0,$xt0,$xa0
521
+ vxor $xt1,$xt1,$xb0
522
+ vxor $xt2,$xt2,$xc0
523
+ vxor $xt3,$xt3,$xd0
524
+
525
+ stvx_4w $xt0,$x00,$out
526
+ stvx_4w $xt1,$x10,$out
527
+ addi $inp,$inp,0x40
528
+ stvx_4w $xt2,$x20,$out
529
+ subi $len,$len,0x40
530
+ stvx_4w $xt3,$x30,$out
531
+ addi $out,$out,0x40
532
+ mtctr r0
533
+ bne Loop_outer_vsx
534
+
535
+Ldone_vsx:
536
+ lwz r12,`$FRAME-4`($sp) # pull vrsave
537
+ li r10,`15+$LOCALS+64`
538
+ li r11,`31+$LOCALS+64`
539
+ $POP r0, `$FRAME+$LRSAVE`($sp)
540
+ mtspr 256,r12 # restore vrsave
541
+ lvx v26,r10,$sp
542
+ addi r10,r10,32
543
+ lvx v27,r11,$sp
544
+ addi r11,r11,32
545
+ lvx v28,r10,$sp
546
+ addi r10,r10,32
547
+ lvx v29,r11,$sp
548
+ addi r11,r11,32
549
+ lvx v30,r10,$sp
550
+ lvx v31,r11,$sp
551
+ mtlr r0
552
+ addi $sp,$sp,$FRAME
553
+ blr
554
+
555
+.align 4
556
+Ltail_vsx:
557
+ addi r11,$sp,$LOCALS
558
+ mtctr $len
559
+ stvx_4w $xa0,$x00,r11 # offload block to stack
560
+ stvx_4w $xb0,$x10,r11
561
+ stvx_4w $xc0,$x20,r11
562
+ stvx_4w $xd0,$x30,r11
563
+ subi r12,r11,1 # prepare for *++ptr
564
+ subi $inp,$inp,1
565
+ subi $out,$out,1
566
+
567
+Loop_tail_vsx:
568
+ lbzu r6,1(r12)
569
+ lbzu r7,1($inp)
570
+ xor r6,r6,r7
571
+ stbu r6,1($out)
572
+ bdnz Loop_tail_vsx
573
+
574
+ stvx_4w $K[0],$x00,r11 # wipe copy of the block
575
+ stvx_4w $K[0],$x10,r11
576
+ stvx_4w $K[0],$x20,r11
577
+ stvx_4w $K[0],$x30,r11
578
+
579
+ b Ldone_vsx
580
+ .long 0
581
+ .byte 0,12,0x04,1,0x80,0,5,0
582
+ .long 0
583
+.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
584
+___
585
+}}}
586
+
587
+##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to
588
+# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
589
+# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
590
+# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
591
+#
592
+{{{
593
+#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
594
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
595
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
596
+ $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
597
+ $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
598
+my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
599
+my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
600
+my @K = map("v$_",27,(24..26));
601
+my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
602
+my $xr0 = "v4";
603
+my $CTR0 = "v22";
604
+my $CTR1 = "v5";
605
+my $beperm = "v31";
606
+my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
607
+my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
608
+my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
609
+my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
610
+my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
611
+
612
+my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload
613
+
614
+sub VSX_lane_ROUND_8x {
615
+my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
616
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
617
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
618
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
619
+my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
620
+my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
621
+my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
622
+my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
623
+my @x=map("\"v$_\"",(0..31));
624
+
625
+ (
626
+ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13
627
+ "&vxxlorc (@x[$c7], $xv9,$xv9)",
628
+
629
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1
630
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2
631
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3
632
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4
633
+ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1
634
+ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2
635
+ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3
636
+ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4
637
+
638
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])",
639
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])",
640
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])",
641
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])",
642
+ "&vxor (@x[$d4],@x[$d4],@x[$a4])",
643
+ "&vxor (@x[$d5],@x[$d5],@x[$a5])",
644
+ "&vxor (@x[$d6],@x[$d6],@x[$a6])",
645
+ "&vxor (@x[$d7],@x[$d7],@x[$a7])",
646
+
647
+ "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
648
+ "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
649
+ "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
650
+ "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
651
+ "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
652
+ "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
653
+ "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
654
+ "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
655
+
656
+ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
657
+ "&vxxlorc (@x[$c7], $xv15,$xv15)",
658
+ "&vxxlorc (@x[$a7], $xv10,$xv10)",
659
+
660
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
661
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
662
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
663
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
664
+ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
665
+ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
666
+ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
667
+ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
668
+
669
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])",
670
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])",
671
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])",
672
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])",
673
+ "&vxor (@x[$b4],@x[$b4],@x[$c4])",
674
+ "&vxor (@x[$b5],@x[$b5],@x[$c5])",
675
+ "&vxor (@x[$b6],@x[$b6],@x[$c6])",
676
+ "&vxor (@x[$b7],@x[$b7],@x[$c7])",
677
+
678
+ "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
679
+ "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
680
+ "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
681
+ "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
682
+ "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
683
+ "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
684
+ "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
685
+ "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
686
+
687
+ "&vxxlorc (@x[$a7], $xv13,$xv13)",
688
+ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])",
689
+ "&vxxlorc (@x[$c7], $xv11,$xv11)",
690
+
691
+
692
+ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])",
693
+ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])",
694
+ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])",
695
+ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])",
696
+ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])",
697
+ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])",
698
+ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])",
699
+ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])",
700
+
701
+ "&vxor (@x[$d0],@x[$d0],@x[$a0])",
702
+ "&vxor (@x[$d1],@x[$d1],@x[$a1])",
703
+ "&vxor (@x[$d2],@x[$d2],@x[$a2])",
704
+ "&vxor (@x[$d3],@x[$d3],@x[$a3])",
705
+ "&vxor (@x[$d4],@x[$d4],@x[$a4])",
706
+ "&vxor (@x[$d5],@x[$d5],@x[$a5])",
707
+ "&vxor (@x[$d6],@x[$d6],@x[$a6])",
708
+ "&vxor (@x[$d7],@x[$d7],@x[$a7])",
709
+
710
+ "&vrlw (@x[$d0],@x[$d0],@x[$c7])",
711
+ "&vrlw (@x[$d1],@x[$d1],@x[$c7])",
712
+ "&vrlw (@x[$d2],@x[$d2],@x[$c7])",
713
+ "&vrlw (@x[$d3],@x[$d3],@x[$c7])",
714
+ "&vrlw (@x[$d4],@x[$d4],@x[$c7])",
715
+ "&vrlw (@x[$d5],@x[$d5],@x[$c7])",
716
+ "&vrlw (@x[$d6],@x[$d6],@x[$c7])",
717
+ "&vrlw (@x[$d7],@x[$d7],@x[$c7])",
718
+
719
+ "&vxxlorc (@x[$c7], $xv15,$xv15)",
720
+ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])",
721
+ "&vxxlorc (@x[$a7], $xv12,$xv12)",
722
+
723
+ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])",
724
+ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])",
725
+ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])",
726
+ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])",
727
+ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])",
728
+ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])",
729
+ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])",
730
+ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])",
731
+ "&vxor (@x[$b0],@x[$b0],@x[$c0])",
732
+ "&vxor (@x[$b1],@x[$b1],@x[$c1])",
733
+ "&vxor (@x[$b2],@x[$b2],@x[$c2])",
734
+ "&vxor (@x[$b3],@x[$b3],@x[$c3])",
735
+ "&vxor (@x[$b4],@x[$b4],@x[$c4])",
736
+ "&vxor (@x[$b5],@x[$b5],@x[$c5])",
737
+ "&vxor (@x[$b6],@x[$b6],@x[$c6])",
738
+ "&vxor (@x[$b7],@x[$b7],@x[$c7])",
739
+ "&vrlw (@x[$b0],@x[$b0],@x[$a7])",
740
+ "&vrlw (@x[$b1],@x[$b1],@x[$a7])",
741
+ "&vrlw (@x[$b2],@x[$b2],@x[$a7])",
742
+ "&vrlw (@x[$b3],@x[$b3],@x[$a7])",
743
+ "&vrlw (@x[$b4],@x[$b4],@x[$a7])",
744
+ "&vrlw (@x[$b5],@x[$b5],@x[$a7])",
745
+ "&vrlw (@x[$b6],@x[$b6],@x[$a7])",
746
+ "&vrlw (@x[$b7],@x[$b7],@x[$a7])",
747
+
748
+ "&vxxlorc (@x[$a7], $xv13,$xv13)",
749
+ );
750
+}
751
+
752
+$code.=<<___;
753
+
754
+.globl .ChaCha20_ctr32_vsx_8x
755
+.align 5
756
+.ChaCha20_ctr32_vsx_8x:
757
+ $STU $sp,-$FRAME($sp)
758
+ mflr r0
759
+ li r10,`15+$LOCALS+64`
760
+ li r11,`31+$LOCALS+64`
761
+ mfspr r12,256
762
+ stvx v24,r10,$sp
763
+ addi r10,r10,32
764
+ stvx v25,r11,$sp
765
+ addi r11,r11,32
766
+ stvx v26,r10,$sp
767
+ addi r10,r10,32
768
+ stvx v27,r11,$sp
769
+ addi r11,r11,32
770
+ stvx v28,r10,$sp
771
+ addi r10,r10,32
772
+ stvx v29,r11,$sp
773
+ addi r11,r11,32
774
+ stvx v30,r10,$sp
775
+ stvx v31,r11,$sp
776
+ stw r12,`$FRAME-4`($sp) # save vrsave
777
+ li r12,-4096+63
778
+ $PUSH r0, `$FRAME+$LRSAVE`($sp)
779
+ mtspr 256,r12 # preserve 29 AltiVec registers
780
+
781
+ bl Lconsts # returns pointer Lsigma in r12
782
+
783
+ lvx_4w @K[0],0,r12 # load sigma
784
+ addi r12,r12,0x70
785
+ li $x10,16
786
+ li $x20,32
787
+ li $x30,48
788
+ li r11,64
789
+
790
+ vspltisw $xa4,-16 # synthesize constants
791
+ vspltisw $xb4,12 # synthesize constants
792
+ vspltisw $xc4,8 # synthesize constants
793
+ vspltisw $xd4,7 # synthesize constants
794
+
795
+ lvx $xa0,$x00,r12 # load [smashed] sigma
796
+ lvx $xa1,$x10,r12
797
+ lvx $xa2,$x20,r12
798
+ lvx $xa3,$x30,r12
799
+
800
+ vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12
801
+ vxxlor $xv10 ,$xb4,$xb4
802
+ vxxlor $xv11 ,$xc4,$xc4
803
+ vxxlor $xv12 ,$xd4,$xd4
804
+ vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25
805
+ vxxlor $xv23 ,$xa1,$xa1
806
+ vxxlor $xv24 ,$xa2,$xa2
807
+ vxxlor $xv25 ,$xa3,$xa3
808
+
809
+ lvx_4w @K[1],0,$key # load key
810
+ lvx_4w @K[2],$x10,$key
811
+ lvx_4w @K[3],0,$ctr # load counter
812
+ vspltisw $xt3,4
813
+
814
+
815
+ vxor $xt2,$xt2,$xt2
816
+ lvx_4w $xt1,r11,r12
817
+ vspltw $xa2,@K[3],0 #save the original count after spltw
818
+ vsldoi @K[3],@K[3],$xt2,4
819
+ vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0]
820
+ vadduwm $xt1,$xa2,$xt1
821
+ vadduwm $xt3,$xt1,$xt3 # next counter value
822
+ vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8.
823
+
824
+ be?lvsl $beperm,0,$x10 # 0x00..0f
825
+ be?vspltisb $xt0,3 # 0x03..03
826
+ be?vxor $beperm,$beperm,$xt0 # swap bytes within words
827
+ be?vxxlor $xv26 ,$beperm,$beperm
828
+
829
+ vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2
830
+ vxxlor $xv1 ,@K[1],@K[1]
831
+ vxxlor $xv2 ,@K[2],@K[2]
832
+ vxxlor $xv3 ,@K[3],@K[3]
833
+ vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5
834
+ vxxlor $xv5 ,$xt3,$xt3
835
+ vxxlor $xv8 ,$xa0,$xa0
836
+
837
+ li r0,10 # inner loop counter
838
+ mtctr r0
839
+ b Loop_outer_vsx_8x
840
+
841
+.align 5
842
+Loop_outer_vsx_8x:
843
+ vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma
844
+ vxxlorc $xa1,$xv23,$xv23
845
+ vxxlorc $xa2,$xv24,$xv24
846
+ vxxlorc $xa3,$xv25,$xv25
847
+ vxxlorc $xa4,$xv22,$xv22
848
+ vxxlorc $xa5,$xv23,$xv23
849
+ vxxlorc $xa6,$xv24,$xv24
850
+ vxxlorc $xa7,$xv25,$xv25
851
+
852
+ vspltw $xb0,@K[1],0 # smash the key
853
+ vspltw $xb1,@K[1],1
854
+ vspltw $xb2,@K[1],2
855
+ vspltw $xb3,@K[1],3
856
+ vspltw $xb4,@K[1],0 # smash the key
857
+ vspltw $xb5,@K[1],1
858
+ vspltw $xb6,@K[1],2
859
+ vspltw $xb7,@K[1],3
860
+
861
+ vspltw $xc0,@K[2],0
862
+ vspltw $xc1,@K[2],1
863
+ vspltw $xc2,@K[2],2
864
+ vspltw $xc3,@K[2],3
865
+ vspltw $xc4,@K[2],0
866
+ vspltw $xc7,@K[2],3
867
+ vspltw $xc5,@K[2],1
868
+
869
+ vxxlorc $xd0,$xv4,$xv4 # smash the counter
870
+ vspltw $xd1,@K[3],1
871
+ vspltw $xd2,@K[3],2
872
+ vspltw $xd3,@K[3],3
873
+ vxxlorc $xd4,$xv5,$xv5 # smash the counter
874
+ vspltw $xd5,@K[3],1
875
+ vspltw $xd6,@K[3],2
876
+ vspltw $xd7,@K[3],3
877
+ vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
878
+
879
+Loop_vsx_8x:
880
+___
881
+ foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
882
+ foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
883
+$code.=<<___;
884
+
885
+ bdnz Loop_vsx_8x
886
+ vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31
887
+ vxxlor $xv14 ,$xd5,$xd5 #
888
+ vxxlor $xv15 ,$xd6,$xd6 #
889
+ vxxlor $xv16 ,$xd7,$xd7 #
890
+
891
+ vxxlor $xv18 ,$xc4,$xc4 #
892
+ vxxlor $xv19 ,$xc5,$xc5 #
893
+ vxxlor $xv20 ,$xc6,$xc6 #
894
+ vxxlor $xv21 ,$xc7,$xc7 #
895
+
896
+ vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs
897
+ vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs
898
+ be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm.
899
+
900
+ vxxlorc @K[0],$xv0,$xv0 #27
901
+ vxxlorc @K[1],$xv1,$xv1 #24
902
+ vxxlorc @K[2],$xv2,$xv2 #25
903
+ vxxlorc @K[3],$xv3,$xv3 #26
904
+ vxxlorc $CTR0,$xv4,$xv4
905
+###changing to vertical
906
+
907
+ vmrgew $xt0,$xa0,$xa1 # transpose data
908
+ vmrgew $xt1,$xa2,$xa3
909
+ vmrgow $xa0,$xa0,$xa1
910
+ vmrgow $xa2,$xa2,$xa3
911
+
912
+ vmrgew $xt2,$xb0,$xb1
913
+ vmrgew $xt3,$xb2,$xb3
914
+ vmrgow $xb0,$xb0,$xb1
915
+ vmrgow $xb2,$xb2,$xb3
916
+
917
+ vadduwm $xd0,$xd0,$CTR0
918
+
919
+ vpermdi $xa1,$xa0,$xa2,0b00
920
+ vpermdi $xa3,$xa0,$xa2,0b11
921
+ vpermdi $xa0,$xt0,$xt1,0b00
922
+ vpermdi $xa2,$xt0,$xt1,0b11
923
+ vpermdi $xb1,$xb0,$xb2,0b00
924
+ vpermdi $xb3,$xb0,$xb2,0b11
925
+ vpermdi $xb0,$xt2,$xt3,0b00
926
+ vpermdi $xb2,$xt2,$xt3,0b11
927
+
928
+ vmrgew $xt0,$xc0,$xc1
929
+ vmrgew $xt1,$xc2,$xc3
930
+ vmrgow $xc0,$xc0,$xc1
931
+ vmrgow $xc2,$xc2,$xc3
932
+ vmrgew $xt2,$xd0,$xd1
933
+ vmrgew $xt3,$xd2,$xd3
934
+ vmrgow $xd0,$xd0,$xd1
935
+ vmrgow $xd2,$xd2,$xd3
936
+
937
+ vpermdi $xc1,$xc0,$xc2,0b00
938
+ vpermdi $xc3,$xc0,$xc2,0b11
939
+ vpermdi $xc0,$xt0,$xt1,0b00
940
+ vpermdi $xc2,$xt0,$xt1,0b11
941
+ vpermdi $xd1,$xd0,$xd2,0b00
942
+ vpermdi $xd3,$xd0,$xd2,0b11
943
+ vpermdi $xd0,$xt2,$xt3,0b00
944
+ vpermdi $xd2,$xt2,$xt3,0b11
945
+
946
+ vspltisw $xt0,8
947
+ vadduwm $CTR0,$CTR0,$xt0 # next counter value
948
+ vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5
949
+
950
+ vadduwm $xa0,$xa0,@K[0]
951
+ vadduwm $xb0,$xb0,@K[1]
952
+ vadduwm $xc0,$xc0,@K[2]
953
+ vadduwm $xd0,$xd0,@K[3]
954
+
955
+ be?vperm $xa0,$xa0,$xa0,$beperm
956
+ be?vperm $xb0,$xb0,$xb0,$beperm
957
+ be?vperm $xc0,$xc0,$xc0,$beperm
958
+ be?vperm $xd0,$xd0,$xd0,$beperm
959
+
960
+ ${UCMP}i $len,0x40
961
+ blt Ltail_vsx_8x
962
+
963
+ lvx_4w $xt0,$x00,$inp
964
+ lvx_4w $xt1,$x10,$inp
965
+ lvx_4w $xt2,$x20,$inp
966
+ lvx_4w $xt3,$x30,$inp
967
+
968
+ vxor $xt0,$xt0,$xa0
969
+ vxor $xt1,$xt1,$xb0
970
+ vxor $xt2,$xt2,$xc0
971
+ vxor $xt3,$xt3,$xd0
972
+
973
+ stvx_4w $xt0,$x00,$out
974
+ stvx_4w $xt1,$x10,$out
975
+ addi $inp,$inp,0x40
976
+ stvx_4w $xt2,$x20,$out
977
+ subi $len,$len,0x40
978
+ stvx_4w $xt3,$x30,$out
979
+ addi $out,$out,0x40
980
+ beq Ldone_vsx_8x
981
+
982
+ vadduwm $xa0,$xa1,@K[0]
983
+ vadduwm $xb0,$xb1,@K[1]
984
+ vadduwm $xc0,$xc1,@K[2]
985
+ vadduwm $xd0,$xd1,@K[3]
986
+
987
+ be?vperm $xa0,$xa0,$xa0,$beperm
988
+ be?vperm $xb0,$xb0,$xb0,$beperm
989
+ be?vperm $xc0,$xc0,$xc0,$beperm
990
+ be?vperm $xd0,$xd0,$xd0,$beperm
991
+
992
+ ${UCMP}i $len,0x40
993
+ blt Ltail_vsx_8x
994
+
995
+ lvx_4w $xt0,$x00,$inp
996
+ lvx_4w $xt1,$x10,$inp
997
+ lvx_4w $xt2,$x20,$inp
998
+ lvx_4w $xt3,$x30,$inp
999
+
1000
+ vxor $xt0,$xt0,$xa0
1001
+ vxor $xt1,$xt1,$xb0
1002
+ vxor $xt2,$xt2,$xc0
1003
+ vxor $xt3,$xt3,$xd0
1004
+
1005
+ stvx_4w $xt0,$x00,$out
1006
+ stvx_4w $xt1,$x10,$out
1007
+ addi $inp,$inp,0x40
1008
+ stvx_4w $xt2,$x20,$out
1009
+ subi $len,$len,0x40
1010
+ stvx_4w $xt3,$x30,$out
1011
+ addi $out,$out,0x40
1012
+ beq Ldone_vsx_8x
1013
+
1014
+ vadduwm $xa0,$xa2,@K[0]
1015
+ vadduwm $xb0,$xb2,@K[1]
1016
+ vadduwm $xc0,$xc2,@K[2]
1017
+ vadduwm $xd0,$xd2,@K[3]
1018
+
1019
+ be?vperm $xa0,$xa0,$xa0,$beperm
1020
+ be?vperm $xb0,$xb0,$xb0,$beperm
1021
+ be?vperm $xc0,$xc0,$xc0,$beperm
1022
+ be?vperm $xd0,$xd0,$xd0,$beperm
1023
+
1024
+ ${UCMP}i $len,0x40
1025
+ blt Ltail_vsx_8x
1026
+
1027
+ lvx_4w $xt0,$x00,$inp
1028
+ lvx_4w $xt1,$x10,$inp
1029
+ lvx_4w $xt2,$x20,$inp
1030
+ lvx_4w $xt3,$x30,$inp
1031
+
1032
+ vxor $xt0,$xt0,$xa0
1033
+ vxor $xt1,$xt1,$xb0
1034
+ vxor $xt2,$xt2,$xc0
1035
+ vxor $xt3,$xt3,$xd0
1036
+
1037
+ stvx_4w $xt0,$x00,$out
1038
+ stvx_4w $xt1,$x10,$out
1039
+ addi $inp,$inp,0x40
1040
+ stvx_4w $xt2,$x20,$out
1041
+ subi $len,$len,0x40
1042
+ stvx_4w $xt3,$x30,$out
1043
+ addi $out,$out,0x40
1044
+ beq Ldone_vsx_8x
1045
+
1046
+ vadduwm $xa0,$xa3,@K[0]
1047
+ vadduwm $xb0,$xb3,@K[1]
1048
+ vadduwm $xc0,$xc3,@K[2]
1049
+ vadduwm $xd0,$xd3,@K[3]
1050
+
1051
+ be?vperm $xa0,$xa0,$xa0,$beperm
1052
+ be?vperm $xb0,$xb0,$xb0,$beperm
1053
+ be?vperm $xc0,$xc0,$xc0,$beperm
1054
+ be?vperm $xd0,$xd0,$xd0,$beperm
1055
+
1056
+ ${UCMP}i $len,0x40
1057
+ blt Ltail_vsx_8x
1058
+
1059
+ lvx_4w $xt0,$x00,$inp
1060
+ lvx_4w $xt1,$x10,$inp
1061
+ lvx_4w $xt2,$x20,$inp
1062
+ lvx_4w $xt3,$x30,$inp
1063
+
1064
+ vxor $xt0,$xt0,$xa0
1065
+ vxor $xt1,$xt1,$xb0
1066
+ vxor $xt2,$xt2,$xc0
1067
+ vxor $xt3,$xt3,$xd0
1068
+
1069
+ stvx_4w $xt0,$x00,$out
1070
+ stvx_4w $xt1,$x10,$out
1071
+ addi $inp,$inp,0x40
1072
+ stvx_4w $xt2,$x20,$out
1073
+ subi $len,$len,0x40
1074
+ stvx_4w $xt3,$x30,$out
1075
+ addi $out,$out,0x40
1076
+ beq Ldone_vsx_8x
1077
+
1078
+#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
1079
+#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
1080
+
1081
+ vxxlorc $CTR1 ,$xv5,$xv5
1082
+
1083
+ vxxlorc $xcn4 ,$xv18,$xv18
1084
+ vxxlorc $xcn5 ,$xv19,$xv19
1085
+ vxxlorc $xcn6 ,$xv20,$xv20
1086
+ vxxlorc $xcn7 ,$xv21,$xv21
1087
+
1088
+ vxxlorc $xdn4 ,$xv13,$xv13
1089
+ vxxlorc $xdn5 ,$xv14,$xv14
1090
+ vxxlorc $xdn6 ,$xv15,$xv15
1091
+ vxxlorc $xdn7 ,$xv16,$xv16
1092
+ vadduwm $xdn4,$xdn4,$CTR1
1093
+
1094
+ vxxlorc $xb6 ,$xv6,$xv6
1095
+ vxxlorc $xb7 ,$xv7,$xv7
1096
+#use xa1->xr0, as xt0...in the block 4-7
1097
+
1098
+ vmrgew $xr0,$xa4,$xa5 # transpose data
1099
+ vmrgew $xt1,$xa6,$xa7
1100
+ vmrgow $xa4,$xa4,$xa5
1101
+ vmrgow $xa6,$xa6,$xa7
1102
+ vmrgew $xt2,$xb4,$xb5
1103
+ vmrgew $xt3,$xb6,$xb7
1104
+ vmrgow $xb4,$xb4,$xb5
1105
+ vmrgow $xb6,$xb6,$xb7
1106
+
1107
+ vpermdi $xa5,$xa4,$xa6,0b00
1108
+ vpermdi $xa7,$xa4,$xa6,0b11
1109
+ vpermdi $xa4,$xr0,$xt1,0b00
1110
+ vpermdi $xa6,$xr0,$xt1,0b11
1111
+ vpermdi $xb5,$xb4,$xb6,0b00
1112
+ vpermdi $xb7,$xb4,$xb6,0b11
1113
+ vpermdi $xb4,$xt2,$xt3,0b00
1114
+ vpermdi $xb6,$xt2,$xt3,0b11
1115
+
1116
+ vmrgew $xr0,$xcn4,$xcn5
1117
+ vmrgew $xt1,$xcn6,$xcn7
1118
+ vmrgow $xcn4,$xcn4,$xcn5
1119
+ vmrgow $xcn6,$xcn6,$xcn7
1120
+ vmrgew $xt2,$xdn4,$xdn5
1121
+ vmrgew $xt3,$xdn6,$xdn7
1122
+ vmrgow $xdn4,$xdn4,$xdn5
1123
+ vmrgow $xdn6,$xdn6,$xdn7
1124
+
1125
+ vpermdi $xcn5,$xcn4,$xcn6,0b00
1126
+ vpermdi $xcn7,$xcn4,$xcn6,0b11
1127
+ vpermdi $xcn4,$xr0,$xt1,0b00
1128
+ vpermdi $xcn6,$xr0,$xt1,0b11
1129
+ vpermdi $xdn5,$xdn4,$xdn6,0b00
1130
+ vpermdi $xdn7,$xdn4,$xdn6,0b11
1131
+ vpermdi $xdn4,$xt2,$xt3,0b00
1132
+ vpermdi $xdn6,$xt2,$xt3,0b11
1133
+
1134
+ vspltisw $xr0,8
1135
+ vadduwm $CTR1,$CTR1,$xr0 # next counter value
1136
+ vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5
1137
+
1138
+ vadduwm $xan0,$xa4,@K[0]
1139
+ vadduwm $xbn0,$xb4,@K[1]
1140
+ vadduwm $xcn0,$xcn4,@K[2]
1141
+ vadduwm $xdn0,$xdn4,@K[3]
1142
+
1143
+ be?vperm $xan0,$xa4,$xa4,$beperm
1144
+ be?vperm $xbn0,$xb4,$xb4,$beperm
1145
+ be?vperm $xcn0,$xcn4,$xcn4,$beperm
1146
+ be?vperm $xdn0,$xdn4,$xdn4,$beperm
1147
+
1148
+ ${UCMP}i $len,0x40
1149
+ blt Ltail_vsx_8x_1
1150
+
1151
+ lvx_4w $xr0,$x00,$inp
1152
+ lvx_4w $xt1,$x10,$inp
1153
+ lvx_4w $xt2,$x20,$inp
1154
+ lvx_4w $xt3,$x30,$inp
1155
+
1156
+ vxor $xr0,$xr0,$xan0
1157
+ vxor $xt1,$xt1,$xbn0
1158
+ vxor $xt2,$xt2,$xcn0
1159
+ vxor $xt3,$xt3,$xdn0
1160
+
1161
+ stvx_4w $xr0,$x00,$out
1162
+ stvx_4w $xt1,$x10,$out
1163
+ addi $inp,$inp,0x40
1164
+ stvx_4w $xt2,$x20,$out
1165
+ subi $len,$len,0x40
1166
+ stvx_4w $xt3,$x30,$out
1167
+ addi $out,$out,0x40
1168
+ beq Ldone_vsx_8x
1169
+
1170
+ vadduwm $xan0,$xa5,@K[0]
1171
+ vadduwm $xbn0,$xb5,@K[1]
1172
+ vadduwm $xcn0,$xcn5,@K[2]
1173
+ vadduwm $xdn0,$xdn5,@K[3]
1174
+
1175
+ be?vperm $xan0,$xan0,$xan0,$beperm
1176
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm
1177
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm
1178
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm
1179
+
1180
+ ${UCMP}i $len,0x40
1181
+ blt Ltail_vsx_8x_1
1182
+
1183
+ lvx_4w $xr0,$x00,$inp
1184
+ lvx_4w $xt1,$x10,$inp
1185
+ lvx_4w $xt2,$x20,$inp
1186
+ lvx_4w $xt3,$x30,$inp
1187
+
1188
+ vxor $xr0,$xr0,$xan0
1189
+ vxor $xt1,$xt1,$xbn0
1190
+ vxor $xt2,$xt2,$xcn0
1191
+ vxor $xt3,$xt3,$xdn0
1192
+
1193
+ stvx_4w $xr0,$x00,$out
1194
+ stvx_4w $xt1,$x10,$out
1195
+ addi $inp,$inp,0x40
1196
+ stvx_4w $xt2,$x20,$out
1197
+ subi $len,$len,0x40
1198
+ stvx_4w $xt3,$x30,$out
1199
+ addi $out,$out,0x40
1200
+ beq Ldone_vsx_8x
1201
+
1202
+ vadduwm $xan0,$xa6,@K[0]
1203
+ vadduwm $xbn0,$xb6,@K[1]
1204
+ vadduwm $xcn0,$xcn6,@K[2]
1205
+ vadduwm $xdn0,$xdn6,@K[3]
1206
+
1207
+ be?vperm $xan0,$xan0,$xan0,$beperm
1208
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm
1209
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm
1210
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm
1211
+
1212
+ ${UCMP}i $len,0x40
1213
+ blt Ltail_vsx_8x_1
1214
+
1215
+ lvx_4w $xr0,$x00,$inp
1216
+ lvx_4w $xt1,$x10,$inp
1217
+ lvx_4w $xt2,$x20,$inp
1218
+ lvx_4w $xt3,$x30,$inp
1219
+
1220
+ vxor $xr0,$xr0,$xan0
1221
+ vxor $xt1,$xt1,$xbn0
1222
+ vxor $xt2,$xt2,$xcn0
1223
+ vxor $xt3,$xt3,$xdn0
1224
+
1225
+ stvx_4w $xr0,$x00,$out
1226
+ stvx_4w $xt1,$x10,$out
1227
+ addi $inp,$inp,0x40
1228
+ stvx_4w $xt2,$x20,$out
1229
+ subi $len,$len,0x40
1230
+ stvx_4w $xt3,$x30,$out
1231
+ addi $out,$out,0x40
1232
+ beq Ldone_vsx_8x
1233
+
1234
+ vadduwm $xan0,$xa7,@K[0]
1235
+ vadduwm $xbn0,$xb7,@K[1]
1236
+ vadduwm $xcn0,$xcn7,@K[2]
1237
+ vadduwm $xdn0,$xdn7,@K[3]
1238
+
1239
+ be?vperm $xan0,$xan0,$xan0,$beperm
1240
+ be?vperm $xbn0,$xbn0,$xbn0,$beperm
1241
+ be?vperm $xcn0,$xcn0,$xcn0,$beperm
1242
+ be?vperm $xdn0,$xdn0,$xdn0,$beperm
1243
+
1244
+ ${UCMP}i $len,0x40
1245
+ blt Ltail_vsx_8x_1
1246
+
1247
+ lvx_4w $xr0,$x00,$inp
1248
+ lvx_4w $xt1,$x10,$inp
1249
+ lvx_4w $xt2,$x20,$inp
1250
+ lvx_4w $xt3,$x30,$inp
1251
+
1252
+ vxor $xr0,$xr0,$xan0
1253
+ vxor $xt1,$xt1,$xbn0
1254
+ vxor $xt2,$xt2,$xcn0
1255
+ vxor $xt3,$xt3,$xdn0
1256
+
1257
+ stvx_4w $xr0,$x00,$out
1258
+ stvx_4w $xt1,$x10,$out
1259
+ addi $inp,$inp,0x40
1260
+ stvx_4w $xt2,$x20,$out
1261
+ subi $len,$len,0x40
1262
+ stvx_4w $xt3,$x30,$out
1263
+ addi $out,$out,0x40
1264
+ beq Ldone_vsx_8x
1265
+
1266
+ mtctr r0
1267
+ bne Loop_outer_vsx_8x
1268
+
1269
+Ldone_vsx_8x:
1270
+ lwz r12,`$FRAME-4`($sp) # pull vrsave
1271
+ li r10,`15+$LOCALS+64`
1272
+ li r11,`31+$LOCALS+64`
1273
+ $POP r0, `$FRAME+$LRSAVE`($sp)
1274
+ mtspr 256,r12 # restore vrsave
1275
+ lvx v24,r10,$sp
1276
+ addi r10,r10,32
1277
+ lvx v25,r11,$sp
1278
+ addi r11,r11,32
1279
+ lvx v26,r10,$sp
1280
+ addi r10,r10,32
1281
+ lvx v27,r11,$sp
1282
+ addi r11,r11,32
1283
+ lvx v28,r10,$sp
1284
+ addi r10,r10,32
1285
+ lvx v29,r11,$sp
1286
+ addi r11,r11,32
1287
+ lvx v30,r10,$sp
1288
+ lvx v31,r11,$sp
1289
+ mtlr r0
1290
+ addi $sp,$sp,$FRAME
1291
+ blr
1292
+
1293
+.align 4
1294
+Ltail_vsx_8x:
1295
+ addi r11,$sp,$LOCALS
1296
+ mtctr $len
1297
+ stvx_4w $xa0,$x00,r11 # offload block to stack
1298
+ stvx_4w $xb0,$x10,r11
1299
+ stvx_4w $xc0,$x20,r11
1300
+ stvx_4w $xd0,$x30,r11
1301
+ subi r12,r11,1 # prepare for *++ptr
1302
+ subi $inp,$inp,1
1303
+ subi $out,$out,1
1304
+ bl Loop_tail_vsx_8x
1305
+Ltail_vsx_8x_1:
1306
+ addi r11,$sp,$LOCALS
1307
+ mtctr $len
1308
+ stvx_4w $xan0,$x00,r11 # offload block to stack
1309
+ stvx_4w $xbn0,$x10,r11
1310
+ stvx_4w $xcn0,$x20,r11
1311
+ stvx_4w $xdn0,$x30,r11
1312
+ subi r12,r11,1 # prepare for *++ptr
1313
+ subi $inp,$inp,1
1314
+ subi $out,$out,1
1315
+ bl Loop_tail_vsx_8x
1316
+
1317
+Loop_tail_vsx_8x:
1318
+ lbzu r6,1(r12)
1319
+ lbzu r7,1($inp)
1320
+ xor r6,r6,r7
1321
+ stbu r6,1($out)
1322
+ bdnz Loop_tail_vsx_8x
1323
+
1324
+ stvx_4w $K[0],$x00,r11 # wipe copy of the block
1325
+ stvx_4w $K[0],$x10,r11
1326
+ stvx_4w $K[0],$x20,r11
1327
+ stvx_4w $K[0],$x30,r11
1328
+
1329
+ b Ldone_vsx_8x
1330
+ .long 0
1331
+ .byte 0,12,0x04,1,0x80,0,5,0
1332
+ .long 0
1333
+.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
1334
+___
1335
+}}}
1336
+
1337
+
1338
+$code.=<<___;
1339
+.align 5
1340
+Lconsts:
1341
+ mflr r0
1342
+ bcl 20,31,\$+4
1343
+ mflr r12 #vvvvv "distance between . and Lsigma
1344
+ addi r12,r12,`64-8`
1345
+ mtlr r0
1346
+ blr
1347
+ .long 0
1348
+ .byte 0,12,0x14,0,0,0,0,0
1349
+ .space `64-9*4`
1350
+Lsigma:
1351
+ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574
1352
+ .long 1,0,0,0
1353
+ .long 2,0,0,0
1354
+ .long 3,0,0,0
1355
+ .long 4,0,0,0
1356
+___
1357
+$code.=<<___ if ($LITTLE_ENDIAN);
1358
+ .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
1359
+ .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
1360
+___
1361
+$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words
1362
+ .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
1363
+ .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
1364
+___
1365
+$code.=<<___;
1366
+ .long 0x61707865,0x61707865,0x61707865,0x61707865
1367
+ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e
1368
+ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32
1369
+ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574
1370
+ .long 0,1,2,3
1371
+ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
1372
+.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
1373
+.align 2
1374
+___
1375
+
1376
+foreach (split("\n",$code)) {
1377
+ s/\`([^\`]*)\`/eval $1/ge;
1378
+
1379
+ # instructions prefixed with '?' are endian-specific and need
1380
+ # to be adjusted accordingly...
1381
+ if ($flavour !~ /le$/) { # big-endian
1382
+ s/be\?// or
1383
+ s/le\?/#le#/ or
1384
+ s/\?lvsr/lvsl/ or
1385
+ s/\?lvsl/lvsr/ or
1386
+ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
1387
+ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
1388
+ } else { # little-endian
1389
+ s/le\?// or
1390
+ s/be\?/#be#/ or
1391
+ s/\?([a-z]+)/$1/ or
1392
+ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
1393
+ }
1394
+
1395
+ print $_,"\n";
1396
+}
1397
+
1398
+close STDOUT or die "error closing STDOUT: $!";
1399
--- a/crypto/chacha/build.info
1400
+++ b/crypto/chacha/build.info
1401
1402
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
1403
GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl $(PERLASM_SCHEME)
1404
GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl $(PERLASM_SCHEME)
1405
+GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl $(PERLASM_SCHEME)
1406
GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl $(PERLASM_SCHEME)
1407
INCLUDE[chacha-armv4.o]=..
1408
GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl $(PERLASM_SCHEME)
1409
--- a/crypto/perlasm/ppc-xlate.pl
1410
+++ b/crypto/perlasm/ppc-xlate.pl
1411
1412
$dm = oct($dm) if ($dm =~ /^0/);
1413
" .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
1414
};
1415
+my $vxxlor = sub { # xxlor
1416
+ my ($f, $vrt, $vra, $vrb) = @_;
1417
+ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
1418
+};
1419
+my $vxxlorc = sub { # xxlor
1420
+ my ($f, $vrt, $vra, $vrb) = @_;
1421
+ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
1422
+};
1423
1424
# PowerISA 2.07 stuff
1425
sub vcrypto_op {
1426
1427
};
1428
my $vmsumudm = sub { vfour_vsr(@_, 35); };
1429
1430
+# PowerISA 3.1 stuff
1431
+my $brd = sub {
1432
+ my ($f, $ra, $rs) = @_;
1433
+ " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
1434
+};
1435
+my $vsrq = sub { vcrypto_op(@_, 517); };
1436
+
1437
+
1438
+
1439
while($line=<>) {
1440
1441
$line =~ s|[#!;].*$||; # get rid of asm-style comments...
1442
--- a/crypto/ppc_arch.h
1443
+++ b/crypto/ppc_arch.h
1444
1445
# define PPC_MADD300 (1<<4)
1446
# define PPC_MFTB (1<<5)
1447
# define PPC_MFSPR268 (1<<6)
1448
+# define PPC_BRD31 (1<<7)
1449
1450
#endif
1451
--- a/crypto/ppccap.c
1452
+++ b/crypto/ppccap.c
1453
1454
void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
1455
size_t len, const unsigned int key[8],
1456
const unsigned int counter[4]);
1457
+void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
1458
+ size_t len, const unsigned int key[8],
1459
+ const unsigned int counter[4]);
1460
void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
1461
size_t len, const unsigned int key[8],
1462
const unsigned int counter[4])
1463
{
1464
- OPENSSL_ppccap_P & PPC_CRYPTO207
1465
- ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1466
- : OPENSSL_ppccap_P & PPC_ALTIVEC
1467
- ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1468
- : ChaCha20_ctr32_int(out, inp, len, key, counter);
1469
+ OPENSSL_ppccap_P & PPC_BRD31
1470
+ ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
1471
+ :OPENSSL_ppccap_P & PPC_CRYPTO207
1472
+ ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
1473
+ : OPENSSL_ppccap_P & PPC_ALTIVEC
1474
+ ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
1475
+ : ChaCha20_ctr32_int(out, inp, len, key, counter);
1476
}
1477
#endif
1478
1479
1480
void OPENSSL_altivec_probe(void);
1481
void OPENSSL_crypto207_probe(void);
1482
void OPENSSL_madd300_probe(void);
1483
+void OPENSSL_brd31_probe(void);
1484
1485
long OPENSSL_rdtsc_mftb(void);
1486
long OPENSSL_rdtsc_mfspr268(void);
1487
1488
#define HWCAP2 26 /* AT_HWCAP2 */
1489
#define HWCAP_VEC_CRYPTO (1U << 25)
1490
#define HWCAP_ARCH_3_00 (1U << 23)
1491
+#define HWCAP_ARCH_3_1 (1U << 18)
1492
1493
# if defined(__GNUC__) && __GNUC__>=2
1494
__attribute__ ((constructor))
1495
1496
if (__power_set(0xffffffffU<<17)) /* POWER9 and later */
1497
OPENSSL_ppccap_P |= PPC_MADD300;
1498
1499
+ if (__power_set(0xffffffffU<<18)) /* POWER10 and later */
1500
+ OPENSSL_ppccap_P |= PPC_BRD31;
1501
+
1502
return;
1503
# endif
1504
#endif
1505
1506
if (hwcap2 & HWCAP_ARCH_3_00) {
1507
OPENSSL_ppccap_P |= PPC_MADD300;
1508
}
1509
+
1510
+ if (hwcap2 & HWCAP_ARCH_3_1) {
1511
+ OPENSSL_ppccap_P |= PPC_BRD31;
1512
+ }
1513
}
1514
#endif
1515
1516
--- a/crypto/ppccpuid.pl
1517
+++ b/crypto/ppccpuid.pl
1518
1519
.long 0
1520
.byte 0,12,0x14,0,0,0,0,0
1521
1522
+.globl .OPENSSL_brd31_probe
1523
+.align 4
1524
+.OPENSSL_brd31_probe:
1525
+ xor r0,r0,r0
1526
+ brd r3,r0
1527
+ blr
1528
+ .long 0
1529
+ .byte 0,12,0x14,0,0,0,0,0
1530
+.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
1531
+
1532
+
1533
.globl .OPENSSL_wipe_cpu
1534
.align 4
1535
.OPENSSL_wipe_cpu:
1536