File 0002-crypto-chacha-asm-chacha-s390x.pl-add-vx-code-path.patch of Package openssl-1_1
xxxxxxxxxx
1
From f760137b2144740916afd9ff381451fa16c710de Mon Sep 17 00:00:00 2001
2
From: Patrick Steuer <patrick.steuer@de.ibm.com>
3
Date: Sat, 4 Aug 2018 00:10:06 +0200
4
Subject: [PATCH] crypto/chacha/asm/chacha-s390x.pl: add vx code path.
5
6
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
7
8
Reviewed-by: Tim Hudson <tjh@openssl.org>
9
Reviewed-by: Richard Levitte <levitte@openssl.org>
10
(Merged from https://github.com/openssl/openssl/pull/6919)
11
---
12
crypto/chacha/asm/chacha-s390x.pl | 816 ++++++++++++++++++++----------
13
crypto/chacha/build.info | 1 +
14
2 files changed, 558 insertions(+), 259 deletions(-)
15
16
Index: openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl
17
===================================================================
18
--- openssl-1.1.1c.orig/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:15:57.271195550 +0200
19
+++ openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:16:43.787489780 +0200
20
21
#
22
# 3 times faster than compiler-generated code.
23
24
-$flavour = shift;
25
+#
26
+# August 2018
27
+#
28
+# Add vx code path.
29
+#
30
+# Copyright IBM Corp. 2018
31
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
32
+
33
+use strict;
34
+use FindBin qw($Bin);
35
+use lib "$Bin/../..";
36
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
37
+
38
+my $flavour = shift;
39
40
+my ($z,$SIZE_T);
41
if ($flavour =~ /3[12]/) {
42
+ $z=0; # S/390 ABI
43
$SIZE_T=4;
44
- $g="";
45
} else {
46
+ $z=1; # zSeries ABI
47
$SIZE_T=8;
48
- $g="g";
49
}
50
51
+my $output;
52
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
53
-open STDOUT,">$output";
54
-
55
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
56
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
57
- $code .= "\t$opcode\t".join(',',@_)."\n";
58
-}
59
60
my $sp="%r15";
61
-
62
my $stdframe=16*$SIZE_T+4*8;
63
-my $frame=$stdframe+4*20;
64
-
65
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
66
67
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
68
my @t=map("%r$_",(8,9));
69
+my @v=map("%v$_",(16..31));
70
71
sub ROUND {
72
my ($a0,$b0,$c0,$d0)=@_;
73
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
74
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
75
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
76
-my ($xc,$xc_)=map("\"$_\"",@t);
77
-my @x=map("\"$_\"",@x);
78
+my ($xc,$xc_)=map("$_",@t);
79
80
# Consider order in which variables are addressed by their
81
# index:
82
83
# 'c' stores and loads in the middle, but none in the beginning
84
# or end.
85
86
- (
87
- "&alr (@x[$a0],@x[$b0])", # Q1
88
- "&alr (@x[$a1],@x[$b1])", # Q2
89
- "&xr (@x[$d0],@x[$a0])",
90
- "&xr (@x[$d1],@x[$a1])",
91
- "&rll (@x[$d0],@x[$d0],16)",
92
- "&rll (@x[$d1],@x[$d1],16)",
93
-
94
- "&alr ($xc,@x[$d0])",
95
- "&alr ($xc_,@x[$d1])",
96
- "&xr (@x[$b0],$xc)",
97
- "&xr (@x[$b1],$xc_)",
98
- "&rll (@x[$b0],@x[$b0],12)",
99
- "&rll (@x[$b1],@x[$b1],12)",
100
-
101
- "&alr (@x[$a0],@x[$b0])",
102
- "&alr (@x[$a1],@x[$b1])",
103
- "&xr (@x[$d0],@x[$a0])",
104
- "&xr (@x[$d1],@x[$a1])",
105
- "&rll (@x[$d0],@x[$d0],8)",
106
- "&rll (@x[$d1],@x[$d1],8)",
107
-
108
- "&alr ($xc,@x[$d0])",
109
- "&alr ($xc_,@x[$d1])",
110
- "&xr (@x[$b0],$xc)",
111
- "&xr (@x[$b1],$xc_)",
112
- "&rll (@x[$b0],@x[$b0],7)",
113
- "&rll (@x[$b1],@x[$b1],7)",
114
-
115
- "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
116
- "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
117
-
118
- "&alr (@x[$a2],@x[$b2])", # Q3
119
- "&alr (@x[$a3],@x[$b3])", # Q4
120
- "&xr (@x[$d2],@x[$a2])",
121
- "&xr (@x[$d3],@x[$a3])",
122
- "&rll (@x[$d2],@x[$d2],16)",
123
- "&rll (@x[$d3],@x[$d3],16)",
124
-
125
- "&alr ($xc,@x[$d2])",
126
- "&alr ($xc_,@x[$d3])",
127
- "&xr (@x[$b2],$xc)",
128
- "&xr (@x[$b3],$xc_)",
129
- "&rll (@x[$b2],@x[$b2],12)",
130
- "&rll (@x[$b3],@x[$b3],12)",
131
-
132
- "&alr (@x[$a2],@x[$b2])",
133
- "&alr (@x[$a3],@x[$b3])",
134
- "&xr (@x[$d2],@x[$a2])",
135
- "&xr (@x[$d3],@x[$a3])",
136
- "&rll (@x[$d2],@x[$d2],8)",
137
- "&rll (@x[$d3],@x[$d3],8)",
138
-
139
- "&alr ($xc,@x[$d2])",
140
- "&alr ($xc_,@x[$d3])",
141
- "&xr (@x[$b2],$xc)",
142
- "&xr (@x[$b3],$xc_)",
143
- "&rll (@x[$b2],@x[$b2],7)",
144
- "&rll (@x[$b3],@x[$b3],7)"
145
- );
146
-}
147
-
148
-$code.=<<___;
149
-.text
150
-
151
-.globl ChaCha20_ctr32
152
-.type ChaCha20_ctr32,\@function
153
-.align 32
154
-ChaCha20_ctr32:
155
- lt${g}r $len,$len # $len==0?
156
- bzr %r14
157
- a${g}hi $len,-64
158
- l${g}hi %r1,-$frame
159
- stm${g} %r6,%r15,`6*$SIZE_T`($sp)
160
- sl${g}r $out,$inp # difference
161
- la $len,0($inp,$len) # end of input minus 64
162
- larl %r7,.Lsigma
163
- lgr %r0,$sp
164
- la $sp,0(%r1,$sp)
165
- st${g} %r0,0($sp)
166
-
167
- lmg %r8,%r11,0($key) # load key
168
- lmg %r12,%r13,0($counter) # load counter
169
- lmg %r6,%r7,0(%r7) # load sigma constant
170
-
171
- la %r14,0($inp)
172
- st${g} $out,$frame+3*$SIZE_T($sp)
173
- st${g} $len,$frame+4*$SIZE_T($sp)
174
- stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
175
- srlg @x[12],%r12,32 # 32-bit counter value
176
- j .Loop_outer
177
-
178
-.align 16
179
-.Loop_outer:
180
- lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
181
- lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
182
- lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
183
- stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
184
- lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
185
- st @x[12],$stdframe+4*12($sp) # save counter
186
- st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
187
- lhi %r14,10
188
- j .Loop
189
-
190
-.align 4
191
-.Loop:
192
-___
193
- foreach (&ROUND(0, 4, 8,12)) { eval; }
194
- foreach (&ROUND(0, 5,10,15)) { eval; }
195
-$code.=<<___;
196
- brct %r14,.Loop
197
-
198
- l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
199
- stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
200
- lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
201
-
202
- al @x[0],$stdframe+4*0($sp) # accumulate key schedule
203
- al @x[1],$stdframe+4*1($sp)
204
- al @x[2],$stdframe+4*2($sp)
205
- al @x[3],$stdframe+4*3($sp)
206
- al @x[4],$stdframe+4*4($sp)
207
- al @x[5],$stdframe+4*5($sp)
208
- al @x[6],$stdframe+4*6($sp)
209
- al @x[7],$stdframe+4*7($sp)
210
- lrvr @x[0],@x[0]
211
- lrvr @x[1],@x[1]
212
- lrvr @x[2],@x[2]
213
- lrvr @x[3],@x[3]
214
- lrvr @x[4],@x[4]
215
- lrvr @x[5],@x[5]
216
- lrvr @x[6],@x[6]
217
- lrvr @x[7],@x[7]
218
- al @x[12],$stdframe+4*12($sp)
219
- al @x[13],$stdframe+4*13($sp)
220
- al @x[14],$stdframe+4*14($sp)
221
- al @x[15],$stdframe+4*15($sp)
222
- lrvr @x[12],@x[12]
223
- lrvr @x[13],@x[13]
224
- lrvr @x[14],@x[14]
225
- lrvr @x[15],@x[15]
226
-
227
- la @t[0],0(@t[0],%r14) # reconstruct output pointer
228
- cl${g}r %r14,@t[1]
229
- jh .Ltail
230
-
231
- x @x[0],4*0(%r14) # xor with input
232
- x @x[1],4*1(%r14)
233
- st @x[0],4*0(@t[0]) # store output
234
- x @x[2],4*2(%r14)
235
- st @x[1],4*1(@t[0])
236
- x @x[3],4*3(%r14)
237
- st @x[2],4*2(@t[0])
238
- x @x[4],4*4(%r14)
239
- st @x[3],4*3(@t[0])
240
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
241
- x @x[5],4*5(%r14)
242
- st @x[4],4*4(@t[0])
243
- x @x[6],4*6(%r14)
244
- al @x[0],$stdframe+4*8($sp)
245
- st @x[5],4*5(@t[0])
246
- x @x[7],4*7(%r14)
247
- al @x[1],$stdframe+4*9($sp)
248
- st @x[6],4*6(@t[0])
249
- x @x[12],4*12(%r14)
250
- al @x[2],$stdframe+4*10($sp)
251
- st @x[7],4*7(@t[0])
252
- x @x[13],4*13(%r14)
253
- al @x[3],$stdframe+4*11($sp)
254
- st @x[12],4*12(@t[0])
255
- x @x[14],4*14(%r14)
256
- st @x[13],4*13(@t[0])
257
- x @x[15],4*15(%r14)
258
- st @x[14],4*14(@t[0])
259
- lrvr @x[0],@x[0]
260
- st @x[15],4*15(@t[0])
261
- lrvr @x[1],@x[1]
262
- lrvr @x[2],@x[2]
263
- lrvr @x[3],@x[3]
264
- lhi @x[12],1
265
- x @x[0],4*8(%r14)
266
- al @x[12],$stdframe+4*12($sp) # increment counter
267
- x @x[1],4*9(%r14)
268
- st @x[0],4*8(@t[0])
269
- x @x[2],4*10(%r14)
270
- st @x[1],4*9(@t[0])
271
- x @x[3],4*11(%r14)
272
- st @x[2],4*10(@t[0])
273
- st @x[3],4*11(@t[0])
274
-
275
- cl${g}r %r14,@t[1] # done yet?
276
- la %r14,64(%r14)
277
- jl .Loop_outer
278
-
279
-.Ldone:
280
- xgr %r0,%r0
281
- xgr %r1,%r1
282
- xgr %r2,%r2
283
- xgr %r3,%r3
284
- stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
285
- stmg %r0,%r3,$stdframe+4*12($sp)
286
-
287
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
288
- br %r14
289
-
290
-.align 16
291
-.Ltail:
292
- la @t[1],64($t[1])
293
- stm @x[0],@x[7],$stdframe+4*0($sp)
294
- sl${g}r @t[1],%r14
295
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
296
- l${g}hi @x[6],0
297
- stm @x[12],@x[15],$stdframe+4*12($sp)
298
- al @x[0],$stdframe+4*8($sp)
299
- al @x[1],$stdframe+4*9($sp)
300
- al @x[2],$stdframe+4*10($sp)
301
- al @x[3],$stdframe+4*11($sp)
302
- lrvr @x[0],@x[0]
303
- lrvr @x[1],@x[1]
304
- lrvr @x[2],@x[2]
305
- lrvr @x[3],@x[3]
306
- stm @x[0],@x[3],$stdframe+4*8($sp)
307
-
308
-.Loop_tail:
309
- llgc @x[4],0(@x[6],%r14)
310
- llgc @x[5],$stdframe(@x[6],$sp)
311
- xr @x[5],@x[4]
312
- stc @x[5],0(@x[6],@t[0])
313
- la @x[6],1(@x[6])
314
- brct @t[1],.Loop_tail
315
-
316
- j .Ldone
317
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
318
-
319
-.align 32
320
-.Lsigma:
321
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
322
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
323
-.align 4
324
-___
325
+ alr (@x[$a0],@x[$b0]); # Q1
326
+ alr (@x[$a1],@x[$b1]); # Q2
327
+ xr (@x[$d0],@x[$a0]);
328
+ xr (@x[$d1],@x[$a1]);
329
+ rll (@x[$d0],@x[$d0],16);
330
+ rll (@x[$d1],@x[$d1],16);
331
+
332
+ alr ($xc,@x[$d0]);
333
+ alr ($xc_,@x[$d1]);
334
+ xr (@x[$b0],$xc);
335
+ xr (@x[$b1],$xc_);
336
+ rll (@x[$b0],@x[$b0],12);
337
+ rll (@x[$b1],@x[$b1],12);
338
+
339
+ alr (@x[$a0],@x[$b0]);
340
+ alr (@x[$a1],@x[$b1]);
341
+ xr (@x[$d0],@x[$a0]);
342
+ xr (@x[$d1],@x[$a1]);
343
+ rll (@x[$d0],@x[$d0],8);
344
+ rll (@x[$d1],@x[$d1],8);
345
+
346
+ alr ($xc,@x[$d0]);
347
+ alr ($xc_,@x[$d1]);
348
+ xr (@x[$b0],$xc);
349
+ xr (@x[$b1],$xc_);
350
+ rll (@x[$b0],@x[$b0],7);
351
+ rll (@x[$b1],@x[$b1],7);
352
+
353
+ stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
354
+ lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
355
+
356
+ alr (@x[$a2],@x[$b2]); # Q3
357
+ alr (@x[$a3],@x[$b3]); # Q4
358
+ xr (@x[$d2],@x[$a2]);
359
+ xr (@x[$d3],@x[$a3]);
360
+ rll (@x[$d2],@x[$d2],16);
361
+ rll (@x[$d3],@x[$d3],16);
362
+
363
+ alr ($xc,@x[$d2]);
364
+ alr ($xc_,@x[$d3]);
365
+ xr (@x[$b2],$xc);
366
+ xr (@x[$b3],$xc_);
367
+ rll (@x[$b2],@x[$b2],12);
368
+ rll (@x[$b3],@x[$b3],12);
369
+
370
+ alr (@x[$a2],@x[$b2]);
371
+ alr (@x[$a3],@x[$b3]);
372
+ xr (@x[$d2],@x[$a2]);
373
+ xr (@x[$d3],@x[$a3]);
374
+ rll (@x[$d2],@x[$d2],8);
375
+ rll (@x[$d3],@x[$d3],8);
376
+
377
+ alr ($xc,@x[$d2]);
378
+ alr ($xc_,@x[$d3]);
379
+ xr (@x[$b2],$xc);
380
+ xr (@x[$b3],$xc_);
381
+ rll (@x[$b2],@x[$b2],7);
382
+ rll (@x[$b3],@x[$b3],7);
383
+}
384
+
385
+sub VX_ROUND {
386
+my ($a0,$b0,$c0,$d0)=@_;
387
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
388
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
389
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
390
+
391
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
392
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
393
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
394
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
395
+ vx (@v[$d0],@v[$d0],@v[$a0]);
396
+ vx (@v[$d1],@v[$d1],@v[$a1]);
397
+ vx (@v[$d2],@v[$d2],@v[$a2]);
398
+ vx (@v[$d3],@v[$d3],@v[$a3]);
399
+ verllf (@v[$d0],@v[$d0],16);
400
+ verllf (@v[$d1],@v[$d1],16);
401
+ verllf (@v[$d2],@v[$d2],16);
402
+ verllf (@v[$d3],@v[$d3],16);
403
+
404
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
405
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
406
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
407
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
408
+ vx (@v[$b0],@v[$b0],@v[$c0]);
409
+ vx (@v[$b1],@v[$b1],@v[$c1]);
410
+ vx (@v[$b2],@v[$b2],@v[$c2]);
411
+ vx (@v[$b3],@v[$b3],@v[$c3]);
412
+ verllf (@v[$b0],@v[$b0],12);
413
+ verllf (@v[$b1],@v[$b1],12);
414
+ verllf (@v[$b2],@v[$b2],12);
415
+ verllf (@v[$b3],@v[$b3],12);
416
+
417
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
418
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
419
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
420
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
421
+ vx (@v[$d0],@v[$d0],@v[$a0]);
422
+ vx (@v[$d1],@v[$d1],@v[$a1]);
423
+ vx (@v[$d2],@v[$d2],@v[$a2]);
424
+ vx (@v[$d3],@v[$d3],@v[$a3]);
425
+ verllf (@v[$d0],@v[$d0],8);
426
+ verllf (@v[$d1],@v[$d1],8);
427
+ verllf (@v[$d2],@v[$d2],8);
428
+ verllf (@v[$d3],@v[$d3],8);
429
+
430
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
431
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
432
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
433
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
434
+ vx (@v[$b0],@v[$b0],@v[$c0]);
435
+ vx (@v[$b1],@v[$b1],@v[$c1]);
436
+ vx (@v[$b2],@v[$b2],@v[$c2]);
437
+ vx (@v[$b3],@v[$b3],@v[$c3]);
438
+ verllf (@v[$b0],@v[$b0],7);
439
+ verllf (@v[$b1],@v[$b1],7);
440
+ verllf (@v[$b2],@v[$b2],7);
441
+ verllf (@v[$b3],@v[$b3],7);
442
+}
443
+
444
+PERLASM_BEGIN($output);
445
446
-foreach (split("\n",$code)) {
447
- s/\`([^\`]*)\`/eval $1/ge;
448
+INCLUDE ("s390x_arch.h");
449
+TEXT ();
450
451
- print $_,"\n";
452
+################
453
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
454
+# const unsigned int key[8], const unsigned int counter[4])
455
+{
456
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
457
+
458
+# VX CODE PATH
459
+{
460
+my $off=$z*8*16+8; # offset(initial state)
461
+my $frame=$stdframe+4*16+$off;
462
+
463
+GLOBL ("ChaCha20_ctr32");
464
+TYPE ("ChaCha20_ctr32","\@function");
465
+ALIGN (32);
466
+LABEL ("ChaCha20_ctr32");
467
+ larl ("%r1","OPENSSL_s390xcap_P");
468
+
469
+ lghi ("%r0",64);
470
+&{$z? \&cgr:\&cr} ($len,"%r0");
471
+ jle ("_s390x_chacha_novx");
472
+
473
+ lg ("%r0","S390X_STFLE+16(%r1)");
474
+ tmhh ("%r0",0x4000); # check for vector facility
475
+ jz ("_s390x_chacha_novx");
476
+
477
+if (!$z) {
478
+ llgfr ($len,$len);
479
+ std ("%f4","16*$SIZE_T+2*8($sp)");
480
+ std ("%f6","16*$SIZE_T+3*8($sp)");
481
+}
482
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
483
+
484
+ lghi ("%r1",-$frame);
485
+ lgr ("%r0",$sp);
486
+ la ($sp,"0(%r1,$sp)"); # allocate stack frame
487
+
488
+ larl ("%r7",".Lsigma");
489
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
490
+
491
+ vstm ("%v8","%v15","8($sp)") if ($z);
492
+
493
+ vlm ("%v1","%v2","0($key)"); # load key
494
+ vl ("%v0","0(%r7)"); # load sigma constant
495
+ vl ("%v3","0($counter)"); # load iv (counter||nonce)
496
+ l ("%r0","0($counter)"); # load counter
497
+ vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
498
+
499
+ srlg ("%r1",$len,8);
500
+ ltgr ("%r1","%r1");
501
+ jz (".Lvx_4x_done");
502
+
503
+ALIGN (16); # process 4 64-byte blocks
504
+LABEL (".Lvx_4x");
505
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
506
+ # state
507
+ vl ("%v31","16(%r7)");
508
+ vaf ("%v12","%v12","%v31"); # increment counter
509
+
510
+ vlr (@v[$_],"%v$_") for (0..15); # copy initial state
511
+
512
+ lhi ("%r6",10);
513
+ j (".Loop_vx_4x");
514
+
515
+ALIGN (16);
516
+LABEL (".Loop_vx_4x");
517
+ VX_ROUND( 0, 4, 8,12); # column round
518
+ VX_ROUND( 0, 5,10,15); # diagonal round
519
+ brct ("%r6",".Loop_vx_4x");
520
+
521
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
522
+ # state (mod 32)
523
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
524
+
525
+for (0..3) { # blocks 1,2
526
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
527
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
528
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
529
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
530
+}
531
+ vlm ("%v0","%v7","0($inp)"); # load in
532
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
533
+ vstm ("%v0","%v7","0($out)"); # store out
534
+
535
+ vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
536
+
537
+for (0..3) { # blocks 2,3
538
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
539
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
540
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
541
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
542
+}
543
+ vlm ("%v0","%v7","128($inp)"); # load in
544
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
545
+ vstm ("%v0","%v7","128($out)"); # store out
546
+
547
+ ahi ("%r0",4);
548
+ st ("%r0","48+$off($sp)"); # update initial state
549
+
550
+ la ($inp,"256($inp)");
551
+ la ($out,"256($out)");
552
+ brctg ("%r1",".Lvx_4x");
553
+
554
+ALIGN (16);
555
+LABEL (".Lvx_4x_done");
556
+ lghi ("%r1",0xff);
557
+ ngr ($len,"%r1");
558
+ jnz (".Lvx_rem");
559
+
560
+ALIGN (16);
561
+LABEL (".Lvx_done");
562
+ vzero ("%v$_") for (16..31); # wipe ks and key copy
563
+ vstm ("%v16","%v17","16+$off($sp)");
564
+ vlm ("%v8","%v15","8($sp)") if ($z);
565
+
566
+ la ($sp,"$frame($sp)");
567
+&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
568
+
569
+if (!$z) {
570
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
571
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
572
+ vzero ("%v$_") for (8..15);
573
+}
574
+ br ("%r14");
575
+ALIGN (16);
576
+LABEL (".Lvx_rem");
577
+ lhi ("%r0",64);
578
+
579
+ sr ($len,"%r0");
580
+ brc (2,".Lvx_rem_g64"); # cc==2?
581
+
582
+ lghi ("%r1",-$stdframe);
583
+
584
+ la ($counter,"48+$off($sp)"); # load updated iv
585
+ ar ($len,"%r0"); # restore len
586
+
587
+ lgr ("%r7",$counter);
588
+&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
589
+ la ($sp,"0(%r1,$sp)");
590
+
591
+ bras ("%r14","_s390x_chacha_novx");
592
+
593
+ la ($sp,"$stdframe($sp)");
594
+&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
595
+ lgr ($counter,"%r7");
596
+ j (".Lvx_done");
597
+
598
+ALIGN (16);
599
+LABEL (".Lvx_rem_g64");
600
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
601
+ # state
602
+ vl ("%v31","16(%r7)");
603
+ vaf ("%v12","%v12","%v31"); # increment counter
604
+
605
+ vlr (@v[$_],"%v$_") for (0..15); # state = initial state
606
+
607
+ lhi ("%r6",10);
608
+ j (".Loop_vx_rem");
609
+
610
+ALIGN (16);
611
+LABEL (".Loop_vx_rem");
612
+ VX_ROUND( 0, 4, 8,12); # column round
613
+ VX_ROUND( 0, 5,10,15); # diagonal round
614
+ brct ("%r6",".Loop_vx_rem");
615
+
616
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
617
+ # state (mod 32)
618
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
619
+
620
+for (0..3) { # blocks 1,2
621
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
622
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
623
+ vperm ("%v".($_+8),"%v0","%v1","%v6");
624
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
625
+}
626
+ vlm ("%v0","%v3","0($inp)"); # load in
627
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
628
+ vstm ("%v0","%v3","0($out)"); # store out
629
+
630
+ la ($inp,"64($inp)");
631
+ la ($out,"64($out)");
632
+
633
+ sr ($len,"%r0");
634
+ brc (4,".Lvx_tail"); # cc==4?
635
+
636
+ vlm ("%v0","%v3","0($inp)"); # load in
637
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
638
+ vstm ("%v0","%v3","0($out)"); # store out
639
+ jz (".Lvx_done");
640
+
641
+for (0..3) { # blocks 3,4
642
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
643
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
644
+ vperm ("%v".($_+12),"%v0","%v1","%v6");
645
+ vperm ("%v".($_+8),"%v0","%v1","%v7");
646
+}
647
+ la ($inp,"64($inp)");
648
+ la ($out,"64($out)");
649
+
650
+ sr ($len,"%r0");
651
+ brc (4,".Lvx_tail"); # cc==4?
652
+
653
+ vlm ("%v0","%v3","0($inp)"); # load in
654
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
655
+ vstm ("%v0","%v3","0($out)"); # store out
656
+ jz (".Lvx_done");
657
+
658
+ la ($inp,"64($inp)");
659
+ la ($out,"64($out)");
660
+
661
+ sr ($len,"%r0");
662
+ vlr ("%v".($_+4),"%v$_") for (8..11);
663
+ j (".Lvx_tail");
664
+
665
+ALIGN (16);
666
+LABEL (".Lvx_tail");
667
+ ar ($len,"%r0"); # restore $len
668
+ ahi ($len,-1);
669
+
670
+ lhi ("%r0",16);
671
+for (0..2) {
672
+ vll ("%v0",$len,($_*16)."($inp)");
673
+ vx ("%v0","%v0","%v".($_+12));
674
+ vstl ("%v0",$len,($_*16)."($out)");
675
+ sr ($len,"%r0");
676
+ brc (4,".Lvx_done"); # cc==4?
677
+}
678
+ vll ("%v0",$len,"3*16($inp)");
679
+ vx ("%v0","%v0","%v15");
680
+ vstl ("%v0",$len,"3*16($out)");
681
+ j (".Lvx_done");
682
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
683
+}
684
+
685
+# NOVX CODE PATH
686
+{
687
+my $frame=$stdframe+4*20;
688
+
689
+TYPE ("_s390x_chacha_novx","\@function");
690
+ALIGN (32);
691
+LABEL ("_s390x_chacha_novx");
692
+&{$z? \<gr:\<r} ($len,$len); # $len==0?
693
+ bzr ("%r14");
694
+&{$z? \&aghi:\&ahi} ($len,-64);
695
+&{$z? \&lghi:\&lhi} ("%r1",-$frame);
696
+&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
697
+&{$z? \&slgr:\&slr} ($out,$inp); # difference
698
+ la ($len,"0($inp,$len)"); # end of input minus 64
699
+ larl ("%r7",".Lsigma");
700
+ lgr ("%r0",$sp);
701
+ la ($sp,"0(%r1,$sp)");
702
+&{$z? \&stg:\&st} ("%r0","0($sp)");
703
+
704
+ lmg ("%r8","%r11","0($key)"); # load key
705
+ lmg ("%r12","%r13","0($counter)"); # load counter
706
+ lmg ("%r6","%r7","0(%r7)"); # load sigma constant
707
+
708
+ la ("%r14","0($inp)");
709
+&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
710
+&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
711
+ stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
712
+ srlg (@x[12],"%r12",32); # 32-bit counter value
713
+ j (".Loop_outer");
714
+
715
+ALIGN (16);
716
+LABEL (".Loop_outer");
717
+ lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
718
+ lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
719
+ lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
720
+ stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
721
+ lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
722
+ st (@x[12],"$stdframe+4*12($sp)"); # save counter
723
+&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
724
+ lhi ("%r14",10);
725
+ j (".Loop");
726
+
727
+ALIGN (4);
728
+LABEL (".Loop");
729
+ ROUND (0, 4, 8,12);
730
+ ROUND (0, 5,10,15);
731
+ brct ("%r14",".Loop");
732
+
733
+&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
734
+ stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
735
+&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
736
+
737
+ al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
738
+ al (@x[1],"$stdframe+4*1($sp)");
739
+ al (@x[2],"$stdframe+4*2($sp)");
740
+ al (@x[3],"$stdframe+4*3($sp)");
741
+ al (@x[4],"$stdframe+4*4($sp)");
742
+ al (@x[5],"$stdframe+4*5($sp)");
743
+ al (@x[6],"$stdframe+4*6($sp)");
744
+ al (@x[7],"$stdframe+4*7($sp)");
745
+ lrvr (@x[0],@x[0]);
746
+ lrvr (@x[1],@x[1]);
747
+ lrvr (@x[2],@x[2]);
748
+ lrvr (@x[3],@x[3]);
749
+ lrvr (@x[4],@x[4]);
750
+ lrvr (@x[5],@x[5]);
751
+ lrvr (@x[6],@x[6]);
752
+ lrvr (@x[7],@x[7]);
753
+ al (@x[12],"$stdframe+4*12($sp)");
754
+ al (@x[13],"$stdframe+4*13($sp)");
755
+ al (@x[14],"$stdframe+4*14($sp)");
756
+ al (@x[15],"$stdframe+4*15($sp)");
757
+ lrvr (@x[12],@x[12]);
758
+ lrvr (@x[13],@x[13]);
759
+ lrvr (@x[14],@x[14]);
760
+ lrvr (@x[15],@x[15]);
761
+
762
+ la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
763
+&{$z? \&clgr:\&clr} ("%r14",@t[1]);
764
+ jh (".Ltail");
765
+
766
+ x (@x[0],"4*0(%r14)"); # xor with input
767
+ x (@x[1],"4*1(%r14)");
768
+ st (@x[0],"4*0(@t[0])"); # store output
769
+ x (@x[2],"4*2(%r14)");
770
+ st (@x[1],"4*1(@t[0])");
771
+ x (@x[3],"4*3(%r14)");
772
+ st (@x[2],"4*2(@t[0])");
773
+ x (@x[4],"4*4(%r14)");
774
+ st (@x[3],"4*3(@t[0])");
775
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
776
+ x (@x[5],"4*5(%r14)");
777
+ st (@x[4],"4*4(@t[0])");
778
+ x (@x[6],"4*6(%r14)");
779
+ al (@x[0],"$stdframe+4*8($sp)");
780
+ st (@x[5],"4*5(@t[0])");
781
+ x (@x[7],"4*7(%r14)");
782
+ al (@x[1],"$stdframe+4*9($sp)");
783
+ st (@x[6],"4*6(@t[0])");
784
+ x (@x[12],"4*12(%r14)");
785
+ al (@x[2],"$stdframe+4*10($sp)");
786
+ st (@x[7],"4*7(@t[0])");
787
+ x (@x[13],"4*13(%r14)");
788
+ al (@x[3],"$stdframe+4*11($sp)");
789
+ st (@x[12],"4*12(@t[0])");
790
+ x (@x[14],"4*14(%r14)");
791
+ st (@x[13],"4*13(@t[0])");
792
+ x (@x[15],"4*15(%r14)");
793
+ st (@x[14],"4*14(@t[0])");
794
+ lrvr (@x[0],@x[0]);
795
+ st (@x[15],"4*15(@t[0])");
796
+ lrvr (@x[1],@x[1]);
797
+ lrvr (@x[2],@x[2]);
798
+ lrvr (@x[3],@x[3]);
799
+ lhi (@x[12],1);
800
+ x (@x[0],"4*8(%r14)");
801
+ al (@x[12],"$stdframe+4*12($sp)"); # increment counter
802
+ x (@x[1],"4*9(%r14)");
803
+ st (@x[0],"4*8(@t[0])");
804
+ x (@x[2],"4*10(%r14)");
805
+ st (@x[1],"4*9(@t[0])");
806
+ x (@x[3],"4*11(%r14)");
807
+ st (@x[2],"4*10(@t[0])");
808
+ st (@x[3],"4*11(@t[0])");
809
+
810
+&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
811
+ la ("%r14","64(%r14)");
812
+ jl (".Loop_outer");
813
+
814
+LABEL (".Ldone");
815
+ xgr ("%r0","%r0");
816
+ xgr ("%r1","%r1");
817
+ xgr ("%r2","%r2");
818
+ xgr ("%r3","%r3");
819
+ stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
820
+ stmg ("%r0","%r3","$stdframe+4*12($sp)");
821
+
822
+&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
823
+ br ("%r14");
824
+
825
+ALIGN (16);
826
+LABEL (".Ltail");
827
+ la (@t[1],"64($t[1])");
828
+ stm (@x[0],@x[7],"$stdframe+4*0($sp)");
829
+&{$z? \&slgr:\&slr} (@t[1],"%r14");
830
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
831
+&{$z? \&lghi:\&lhi} (@x[6],0);
832
+ stm (@x[12],@x[15],"$stdframe+4*12($sp)");
833
+ al (@x[0],"$stdframe+4*8($sp)");
834
+ al (@x[1],"$stdframe+4*9($sp)");
835
+ al (@x[2],"$stdframe+4*10($sp)");
836
+ al (@x[3],"$stdframe+4*11($sp)");
837
+ lrvr (@x[0],@x[0]);
838
+ lrvr (@x[1],@x[1]);
839
+ lrvr (@x[2],@x[2]);
840
+ lrvr (@x[3],@x[3]);
841
+ stm (@x[0],@x[3],"$stdframe+4*8($sp)");
842
+
843
+LABEL (".Loop_tail");
844
+ llgc (@x[4],"0(@x[6],%r14)");
845
+ llgc (@x[5],"$stdframe(@x[6],$sp)");
846
+ xr (@x[5],@x[4]);
847
+ stc (@x[5],"0(@x[6],@t[0])");
848
+ la (@x[6],"1(@x[6])");
849
+ brct (@t[1],".Loop_tail");
850
+
851
+ j (".Ldone");
852
+SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
853
+}
854
}
855
-close STDOUT or die "error closing STDOUT: $!";
856
+################
857
+
858
+ALIGN (64);
859
+LABEL (".Lsigma");
860
+LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
861
+LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
862
+LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
863
+LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
864
+ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
865
+ALIGN (4);
866
+
867
+PERLASM_END();
868