File 0002-crypto-chacha-asm-chacha-s390x.pl-add-vx-code-path.patch of Package openssl-1_1 (Revision eeb624f0720e7a884f41874330f0e28f)
Currently displaying revision eeb624f0720e7a884f41874330f0e28f , Show latest
xxxxxxxxxx
1
From f760137b2144740916afd9ff381451fa16c710de Mon Sep 17 00:00:00 2001
2
From: Patrick Steuer <patrick.steuer@de.ibm.com>
3
Date: Sat, 4 Aug 2018 00:10:06 +0200
4
Subject: [PATCH] crypto/chacha/asm/chacha-s390x.pl: add vx code path.
5
6
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
7
8
Reviewed-by: Tim Hudson <tjh@openssl.org>
9
Reviewed-by: Richard Levitte <levitte@openssl.org>
10
(Merged from https://github.com/openssl/openssl/pull/6919)
11
---
12
crypto/chacha/asm/chacha-s390x.pl | 816 ++++++++++++++++++++----------
13
crypto/chacha/build.info | 1 +
14
2 files changed, 558 insertions(+), 259 deletions(-)
15
16
Index: openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl
17
===================================================================
18
--- openssl-1.1.1c.orig/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:15:57.271195550 +0200
19
+++ openssl-1.1.1c/crypto/chacha/asm/chacha-s390x.pl 2019-06-06 12:16:43.787489780 +0200
20
21
#! /usr/bin/env perl
22
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
23
+# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
24
#
25
# Licensed under the OpenSSL license (the "License"). You may not use
26
# this file except in compliance with the License. You can obtain a copy
27
28
#
29
# 3 times faster than compiler-generated code.
30
31
-$flavour = shift;
32
+#
33
+# August 2018
34
+#
35
+# Add vx code path.
36
+#
37
+# Copyright IBM Corp. 2018
38
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
39
+
40
+use strict;
41
+use FindBin qw($Bin);
42
+use lib "$Bin/../..";
43
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
44
+
45
+my $flavour = shift;
46
47
+my ($z,$SIZE_T);
48
if ($flavour =~ /3[12]/) {
49
+ $z=0; # S/390 ABI
50
$SIZE_T=4;
51
- $g="";
52
} else {
53
+ $z=1; # zSeries ABI
54
$SIZE_T=8;
55
- $g="g";
56
}
57
58
+my $output;
59
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60
-open STDOUT,">$output";
61
-
62
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
63
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
64
- $code .= "\t$opcode\t".join(',',@_)."\n";
65
-}
66
67
my $sp="%r15";
68
-
69
my $stdframe=16*$SIZE_T+4*8;
70
-my $frame=$stdframe+4*20;
71
-
72
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
73
74
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
75
my @t=map("%r$_",(8,9));
76
+my @v=map("%v$_",(16..31));
77
78
sub ROUND {
79
my ($a0,$b0,$c0,$d0)=@_;
80
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
81
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
82
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
83
-my ($xc,$xc_)=map("\"$_\"",@t);
84
-my @x=map("\"$_\"",@x);
85
+my ($xc,$xc_)=map("$_",@t);
86
87
# Consider order in which variables are addressed by their
88
# index:
89
90
# 'c' stores and loads in the middle, but none in the beginning
91
# or end.
92
93
- (
94
- "&alr (@x[$a0],@x[$b0])", # Q1
95
- "&alr (@x[$a1],@x[$b1])", # Q2
96
- "&xr (@x[$d0],@x[$a0])",
97
- "&xr (@x[$d1],@x[$a1])",
98
- "&rll (@x[$d0],@x[$d0],16)",
99
- "&rll (@x[$d1],@x[$d1],16)",
100
-
101
- "&alr ($xc,@x[$d0])",
102
- "&alr ($xc_,@x[$d1])",
103
- "&xr (@x[$b0],$xc)",
104
- "&xr (@x[$b1],$xc_)",
105
- "&rll (@x[$b0],@x[$b0],12)",
106
- "&rll (@x[$b1],@x[$b1],12)",
107
-
108
- "&alr (@x[$a0],@x[$b0])",
109
- "&alr (@x[$a1],@x[$b1])",
110
- "&xr (@x[$d0],@x[$a0])",
111
- "&xr (@x[$d1],@x[$a1])",
112
- "&rll (@x[$d0],@x[$d0],8)",
113
- "&rll (@x[$d1],@x[$d1],8)",
114
-
115
- "&alr ($xc,@x[$d0])",
116
- "&alr ($xc_,@x[$d1])",
117
- "&xr (@x[$b0],$xc)",
118
- "&xr (@x[$b1],$xc_)",
119
- "&rll (@x[$b0],@x[$b0],7)",
120
- "&rll (@x[$b1],@x[$b1],7)",
121
-
122
- "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
123
- "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
124
-
125
- "&alr (@x[$a2],@x[$b2])", # Q3
126
- "&alr (@x[$a3],@x[$b3])", # Q4
127
- "&xr (@x[$d2],@x[$a2])",
128
- "&xr (@x[$d3],@x[$a3])",
129
- "&rll (@x[$d2],@x[$d2],16)",
130
- "&rll (@x[$d3],@x[$d3],16)",
131
-
132
- "&alr ($xc,@x[$d2])",
133
- "&alr ($xc_,@x[$d3])",
134
- "&xr (@x[$b2],$xc)",
135
- "&xr (@x[$b3],$xc_)",
136
- "&rll (@x[$b2],@x[$b2],12)",
137
- "&rll (@x[$b3],@x[$b3],12)",
138
-
139
- "&alr (@x[$a2],@x[$b2])",
140
- "&alr (@x[$a3],@x[$b3])",
141
- "&xr (@x[$d2],@x[$a2])",
142
- "&xr (@x[$d3],@x[$a3])",
143
- "&rll (@x[$d2],@x[$d2],8)",
144
- "&rll (@x[$d3],@x[$d3],8)",
145
-
146
- "&alr ($xc,@x[$d2])",
147
- "&alr ($xc_,@x[$d3])",
148
- "&xr (@x[$b2],$xc)",
149
- "&xr (@x[$b3],$xc_)",
150
- "&rll (@x[$b2],@x[$b2],7)",
151
- "&rll (@x[$b3],@x[$b3],7)"
152
- );
153
-}
154
-
155
-$code.=<<___;
156
-.text
157
-
158
-.globl ChaCha20_ctr32
159
-.type ChaCha20_ctr32,\@function
160
-.align 32
161
-ChaCha20_ctr32:
162
- lt${g}r $len,$len # $len==0?
163
- bzr %r14
164
- a${g}hi $len,-64
165
- l${g}hi %r1,-$frame
166
- stm${g} %r6,%r15,`6*$SIZE_T`($sp)
167
- sl${g}r $out,$inp # difference
168
- la $len,0($inp,$len) # end of input minus 64
169
- larl %r7,.Lsigma
170
- lgr %r0,$sp
171
- la $sp,0(%r1,$sp)
172
- st${g} %r0,0($sp)
173
-
174
- lmg %r8,%r11,0($key) # load key
175
- lmg %r12,%r13,0($counter) # load counter
176
- lmg %r6,%r7,0(%r7) # load sigma constant
177
-
178
- la %r14,0($inp)
179
- st${g} $out,$frame+3*$SIZE_T($sp)
180
- st${g} $len,$frame+4*$SIZE_T($sp)
181
- stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
182
- srlg @x[12],%r12,32 # 32-bit counter value
183
- j .Loop_outer
184
-
185
-.align 16
186
-.Loop_outer:
187
- lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
188
- lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
189
- lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
190
- stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
191
- lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
192
- st @x[12],$stdframe+4*12($sp) # save counter
193
- st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
194
- lhi %r14,10
195
- j .Loop
196
-
197
-.align 4
198
-.Loop:
199
-___
200
- foreach (&ROUND(0, 4, 8,12)) { eval; }
201
- foreach (&ROUND(0, 5,10,15)) { eval; }
202
-$code.=<<___;
203
- brct %r14,.Loop
204
-
205
- l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
206
- stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
207
- lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
208
-
209
- al @x[0],$stdframe+4*0($sp) # accumulate key schedule
210
- al @x[1],$stdframe+4*1($sp)
211
- al @x[2],$stdframe+4*2($sp)
212
- al @x[3],$stdframe+4*3($sp)
213
- al @x[4],$stdframe+4*4($sp)
214
- al @x[5],$stdframe+4*5($sp)
215
- al @x[6],$stdframe+4*6($sp)
216
- al @x[7],$stdframe+4*7($sp)
217
- lrvr @x[0],@x[0]
218
- lrvr @x[1],@x[1]
219
- lrvr @x[2],@x[2]
220
- lrvr @x[3],@x[3]
221
- lrvr @x[4],@x[4]
222
- lrvr @x[5],@x[5]
223
- lrvr @x[6],@x[6]
224
- lrvr @x[7],@x[7]
225
- al @x[12],$stdframe+4*12($sp)
226
- al @x[13],$stdframe+4*13($sp)
227
- al @x[14],$stdframe+4*14($sp)
228
- al @x[15],$stdframe+4*15($sp)
229
- lrvr @x[12],@x[12]
230
- lrvr @x[13],@x[13]
231
- lrvr @x[14],@x[14]
232
- lrvr @x[15],@x[15]
233
-
234
- la @t[0],0(@t[0],%r14) # reconstruct output pointer
235
- cl${g}r %r14,@t[1]
236
- jh .Ltail
237
-
238
- x @x[0],4*0(%r14) # xor with input
239
- x @x[1],4*1(%r14)
240
- st @x[0],4*0(@t[0]) # store output
241
- x @x[2],4*2(%r14)
242
- st @x[1],4*1(@t[0])
243
- x @x[3],4*3(%r14)
244
- st @x[2],4*2(@t[0])
245
- x @x[4],4*4(%r14)
246
- st @x[3],4*3(@t[0])
247
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
248
- x @x[5],4*5(%r14)
249
- st @x[4],4*4(@t[0])
250
- x @x[6],4*6(%r14)
251
- al @x[0],$stdframe+4*8($sp)
252
- st @x[5],4*5(@t[0])
253
- x @x[7],4*7(%r14)
254
- al @x[1],$stdframe+4*9($sp)
255
- st @x[6],4*6(@t[0])
256
- x @x[12],4*12(%r14)
257
- al @x[2],$stdframe+4*10($sp)
258
- st @x[7],4*7(@t[0])
259
- x @x[13],4*13(%r14)
260
- al @x[3],$stdframe+4*11($sp)
261
- st @x[12],4*12(@t[0])
262
- x @x[14],4*14(%r14)
263
- st @x[13],4*13(@t[0])
264
- x @x[15],4*15(%r14)
265
- st @x[14],4*14(@t[0])
266
- lrvr @x[0],@x[0]
267
- st @x[15],4*15(@t[0])
268
- lrvr @x[1],@x[1]
269
- lrvr @x[2],@x[2]
270
- lrvr @x[3],@x[3]
271
- lhi @x[12],1
272
- x @x[0],4*8(%r14)
273
- al @x[12],$stdframe+4*12($sp) # increment counter
274
- x @x[1],4*9(%r14)
275
- st @x[0],4*8(@t[0])
276
- x @x[2],4*10(%r14)
277
- st @x[1],4*9(@t[0])
278
- x @x[3],4*11(%r14)
279
- st @x[2],4*10(@t[0])
280
- st @x[3],4*11(@t[0])
281
-
282
- cl${g}r %r14,@t[1] # done yet?
283
- la %r14,64(%r14)
284
- jl .Loop_outer
285
-
286
-.Ldone:
287
- xgr %r0,%r0
288
- xgr %r1,%r1
289
- xgr %r2,%r2
290
- xgr %r3,%r3
291
- stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
292
- stmg %r0,%r3,$stdframe+4*12($sp)
293
-
294
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
295
- br %r14
296
-
297
-.align 16
298
-.Ltail:
299
- la @t[1],64($t[1])
300
- stm @x[0],@x[7],$stdframe+4*0($sp)
301
- sl${g}r @t[1],%r14
302
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
303
- l${g}hi @x[6],0
304
- stm @x[12],@x[15],$stdframe+4*12($sp)
305
- al @x[0],$stdframe+4*8($sp)
306
- al @x[1],$stdframe+4*9($sp)
307
- al @x[2],$stdframe+4*10($sp)
308
- al @x[3],$stdframe+4*11($sp)
309
- lrvr @x[0],@x[0]
310
- lrvr @x[1],@x[1]
311
- lrvr @x[2],@x[2]
312
- lrvr @x[3],@x[3]
313
- stm @x[0],@x[3],$stdframe+4*8($sp)
314
-
315
-.Loop_tail:
316
- llgc @x[4],0(@x[6],%r14)
317
- llgc @x[5],$stdframe(@x[6],$sp)
318
- xr @x[5],@x[4]
319
- stc @x[5],0(@x[6],@t[0])
320
- la @x[6],1(@x[6])
321
- brct @t[1],.Loop_tail
322
-
323
- j .Ldone
324
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
325
-
326
-.align 32
327
-.Lsigma:
328
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
329
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
330
-.align 4
331
-___
332
+ alr (@x[$a0],@x[$b0]); # Q1
333
+ alr (@x[$a1],@x[$b1]); # Q2
334
+ xr (@x[$d0],@x[$a0]);
335
+ xr (@x[$d1],@x[$a1]);
336
+ rll (@x[$d0],@x[$d0],16);
337
+ rll (@x[$d1],@x[$d1],16);
338
+
339
+ alr ($xc,@x[$d0]);
340
+ alr ($xc_,@x[$d1]);
341
+ xr (@x[$b0],$xc);
342
+ xr (@x[$b1],$xc_);
343
+ rll (@x[$b0],@x[$b0],12);
344
+ rll (@x[$b1],@x[$b1],12);
345
+
346
+ alr (@x[$a0],@x[$b0]);
347
+ alr (@x[$a1],@x[$b1]);
348
+ xr (@x[$d0],@x[$a0]);
349
+ xr (@x[$d1],@x[$a1]);
350
+ rll (@x[$d0],@x[$d0],8);
351
+ rll (@x[$d1],@x[$d1],8);
352
+
353
+ alr ($xc,@x[$d0]);
354
+ alr ($xc_,@x[$d1]);
355
+ xr (@x[$b0],$xc);
356
+ xr (@x[$b1],$xc_);
357
+ rll (@x[$b0],@x[$b0],7);
358
+ rll (@x[$b1],@x[$b1],7);
359
+
360
+ stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
361
+ lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
362
+
363
+ alr (@x[$a2],@x[$b2]); # Q3
364
+ alr (@x[$a3],@x[$b3]); # Q4
365
+ xr (@x[$d2],@x[$a2]);
366
+ xr (@x[$d3],@x[$a3]);
367
+ rll (@x[$d2],@x[$d2],16);
368
+ rll (@x[$d3],@x[$d3],16);
369
+
370
+ alr ($xc,@x[$d2]);
371
+ alr ($xc_,@x[$d3]);
372
+ xr (@x[$b2],$xc);
373
+ xr (@x[$b3],$xc_);
374
+ rll (@x[$b2],@x[$b2],12);
375
+ rll (@x[$b3],@x[$b3],12);
376
+
377
+ alr (@x[$a2],@x[$b2]);
378
+ alr (@x[$a3],@x[$b3]);
379
+ xr (@x[$d2],@x[$a2]);
380
+ xr (@x[$d3],@x[$a3]);
381
+ rll (@x[$d2],@x[$d2],8);
382
+ rll (@x[$d3],@x[$d3],8);
383
+
384
+ alr ($xc,@x[$d2]);
385
+ alr ($xc_,@x[$d3]);
386
+ xr (@x[$b2],$xc);
387
+ xr (@x[$b3],$xc_);
388
+ rll (@x[$b2],@x[$b2],7);
389
+ rll (@x[$b3],@x[$b3],7);
390
+}
391
+
392
+sub VX_ROUND {
393
+my ($a0,$b0,$c0,$d0)=@_;
394
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
395
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
396
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
397
+
398
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
399
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
400
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
401
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
402
+ vx (@v[$d0],@v[$d0],@v[$a0]);
403
+ vx (@v[$d1],@v[$d1],@v[$a1]);
404
+ vx (@v[$d2],@v[$d2],@v[$a2]);
405
+ vx (@v[$d3],@v[$d3],@v[$a3]);
406
+ verllf (@v[$d0],@v[$d0],16);
407
+ verllf (@v[$d1],@v[$d1],16);
408
+ verllf (@v[$d2],@v[$d2],16);
409
+ verllf (@v[$d3],@v[$d3],16);
410
+
411
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
412
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
413
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
414
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
415
+ vx (@v[$b0],@v[$b0],@v[$c0]);
416
+ vx (@v[$b1],@v[$b1],@v[$c1]);
417
+ vx (@v[$b2],@v[$b2],@v[$c2]);
418
+ vx (@v[$b3],@v[$b3],@v[$c3]);
419
+ verllf (@v[$b0],@v[$b0],12);
420
+ verllf (@v[$b1],@v[$b1],12);
421
+ verllf (@v[$b2],@v[$b2],12);
422
+ verllf (@v[$b3],@v[$b3],12);
423
+
424
+ vaf (@v[$a0],@v[$a0],@v[$b0]);
425
+ vaf (@v[$a1],@v[$a1],@v[$b1]);
426
+ vaf (@v[$a2],@v[$a2],@v[$b2]);
427
+ vaf (@v[$a3],@v[$a3],@v[$b3]);
428
+ vx (@v[$d0],@v[$d0],@v[$a0]);
429
+ vx (@v[$d1],@v[$d1],@v[$a1]);
430
+ vx (@v[$d2],@v[$d2],@v[$a2]);
431
+ vx (@v[$d3],@v[$d3],@v[$a3]);
432
+ verllf (@v[$d0],@v[$d0],8);
433
+ verllf (@v[$d1],@v[$d1],8);
434
+ verllf (@v[$d2],@v[$d2],8);
435
+ verllf (@v[$d3],@v[$d3],8);
436
+
437
+ vaf (@v[$c0],@v[$c0],@v[$d0]);
438
+ vaf (@v[$c1],@v[$c1],@v[$d1]);
439
+ vaf (@v[$c2],@v[$c2],@v[$d2]);
440
+ vaf (@v[$c3],@v[$c3],@v[$d3]);
441
+ vx (@v[$b0],@v[$b0],@v[$c0]);
442
+ vx (@v[$b1],@v[$b1],@v[$c1]);
443
+ vx (@v[$b2],@v[$b2],@v[$c2]);
444
+ vx (@v[$b3],@v[$b3],@v[$c3]);
445
+ verllf (@v[$b0],@v[$b0],7);
446
+ verllf (@v[$b1],@v[$b1],7);
447
+ verllf (@v[$b2],@v[$b2],7);
448
+ verllf (@v[$b3],@v[$b3],7);
449
+}
450
+
451
+PERLASM_BEGIN($output);
452
453
-foreach (split("\n",$code)) {
454
- s/\`([^\`]*)\`/eval $1/ge;
455
+INCLUDE ("s390x_arch.h");
456
+TEXT ();
457
458
- print $_,"\n";
459
+################
460
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
461
+# const unsigned int key[8], const unsigned int counter[4])
462
+{
463
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
464
+
465
+# VX CODE PATH
466
+{
467
+my $off=$z*8*16+8; # offset(initial state)
468
+my $frame=$stdframe+4*16+$off;
469
+
470
+GLOBL ("ChaCha20_ctr32");
471
+TYPE ("ChaCha20_ctr32","\@function");
472
+ALIGN (32);
473
+LABEL ("ChaCha20_ctr32");
474
+ larl ("%r1","OPENSSL_s390xcap_P");
475
+
476
+ lghi ("%r0",64);
477
+&{$z? \&cgr:\&cr} ($len,"%r0");
478
+ jle ("_s390x_chacha_novx");
479
+
480
+ lg ("%r0","S390X_STFLE+16(%r1)");
481
+ tmhh ("%r0",0x4000); # check for vector facility
482
+ jz ("_s390x_chacha_novx");
483
+
484
+if (!$z) {
485
+ llgfr ($len,$len);
486
+ std ("%f4","16*$SIZE_T+2*8($sp)");
487
+ std ("%f6","16*$SIZE_T+3*8($sp)");
488
+}
489
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
490
+
491
+ lghi ("%r1",-$frame);
492
+ lgr ("%r0",$sp);
493
+ la ($sp,"0(%r1,$sp)"); # allocate stack frame
494
+
495
+ larl ("%r7",".Lsigma");
496
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
497
+
498
+ vstm ("%v8","%v15","8($sp)") if ($z);
499
+
500
+ vlm ("%v1","%v2","0($key)"); # load key
501
+ vl ("%v0","0(%r7)"); # load sigma constant
502
+ vl ("%v3","0($counter)"); # load iv (counter||nonce)
503
+ l ("%r0","0($counter)"); # load counter
504
+ vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
505
+
506
+ srlg ("%r1",$len,8);
507
+ ltgr ("%r1","%r1");
508
+ jz (".Lvx_4x_done");
509
+
510
+ALIGN (16); # process 4 64-byte blocks
511
+LABEL (".Lvx_4x");
512
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
513
+ # state
514
+ vl ("%v31","16(%r7)");
515
+ vaf ("%v12","%v12","%v31"); # increment counter
516
+
517
+ vlr (@v[$_],"%v$_") for (0..15); # copy initial state
518
+
519
+ lhi ("%r6",10);
520
+ j (".Loop_vx_4x");
521
+
522
+ALIGN (16);
523
+LABEL (".Loop_vx_4x");
524
+ VX_ROUND( 0, 4, 8,12); # column round
525
+ VX_ROUND( 0, 5,10,15); # diagonal round
526
+ brct ("%r6",".Loop_vx_4x");
527
+
528
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
529
+ # state (mod 32)
530
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
531
+
532
+for (0..3) { # blocks 1,2
533
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
534
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
535
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
536
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
537
+}
538
+ vlm ("%v0","%v7","0($inp)"); # load in
539
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
540
+ vstm ("%v0","%v7","0($out)"); # store out
541
+
542
+ vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
543
+
544
+for (0..3) { # blocks 2,3
545
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
546
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
547
+ vperm ("%v".($_+ 8),"%v0","%v1","%v6");
548
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
549
+}
550
+ vlm ("%v0","%v7","128($inp)"); # load in
551
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
552
+ vstm ("%v0","%v7","128($out)"); # store out
553
+
554
+ ahi ("%r0",4);
555
+ st ("%r0","48+$off($sp)"); # update initial state
556
+
557
+ la ($inp,"256($inp)");
558
+ la ($out,"256($out)");
559
+ brctg ("%r1",".Lvx_4x");
560
+
561
+ALIGN (16);
562
+LABEL (".Lvx_4x_done");
563
+ lghi ("%r1",0xff);
564
+ ngr ($len,"%r1");
565
+ jnz (".Lvx_rem");
566
+
567
+ALIGN (16);
568
+LABEL (".Lvx_done");
569
+ vzero ("%v$_") for (16..31); # wipe ks and key copy
570
+ vstm ("%v16","%v17","16+$off($sp)");
571
+ vlm ("%v8","%v15","8($sp)") if ($z);
572
+
573
+ la ($sp,"$frame($sp)");
574
+&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
575
+
576
+if (!$z) {
577
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
578
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
579
+ vzero ("%v$_") for (8..15);
580
+}
581
+ br ("%r14");
582
+ALIGN (16);
583
+LABEL (".Lvx_rem");
584
+ lhi ("%r0",64);
585
+
586
+ sr ($len,"%r0");
587
+ brc (2,".Lvx_rem_g64"); # cc==2?
588
+
589
+ lghi ("%r1",-$stdframe);
590
+
591
+ la ($counter,"48+$off($sp)"); # load updated iv
592
+ ar ($len,"%r0"); # restore len
593
+
594
+ lgr ("%r7",$counter);
595
+&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
596
+ la ($sp,"0(%r1,$sp)");
597
+
598
+ bras ("%r14","_s390x_chacha_novx");
599
+
600
+ la ($sp,"$stdframe($sp)");
601
+&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
602
+ lgr ($counter,"%r7");
603
+ j (".Lvx_done");
604
+
605
+ALIGN (16);
606
+LABEL (".Lvx_rem_g64");
607
+ vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
608
+ # state
609
+ vl ("%v31","16(%r7)");
610
+ vaf ("%v12","%v12","%v31"); # increment counter
611
+
612
+ vlr (@v[$_],"%v$_") for (0..15); # state = initial state
613
+
614
+ lhi ("%r6",10);
615
+ j (".Loop_vx_rem");
616
+
617
+ALIGN (16);
618
+LABEL (".Loop_vx_rem");
619
+ VX_ROUND( 0, 4, 8,12); # column round
620
+ VX_ROUND( 0, 5,10,15); # diagonal round
621
+ brct ("%r6",".Loop_vx_rem");
622
+
623
+ vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
624
+ # state (mod 32)
625
+ vlm ("%v6","%v7","32(%r7)"); # load vperm operands
626
+
627
+for (0..3) { # blocks 1,2
628
+ vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
629
+ vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
630
+ vperm ("%v".($_+8),"%v0","%v1","%v6");
631
+ vperm ("%v".($_+12),"%v0","%v1","%v7");
632
+}
633
+ vlm ("%v0","%v3","0($inp)"); # load in
634
+ vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
635
+ vstm ("%v0","%v3","0($out)"); # store out
636
+
637
+ la ($inp,"64($inp)");
638
+ la ($out,"64($out)");
639
+
640
+ sr ($len,"%r0");
641
+ brc (4,".Lvx_tail"); # cc==4?
642
+
643
+ vlm ("%v0","%v3","0($inp)"); # load in
644
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
645
+ vstm ("%v0","%v3","0($out)"); # store out
646
+ jz (".Lvx_done");
647
+
648
+for (0..3) { # blocks 3,4
649
+ vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
650
+ vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
651
+ vperm ("%v".($_+12),"%v0","%v1","%v6");
652
+ vperm ("%v".($_+8),"%v0","%v1","%v7");
653
+}
654
+ la ($inp,"64($inp)");
655
+ la ($out,"64($out)");
656
+
657
+ sr ($len,"%r0");
658
+ brc (4,".Lvx_tail"); # cc==4?
659
+
660
+ vlm ("%v0","%v3","0($inp)"); # load in
661
+ vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
662
+ vstm ("%v0","%v3","0($out)"); # store out
663
+ jz (".Lvx_done");
664
+
665
+ la ($inp,"64($inp)");
666
+ la ($out,"64($out)");
667
+
668
+ sr ($len,"%r0");
669
+ vlr ("%v".($_+4),"%v$_") for (8..11);
670
+ j (".Lvx_tail");
671
+
672
+ALIGN (16);
673
+LABEL (".Lvx_tail");
674
+ ar ($len,"%r0"); # restore $len
675
+ ahi ($len,-1);
676
+
677
+ lhi ("%r0",16);
678
+for (0..2) {
679
+ vll ("%v0",$len,($_*16)."($inp)");
680
+ vx ("%v0","%v0","%v".($_+12));
681
+ vstl ("%v0",$len,($_*16)."($out)");
682
+ sr ($len,"%r0");
683
+ brc (4,".Lvx_done"); # cc==4?
684
+}
685
+ vll ("%v0",$len,"3*16($inp)");
686
+ vx ("%v0","%v0","%v15");
687
+ vstl ("%v0",$len,"3*16($out)");
688
+ j (".Lvx_done");
689
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
690
+}
691
+
692
+# NOVX CODE PATH
693
+{
694
+my $frame=$stdframe+4*20;
695
+
696
+TYPE ("_s390x_chacha_novx","\@function");
697
+ALIGN (32);
698
+LABEL ("_s390x_chacha_novx");
699
+&{$z? \<gr:\<r} ($len,$len); # $len==0?
700
+ bzr ("%r14");
701
+&{$z? \&aghi:\&ahi} ($len,-64);
702
+&{$z? \&lghi:\&lhi} ("%r1",-$frame);
703
+&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
704
+&{$z? \&slgr:\&slr} ($out,$inp); # difference
705
+ la ($len,"0($inp,$len)"); # end of input minus 64
706
+ larl ("%r7",".Lsigma");
707
+ lgr ("%r0",$sp);
708
+ la ($sp,"0(%r1,$sp)");
709
+&{$z? \&stg:\&st} ("%r0","0($sp)");
710
+
711
+ lmg ("%r8","%r11","0($key)"); # load key
712
+ lmg ("%r12","%r13","0($counter)"); # load counter
713
+ lmg ("%r6","%r7","0(%r7)"); # load sigma constant
714
+
715
+ la ("%r14","0($inp)");
716
+&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
717
+&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
718
+ stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
719
+ srlg (@x[12],"%r12",32); # 32-bit counter value
720
+ j (".Loop_outer");
721
+
722
+ALIGN (16);
723
+LABEL (".Loop_outer");
724
+ lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
725
+ lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
726
+ lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
727
+ stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
728
+ lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
729
+ st (@x[12],"$stdframe+4*12($sp)"); # save counter
730
+&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
731
+ lhi ("%r14",10);
732
+ j (".Loop");
733
+
734
+ALIGN (4);
735
+LABEL (".Loop");
736
+ ROUND (0, 4, 8,12);
737
+ ROUND (0, 5,10,15);
738
+ brct ("%r14",".Loop");
739
+
740
+&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
741
+ stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
742
+&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
743
+
744
+ al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
745
+ al (@x[1],"$stdframe+4*1($sp)");
746
+ al (@x[2],"$stdframe+4*2($sp)");
747
+ al (@x[3],"$stdframe+4*3($sp)");
748
+ al (@x[4],"$stdframe+4*4($sp)");
749
+ al (@x[5],"$stdframe+4*5($sp)");
750
+ al (@x[6],"$stdframe+4*6($sp)");
751
+ al (@x[7],"$stdframe+4*7($sp)");
752
+ lrvr (@x[0],@x[0]);
753
+ lrvr (@x[1],@x[1]);
754
+ lrvr (@x[2],@x[2]);
755
+ lrvr (@x[3],@x[3]);
756
+ lrvr (@x[4],@x[4]);
757
+ lrvr (@x[5],@x[5]);
758
+ lrvr (@x[6],@x[6]);
759
+ lrvr (@x[7],@x[7]);
760
+ al (@x[12],"$stdframe+4*12($sp)");
761
+ al (@x[13],"$stdframe+4*13($sp)");
762
+ al (@x[14],"$stdframe+4*14($sp)");
763
+ al (@x[15],"$stdframe+4*15($sp)");
764
+ lrvr (@x[12],@x[12]);
765
+ lrvr (@x[13],@x[13]);
766
+ lrvr (@x[14],@x[14]);
767
+ lrvr (@x[15],@x[15]);
768
+
769
+ la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
770
+&{$z? \&clgr:\&clr} ("%r14",@t[1]);
771
+ jh (".Ltail");
772
+
773
+ x (@x[0],"4*0(%r14)"); # xor with input
774
+ x (@x[1],"4*1(%r14)");
775
+ st (@x[0],"4*0(@t[0])"); # store output
776
+ x (@x[2],"4*2(%r14)");
777
+ st (@x[1],"4*1(@t[0])");
778
+ x (@x[3],"4*3(%r14)");
779
+ st (@x[2],"4*2(@t[0])");
780
+ x (@x[4],"4*4(%r14)");
781
+ st (@x[3],"4*3(@t[0])");
782
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
783
+ x (@x[5],"4*5(%r14)");
784
+ st (@x[4],"4*4(@t[0])");
785
+ x (@x[6],"4*6(%r14)");
786
+ al (@x[0],"$stdframe+4*8($sp)");
787
+ st (@x[5],"4*5(@t[0])");
788
+ x (@x[7],"4*7(%r14)");
789
+ al (@x[1],"$stdframe+4*9($sp)");
790
+ st (@x[6],"4*6(@t[0])");
791
+ x (@x[12],"4*12(%r14)");
792
+ al (@x[2],"$stdframe+4*10($sp)");
793
+ st (@x[7],"4*7(@t[0])");
794
+ x (@x[13],"4*13(%r14)");
795
+ al (@x[3],"$stdframe+4*11($sp)");
796
+ st (@x[12],"4*12(@t[0])");
797
+ x (@x[14],"4*14(%r14)");
798
+ st (@x[13],"4*13(@t[0])");
799
+ x (@x[15],"4*15(%r14)");
800
+ st (@x[14],"4*14(@t[0])");
801
+ lrvr (@x[0],@x[0]);
802
+ st (@x[15],"4*15(@t[0])");
803
+ lrvr (@x[1],@x[1]);
804
+ lrvr (@x[2],@x[2]);
805
+ lrvr (@x[3],@x[3]);
806
+ lhi (@x[12],1);
807
+ x (@x[0],"4*8(%r14)");
808
+ al (@x[12],"$stdframe+4*12($sp)"); # increment counter
809
+ x (@x[1],"4*9(%r14)");
810
+ st (@x[0],"4*8(@t[0])");
811
+ x (@x[2],"4*10(%r14)");
812
+ st (@x[1],"4*9(@t[0])");
813
+ x (@x[3],"4*11(%r14)");
814
+ st (@x[2],"4*10(@t[0])");
815
+ st (@x[3],"4*11(@t[0])");
816
+
817
+&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
818
+ la ("%r14","64(%r14)");
819
+ jl (".Loop_outer");
820
+
821
+LABEL (".Ldone");
822
+ xgr ("%r0","%r0");
823
+ xgr ("%r1","%r1");
824
+ xgr ("%r2","%r2");
825
+ xgr ("%r3","%r3");
826
+ stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
827
+ stmg ("%r0","%r3","$stdframe+4*12($sp)");
828
+
829
+&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
830
+ br ("%r14");
831
+
832
+ALIGN (16);
833
+LABEL (".Ltail");
834
+ la (@t[1],"64($t[1])");
835
+ stm (@x[0],@x[7],"$stdframe+4*0($sp)");
836
+&{$z? \&slgr:\&slr} (@t[1],"%r14");
837
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
838
+&{$z? \&lghi:\&lhi} (@x[6],0);
839
+ stm (@x[12],@x[15],"$stdframe+4*12($sp)");
840
+ al (@x[0],"$stdframe+4*8($sp)");
841
+ al (@x[1],"$stdframe+4*9($sp)");
842
+ al (@x[2],"$stdframe+4*10($sp)");
843
+ al (@x[3],"$stdframe+4*11($sp)");
844
+ lrvr (@x[0],@x[0]);
845
+ lrvr (@x[1],@x[1]);
846
+ lrvr (@x[2],@x[2]);
847
+ lrvr (@x[3],@x[3]);
848
+ stm (@x[0],@x[3],"$stdframe+4*8($sp)");
849
+
850
+LABEL (".Loop_tail");
851
+ llgc (@x[4],"0(@x[6],%r14)");
852
+ llgc (@x[5],"$stdframe(@x[6],$sp)");
853
+ xr (@x[5],@x[4]);
854
+ stc (@x[5],"0(@x[6],@t[0])");
855
+ la (@x[6],"1(@x[6])");
856
+ brct (@t[1],".Loop_tail");
857
+
858
+ j (".Ldone");
859
+SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
860
+}
861
}
862
-close STDOUT;
863
+################
864
+
865
+ALIGN (64);
866
+LABEL (".Lsigma");
867
+LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
868
+LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
869
+LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
870
+LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
871
+ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
872
+ALIGN (4);
873
+
874
+PERLASM_END();
875