File 0005-s390x-assembly-pack-import-chacha-from-cryptogams-re.patch of Package openssl-1_1
xxxxxxxxxx
1
From d1229190bfbb19439589557e4d65f9bccab09b2d Mon Sep 17 00:00:00 2001
2
From: Patrick Steuer <patrick.steuer@de.ibm.com>
3
Date: Mon, 25 Feb 2019 18:55:04 +0100
4
Subject: [PATCH] s390x assembly pack: import chacha from cryptogams repo
5
6
featuring 6x"horizontal" code path which is up to 25%
7
faster than present 4x"vertical" for larger blocks.
8
9
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
10
11
Reviewed-by: Matt Caswell <matt@openssl.org>
12
Reviewed-by: Richard Levitte <levitte@openssl.org>
13
(Merged from https://github.com/openssl/openssl/pull/8287)
14
---
15
crypto/chacha/asm/chacha-s390x.pl | 1006 +++++++++++++++++++++--------
16
1 file changed, 719 insertions(+), 287 deletions(-)
17
18
diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl
19
index abf7283dd8..51efe64408 100755
20
--- a/crypto/chacha/asm/chacha-s390x.pl
21
+++ b/crypto/chacha/asm/chacha-s390x.pl
22
23
#
24
# August 2018
25
#
26
-# Add vx code path.
27
+# Add vx code path: 4x"vertical".
28
#
29
# Copyright IBM Corp. 2018
30
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
31
32
+#
33
+# February 2019
34
+#
35
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
36
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
37
+# But to harness overheads revert to transliteration of VSX code path
38
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
39
+# not longer than 256 bytes.
40
+
41
use strict;
42
use FindBin qw($Bin);
43
use lib "$Bin/../..";
44
45
my $sp="%r15";
46
my $stdframe=16*$SIZE_T+4*8;
47
48
+sub ROUND {
49
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
50
my @t=map("%r$_",(8,9));
51
-my @v=map("%v$_",(16..31));
52
-
53
-sub ROUND {
54
my ($a0,$b0,$c0,$d0)=@_;
55
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
56
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
57
58
rll (@x[$b3],@x[$b3],7);
59
}
60
61
-sub VX_ROUND {
62
+sub VX_lane_ROUND {
63
my ($a0,$b0,$c0,$d0)=@_;
64
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
65
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
66
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
67
+my @x=map("%v$_",(0..15));
68
+
69
+ vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
70
+ vx (@x[$d0],@x[$d0],@x[$a0]);
71
+ verllf (@x[$d0],@x[$d0],16);
72
+ vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
73
+ vx (@x[$d1],@x[$d1],@x[$a1]);
74
+ verllf (@x[$d1],@x[$d1],16);
75
+ vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
76
+ vx (@x[$d2],@x[$d2],@x[$a2]);
77
+ verllf (@x[$d2],@x[$d2],16);
78
+ vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
79
+ vx (@x[$d3],@x[$d3],@x[$a3]);
80
+ verllf (@x[$d3],@x[$d3],16);
81
+
82
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
83
+ vx (@x[$b0],@x[$b0],@x[$c0]);
84
+ verllf (@x[$b0],@x[$b0],12);
85
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
86
+ vx (@x[$b1],@x[$b1],@x[$c1]);
87
+ verllf (@x[$b1],@x[$b1],12);
88
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
89
+ vx (@x[$b2],@x[$b2],@x[$c2]);
90
+ verllf (@x[$b2],@x[$b2],12);
91
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
92
+ vx (@x[$b3],@x[$b3],@x[$c3]);
93
+ verllf (@x[$b3],@x[$b3],12);
94
+
95
+ vaf (@x[$a0],@x[$a0],@x[$b0]);
96
+ vx (@x[$d0],@x[$d0],@x[$a0]);
97
+ verllf (@x[$d0],@x[$d0],8);
98
+ vaf (@x[$a1],@x[$a1],@x[$b1]);
99
+ vx (@x[$d1],@x[$d1],@x[$a1]);
100
+ verllf (@x[$d1],@x[$d1],8);
101
+ vaf (@x[$a2],@x[$a2],@x[$b2]);
102
+ vx (@x[$d2],@x[$d2],@x[$a2]);
103
+ verllf (@x[$d2],@x[$d2],8);
104
+ vaf (@x[$a3],@x[$a3],@x[$b3]);
105
+ vx (@x[$d3],@x[$d3],@x[$a3]);
106
+ verllf (@x[$d3],@x[$d3],8);
107
+
108
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
109
+ vx (@x[$b0],@x[$b0],@x[$c0]);
110
+ verllf (@x[$b0],@x[$b0],7);
111
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
112
+ vx (@x[$b1],@x[$b1],@x[$c1]);
113
+ verllf (@x[$b1],@x[$b1],7);
114
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
115
+ vx (@x[$b2],@x[$b2],@x[$c2]);
116
+ verllf (@x[$b2],@x[$b2],7);
117
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
118
+ vx (@x[$b3],@x[$b3],@x[$c3]);
119
+ verllf (@x[$b3],@x[$b3],7);
120
+}
121
122
- vaf (@v[$a0],@v[$a0],@v[$b0]);
123
- vaf (@v[$a1],@v[$a1],@v[$b1]);
124
- vaf (@v[$a2],@v[$a2],@v[$b2]);
125
- vaf (@v[$a3],@v[$a3],@v[$b3]);
126
- vx (@v[$d0],@v[$d0],@v[$a0]);
127
- vx (@v[$d1],@v[$d1],@v[$a1]);
128
- vx (@v[$d2],@v[$d2],@v[$a2]);
129
- vx (@v[$d3],@v[$d3],@v[$a3]);
130
- verllf (@v[$d0],@v[$d0],16);
131
- verllf (@v[$d1],@v[$d1],16);
132
- verllf (@v[$d2],@v[$d2],16);
133
- verllf (@v[$d3],@v[$d3],16);
134
-
135
- vaf (@v[$c0],@v[$c0],@v[$d0]);
136
- vaf (@v[$c1],@v[$c1],@v[$d1]);
137
- vaf (@v[$c2],@v[$c2],@v[$d2]);
138
- vaf (@v[$c3],@v[$c3],@v[$d3]);
139
- vx (@v[$b0],@v[$b0],@v[$c0]);
140
- vx (@v[$b1],@v[$b1],@v[$c1]);
141
- vx (@v[$b2],@v[$b2],@v[$c2]);
142
- vx (@v[$b3],@v[$b3],@v[$c3]);
143
- verllf (@v[$b0],@v[$b0],12);
144
- verllf (@v[$b1],@v[$b1],12);
145
- verllf (@v[$b2],@v[$b2],12);
146
- verllf (@v[$b3],@v[$b3],12);
147
-
148
- vaf (@v[$a0],@v[$a0],@v[$b0]);
149
- vaf (@v[$a1],@v[$a1],@v[$b1]);
150
- vaf (@v[$a2],@v[$a2],@v[$b2]);
151
- vaf (@v[$a3],@v[$a3],@v[$b3]);
152
- vx (@v[$d0],@v[$d0],@v[$a0]);
153
- vx (@v[$d1],@v[$d1],@v[$a1]);
154
- vx (@v[$d2],@v[$d2],@v[$a2]);
155
- vx (@v[$d3],@v[$d3],@v[$a3]);
156
- verllf (@v[$d0],@v[$d0],8);
157
- verllf (@v[$d1],@v[$d1],8);
158
- verllf (@v[$d2],@v[$d2],8);
159
- verllf (@v[$d3],@v[$d3],8);
160
-
161
- vaf (@v[$c0],@v[$c0],@v[$d0]);
162
- vaf (@v[$c1],@v[$c1],@v[$d1]);
163
- vaf (@v[$c2],@v[$c2],@v[$d2]);
164
- vaf (@v[$c3],@v[$c3],@v[$d3]);
165
- vx (@v[$b0],@v[$b0],@v[$c0]);
166
- vx (@v[$b1],@v[$b1],@v[$c1]);
167
- vx (@v[$b2],@v[$b2],@v[$c2]);
168
- vx (@v[$b3],@v[$b3],@v[$c3]);
169
- verllf (@v[$b0],@v[$b0],7);
170
- verllf (@v[$b1],@v[$b1],7);
171
- verllf (@v[$b2],@v[$b2],7);
172
- verllf (@v[$b3],@v[$b3],7);
173
+sub VX_ROUND {
174
+my @a=@_[0..5];
175
+my @b=@_[6..11];
176
+my @c=@_[12..17];
177
+my @d=@_[18..23];
178
+my $odd=@_[24];
179
+
180
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
181
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
182
+ verllf (@d[$_],@d[$_],16) for (0..5);
183
+
184
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
185
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
186
+ verllf (@b[$_],@b[$_],12) for (0..5);
187
+
188
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
189
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
190
+ verllf (@d[$_],@d[$_],8) for (0..5);
191
+
192
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
193
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
194
+ verllf (@b[$_],@b[$_],7) for (0..5);
195
+
196
+ vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
197
+ vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
198
+ vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
199
}
200
201
PERLASM_BEGIN($output);
202
203
################
204
# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
205
# const unsigned int key[8], const unsigned int counter[4])
206
-{
207
my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
208
-
209
-# VX CODE PATH
210
{
211
-my $off=$z*8*16+8; # offset(initial state)
212
-my $frame=$stdframe+4*16+$off;
213
+my $frame=$stdframe+4*20;
214
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
215
+my @t=map("%r$_",(8,9));
216
217
GLOBL ("ChaCha20_ctr32");
218
TYPE ("ChaCha20_ctr32","\@function");
219
220
larl ("%r1","OPENSSL_s390xcap_P");
221
222
lghi ("%r0",64);
223
+&{$z? \<gr:\<r} ($len,$len); # len==0?
224
+ bzr ("%r14");
225
+ lg ("%r1","S390X_STFLE+16(%r1)");
226
&{$z? \&clgr:\&clr} ($len,"%r0");
227
- jle ("_s390x_chacha_novx");
228
-
229
- lg ("%r0","S390X_STFLE+16(%r1)");
230
- tmhh ("%r0",0x4000); # check for vector facility
231
- jz ("_s390x_chacha_novx");
232
-
233
-if (!$z) {
234
- llgfr ($len,$len);
235
- std ("%f4","16*$SIZE_T+2*8($sp)");
236
- std ("%f6","16*$SIZE_T+3*8($sp)");
237
-}
238
-&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
239
+ jle (".Lshort");
240
241
- lghi ("%r1",-$frame);
242
- lgr ("%r0",$sp);
243
- la ($sp,"0(%r1,$sp)"); # allocate stack frame
244
+ tmhh ("%r1",0x4000); # check for vx bit
245
+ jnz (".LChaCha20_ctr32_vx");
246
247
- larl ("%r7",".Lsigma");
248
-&{$z? \&stg:\&st} ("%r0","0($sp)"); # backchain
249
-
250
- vstm ("%v8","%v15","8($sp)") if ($z);
251
-
252
- vlm ("%v1","%v2","0($key)"); # load key
253
- vl ("%v0","0(%r7)"); # load sigma constant
254
- vl ("%v3","0($counter)"); # load iv (counter||nonce)
255
- l ("%r0","0($counter)"); # load counter
256
- vstm ("%v0","%v3","$off($sp)"); # copy initial state to stack
257
-
258
- srlg ("%r1",$len,8);
259
- ltgr ("%r1","%r1");
260
- jz (".Lvx_4x_done");
261
-
262
-ALIGN (16); # process 4 64-byte blocks
263
-LABEL (".Lvx_4x");
264
- vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
265
- # state
266
- vl ("%v31","16(%r7)");
267
- vaf ("%v12","%v12","%v31"); # increment counter
268
-
269
- vlr (@v[$_],"%v$_") for (0..15); # copy initial state
270
-
271
- lhi ("%r6",10);
272
- j (".Loop_vx_4x");
273
-
274
-ALIGN (16);
275
-LABEL (".Loop_vx_4x");
276
- VX_ROUND( 0, 4, 8,12); # column round
277
- VX_ROUND( 0, 5,10,15); # diagonal round
278
- brct ("%r6",".Loop_vx_4x");
279
-
280
- vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
281
- # state (mod 32)
282
- vlm ("%v6","%v7","32(%r7)"); # load vperm operands
283
-
284
-for (0..3) { # blocks 1,2
285
- vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
286
- vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
287
- vperm ("%v".($_+ 8),"%v0","%v1","%v6");
288
- vperm ("%v".($_+12),"%v0","%v1","%v7");
289
-}
290
- vlm ("%v0","%v7","0($inp)"); # load in
291
- vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
292
- vstm ("%v0","%v7","0($out)"); # store out
293
-
294
- vlm ("%v6","%v7","32(%r7)"); # restore vperm operands
295
-
296
-for (0..3) { # blocks 2,3
297
- vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
298
- vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
299
- vperm ("%v".($_+ 8),"%v0","%v1","%v6");
300
- vperm ("%v".($_+12),"%v0","%v1","%v7");
301
-}
302
- vlm ("%v0","%v7","128($inp)"); # load in
303
- vx ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
304
- vstm ("%v0","%v7","128($out)"); # store out
305
-
306
- ahi ("%r0",4);
307
- st ("%r0","48+$off($sp)"); # update initial state
308
-
309
- la ($inp,"256($inp)");
310
- la ($out,"256($out)");
311
- brctg ("%r1",".Lvx_4x");
312
-
313
-ALIGN (16);
314
-LABEL (".Lvx_4x_done");
315
- lghi ("%r1",0xff);
316
- ngr ($len,"%r1");
317
- jnz (".Lvx_rem");
318
-
319
-ALIGN (16);
320
-LABEL (".Lvx_done");
321
- vzero ("%v$_") for (16..31); # wipe ks and key copy
322
- vstm ("%v16","%v17","16+$off($sp)");
323
- vlm ("%v8","%v15","8($sp)") if ($z);
324
-
325
- la ($sp,"$frame($sp)");
326
-&{$z? \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
327
-
328
-if (!$z) {
329
- ld ("%f4","16*$SIZE_T+2*8($sp)");
330
- ld ("%f6","16*$SIZE_T+3*8($sp)");
331
- vzero ("%v$_") for (8..15);
332
-}
333
- br ("%r14");
334
-ALIGN (16);
335
-LABEL (".Lvx_rem");
336
- lhi ("%r0",64);
337
-
338
- sr ($len,"%r0");
339
- brc (2,".Lvx_rem_g64"); # cc==2?
340
-
341
- lghi ("%r1",-$stdframe);
342
-
343
- la ($counter,"48+$off($sp)"); # load updated iv
344
- ar ($len,"%r0"); # restore len
345
-
346
- lgr ("%r7",$counter);
347
-&{$z? \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
348
- la ($sp,"0(%r1,$sp)");
349
-
350
- bras ("%r14","_s390x_chacha_novx");
351
-
352
- la ($sp,"$stdframe($sp)");
353
-&{$z? \&lg:\&l} ("%r14","14*$SIZE_T+$frame($sp)");
354
- lgr ($counter,"%r7");
355
- j (".Lvx_done");
356
-
357
-ALIGN (16);
358
-LABEL (".Lvx_rem_g64");
359
- vlrepf ("%v$_",($_*4)."+$off($sp)") for (0..15); # load initial
360
- # state
361
- vl ("%v31","16(%r7)");
362
- vaf ("%v12","%v12","%v31"); # increment counter
363
-
364
- vlr (@v[$_],"%v$_") for (0..15); # state = initial state
365
-
366
- lhi ("%r6",10);
367
- j (".Loop_vx_rem");
368
-
369
-ALIGN (16);
370
-LABEL (".Loop_vx_rem");
371
- VX_ROUND( 0, 4, 8,12); # column round
372
- VX_ROUND( 0, 5,10,15); # diagonal round
373
- brct ("%r6",".Loop_vx_rem");
374
-
375
- vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
376
- # state (mod 32)
377
- vlm ("%v6","%v7","32(%r7)"); # load vperm operands
378
-
379
-for (0..3) { # blocks 1,2
380
- vmrhf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
381
- vmrhf ("%v1",@v[$_*4+2],@v[$_*4+3]);
382
- vperm ("%v".($_+8),"%v0","%v1","%v6");
383
- vperm ("%v".($_+12),"%v0","%v1","%v7");
384
-}
385
- vlm ("%v0","%v3","0($inp)"); # load in
386
- vx ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
387
- vstm ("%v0","%v3","0($out)"); # store out
388
-
389
- la ($inp,"64($inp)");
390
- la ($out,"64($out)");
391
-
392
- sr ($len,"%r0");
393
- brc (4,".Lvx_tail"); # cc==4?
394
-
395
- vlm ("%v0","%v3","0($inp)"); # load in
396
- vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
397
- vstm ("%v0","%v3","0($out)"); # store out
398
- jz (".Lvx_done");
399
-
400
-for (0..3) { # blocks 3,4
401
- vmrlf ("%v0",@v[$_*4+0],@v[$_*4+1]); # ks = serialize(state)
402
- vmrlf ("%v1",@v[$_*4+2],@v[$_*4+3]);
403
- vperm ("%v".($_+12),"%v0","%v1","%v6");
404
- vperm ("%v".($_+8),"%v0","%v1","%v7");
405
-}
406
- la ($inp,"64($inp)");
407
- la ($out,"64($out)");
408
-
409
- sr ($len,"%r0");
410
- brc (4,".Lvx_tail"); # cc==4?
411
-
412
- vlm ("%v0","%v3","0($inp)"); # load in
413
- vx ("%v$_","%v$_","%v".($_+12)) for (0..3); # out = in ^ ks
414
- vstm ("%v0","%v3","0($out)"); # store out
415
- jz (".Lvx_done");
416
-
417
- la ($inp,"64($inp)");
418
- la ($out,"64($out)");
419
-
420
- sr ($len,"%r0");
421
- vlr ("%v".($_+4),"%v$_") for (8..11);
422
- j (".Lvx_tail");
423
-
424
-ALIGN (16);
425
-LABEL (".Lvx_tail");
426
- ar ($len,"%r0"); # restore $len
427
- ahi ($len,-1);
428
-
429
- lhi ("%r0",16);
430
-for (0..2) {
431
- vll ("%v0",$len,($_*16)."($inp)");
432
- vx ("%v0","%v0","%v".($_+12));
433
- vstl ("%v0",$len,($_*16)."($out)");
434
- sr ($len,"%r0");
435
- brc (4,".Lvx_done"); # cc==4?
436
-}
437
- vll ("%v0",$len,"3*16($inp)");
438
- vx ("%v0","%v0","%v15");
439
- vstl ("%v0",$len,"3*16($out)");
440
- j (".Lvx_done");
441
-SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
442
-}
443
-
444
-# NOVX CODE PATH
445
-{
446
-my $frame=$stdframe+4*20;
447
-
448
-TYPE ("_s390x_chacha_novx","\@function");
449
-ALIGN (32);
450
-LABEL ("_s390x_chacha_novx");
451
-&{$z? \<gr:\<r} ($len,$len); # $len==0?
452
- bzr ("%r14");
453
+LABEL (".Lshort");
454
&{$z? \&aghi:\&ahi} ($len,-64);
455
&{$z? \&lghi:\&lhi} ("%r1",-$frame);
456
&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
457
458
brct (@t[1],".Loop_tail");
459
460
j (".Ldone");
461
-SIZE ("_s390x_chacha_novx",".-_s390x_chacha_novx");
462
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
463
+}
464
+
465
+########################################################################
466
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
467
+# runs underutilized [because of vector instructions' high latency].
468
+# On the other hand minimum amount of data it takes to fully utilize
469
+# the pipeline is higher, so that effectively, short inputs would be
470
+# processed slower. Hence this code path targeting <=256 bytes lengths.
471
+#
472
+{
473
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
474
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
475
+my @K=map("%v$_",(16..19));
476
+my $CTR="%v26";
477
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
478
+my $beperm="%v31";
479
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
480
+my $FRAME=$stdframe+4*16;
481
+
482
+ALIGN (32);
483
+LABEL ("ChaCha20_ctr32_4x");
484
+LABEL (".LChaCha20_ctr32_4x");
485
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
486
+if (!$z) {
487
+ std ("%f4","16*$SIZE_T+2*8($sp)");
488
+ std ("%f6","16*$SIZE_T+3*8($sp)");
489
+}
490
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
491
+ lgr ("%r0",$sp);
492
+ la ($sp,"0(%r1,$sp)");
493
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
494
+if ($z) {
495
+ std ("%f8","$stdframe+8*0($sp)");
496
+ std ("%f9","$stdframe+8*1($sp)");
497
+ std ("%f10","$stdframe+8*2($sp)");
498
+ std ("%f11","$stdframe+8*3($sp)");
499
+ std ("%f12","$stdframe+8*4($sp)");
500
+ std ("%f13","$stdframe+8*5($sp)");
501
+ std ("%f14","$stdframe+8*6($sp)");
502
+ std ("%f15","$stdframe+8*7($sp)");
503
+}
504
+ larl ("%r7",".Lsigma");
505
+ lhi ("%r0",10);
506
+ lhi ("%r1",0);
507
+
508
+ vl (@K[0],"0(%r7)"); # load sigma
509
+ vl (@K[1],"0($key)"); # load key
510
+ vl (@K[2],"16($key)");
511
+ vl (@K[3],"0($counter)"); # load counter
512
+
513
+ vl ($beperm,"0x40(%r7)");
514
+ vl ($xt1,"0x50(%r7)");
515
+ vrepf ($CTR,@K[3],0);
516
+ vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
517
+ vaf ($CTR,$CTR,$xt1);
518
+
519
+#LABEL (".Loop_outer_4x");
520
+ vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
521
+
522
+ vrepf ($xb0,@K[1],0); # smash the key
523
+ vrepf ($xb1,@K[1],1);
524
+ vrepf ($xb2,@K[1],2);
525
+ vrepf ($xb3,@K[1],3);
526
+
527
+ vrepf ($xc0,@K[2],0);
528
+ vrepf ($xc1,@K[2],1);
529
+ vrepf ($xc2,@K[2],2);
530
+ vrepf ($xc3,@K[2],3);
531
+
532
+ vlr ($xd0,$CTR);
533
+ vrepf ($xd1,@K[3],1);
534
+ vrepf ($xd2,@K[3],2);
535
+ vrepf ($xd3,@K[3],3);
536
+
537
+LABEL (".Loop_4x");
538
+ VX_lane_ROUND(0, 4, 8,12);
539
+ VX_lane_ROUND(0, 5,10,15);
540
+ brct ("%r0",".Loop_4x");
541
+
542
+ vaf ($xd0,$xd0,$CTR);
543
+
544
+ vmrhf ($xt0,$xa0,$xa1); # transpose data
545
+ vmrhf ($xt1,$xa2,$xa3);
546
+ vmrlf ($xt2,$xa0,$xa1);
547
+ vmrlf ($xt3,$xa2,$xa3);
548
+ vpdi ($xa0,$xt0,$xt1,0b0000);
549
+ vpdi ($xa1,$xt0,$xt1,0b0101);
550
+ vpdi ($xa2,$xt2,$xt3,0b0000);
551
+ vpdi ($xa3,$xt2,$xt3,0b0101);
552
+
553
+ vmrhf ($xt0,$xb0,$xb1);
554
+ vmrhf ($xt1,$xb2,$xb3);
555
+ vmrlf ($xt2,$xb0,$xb1);
556
+ vmrlf ($xt3,$xb2,$xb3);
557
+ vpdi ($xb0,$xt0,$xt1,0b0000);
558
+ vpdi ($xb1,$xt0,$xt1,0b0101);
559
+ vpdi ($xb2,$xt2,$xt3,0b0000);
560
+ vpdi ($xb3,$xt2,$xt3,0b0101);
561
+
562
+ vmrhf ($xt0,$xc0,$xc1);
563
+ vmrhf ($xt1,$xc2,$xc3);
564
+ vmrlf ($xt2,$xc0,$xc1);
565
+ vmrlf ($xt3,$xc2,$xc3);
566
+ vpdi ($xc0,$xt0,$xt1,0b0000);
567
+ vpdi ($xc1,$xt0,$xt1,0b0101);
568
+ vpdi ($xc2,$xt2,$xt3,0b0000);
569
+ vpdi ($xc3,$xt2,$xt3,0b0101);
570
+
571
+ vmrhf ($xt0,$xd0,$xd1);
572
+ vmrhf ($xt1,$xd2,$xd3);
573
+ vmrlf ($xt2,$xd0,$xd1);
574
+ vmrlf ($xt3,$xd2,$xd3);
575
+ vpdi ($xd0,$xt0,$xt1,0b0000);
576
+ vpdi ($xd1,$xt0,$xt1,0b0101);
577
+ vpdi ($xd2,$xt2,$xt3,0b0000);
578
+ vpdi ($xd3,$xt2,$xt3,0b0101);
579
+
580
+ #vrepif ($xt0,4);
581
+ #vaf ($CTR,$CTR,$xt0); # next counter value
582
+
583
+ vaf ($xa0,$xa0,@K[0]);
584
+ vaf ($xb0,$xb0,@K[1]);
585
+ vaf ($xc0,$xc0,@K[2]);
586
+ vaf ($xd0,$xd0,@K[3]);
587
+
588
+ vperm ($xa0,$xa0,$xa0,$beperm);
589
+ vperm ($xb0,$xb0,$xb0,$beperm);
590
+ vperm ($xc0,$xc0,$xc0,$beperm);
591
+ vperm ($xd0,$xd0,$xd0,$beperm);
592
+
593
+ #&{$z? \&clgfi:\&clfi} ($len,0x40);
594
+ #jl (".Ltail_4x");
595
+
596
+ vlm ($xt0,$xt3,"0($inp)");
597
+
598
+ vx ($xt0,$xt0,$xa0);
599
+ vx ($xt1,$xt1,$xb0);
600
+ vx ($xt2,$xt2,$xc0);
601
+ vx ($xt3,$xt3,$xd0);
602
+
603
+ vstm ($xt0,$xt3,"0($out)");
604
+
605
+ la ($inp,"0x40($inp)");
606
+ la ($out,"0x40($out)");
607
+&{$z? \&aghi:\&ahi} ($len,-0x40);
608
+ #je (".Ldone_4x");
609
+
610
+ vaf ($xa0,$xa1,@K[0]);
611
+ vaf ($xb0,$xb1,@K[1]);
612
+ vaf ($xc0,$xc1,@K[2]);
613
+ vaf ($xd0,$xd1,@K[3]);
614
+
615
+ vperm ($xa0,$xa0,$xa0,$beperm);
616
+ vperm ($xb0,$xb0,$xb0,$beperm);
617
+ vperm ($xc0,$xc0,$xc0,$beperm);
618
+ vperm ($xd0,$xd0,$xd0,$beperm);
619
+
620
+&{$z? \&clgfi:\&clfi} ($len,0x40);
621
+ jl (".Ltail_4x");
622
+
623
+ vlm ($xt0,$xt3,"0($inp)");
624
+
625
+ vx ($xt0,$xt0,$xa0);
626
+ vx ($xt1,$xt1,$xb0);
627
+ vx ($xt2,$xt2,$xc0);
628
+ vx ($xt3,$xt3,$xd0);
629
+
630
+ vstm ($xt0,$xt3,"0($out)");
631
+
632
+ la ($inp,"0x40($inp)");
633
+ la ($out,"0x40($out)");
634
+&{$z? \&aghi:\&ahi} ($len,-0x40);
635
+ je (".Ldone_4x");
636
+
637
+ vaf ($xa0,$xa2,@K[0]);
638
+ vaf ($xb0,$xb2,@K[1]);
639
+ vaf ($xc0,$xc2,@K[2]);
640
+ vaf ($xd0,$xd2,@K[3]);
641
+
642
+ vperm ($xa0,$xa0,$xa0,$beperm);
643
+ vperm ($xb0,$xb0,$xb0,$beperm);
644
+ vperm ($xc0,$xc0,$xc0,$beperm);
645
+ vperm ($xd0,$xd0,$xd0,$beperm);
646
+
647
+&{$z? \&clgfi:\&clfi} ($len,0x40);
648
+ jl (".Ltail_4x");
649
+
650
+ vlm ($xt0,$xt3,"0($inp)");
651
+
652
+ vx ($xt0,$xt0,$xa0);
653
+ vx ($xt1,$xt1,$xb0);
654
+ vx ($xt2,$xt2,$xc0);
655
+ vx ($xt3,$xt3,$xd0);
656
+
657
+ vstm ($xt0,$xt3,"0($out)");
658
+
659
+ la ($inp,"0x40($inp)");
660
+ la ($out,"0x40($out)");
661
+&{$z? \&aghi:\&ahi} ($len,-0x40);
662
+ je (".Ldone_4x");
663
+
664
+ vaf ($xa0,$xa3,@K[0]);
665
+ vaf ($xb0,$xb3,@K[1]);
666
+ vaf ($xc0,$xc3,@K[2]);
667
+ vaf ($xd0,$xd3,@K[3]);
668
+
669
+ vperm ($xa0,$xa0,$xa0,$beperm);
670
+ vperm ($xb0,$xb0,$xb0,$beperm);
671
+ vperm ($xc0,$xc0,$xc0,$beperm);
672
+ vperm ($xd0,$xd0,$xd0,$beperm);
673
+
674
+&{$z? \&clgfi:\&clfi} ($len,0x40);
675
+ jl (".Ltail_4x");
676
+
677
+ vlm ($xt0,$xt3,"0($inp)");
678
+
679
+ vx ($xt0,$xt0,$xa0);
680
+ vx ($xt1,$xt1,$xb0);
681
+ vx ($xt2,$xt2,$xc0);
682
+ vx ($xt3,$xt3,$xd0);
683
+
684
+ vstm ($xt0,$xt3,"0($out)");
685
+
686
+ #la $inp,0x40($inp));
687
+ #la $out,0x40($out));
688
+ #lhi %r0,10);
689
+ #&{$z? \&aghi:\&ahi} $len,-0x40);
690
+ #jne .Loop_outer_4x);
691
+
692
+LABEL (".Ldone_4x");
693
+if (!$z) {
694
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
695
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
696
+} else {
697
+ ld ("%f8","$stdframe+8*0($sp)");
698
+ ld ("%f9","$stdframe+8*1($sp)");
699
+ ld ("%f10","$stdframe+8*2($sp)");
700
+ ld ("%f11","$stdframe+8*3($sp)");
701
+ ld ("%f12","$stdframe+8*4($sp)");
702
+ ld ("%f13","$stdframe+8*5($sp)");
703
+ ld ("%f14","$stdframe+8*6($sp)");
704
+ ld ("%f15","$stdframe+8*7($sp)");
705
+}
706
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
707
+ la ($sp,"$FRAME($sp)");
708
+ br ("%r14");
709
+
710
+ALIGN (16);
711
+LABEL (".Ltail_4x");
712
+if (!$z) {
713
+ vlr ($xt0,$xb0);
714
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
715
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
716
+
717
+ vst ($xa0,"$stdframe+0x00($sp)");
718
+ vst ($xt0,"$stdframe+0x10($sp)");
719
+ vst ($xc0,"$stdframe+0x20($sp)");
720
+ vst ($xd0,"$stdframe+0x30($sp)");
721
+} else {
722
+ vlr ($xt0,$xc0);
723
+ ld ("%f8","$stdframe+8*0($sp)");
724
+ ld ("%f9","$stdframe+8*1($sp)");
725
+ ld ("%f10","$stdframe+8*2($sp)");
726
+ ld ("%f11","$stdframe+8*3($sp)");
727
+ vlr ($xt1,$xd0);
728
+ ld ("%f12","$stdframe+8*4($sp)");
729
+ ld ("%f13","$stdframe+8*5($sp)");
730
+ ld ("%f14","$stdframe+8*6($sp)");
731
+ ld ("%f15","$stdframe+8*7($sp)");
732
+
733
+ vst ($xa0,"$stdframe+0x00($sp)");
734
+ vst ($xb0,"$stdframe+0x10($sp)");
735
+ vst ($xt0,"$stdframe+0x20($sp)");
736
+ vst ($xt1,"$stdframe+0x30($sp)");
737
}
738
+ lghi ("%r1",0);
739
+
740
+LABEL (".Loop_tail_4x");
741
+ llgc ("%r5","0(%r1,$inp)");
742
+ llgc ("%r6","$stdframe(%r1,$sp)");
743
+ xr ("%r6","%r5");
744
+ stc ("%r6","0(%r1,$out)");
745
+ la ("%r1","1(%r1)");
746
+ brct ($len,".Loop_tail_4x");
747
+
748
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
749
+ la ($sp,"$FRAME($sp)");
750
+ br ("%r14");
751
+SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
752
+}
753
+
754
+########################################################################
755
+# 6x"horizontal" layout is optimal fit for the platform in its current
756
+# shape, more specifically for given vector instructions' latency. Well,
757
+# computational part of 8x"vertical" would be faster, but it consumes
758
+# all registers and dealing with that will diminish the return...
759
+#
760
+{
761
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
762
+ $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
763
+ $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
764
+my @K=map("%v$_",(27,24..26));
765
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
766
+my $beperm="%v31";
767
+my $FRAME=$stdframe + 4*16;
768
+
769
+GLOBL ("ChaCha20_ctr32_vx");
770
+ALIGN (32);
771
+LABEL ("ChaCha20_ctr32_vx");
772
+LABEL (".LChaCha20_ctr32_vx");
773
+&{$z? \&clgfi:\&clfi} ($len,256);
774
+ jle (".LChaCha20_ctr32_4x");
775
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
776
+if (!$z) {
777
+ std ("%f4","16*$SIZE_T+2*8($sp)");
778
+ std ("%f6","16*$SIZE_T+3*8($sp)");
779
+}
780
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
781
+ lgr ("%r0",$sp);
782
+ la ($sp,"0(%r1,$sp)");
783
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
784
+if ($z) {
785
+ std ("%f8","$FRAME-8*8($sp)");
786
+ std ("%f9","$FRAME-8*7($sp)");
787
+ std ("%f10","$FRAME-8*6($sp)");
788
+ std ("%f11","$FRAME-8*5($sp)");
789
+ std ("%f12","$FRAME-8*4($sp)");
790
+ std ("%f13","$FRAME-8*3($sp)");
791
+ std ("%f14","$FRAME-8*2($sp)");
792
+ std ("%f15","$FRAME-8*1($sp)");
793
+}
794
+ larl ("%r7",".Lsigma");
795
+ lhi ("%r0",10);
796
+
797
+ vlm (@K[1],@K[2],"0($key)"); # load key
798
+ vl (@K[3],"0($counter)"); # load counter
799
+
800
+ vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
801
+
802
+LABEL (".Loop_outer_vx");
803
+ vlr ($a0,@K[0]);
804
+ vlr ($b0,@K[1]);
805
+ vlr ($a1,@K[0]);
806
+ vlr ($b1,@K[1]);
807
+ vlr ($a2,@K[0]);
808
+ vlr ($b2,@K[1]);
809
+ vlr ($a3,@K[0]);
810
+ vlr ($b3,@K[1]);
811
+ vlr ($a4,@K[0]);
812
+ vlr ($b4,@K[1]);
813
+ vlr ($a5,@K[0]);
814
+ vlr ($b5,@K[1]);
815
+
816
+ vlr ($d0,@K[3]);
817
+ vaf ($d1,@K[3],$t1); # K[3]+1
818
+ vaf ($d2,@K[3],$t2); # K[3]+2
819
+ vaf ($d3,@K[3],$t3); # K[3]+3
820
+ vaf ($d4,$d2,$t2); # K[3]+4
821
+ vaf ($d5,$d2,$t3); # K[3]+5
822
+
823
+ vlr ($c0,@K[2]);
824
+ vlr ($c1,@K[2]);
825
+ vlr ($c2,@K[2]);
826
+ vlr ($c3,@K[2]);
827
+ vlr ($c4,@K[2]);
828
+ vlr ($c5,@K[2]);
829
+
830
+ vlr ($t1,$d1);
831
+ vlr ($t2,$d2);
832
+ vlr ($t3,$d3);
833
+
834
+ALIGN (4);
835
+LABEL (".Loop_vx");
836
+
837
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
838
+ $b0,$b1,$b2,$b3,$b4,$b5,
839
+ $c0,$c1,$c2,$c3,$c4,$c5,
840
+ $d0,$d1,$d2,$d3,$d4,$d5,
841
+ 0);
842
+
843
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
844
+ $b0,$b1,$b2,$b3,$b4,$b5,
845
+ $c0,$c1,$c2,$c3,$c4,$c5,
846
+ $d0,$d1,$d2,$d3,$d4,$d5,
847
+ 1);
848
+
849
+ brct ("%r0",".Loop_vx");
850
+
851
+ vaf ($a0,$a0,@K[0]);
852
+ vaf ($b0,$b0,@K[1]);
853
+ vaf ($c0,$c0,@K[2]);
854
+ vaf ($d0,$d0,@K[3]);
855
+ vaf ($a1,$a1,@K[0]);
856
+ vaf ($d1,$d1,$t1); # +K[3]+1
857
+
858
+ vperm ($a0,$a0,$a0,$beperm);
859
+ vperm ($b0,$b0,$b0,$beperm);
860
+ vperm ($c0,$c0,$c0,$beperm);
861
+ vperm ($d0,$d0,$d0,$beperm);
862
+
863
+&{$z? \&clgfi:\&clfi} ($len,0x40);
864
+ jl (".Ltail_vx");
865
+
866
+ vaf ($d2,$d2,$t2); # +K[3]+2
867
+ vaf ($d3,$d3,$t3); # +K[3]+3
868
+ vlm ($t0,$t3,"0($inp)");
869
+
870
+ vx ($a0,$a0,$t0);
871
+ vx ($b0,$b0,$t1);
872
+ vx ($c0,$c0,$t2);
873
+ vx ($d0,$d0,$t3);
874
+
875
+ vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
876
+
877
+ vstm ($a0,$d0,"0($out)");
878
+
879
+ la ($inp,"0x40($inp)");
880
+ la ($out,"0x40($out)");
881
+&{$z? \&aghi:\&ahi} ($len,-0x40);
882
+ je (".Ldone_vx");
883
+
884
+ vaf ($b1,$b1,@K[1]);
885
+ vaf ($c1,$c1,@K[2]);
886
+
887
+ vperm ($a0,$a1,$a1,$beperm);
888
+ vperm ($b0,$b1,$b1,$beperm);
889
+ vperm ($c0,$c1,$c1,$beperm);
890
+ vperm ($d0,$d1,$d1,$beperm);
891
+
892
+&{$z? \&clgfi:\&clfi} ($len,0x40);
893
+ jl (".Ltail_vx");
894
+
895
+ vlm ($a1,$d1,"0($inp)");
896
+
897
+ vx ($a0,$a0,$a1);
898
+ vx ($b0,$b0,$b1);
899
+ vx ($c0,$c0,$c1);
900
+ vx ($d0,$d0,$d1);
901
+
902
+ vstm ($a0,$d0,"0($out)");
903
+
904
+ la ($inp,"0x40($inp)");
905
+ la ($out,"0x40($out)");
906
+&{$z? \&aghi:\&ahi} ($len,-0x40);
907
+ je (".Ldone_vx");
908
+
909
+ vaf ($a2,$a2,@K[0]);
910
+ vaf ($b2,$b2,@K[1]);
911
+ vaf ($c2,$c2,@K[2]);
912
+
913
+ vperm ($a0,$a2,$a2,$beperm);
914
+ vperm ($b0,$b2,$b2,$beperm);
915
+ vperm ($c0,$c2,$c2,$beperm);
916
+ vperm ($d0,$d2,$d2,$beperm);
917
+
918
+&{$z? \&clgfi:\&clfi} ($len,0x40);
919
+ jl (".Ltail_vx");
920
+
921
+ vlm ($a1,$d1,"0($inp)");
922
+
923
+ vx ($a0,$a0,$a1);
924
+ vx ($b0,$b0,$b1);
925
+ vx ($c0,$c0,$c1);
926
+ vx ($d0,$d0,$d1);
927
+
928
+ vstm ($a0,$d0,"0($out)");
929
+
930
+ la ($inp,"0x40($inp)");
931
+ la ($out,"0x40($out)");
932
+&{$z? \&aghi:\&ahi} ($len,-0x40);
933
+ je (".Ldone_vx");
934
+
935
+ vaf ($a3,$a3,@K[0]);
936
+ vaf ($b3,$b3,@K[1]);
937
+ vaf ($c3,$c3,@K[2]);
938
+ vaf ($d2,@K[3],$t3); # K[3]+3
939
+
940
+ vperm ($a0,$a3,$a3,$beperm);
941
+ vperm ($b0,$b3,$b3,$beperm);
942
+ vperm ($c0,$c3,$c3,$beperm);
943
+ vperm ($d0,$d3,$d3,$beperm);
944
+
945
+&{$z? \&clgfi:\&clfi} ($len,0x40);
946
+ jl (".Ltail_vx");
947
+
948
+ vaf ($d3,$d2,$t1); # K[3]+4
949
+ vlm ($a1,$d1,"0($inp)");
950
+
951
+ vx ($a0,$a0,$a1);
952
+ vx ($b0,$b0,$b1);
953
+ vx ($c0,$c0,$c1);
954
+ vx ($d0,$d0,$d1);
955
+
956
+ vstm ($a0,$d0,"0($out)");
957
+
958
+ la ($inp,"0x40($inp)");
959
+ la ($out,"0x40($out)");
960
+&{$z? \&aghi:\&ahi} ($len,-0x40);
961
+ je (".Ldone_vx");
962
+
963
+ vaf ($a4,$a4,@K[0]);
964
+ vaf ($b4,$b4,@K[1]);
965
+ vaf ($c4,$c4,@K[2]);
966
+ vaf ($d4,$d4,$d3); # +K[3]+4
967
+ vaf ($d3,$d3,$t1); # K[3]+5
968
+ vaf (@K[3],$d2,$t3); # K[3]+=6
969
+
970
+ vperm ($a0,$a4,$a4,$beperm);
971
+ vperm ($b0,$b4,$b4,$beperm);
972
+ vperm ($c0,$c4,$c4,$beperm);
973
+ vperm ($d0,$d4,$d4,$beperm);
974
+
975
+&{$z? \&clgfi:\&clfi} ($len,0x40);
976
+ jl (".Ltail_vx");
977
+
978
+ vlm ($a1,$d1,"0($inp)");
979
+
980
+ vx ($a0,$a0,$a1);
981
+ vx ($b0,$b0,$b1);
982
+ vx ($c0,$c0,$c1);
983
+ vx ($d0,$d0,$d1);
984
+
985
+ vstm ($a0,$d0,"0($out)");
986
+
987
+ la ($inp,"0x40($inp)");
988
+ la ($out,"0x40($out)");
989
+&{$z? \&aghi:\&ahi} ($len,-0x40);
990
+ je (".Ldone_vx");
991
+
992
+ vaf ($a5,$a5,@K[0]);
993
+ vaf ($b5,$b5,@K[1]);
994
+ vaf ($c5,$c5,@K[2]);
995
+ vaf ($d5,$d5,$d3); # +K[3]+5
996
+
997
+ vperm ($a0,$a5,$a5,$beperm);
998
+ vperm ($b0,$b5,$b5,$beperm);
999
+ vperm ($c0,$c5,$c5,$beperm);
1000
+ vperm ($d0,$d5,$d5,$beperm);
1001
+
1002
+&{$z? \&clgfi:\&clfi} ($len,0x40);
1003
+ jl (".Ltail_vx");
1004
+
1005
+ vlm ($a1,$d1,"0($inp)");
1006
+
1007
+ vx ($a0,$a0,$a1);
1008
+ vx ($b0,$b0,$b1);
1009
+ vx ($c0,$c0,$c1);
1010
+ vx ($d0,$d0,$d1);
1011
+
1012
+ vstm ($a0,$d0,"0($out)");
1013
+
1014
+ la ($inp,"0x40($inp)");
1015
+ la ($out,"0x40($out)");
1016
+ lhi ("%r0",10);
1017
+&{$z? \&aghi:\&ahi} ($len,-0x40);
1018
+ jne (".Loop_outer_vx");
1019
+
1020
+LABEL (".Ldone_vx");
1021
+if (!$z) {
1022
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1023
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1024
+} else {
1025
+ ld ("%f8","$FRAME-8*8($sp)");
1026
+ ld ("%f9","$FRAME-8*7($sp)");
1027
+ ld ("%f10","$FRAME-8*6($sp)");
1028
+ ld ("%f11","$FRAME-8*5($sp)");
1029
+ ld ("%f12","$FRAME-8*4($sp)");
1030
+ ld ("%f13","$FRAME-8*3($sp)");
1031
+ ld ("%f14","$FRAME-8*2($sp)");
1032
+ ld ("%f15","$FRAME-8*1($sp)");
1033
+}
1034
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1035
+ la ($sp,"$FRAME($sp)");
1036
+ br ("%r14");
1037
+
1038
+ALIGN (16);
1039
+LABEL (".Ltail_vx");
1040
+if (!$z) {
1041
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
1042
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
1043
+} else {
1044
+ ld ("%f8","$FRAME-8*8($sp)");
1045
+ ld ("%f9","$FRAME-8*7($sp)");
1046
+ ld ("%f10","$FRAME-8*6($sp)");
1047
+ ld ("%f11","$FRAME-8*5($sp)");
1048
+ ld ("%f12","$FRAME-8*4($sp)");
1049
+ ld ("%f13","$FRAME-8*3($sp)");
1050
+ ld ("%f14","$FRAME-8*2($sp)");
1051
+ ld ("%f15","$FRAME-8*1($sp)");
1052
+}
1053
+ vstm ($a0,$d0,"$stdframe($sp)");
1054
+ lghi ("%r1",0);
1055
+
1056
+LABEL (".Loop_tail_vx");
1057
+ llgc ("%r5","0(%r1,$inp)");
1058
+ llgc ("%r6","$stdframe(%r1,$sp)");
1059
+ xr ("%r6","%r5");
1060
+ stc ("%r6","0(%r1,$out)");
1061
+ la ("%r1","1(%r1)");
1062
+ brct ($len,".Loop_tail_vx");
1063
+
1064
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
1065
+ la ($sp,"$FRAME($sp)");
1066
+ br ("%r14");
1067
+SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
1068
}
1069
################
1070
1071
-ALIGN (64);
1072
+ALIGN (32);
1073
LABEL (".Lsigma");
1074
LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
1075
-LONG (0x00000000,0x00000001,0x00000002,0x00000003); # vaf counter increment
1076
-LONG (0x03020100,0x07060504,0x13121110,0x17161514); # vperm serialization
1077
-LONG (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c); # vperm serialization
1078
+LONG (1,0,0,0);
1079
+LONG (2,0,0,0);
1080
+LONG (3,0,0,0);
1081
+LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
1082
+
1083
+LONG (0,1,2,3);
1084
+LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
1085
+LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
1086
+LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
1087
+LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
1088
+
1089
ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1090
ALIGN (4);
1091
1092
--
1093
2.21.0
1094
1095