File 0003-crypto-poly1305-asm-poly1305-s390x.pl-add-vx-code-pa.patch of Package openssl-1_1
1007
1
From d6f4b0a8bfbe901c72294d8923eb5b6f54ca7732 Mon Sep 17 00:00:00 2001
2
From: Patrick Steuer <patrick.steuer@de.ibm.com>
3
Date: Mon, 6 Feb 2017 10:54:54 +0100
4
Subject: [PATCH] crypto/poly1305/asm/poly1305-s390x.pl: add vx code path.
5
6
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
7
8
Reviewed-by: Matt Caswell <matt@openssl.org>
9
Reviewed-by: Richard Levitte <levitte@openssl.org>
10
(Merged from https://github.com/openssl/openssl/pull/7991)
11
---
12
crypto/poly1305/asm/poly1305-s390x.pl | 944 +++++++++++++++++++++-----
13
1 file changed, 780 insertions(+), 164 deletions(-)
14
15
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl
16
index 21ca86055e..390f9eefe7 100755
17
--- a/crypto/poly1305/asm/poly1305-s390x.pl
18
+++ b/crypto/poly1305/asm/poly1305-s390x.pl
19
20
#
21
# On side note, z13 enables vector base 2^26 implementation...
22
23
-$flavour = shift;
24
+#
25
+# January 2019
26
+#
27
+# Add vx code path (base 2^26).
28
+#
29
+# Copyright IBM Corp. 2019
30
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
31
32
+use strict;
33
+use FindBin qw($Bin);
34
+use lib "$Bin/../..";
35
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL);
36
+
37
+my $flavour = shift;
38
+
39
+my ($z,$SIZE_T);
40
if ($flavour =~ /3[12]/) {
41
+ $z=0; # S/390 ABI
42
$SIZE_T=4;
43
- $g="";
44
} else {
45
+ $z=1; # zSeries ABI
46
$SIZE_T=8;
47
- $g="g";
48
}
49
50
+my $output;
51
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
52
-open STDOUT,">$output";
53
54
-$sp="%r15";
55
+my $sp="%r15";
56
+
57
+# novx code path ctx layout
58
+# ---------------------------------
59
+# var value base off
60
+# ---------------------------------
61
+# u64 h[3] hash 2^64 0
62
+# u32 pad[2]
63
+# u64 r[2] key 2^64 32
64
+
65
+# vx code path ctx layout
66
+# ---------------------------------
67
+# var value base off
68
+# ---------------------------------
69
+# u32 acc1[5] r^2-acc 2^26 0
70
+# u32 pad
71
+# u32 acc2[5] r-acc 2^26 24
72
+# u32 pad
73
+# u32 r1[5] r 2^26 48
74
+# u32 r15[5] 5*r 2^26 68
75
+# u32 r2[5] r^2 2^26 88
76
+# u32 r25[5] 5*r^2 2^26 108
77
+# u32 r4[5] r^4 2^26 128
78
+# u32 r45[5] 5*r^4 2^26 148
79
+
80
+PERLASM_BEGIN($output);
81
+
82
+TEXT ();
83
+
84
+################
85
+# static void poly1305_init(void *ctx, const unsigned char key[16])
86
+{
87
+my ($ctx,$key)=map("%r$_",(2..3));
88
+my ($r0,$r1,$r2)=map("%r$_",(9,11,13));
89
90
-my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
91
+sub MUL_RKEY { # r*=key
92
+my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7));
93
+my ($t0,$t1,$s1)=map("%r$_",(8,10,12));
94
+
95
+ lg ("%r0","32($ctx)");
96
+ lg ("%r1","40($ctx)");
97
+
98
+ srlg ($s1,"%r1",2);
99
+ algr ($s1,"%r1");
100
+
101
+ lgr ($d0lo,$r0);
102
+ lgr ($d1lo,$r1);
103
+
104
+ mlgr ($d0hi,"%r0");
105
+ lgr ($r1,$d1lo);
106
+ mlgr ($d1hi,$s1);
107
+
108
+ mlgr ($t0,"%r1");
109
+ mlgr ($t1,"%r0");
110
+
111
+ algr ($d0lo,$d1lo);
112
+ lgr ($d1lo,$r2);
113
+ alcgr ($d0hi,$d1hi);
114
+ lghi ($d1hi,0);
115
+
116
+ algr ($r1,$r0);
117
+ alcgr ($t1,$t0);
118
+
119
+ msgr ($d1lo,$s1);
120
+ msgr ($r2,"%r0");
121
+
122
+ algr ($r1,$d1lo);
123
+ alcgr ($t1,$d1hi);
124
+
125
+ algr ($r1,$d0hi);
126
+ alcgr ($r2,$t1);
127
+
128
+ lghi ($r0,-4);
129
+ ngr ($r0,$r2);
130
+ srlg ($t0,$r2,2);
131
+ algr ($r0,$t0);
132
+ lghi ($t1,3);
133
+ ngr ($r2,$t1);
134
+
135
+ algr ($r0,$d0lo);
136
+ alcgr ($r1,$d1hi);
137
+ alcgr ($r2,$d1hi);
138
+}
139
+
140
+sub ST_R5R { # store r,5*r -> base 2^26
141
+my @d=map("%r$_",(4..8));
142
+my @off=@_;
143
+
144
+ lgr (@d[2],$r0);
145
+ lr ("%r1",@d[2]);
146
+ nilh ("%r1",1023);
147
+ lgr (@d[3],$r1);
148
+ lr (@d[0],"%r1");
149
+ srlg ("%r1",@d[2],52);
150
+ lgr (@d[4],$r2);
151
+ srlg ("%r0",@d[2],26);
152
+ sll (@d[4],24);
153
+ lr (@d[2],@d[3]);
154
+ nilh ("%r0",1023);
155
+ sll (@d[2],12);
156
+ lr (@d[1],"%r0");
157
+ &or (@d[2],"%r1");
158
+ srlg ("%r1",@d[3],40);
159
+ nilh (@d[2],1023);
160
+ &or (@d[4],"%r1");
161
+ srlg (@d[3],@d[3],14);
162
+ nilh (@d[4],1023);
163
+ nilh (@d[3],1023);
164
+
165
+ stm (@d[0],@d[4],"@off[0]($ctx)");
166
+ mhi (@d[$_],5) for (0..4);
167
+ stm (@d[0],@d[4],"@off[1]($ctx)");
168
+}
169
170
-$code.=<<___;
171
-.text
172
-
173
-.globl poly1305_init
174
-.type poly1305_init,\@function
175
-.align 16
176
-poly1305_init:
177
- lghi %r0,0
178
- lghi %r1,-1
179
- stg %r0,0($ctx) # zero hash value
180
- stg %r0,8($ctx)
181
- stg %r0,16($ctx)
182
-
183
- cl${g}r $inp,%r0
184
- je .Lno_key
185
-
186
- lrvg %r4,0($inp) # load little-endian key
187
- lrvg %r5,8($inp)
188
-
189
- nihl %r1,0xffc0 # 0xffffffc0ffffffff
190
- srlg %r0,%r1,4 # 0x0ffffffc0fffffff
191
- srlg %r1,%r1,4
192
- nill %r1,0xfffc # 0x0ffffffc0ffffffc
193
-
194
- ngr %r4,%r0
195
- ngr %r5,%r1
196
-
197
- stg %r4,32($ctx)
198
- stg %r5,40($ctx)
199
-
200
-.Lno_key:
201
- lghi %r2,0
202
- br %r14
203
-.size poly1305_init,.-poly1305_init
204
-___
205
+GLOBL ("poly1305_init");
206
+TYPE ("poly1305_init","\@function");
207
+ALIGN (16);
208
+LABEL ("poly1305_init");
209
+ lghi ("%r0",0);
210
+ lghi ("%r1",-1);
211
+ stg ("%r0","0($ctx)"); # zero hash value / acc1
212
+ stg ("%r0","8($ctx)");
213
+ stg ("%r0","16($ctx)");
214
+
215
+&{$z? \&clgr:\&clr} ($key,"%r0");
216
+ je (".Ldone");
217
+
218
+ lrvg ("%r4","0($key)"); # load little-endian key
219
+ lrvg ("%r5","8($key)");
220
+
221
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
222
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
223
+ srlg ("%r1","%r1",4);
224
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
225
+
226
+ ngr ("%r4","%r0");
227
+ ngr ("%r5","%r1");
228
+
229
+ stg ("%r4","32($ctx)");
230
+ stg ("%r5","40($ctx)");
231
+
232
+ larl ("%r1","OPENSSL_s390xcap_P");
233
+ lg ("%r0","16(%r1)");
234
+ tmhh ("%r0",0x4000); # check for vector facility
235
+ jz (".Ldone");
236
+
237
+ larl ("%r4","poly1305_blocks_vx");
238
+ larl ("%r5","poly1305_emit_vx");
239
+
240
+&{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)");
241
+&{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)");
242
+
243
+ lg ($r0,"32($ctx)");
244
+ lg ($r1,"40($ctx)");
245
+ lghi ($r2,0);
246
+
247
+ ST_R5R (48,68); # store r,5*r
248
+
249
+ MUL_RKEY();
250
+ ST_R5R (88,108); # store r^2,5*r^2
251
+
252
+ MUL_RKEY();
253
+ MUL_RKEY();
254
+ ST_R5R (128,148); # store r^4,5*r^4
255
+
256
+ lghi ("%r0",0);
257
+ stg ("%r0","24($ctx)"); # zero acc2
258
+ stg ("%r0","32($ctx)");
259
+ stg ("%r0","40($ctx)");
260
+
261
+&{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)");
262
+ lghi ("%r2",1);
263
+ br ("%r14");
264
+
265
+LABEL (".Ldone");
266
+ lghi ("%r2",0);
267
+ br ("%r14");
268
+SIZE ("poly1305_init",".-poly1305_init");
269
+}
270
+
271
+# VX CODE PATH
272
{
273
-my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
274
-my ($r0,$r1,$s1) = map("%r$_",(0..2));
275
+my $frame=8*16;
276
+my @m01=map("%v$_",(0..4));
277
+my @m23=map("%v$_",(5..9));
278
+my @tmp=@m23;
279
+my @acc=map("%v$_",(10..14));
280
+my @r=map("%v$_",(15..19));
281
+my @r5=map("%v$_",(20..24));
282
+my $padvec="%v26";
283
+my $mask4="%v27";
284
+my @vperm=map("%v$_",(28..30));
285
+my $mask="%v31";
286
+
287
+sub REDUCE {
288
+ vesrlg (@tmp[0],@acc[0],26);
289
+ vesrlg (@tmp[3],@acc[3],26);
290
+ vn (@acc[0],@acc[0],$mask);
291
+ vn (@acc[3],@acc[3],$mask);
292
+ vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
293
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
294
+
295
+ vesrlg (@tmp[1],@acc[1],26);
296
+ vesrlg (@tmp[4],@acc[4],26);
297
+ vn (@acc[1],@acc[1],$mask);
298
+ vn (@acc[4],@acc[4],$mask);
299
+ veslg (@tmp[0],@tmp[4],2);
300
+ vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5
301
+ vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
302
+ vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0
303
+
304
+ vesrlg (@tmp[2],@acc[2],26);
305
+ vesrlg (@tmp[0],@acc[0],26);
306
+ vn (@acc[2],@acc[2],$mask);
307
+ vn (@acc[0],@acc[0],$mask);
308
+ vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
309
+ vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
310
+
311
+ vesrlg (@tmp[3],@acc[3],26);
312
+ vn (@acc[3],@acc[3],$mask);
313
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
314
+}
315
316
-$code.=<<___;
317
-.globl poly1305_blocks
318
-.type poly1305_blocks,\@function
319
-.align 16
320
-poly1305_blocks:
321
- srl${g} $len,4 # fixed-up in 64-bit build
322
- lghi %r0,0
323
- cl${g}r $len,%r0
324
- je .Lno_data
325
+################
326
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
327
+# size_t len, u32 padbit)
328
+{
329
+my ($ctx,$inp,$len) = map("%r$_",(2..4));
330
+my $padbit="%r0";
331
+
332
+GLOBL ("poly1305_blocks_vx");
333
+TYPE ("poly1305_blocks_vx","\@function");
334
+ALIGN (16);
335
+LABEL ("poly1305_blocks_vx");
336
+if ($z) {
337
+ aghi ($sp,-$frame);
338
+ vstm ("%v8","%v15","0($sp)");
339
+} else {
340
+ std ("%f4","16*$SIZE_T+2*8($sp)");
341
+ std ("%f6","16*$SIZE_T+3*8($sp)");
342
+ llgfr ($len,$len);
343
+}
344
+ llgfr ($padbit,"%r5");
345
+ vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
346
+ larl ("%r5",".Lconst");
347
+ vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
348
+ sllg ($padbit,$padbit,24);
349
+ vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
350
+ vgbm ($mask4,0x0707);
351
+ vlvgp ($padvec,$padbit,$padbit);
352
+
353
+ srlg ("%r1",$len,6);
354
+ ltgr ("%r1","%r1");
355
+ jz (".Lvx_4x_done");
356
+
357
+ALIGN (16);
358
+LABEL (".Lvx_4x");
359
+ vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
360
+
361
+ # m01,m23 -> base 2^26
362
+
363
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
364
+ vperm (@m23[0],"%v22","%v23",@vperm[0]);
365
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
366
+ vperm (@m23[2],"%v22","%v23",@vperm[1]);
367
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
368
+ vperm (@m23[4],"%v22","%v23",@vperm[2]);
369
+
370
+ vesrlg (@m01[1],@m01[0],26);
371
+ vesrlg (@m23[1],@m23[0],26);
372
+ vesrlg (@m01[3],@m01[2],30);
373
+ vesrlg (@m23[3],@m23[2],30);
374
+ vesrlg (@m01[2],@m01[2],4);
375
+ vesrlg (@m23[2],@m23[2],4);
376
+
377
+ vn (@m01[4],@m01[4],$mask4);
378
+ vn (@m23[4],@m23[4],$mask4);
379
+for (0..3) {
380
+ vn (@m01[$_],@m01[$_],$mask);
381
+ vn (@m23[$_],@m23[$_],$mask);
382
+}
383
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
384
+ vaf (@m23[4],@m23[4],$padvec); # pad m23
385
+
386
+ # acc = acc * r^4 + m01 * r^2 + m23
387
+
388
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
389
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
390
+
391
+ vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
392
+ vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
393
+ vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
394
+ vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
395
+ vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
396
+
397
+ vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
398
+ vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
399
+ vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
400
+ vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
401
+ vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
402
+
403
+ vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
404
+ vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
405
+ vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
406
+ vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
407
+ vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
408
+
409
+ vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
410
+ vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
411
+ vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
412
+ vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
413
+ vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
414
+
415
+ vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
416
+ vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
417
+ vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
418
+ vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
419
+ vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
420
+
421
+ vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
422
+ vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
423
+
424
+ vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
425
+ vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
426
+ vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
427
+ vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
428
+ vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
429
+
430
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
431
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
432
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
433
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
434
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
435
+
436
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
437
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
438
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
439
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
440
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
441
+
442
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
443
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
444
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
445
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
446
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
447
+
448
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
449
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
450
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
451
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
452
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
453
+
454
+ REDUCE ();
455
+
456
+ la ($inp,"64($inp)");
457
+ brctg ("%r1",".Lvx_4x");
458
+
459
+ALIGN (16);
460
+LABEL (".Lvx_4x_done");
461
+ tml ($len,32);
462
+ jz (".Lvx_2x_done");
463
+
464
+ vlm ("%v20","%v21","0($inp)"); # load m0,m1
465
+
466
+ # m01 -> base 2^26
467
+
468
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
469
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
470
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
471
+
472
+ vesrlg (@m01[1],@m01[0],26);
473
+ vesrlg (@m01[3],@m01[2],30);
474
+ vesrlg (@m01[2],@m01[2],4);
475
+
476
+ vn (@m01[4],@m01[4],$mask4);
477
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
478
+
479
+ vaf (@m01[4],@m01[4],$padvec); # pad m01
480
+
481
+ # acc = acc * r^2+ m01
482
+
483
+ vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
484
+ vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
485
+
486
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
487
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
488
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
489
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
490
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
491
+
492
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
493
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
494
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
495
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
496
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
497
+
498
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
499
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
500
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
501
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
502
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
503
+
504
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
505
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
506
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
507
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
508
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
509
+
510
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
511
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
512
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
513
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
514
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
515
+
516
+ REDUCE ();
517
+
518
+ la ($inp,"32($inp)");
519
+
520
+ALIGN (16);
521
+LABEL (".Lvx_2x_done");
522
+ tml ($len,16);
523
+ jz (".Lvx_done");
524
+
525
+ vleig ($padvec,0,0);
526
+
527
+ vzero ("%v20");
528
+ vl ("%v21","0($inp)"); # load m0
529
+
530
+ # m0 -> base 2^26
531
+
532
+ vperm (@m01[0],"%v20","%v21",@vperm[0]);
533
+ vperm (@m01[2],"%v20","%v21",@vperm[1]);
534
+ vperm (@m01[4],"%v20","%v21",@vperm[2]);
535
+
536
+ vesrlg (@m01[1],@m01[0],26);
537
+ vesrlg (@m01[3],@m01[2],30);
538
+ vesrlg (@m01[2],@m01[2],4);
539
+
540
+ vn (@m01[4],@m01[4],$mask4);
541
+ vn (@m01[$_],@m01[$_],$mask) for (0..3);
542
+
543
+ vaf (@m01[4],@m01[4],$padvec); # pad m0
544
+
545
+ # acc = acc * r + m01
546
+
547
+ vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
548
+ vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
549
+
550
+ vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
551
+ vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
552
+ vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
553
+ vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
554
+ vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
555
+
556
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
557
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
558
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
559
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
560
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
561
+
562
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
563
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
564
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
565
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
566
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
567
+
568
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
569
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
570
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
571
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
572
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
573
+
574
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
575
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
576
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
577
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
578
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
579
+
580
+ REDUCE ();
581
+
582
+ALIGN (16);
583
+LABEL (".Lvx_done");
584
+ vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
585
+ vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
586
+
587
+if ($z) {
588
+ vlm ("%v8","%v15","0($sp)");
589
+ la ($sp,"$frame($sp)");
590
+} else {
591
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
592
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
593
+}
594
+ br ("%r14");
595
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
596
+}
597
598
- stm${g} %r6,%r14,`6*$SIZE_T`($sp)
599
+################
600
+# static void poly1305_emit_vx(void *ctx, unsigned char mac[16],
601
+# const u32 nonce[4])
602
+{
603
+my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
604
+
605
+GLOBL ("poly1305_emit_vx");
606
+TYPE ("poly1305_emit_vx","\@function");
607
+ALIGN (16);
608
+LABEL ("poly1305_emit_vx");
609
+if ($z) {
610
+ aghi ($sp,-$frame);
611
+ vstm ("%v8","%v15","0($sp)");
612
+} else {
613
+ std ("%f4","16*$SIZE_T+2*8($sp)");
614
+ std ("%f6","16*$SIZE_T+3*8($sp)");
615
+}
616
+ larl ("%r5",".Lconst");
617
618
- llgfr $padbit,$padbit # clear upper half, much needed with
619
+ vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
620
+ vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
621
+ vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2
622
+ vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2
623
+ vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r
624
+ vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r
625
+ vl ($mask,"48(%r5)"); # load mask
626
+
627
+ # acc = acc1 * r^2 + acc2 * r
628
+
629
+ vmlof (@tmp[0],@acc[4],@r5[1]);
630
+ vmlof (@tmp[1],@acc[4],@r5[2]);
631
+ vmlof (@tmp[2],@acc[4],@r5[3]);
632
+ vmlof (@tmp[3],@acc[4],@r5[4]);
633
+ vmlof (@tmp[4],@acc[4],@r[0]);
634
+
635
+ vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
636
+ vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
637
+ vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
638
+ vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
639
+ vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
640
+
641
+ vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
642
+ vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
643
+ vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
644
+ vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
645
+ vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
646
+
647
+ vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
648
+ vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
649
+ vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
650
+ vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
651
+ vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
652
+
653
+ vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
654
+ vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
655
+ vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
656
+ vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
657
+ vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
658
+
659
+ vzero ("%v27");
660
+ vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4);
661
+
662
+ REDUCE ();
663
+
664
+ vesrlg (@tmp[1],@acc[1],26);
665
+ vn (@acc[1],@acc[1],$mask);
666
+ vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
667
+
668
+ vesrlg (@tmp[2],@acc[2],26);
669
+ vn (@acc[2],@acc[2],$mask);
670
+ vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
671
+
672
+ vesrlg (@tmp[3],@acc[3],26);
673
+ vn (@acc[3],@acc[3],$mask);
674
+ vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
675
+
676
+ # acc -> base 2^64
677
+ vleib ("%v30",6*8,7);
678
+ vleib ("%v29",13*8,7);
679
+ vleib ("%v28",3*8,7);
680
+
681
+ veslg (@acc[1],@acc[1],26);
682
+ veslg (@acc[3],@acc[3],26);
683
+ vo (@acc[0],@acc[0],@acc[1]);
684
+ vo (@acc[2],@acc[2],@acc[3]);
685
+
686
+ veslg (@acc[2],@acc[2],4);
687
+ vslb (@acc[2],@acc[2],"%v30"); # <<52
688
+ vo (@acc[0],@acc[0],@acc[2]);
689
+
690
+ vslb (@tmp[4],@acc[4],"%v29"); # <<104
691
+ vo (@acc[0],@acc[0],@tmp[4]);
692
+
693
+ vsrlb (@acc[1],@acc[4],"%v28"); # >>24
694
+
695
+ # acc %= 2^130-5
696
+ vone ("%v26");
697
+ vleig ("%v27",5,1);
698
+ vone ("%v29");
699
+ vleig ("%v26",-4,1);
700
+
701
+ vaq (@tmp[0],@acc[0],"%v27");
702
+ vaccq (@tmp[1],@acc[0],"%v27");
703
+
704
+ vaq (@tmp[1],@tmp[1],"%v26");
705
+ vaccq (@tmp[1],@tmp[1],@acc[1]);
706
+
707
+ vaq (@tmp[1],@tmp[1],"%v29");
708
+
709
+ vn (@tmp[2],@tmp[1],@acc[0]);
710
+ vnc (@tmp[3],@tmp[0],@tmp[1]);
711
+ vo (@acc[0],@tmp[2],@tmp[3]);
712
+
713
+ # acc += nonce
714
+ vl (@vperm[0],"64(%r5)");
715
+ vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3);
716
+
717
+ vaq (@acc[0],@acc[0],@tmp[0]);
718
+
719
+ vperm (@acc[0],@acc[0],@acc[0],@vperm[0]);
720
+ vst (@acc[0],"0($mac)"); # store mac
721
+
722
+if ($z) {
723
+ vlm ("%v8","%v15","0($sp)");
724
+ la ($sp,"$frame($sp)");
725
+} else {
726
+ ld ("%f4","16*$SIZE_T+2*8($sp)");
727
+ ld ("%f6","16*$SIZE_T+3*8($sp)");
728
+}
729
+ br ("%r14");
730
+SIZE ("poly1305_emit_vx",".-poly1305_emit_vx");
731
+}
732
+}
733
+
734
+# NOVX CODE PATH
735
+{
736
+################
737
+# static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
738
+# u32 padbit)
739
+{
740
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
741
+
742
+my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
743
+my ($r0,$r1,$s1) = map("%r$_",(0..2));
744
+GLOBL ("poly1305_blocks");
745
+TYPE ("poly1305_blocks","\@function");
746
+ALIGN (16);
747
+LABEL ("poly1305_blocks");
748
+$z? srlg ($len,$len,4) :srl ($len,4);
749
+ lghi ("%r0",0);
750
+&{$z? \&clgr:\&clr} ($len,"%r0");
751
+ je (".Lno_data");
752
+
753
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
754
+
755
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
756
# non-64-bit ABI
757
- lg $r0,32($ctx) # load key
758
- lg $r1,40($ctx)
759
+ lg ($r0,"32($ctx)"); # load key
760
+ lg ($r1,"40($ctx)");
761
762
- lg $h0,0($ctx) # load hash value
763
- lg $h1,8($ctx)
764
- lg $h2,16($ctx)
765
+ lg ($h0,"0($ctx)"); # load hash value
766
+ lg ($h1,"8($ctx)");
767
+ lg ($h2,"16($ctx)");
768
769
- st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
770
- srlg $s1,$r1,2
771
- algr $s1,$r1 # s1 = r1 + r1>>2
772
- j .Loop
773
+&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
774
+ srlg ($s1,$r1,2);
775
+ algr ($s1,$r1); # s1 = r1 + r1>>2
776
+ j (".Loop");
777
778
-.align 16
779
-.Loop:
780
- lrvg $d0lo,0($inp) # load little-endian input
781
- lrvg $d1lo,8($inp)
782
- la $inp,16($inp)
783
+ALIGN (16);
784
+LABEL (".Loop");
785
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
786
+ lrvg ($d1lo,"8($inp)");
787
+ la ($inp,"16($inp)");
788
789
- algr $d0lo,$h0 # accumulate input
790
- alcgr $d1lo,$h1
791
+ algr ($d0lo,$h0); # accumulate input
792
+ alcgr ($d1lo,$h1);
793
794
- lgr $h0,$d0lo
795
- mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
796
- lgr $h1,$d1lo
797
- mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
798
+ lgr ($h0,$d0lo);
799
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
800
+ lgr ($h1,$d1lo);
801
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
802
803
- mlgr $t0,$r1 # h0*r1 -> $t0:$h0
804
- mlgr $t1,$r0 # h1*r0 -> $t1:$h1
805
- alcgr $h2,$padbit
806
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
807
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
808
+ alcgr ($h2,$padbit);
809
810
- algr $d0lo,$d1lo
811
- lgr $d1lo,$h2
812
- alcgr $d0hi,$d1hi
813
- lghi $d1hi,0
814
+ algr ($d0lo,$d1lo);
815
+ lgr ($d1lo,$h2);
816
+ alcgr ($d0hi,$d1hi);
817
+ lghi ($d1hi,0);
818
819
- algr $h1,$h0
820
- alcgr $t1,$t0
821
+ algr ($h1,$h0);
822
+ alcgr ($t1,$t0);
823
824
- msgr $d1lo,$s1 # h2*s1
825
- msgr $h2,$r0 # h2*r0
826
+ msgr ($d1lo,$s1); # h2*s1
827
+ msgr ($h2,$r0); # h2*r0
828
829
- algr $h1,$d1lo
830
- alcgr $t1,$d1hi # $d1hi is zero
831
+ algr ($h1,$d1lo);
832
+ alcgr ($t1,$d1hi); # $d1hi is zero
833
834
- algr $h1,$d0hi
835
- alcgr $h2,$t1
836
+ algr ($h1,$d0hi);
837
+ alcgr ($h2,$t1);
838
839
- lghi $h0,-4 # final reduction step
840
- ngr $h0,$h2
841
- srlg $t0,$h2,2
842
- algr $h0,$t0
843
- lghi $t1,3
844
- ngr $h2,$t1
845
+ lghi ($h0,-4); # final reduction step
846
+ ngr ($h0,$h2);
847
+ srlg ($t0,$h2,2);
848
+ algr ($h0,$t0);
849
+ lghi ($t1,3);
850
+ ngr ($h2,$t1);
851
852
- algr $h0,$d0lo
853
- alcgr $h1,$d1hi # $d1hi is still zero
854
- alcgr $h2,$d1hi # $d1hi is still zero
855
+ algr ($h0,$d0lo);
856
+ alcgr ($h1,$d1hi); # $d1hi is still zero
857
+ alcgr ($h2,$d1hi); # $d1hi is still zero
858
859
- brct$g $len,.Loop
860
+&{$z? \&brctg:\&brct} ($len,".Loop");
861
862
- l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
863
+&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
864
865
- stg $h0,0($ctx) # store hash value
866
- stg $h1,8($ctx)
867
- stg $h2,16($ctx)
868
+ stg ($h0,"0($ctx)"); # store hash value
869
+ stg ($h1,"8($ctx)");
870
+ stg ($h2,"16($ctx)");
871
872
- lm${g} %r6,%r14,`6*$SIZE_T`($sp)
873
-.Lno_data:
874
- br %r14
875
-.size poly1305_blocks,.-poly1305_blocks
876
-___
877
+&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
878
+LABEL (".Lno_data");
879
+ br ("%r14");
880
+SIZE ("poly1305_blocks",".-poly1305_blocks");
881
}
882
+
883
+################
884
+# static void poly1305_emit(void *ctx, unsigned char mac[16],
885
+# const u32 nonce[4])
886
{
887
-my ($mac,$nonce)=($inp,$len);
888
+my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
889
my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
890
891
-$code.=<<___;
892
-.globl poly1305_emit
893
-.type poly1305_emit,\@function
894
-.align 16
895
-poly1305_emit:
896
- stm${g} %r6,%r9,`6*$SIZE_T`($sp)
897
-
898
- lg $h0,0($ctx)
899
- lg $h1,8($ctx)
900
- lg $h2,16($ctx)
901
-
902
- lghi %r0,5
903
- lghi %r1,0
904
- lgr $d0,$h0
905
- lgr $d1,$h1
906
-
907
- algr $h0,%r0 # compare to modulus
908
- alcgr $h1,%r1
909
- alcgr $h2,%r1
910
-
911
- srlg $h2,$h2,2 # did it borrow/carry?
912
- slgr %r1,$h2 # 0-$h2>>2
913
- lg $h2,0($nonce) # load nonce
914
- lghi %r0,-1
915
- lg $ctx,8($nonce)
916
- xgr %r0,%r1 # ~%r1
917
-
918
- ngr $h0,%r1
919
- ngr $d0,%r0
920
- ngr $h1,%r1
921
- ngr $d1,%r0
922
- ogr $h0,$d0
923
- rllg $d0,$h2,32 # flip nonce words
924
- ogr $h1,$d1
925
- rllg $d1,$ctx,32
926
-
927
- algr $h0,$d0 # accumulate nonce
928
- alcgr $h1,$d1
929
-
930
- strvg $h0,0($mac) # write little-endian result
931
- strvg $h1,8($mac)
932
-
933
- lm${g} %r6,%r9,`6*$SIZE_T`($sp)
934
- br %r14
935
-.size poly1305_emit,.-poly1305_emit
936
-
937
-.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
938
-___
939
-}
940
-
941
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
942
-$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
943
-
944
-print $code;
945
-close STDOUT or die "error closing STDOUT: $!";
946
+GLOBL ("poly1305_emit");
947
+TYPE ("poly1305_emit","\@function");
948
+ALIGN (16);
949
+LABEL ("poly1305_emit");
950
+&{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)");
951
+
952
+ lg ($h0,"0($ctx)");
953
+ lg ($h1,"8($ctx)");
954
+ lg ($h2,"16($ctx)");
955
+
956
+ lghi ("%r0",5);
957
+ lghi ("%r1",0);
958
+ lgr ($d0,$h0);
959
+ lgr ($d1,$h1);
960
+
961
+ algr ($h0,"%r0"); # compare to modulus
962
+ alcgr ($h1,"%r1");
963
+ alcgr ($h2,"%r1");
964
+
965
+ srlg ($h2,$h2,2); # did it borrow/carry?
966
+ slgr ("%r1",$h2); # 0-$h2>>2
967
+ lg ($h2,"0($nonce)"); # load nonce
968
+ lghi ("%r0",-1);
969
+ lg ($ctx,"8($nonce)");
970
+ xgr ("%r0","%r1"); # ~%r1
971
+
972
+ ngr ($h0,"%r1");
973
+ ngr ($d0,"%r0");
974
+ ngr ($h1,"%r1");
975
+ ngr ($d1,"%r0");
976
+ ogr ($h0,$d0);
977
+ rllg ($d0,$h2,32); # flip nonce words
978
+ ogr ($h1,$d1);
979
+ rllg ($d1,$ctx,32);
980
+
981
+ algr ($h0,$d0); # accumulate nonce
982
+ alcgr ($h1,$d1);
983
+
984
+ strvg ($h0,"0($mac)"); # write little-endian result
985
+ strvg ($h1,"8($mac)");
986
+
987
+&{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)");
988
+ br ("%r14");
989
+SIZE ("poly1305_emit",".-poly1305_emit");
990
+}
991
+}
992
+################
993
+
994
+ALIGN (128);
995
+LABEL (".Lconst");
996
+LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]]
997
+LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]]
998
+LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]]
999
+LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1]
1000
+LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian
1001
+STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1002
+
1003
+PERLASM_END();
1004
--
1005
2.21.0
1006
1007