File 0006-s390x-assembly-pack-import-poly-from-cryptogams-repo.patch of Package openssl-1_1
xxxxxxxxxx
1
From 2e6b615f795e8ca8ae830a00079c4ea064eaae42 Mon Sep 17 00:00:00 2001
2
From: Patrick Steuer <patrick.steuer@de.ibm.com>
3
Date: Sat, 23 Mar 2019 00:03:24 +0100
4
Subject: [PATCH] s390x assembly pack: import poly from cryptogams repo
5
6
>=20% faster than present code.
7
8
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
9
10
Reviewed-by: Matt Caswell <matt@openssl.org>
11
Reviewed-by: Richard Levitte <levitte@openssl.org>
12
(Merged from https://github.com/openssl/openssl/pull/8560)
13
---
14
crypto/poly1305/asm/poly1305-s390x.pl | 1455 ++++++++++++++-----------
15
crypto/poly1305/build.info | 1 +
16
2 files changed, 799 insertions(+), 657 deletions(-)
17
18
Index: openssl-1.1.1c/crypto/poly1305/asm/poly1305-s390x.pl
19
===================================================================
20
--- openssl-1.1.1c.orig/crypto/poly1305/asm/poly1305-s390x.pl 2019-06-06 12:18:53.384309579 +0200
21
+++ openssl-1.1.1c/crypto/poly1305/asm/poly1305-s390x.pl 2019-06-06 12:18:54.556316994 +0200
22
23
# Copyright IBM Corp. 2019
24
# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
25
26
+#
27
+# January 2019
28
+#
29
+# Add vector base 2^26 implementation. It's problematic to accurately
30
+# measure performance, because reference system is hardly idle. But
31
+# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
32
+# >=20% faster than IBM's submission on long inputs, and much faster on
33
+# short ones, because calculation of key powers is postponed till we
34
+# know that input is long enough to justify the additional overhead.
35
+
36
use strict;
37
use FindBin qw($Bin);
38
use lib "$Bin/../..";
39
-use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL);
40
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
41
42
my $flavour = shift;
43
44
45
my $output;
46
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
47
48
+my $stdframe=16*$SIZE_T+4*8;
49
my $sp="%r15";
50
51
-# novx code path ctx layout
52
-# ---------------------------------
53
-# var value base off
54
-# ---------------------------------
55
-# u64 h[3] hash 2^64 0
56
-# u32 pad[2]
57
-# u64 r[2] key 2^64 32
58
-
59
-# vx code path ctx layout
60
-# ---------------------------------
61
-# var value base off
62
-# ---------------------------------
63
-# u32 acc1[5] r^2-acc 2^26 0
64
-# u32 pad
65
-# u32 acc2[5] r-acc 2^26 24
66
-# u32 pad
67
-# u32 r1[5] r 2^26 48
68
-# u32 r15[5] 5*r 2^26 68
69
-# u32 r2[5] r^2 2^26 88
70
-# u32 r25[5] 5*r^2 2^26 108
71
-# u32 r4[5] r^4 2^26 128
72
-# u32 r45[5] 5*r^4 2^26 148
73
+my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
74
75
PERLASM_BEGIN($output);
76
77
+INCLUDE ("s390x_arch.h");
78
TEXT ();
79
80
################
81
# static void poly1305_init(void *ctx, const unsigned char key[16])
82
{
83
-my ($ctx,$key)=map("%r$_",(2..3));
84
-my ($r0,$r1,$r2)=map("%r$_",(9,11,13));
85
-
86
-sub MUL_RKEY { # r*=key
87
-my ($d0hi,$d0lo,$d1hi,$d1lo)=map("%r$_",(4..7));
88
-my ($t0,$t1,$s1)=map("%r$_",(8,10,12));
89
-
90
- lg ("%r0","32($ctx)");
91
- lg ("%r1","40($ctx)");
92
-
93
- srlg ($s1,"%r1",2);
94
- algr ($s1,"%r1");
95
-
96
- lgr ($d0lo,$r0);
97
- lgr ($d1lo,$r1);
98
-
99
- mlgr ($d0hi,"%r0");
100
- lgr ($r1,$d1lo);
101
- mlgr ($d1hi,$s1);
102
-
103
- mlgr ($t0,"%r1");
104
- mlgr ($t1,"%r0");
105
-
106
- algr ($d0lo,$d1lo);
107
- lgr ($d1lo,$r2);
108
- alcgr ($d0hi,$d1hi);
109
- lghi ($d1hi,0);
110
-
111
- algr ($r1,$r0);
112
- alcgr ($t1,$t0);
113
-
114
- msgr ($d1lo,$s1);
115
- msgr ($r2,"%r0");
116
-
117
- algr ($r1,$d1lo);
118
- alcgr ($t1,$d1hi);
119
-
120
- algr ($r1,$d0hi);
121
- alcgr ($r2,$t1);
122
-
123
- lghi ($r0,-4);
124
- ngr ($r0,$r2);
125
- srlg ($t0,$r2,2);
126
- algr ($r0,$t0);
127
- lghi ($t1,3);
128
- ngr ($r2,$t1);
129
-
130
- algr ($r0,$d0lo);
131
- alcgr ($r1,$d1hi);
132
- alcgr ($r2,$d1hi);
133
-}
134
-
135
-sub ST_R5R { # store r,5*r -> base 2^26
136
-my @d=map("%r$_",(4..8));
137
-my @off=@_;
138
-
139
- lgr (@d[2],$r0);
140
- lr ("%r1",@d[2]);
141
- nilh ("%r1",1023);
142
- lgr (@d[3],$r1);
143
- lr (@d[0],"%r1");
144
- srlg ("%r1",@d[2],52);
145
- lgr (@d[4],$r2);
146
- srlg ("%r0",@d[2],26);
147
- sll (@d[4],24);
148
- lr (@d[2],@d[3]);
149
- nilh ("%r0",1023);
150
- sll (@d[2],12);
151
- lr (@d[1],"%r0");
152
- &or (@d[2],"%r1");
153
- srlg ("%r1",@d[3],40);
154
- nilh (@d[2],1023);
155
- &or (@d[4],"%r1");
156
- srlg (@d[3],@d[3],14);
157
- nilh (@d[4],1023);
158
- nilh (@d[3],1023);
159
-
160
- stm (@d[0],@d[4],"@off[0]($ctx)");
161
- mhi (@d[$_],5) for (0..4);
162
- stm (@d[0],@d[4],"@off[1]($ctx)");
163
-}
164
-
165
GLOBL ("poly1305_init");
166
TYPE ("poly1305_init","\@function");
167
ALIGN (16);
168
LABEL ("poly1305_init");
169
lghi ("%r0",0);
170
lghi ("%r1",-1);
171
- stg ("%r0","0($ctx)"); # zero hash value / acc1
172
+ stg ("%r0","0($ctx)"); # zero hash value
173
stg ("%r0","8($ctx)");
174
stg ("%r0","16($ctx)");
175
+ st ("%r0","24($ctx)"); # clear is_base2_26
176
+ lgr ("%r5",$ctx); # reassign $ctx
177
+ lghi ("%r2",0);
178
179
-&{$z? \&clgr:\&clr} ($key,"%r0");
180
- je (".Ldone");
181
+&{$z? \&clgr:\&clr} ($inp,"%r0");
182
+ je (".Lno_key");
183
184
- lrvg ("%r4","0($key)"); # load little-endian key
185
- lrvg ("%r5","8($key)");
186
+ lrvg ("%r2","0($inp)"); # load little-endian key
187
+ lrvg ("%r3","8($inp)");
188
189
- nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
190
- srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
191
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
192
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
193
srlg ("%r1","%r1",4);
194
- nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
195
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
196
197
- ngr ("%r4","%r0");
198
- ngr ("%r5","%r1");
199
+ ngr ("%r2","%r0");
200
+ ngr ("%r3","%r1");
201
202
- stg ("%r4","32($ctx)");
203
- stg ("%r5","40($ctx)");
204
+ stmg ("%r2","%r3","32(%r5)");
205
206
larl ("%r1","OPENSSL_s390xcap_P");
207
lg ("%r0","16(%r1)");
208
- tmhh ("%r0",0x4000); # check for vector facility
209
- jz (".Ldone");
210
-
211
- larl ("%r4","poly1305_blocks_vx");
212
- larl ("%r5","poly1305_emit_vx");
213
-
214
-&{$z? \&stmg:\&stm} ("%r6","%r13","6*$SIZE_T($sp)");
215
-&{$z? \&stmg:\&stm} ("%r4","%r5","4*$z+228($ctx)");
216
-
217
- lg ($r0,"32($ctx)");
218
- lg ($r1,"40($ctx)");
219
- lghi ($r2,0);
220
-
221
- ST_R5R (48,68); # store r,5*r
222
-
223
- MUL_RKEY();
224
- ST_R5R (88,108); # store r^2,5*r^2
225
-
226
- MUL_RKEY();
227
- MUL_RKEY();
228
- ST_R5R (128,148); # store r^4,5*r^4
229
-
230
- lghi ("%r0",0);
231
- stg ("%r0","24($ctx)"); # zero acc2
232
- stg ("%r0","32($ctx)");
233
- stg ("%r0","40($ctx)");
234
-
235
-&{$z? \&lmg:\&lm} ("%r6","%r13","6*$SIZE_T($sp)");
236
+ srlg ("%r0","%r0",62);
237
+ nill ("%r0",1); # extract vx bit
238
+ lcgr ("%r0","%r0");
239
+ larl ("%r1",".Lpoly1305_blocks");
240
+ larl ("%r2",".Lpoly1305_blocks_vx");
241
+ larl ("%r3",".Lpoly1305_emit");
242
+&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
243
+&{$z? \&ngr:\&nr} ("%r2","%r0");
244
+&{$z? \&xgr:\&xr} ("%r2","%r1");
245
+&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
246
lghi ("%r2",1);
247
- br ("%r14");
248
-
249
-LABEL (".Ldone");
250
- lghi ("%r2",0);
251
+LABEL (".Lno_key");
252
br ("%r14");
253
SIZE ("poly1305_init",".-poly1305_init");
254
}
255
256
-# VX CODE PATH
257
-{
258
-my $frame=8*16;
259
-my @m01=map("%v$_",(0..4));
260
-my @m23=map("%v$_",(5..9));
261
-my @tmp=@m23;
262
-my @acc=map("%v$_",(10..14));
263
-my @r=map("%v$_",(15..19));
264
-my @r5=map("%v$_",(20..24));
265
-my $padvec="%v26";
266
-my $mask4="%v27";
267
-my @vperm=map("%v$_",(28..30));
268
-my $mask="%v31";
269
-
270
-sub REDUCE {
271
- vesrlg (@tmp[0],@acc[0],26);
272
- vesrlg (@tmp[3],@acc[3],26);
273
- vn (@acc[0],@acc[0],$mask);
274
- vn (@acc[3],@acc[3],$mask);
275
- vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
276
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
277
-
278
- vesrlg (@tmp[1],@acc[1],26);
279
- vesrlg (@tmp[4],@acc[4],26);
280
- vn (@acc[1],@acc[1],$mask);
281
- vn (@acc[4],@acc[4],$mask);
282
- veslg (@tmp[0],@tmp[4],2);
283
- vag (@tmp[4],@tmp[4],@tmp[0]); # h[4]*=5
284
- vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
285
- vag (@acc[0],@acc[0],@tmp[4]); # carry 4->0
286
-
287
- vesrlg (@tmp[2],@acc[2],26);
288
- vesrlg (@tmp[0],@acc[0],26);
289
- vn (@acc[2],@acc[2],$mask);
290
- vn (@acc[0],@acc[0],$mask);
291
- vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
292
- vag (@acc[1],@acc[1],@tmp[0]); # carry 0->1
293
-
294
- vesrlg (@tmp[3],@acc[3],26);
295
- vn (@acc[3],@acc[3],$mask);
296
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
297
-}
298
-
299
################
300
-# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
301
-# size_t len, u32 padbit)
302
+# static void poly1305_blocks(void *ctx, const unsigned char *inp,
303
+# size_t len, u32 padbit)
304
{
305
-my ($ctx,$inp,$len) = map("%r$_",(2..4));
306
-my $padbit="%r0";
307
-
308
-GLOBL ("poly1305_blocks_vx");
309
-TYPE ("poly1305_blocks_vx","\@function");
310
-ALIGN (16);
311
-LABEL ("poly1305_blocks_vx");
312
-if ($z) {
313
- aghi ($sp,-$frame);
314
- vstm ("%v8","%v15","0($sp)");
315
-} else {
316
- std ("%f4","16*$SIZE_T+2*8($sp)");
317
- std ("%f6","16*$SIZE_T+3*8($sp)");
318
- llgfr ($len,$len);
319
-}
320
- llgfr ($padbit,"%r5");
321
- vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
322
- larl ("%r5",".Lconst");
323
- vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
324
- sllg ($padbit,$padbit,24);
325
- vlm (@vperm[0],$mask,"0(%r5)"); # load vperm ops, mask
326
- vgbm ($mask4,0x0707);
327
- vlvgp ($padvec,$padbit,$padbit);
328
-
329
- srlg ("%r1",$len,6);
330
- ltgr ("%r1","%r1");
331
- jz (".Lvx_4x_done");
332
-
333
-ALIGN (16);
334
-LABEL (".Lvx_4x");
335
- vlm ("%v20","%v23","0($inp)"); # load m0,m1,m2,m3
336
-
337
- # m01,m23 -> base 2^26
338
-
339
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
340
- vperm (@m23[0],"%v22","%v23",@vperm[0]);
341
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
342
- vperm (@m23[2],"%v22","%v23",@vperm[1]);
343
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
344
- vperm (@m23[4],"%v22","%v23",@vperm[2]);
345
-
346
- vesrlg (@m01[1],@m01[0],26);
347
- vesrlg (@m23[1],@m23[0],26);
348
- vesrlg (@m01[3],@m01[2],30);
349
- vesrlg (@m23[3],@m23[2],30);
350
- vesrlg (@m01[2],@m01[2],4);
351
- vesrlg (@m23[2],@m23[2],4);
352
-
353
- vn (@m01[4],@m01[4],$mask4);
354
- vn (@m23[4],@m23[4],$mask4);
355
-for (0..3) {
356
- vn (@m01[$_],@m01[$_],$mask);
357
- vn (@m23[$_],@m23[$_],$mask);
358
-}
359
- vaf (@m01[4],@m01[4],$padvec); # pad m01
360
- vaf (@m23[4],@m23[4],$padvec); # pad m23
361
-
362
- # acc = acc * r^4 + m01 * r^2 + m23
363
-
364
- vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
365
- vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
366
-
367
- vmalof (@tmp[0],@m01[4],@r5[1],@m23[0]);
368
- vmalof (@tmp[1],@m01[4],@r5[2],@m23[1]);
369
- vmalof (@tmp[2],@m01[4],@r5[3],@m23[2]);
370
- vmalof (@tmp[3],@m01[4],@r5[4],@m23[3]);
371
- vmalof (@tmp[4],@m01[4],@r[0],@m23[4]);
372
-
373
- vmalof (@tmp[0],@m01[3],@r5[2],@tmp[0]);
374
- vmalof (@tmp[1],@m01[3],@r5[3],@tmp[1]);
375
- vmalof (@tmp[2],@m01[3],@r5[4],@tmp[2]);
376
- vmalof (@tmp[3],@m01[3],@r[0],@tmp[3]);
377
- vmalof (@tmp[4],@m01[3],@r[1],@tmp[4]);
378
-
379
- vmalof (@tmp[0],@m01[2],@r5[3],@tmp[0]);
380
- vmalof (@tmp[1],@m01[2],@r5[4],@tmp[1]);
381
- vmalof (@tmp[2],@m01[2],@r[0],@tmp[2]);
382
- vmalof (@tmp[3],@m01[2],@r[1],@tmp[3]);
383
- vmalof (@tmp[4],@m01[2],@r[2],@tmp[4]);
384
-
385
- vmalof (@tmp[0],@m01[1],@r5[4],@tmp[0]);
386
- vmalof (@tmp[1],@m01[1],@r[0],@tmp[1]);
387
- vmalof (@tmp[2],@m01[1],@r[1],@tmp[2]);
388
- vmalof (@tmp[3],@m01[1],@r[2],@tmp[3]);
389
- vmalof (@tmp[4],@m01[1],@r[3],@tmp[4]);
390
-
391
- vmalof (@tmp[0],@m01[0],@r[0],@tmp[0]);
392
- vmalof (@tmp[1],@m01[0],@r[1],@tmp[1]);
393
- vmalof (@tmp[2],@m01[0],@r[2],@tmp[2]);
394
- vmalof (@tmp[3],@m01[0],@r[3],@tmp[3]);
395
- vmalof (@tmp[4],@m01[0],@r[4],@tmp[4]);
396
-
397
- vlrepf (@r5[$_],"4*$_+148($ctx)") for (0..4); # load 5*r^4
398
- vlrepf (@r[$_],"4*$_+128($ctx)") for (0..4); # load r^4
399
-
400
- vmalof (@tmp[0],@acc[4],@r5[1],@tmp[0]);
401
- vmalof (@tmp[1],@acc[4],@r5[2],@tmp[1]);
402
- vmalof (@tmp[2],@acc[4],@r5[3],@tmp[2]);
403
- vmalof (@tmp[3],@acc[4],@r5[4],@tmp[3]);
404
- vmalof (@tmp[4],@acc[4],@r[0],@tmp[4]);
405
-
406
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
407
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
408
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
409
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
410
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
411
-
412
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
413
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
414
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
415
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
416
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
417
-
418
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
419
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
420
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
421
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
422
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
423
-
424
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
425
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
426
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
427
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
428
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
429
-
430
- REDUCE ();
431
-
432
- la ($inp,"64($inp)");
433
- brctg ("%r1",".Lvx_4x");
434
-
435
-ALIGN (16);
436
-LABEL (".Lvx_4x_done");
437
- tml ($len,32);
438
- jz (".Lvx_2x_done");
439
-
440
- vlm ("%v20","%v21","0($inp)"); # load m0,m1
441
-
442
- # m01 -> base 2^26
443
-
444
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
445
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
446
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
447
-
448
- vesrlg (@m01[1],@m01[0],26);
449
- vesrlg (@m01[3],@m01[2],30);
450
- vesrlg (@m01[2],@m01[2],4);
451
-
452
- vn (@m01[4],@m01[4],$mask4);
453
- vn (@m01[$_],@m01[$_],$mask) for (0..3);
454
-
455
- vaf (@m01[4],@m01[4],$padvec); # pad m01
456
-
457
- # acc = acc * r^2+ m01
458
-
459
- vlrepf (@r5[$_],"4*$_+108($ctx)") for (0..4); # load 5*r^2
460
- vlrepf (@r[$_],"4*$_+88($ctx)") for (0..4); # load r^2
461
-
462
- vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
463
- vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
464
- vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
465
- vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
466
- vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
467
-
468
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
469
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
470
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
471
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
472
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
473
-
474
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
475
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
476
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
477
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
478
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
479
-
480
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
481
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
482
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
483
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
484
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
485
-
486
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
487
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
488
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
489
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
490
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
491
-
492
- REDUCE ();
493
-
494
- la ($inp,"32($inp)");
495
-
496
-ALIGN (16);
497
-LABEL (".Lvx_2x_done");
498
- tml ($len,16);
499
- jz (".Lvx_done");
500
-
501
- vleig ($padvec,0,0);
502
-
503
- vzero ("%v20");
504
- vl ("%v21","0($inp)"); # load m0
505
-
506
- # m0 -> base 2^26
507
-
508
- vperm (@m01[0],"%v20","%v21",@vperm[0]);
509
- vperm (@m01[2],"%v20","%v21",@vperm[1]);
510
- vperm (@m01[4],"%v20","%v21",@vperm[2]);
511
-
512
- vesrlg (@m01[1],@m01[0],26);
513
- vesrlg (@m01[3],@m01[2],30);
514
- vesrlg (@m01[2],@m01[2],4);
515
-
516
- vn (@m01[4],@m01[4],$mask4);
517
- vn (@m01[$_],@m01[$_],$mask) for (0..3);
518
-
519
- vaf (@m01[4],@m01[4],$padvec); # pad m0
520
-
521
- # acc = acc * r + m01
522
-
523
- vlrepf (@r5[$_],"4*$_+68($ctx)") for (0..4); # load 5*r
524
- vlrepf (@r[$_],"4*$_+48($ctx)") for (0..4); # load r
525
-
526
- vmalof (@tmp[0],@acc[4],@r5[1],@m01[0]);
527
- vmalof (@tmp[1],@acc[4],@r5[2],@m01[1]);
528
- vmalof (@tmp[2],@acc[4],@r5[3],@m01[2]);
529
- vmalof (@tmp[3],@acc[4],@r5[4],@m01[3]);
530
- vmalof (@tmp[4],@acc[4],@r[0],@m01[4]);
531
-
532
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
533
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
534
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
535
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
536
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
537
-
538
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
539
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
540
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
541
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
542
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
543
-
544
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
545
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
546
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
547
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
548
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
549
-
550
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
551
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
552
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
553
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
554
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
555
-
556
- REDUCE ();
557
-
558
-ALIGN (16);
559
-LABEL (".Lvx_done");
560
- vstef (@acc[$_],"4*$_($ctx)",1) for (0..4); # store acc
561
- vstef (@acc[$_],"24+4*$_($ctx)",3) for (0..4);
562
-
563
-if ($z) {
564
- vlm ("%v8","%v15","0($sp)");
565
- la ($sp,"$frame($sp)");
566
-} else {
567
- ld ("%f4","16*$SIZE_T+2*8($sp)");
568
- ld ("%f6","16*$SIZE_T+3*8($sp)");
569
-}
570
- br ("%r14");
571
-SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
572
-}
573
-
574
-################
575
-# static void poly1305_emit_vx(void *ctx, unsigned char mac[16],
576
-# const u32 nonce[4])
577
-{
578
-my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
579
-
580
-GLOBL ("poly1305_emit_vx");
581
-TYPE ("poly1305_emit_vx","\@function");
582
-ALIGN (16);
583
-LABEL ("poly1305_emit_vx");
584
-if ($z) {
585
- aghi ($sp,-$frame);
586
- vstm ("%v8","%v15","0($sp)");
587
-} else {
588
- std ("%f4","16*$SIZE_T+2*8($sp)");
589
- std ("%f6","16*$SIZE_T+3*8($sp)");
590
-}
591
- larl ("%r5",".Lconst");
592
-
593
- vlef (@acc[$_],"4*$_($ctx)",1) for (0..4); # load acc1
594
- vlef (@acc[$_],"24+4*$_($ctx)",3) for (0..4); # load acc2
595
- vlef (@r5[$_],"108+4*$_($ctx)",1) for (0..4); # load 5*r^2
596
- vlef (@r[$_],"88+4*$_($ctx)",1) for (0..4); # load r^2
597
- vlef (@r5[$_],"68+4*$_($ctx)",3) for (0..4); # load 5*r
598
- vlef (@r[$_],"48+4*$_($ctx)",3) for (0..4); # load r
599
- vl ($mask,"48(%r5)"); # load mask
600
-
601
- # acc = acc1 * r^2 + acc2 * r
602
-
603
- vmlof (@tmp[0],@acc[4],@r5[1]);
604
- vmlof (@tmp[1],@acc[4],@r5[2]);
605
- vmlof (@tmp[2],@acc[4],@r5[3]);
606
- vmlof (@tmp[3],@acc[4],@r5[4]);
607
- vmlof (@tmp[4],@acc[4],@r[0]);
608
-
609
- vmalof (@tmp[0],@acc[3],@r5[2],@tmp[0]);
610
- vmalof (@tmp[1],@acc[3],@r5[3],@tmp[1]);
611
- vmalof (@tmp[2],@acc[3],@r5[4],@tmp[2]);
612
- vmalof (@tmp[3],@acc[3],@r[0],@tmp[3]);
613
- vmalof (@tmp[4],@acc[3],@r[1],@tmp[4]);
614
-
615
- vmalof (@tmp[0],@acc[2],@r5[3],@tmp[0]);
616
- vmalof (@tmp[1],@acc[2],@r5[4],@tmp[1]);
617
- vmalof (@tmp[2],@acc[2],@r[0],@tmp[2]);
618
- vmalof (@tmp[3],@acc[2],@r[1],@tmp[3]);
619
- vmalof (@tmp[4],@acc[2],@r[2],@tmp[4]);
620
-
621
- vmalof (@tmp[0],@acc[1],@r5[4],@tmp[0]);
622
- vmalof (@tmp[1],@acc[1],@r[0],@tmp[1]);
623
- vmalof (@tmp[2],@acc[1],@r[1],@tmp[2]);
624
- vmalof (@tmp[3],@acc[1],@r[2],@tmp[3]);
625
- vmalof (@tmp[4],@acc[1],@r[3],@tmp[4]);
626
-
627
- vmalof (@acc[1],@acc[0],@r[1],@tmp[1]);
628
- vmalof (@acc[2],@acc[0],@r[2],@tmp[2]);
629
- vmalof (@acc[3],@acc[0],@r[3],@tmp[3]);
630
- vmalof (@acc[4],@acc[0],@r[4],@tmp[4]);
631
- vmalof (@acc[0],@acc[0],@r[0],@tmp[0]);
632
-
633
- vzero ("%v27");
634
- vsumqg (@acc[$_],@acc[$_],"%v27") for (0..4);
635
-
636
- REDUCE ();
637
-
638
- vesrlg (@tmp[1],@acc[1],26);
639
- vn (@acc[1],@acc[1],$mask);
640
- vag (@acc[2],@acc[2],@tmp[1]); # carry 1->2
641
-
642
- vesrlg (@tmp[2],@acc[2],26);
643
- vn (@acc[2],@acc[2],$mask);
644
- vag (@acc[3],@acc[3],@tmp[2]); # carry 2->3
645
-
646
- vesrlg (@tmp[3],@acc[3],26);
647
- vn (@acc[3],@acc[3],$mask);
648
- vag (@acc[4],@acc[4],@tmp[3]); # carry 3->4
649
-
650
- # acc -> base 2^64
651
- vleib ("%v30",6*8,7);
652
- vleib ("%v29",13*8,7);
653
- vleib ("%v28",3*8,7);
654
-
655
- veslg (@acc[1],@acc[1],26);
656
- veslg (@acc[3],@acc[3],26);
657
- vo (@acc[0],@acc[0],@acc[1]);
658
- vo (@acc[2],@acc[2],@acc[3]);
659
-
660
- veslg (@acc[2],@acc[2],4);
661
- vslb (@acc[2],@acc[2],"%v30"); # <<52
662
- vo (@acc[0],@acc[0],@acc[2]);
663
-
664
- vslb (@tmp[4],@acc[4],"%v29"); # <<104
665
- vo (@acc[0],@acc[0],@tmp[4]);
666
-
667
- vsrlb (@acc[1],@acc[4],"%v28"); # >>24
668
-
669
- # acc %= 2^130-5
670
- vone ("%v26");
671
- vleig ("%v27",5,1);
672
- vone ("%v29");
673
- vleig ("%v26",-4,1);
674
-
675
- vaq (@tmp[0],@acc[0],"%v27");
676
- vaccq (@tmp[1],@acc[0],"%v27");
677
-
678
- vaq (@tmp[1],@tmp[1],"%v26");
679
- vaccq (@tmp[1],@tmp[1],@acc[1]);
680
-
681
- vaq (@tmp[1],@tmp[1],"%v29");
682
-
683
- vn (@tmp[2],@tmp[1],@acc[0]);
684
- vnc (@tmp[3],@tmp[0],@tmp[1]);
685
- vo (@acc[0],@tmp[2],@tmp[3]);
686
-
687
- # acc += nonce
688
- vl (@vperm[0],"64(%r5)");
689
- vlef (@tmp[0],"4*$_($nonce)",3-$_) for (0..3);
690
-
691
- vaq (@acc[0],@acc[0],@tmp[0]);
692
-
693
- vperm (@acc[0],@acc[0],@acc[0],@vperm[0]);
694
- vst (@acc[0],"0($mac)"); # store mac
695
-
696
-if ($z) {
697
- vlm ("%v8","%v15","0($sp)");
698
- la ($sp,"$frame($sp)");
699
-} else {
700
- ld ("%f4","16*$SIZE_T+2*8($sp)");
701
- ld ("%f6","16*$SIZE_T+3*8($sp)");
702
-}
703
- br ("%r14");
704
-SIZE ("poly1305_emit_vx",".-poly1305_emit_vx");
705
-}
706
-}
707
-
708
-# NOVX CODE PATH
709
-{
710
-################
711
-# static void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
712
-# u32 padbit)
713
-{
714
-my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
715
-
716
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
717
my ($r0,$r1,$s1) = map("%r$_",(0..2));
718
+
719
GLOBL ("poly1305_blocks");
720
TYPE ("poly1305_blocks","\@function");
721
ALIGN (16);
722
LABEL ("poly1305_blocks");
723
-$z? srlg ($len,$len,4) :srl ($len,4);
724
- lghi ("%r0",0);
725
-&{$z? \&clgr:\&clr} ($len,"%r0");
726
- je (".Lno_data");
727
+LABEL (".Lpoly1305_blocks");
728
+&{$z? \<gr:\<r} ("%r0",$len);
729
+ jz (".Lno_data");
730
731
&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
732
733
- llgfr ($padbit,$padbit); # clear upper half, much needed with
734
- # non-64-bit ABI
735
- lg ($r0,"32($ctx)"); # load key
736
- lg ($r1,"40($ctx)");
737
-
738
- lg ($h0,"0($ctx)"); # load hash value
739
+ lg ($h0,"0($ctx)"); # load hash value
740
lg ($h1,"8($ctx)");
741
lg ($h2,"16($ctx)");
742
743
+LABEL (".Lpoly1305_blocks_entry");
744
+if ($z) {
745
+ srlg ($len,$len,4);
746
+} else {
747
+ srl ($len,4);
748
+}
749
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
750
+ # non-64-bit ABI
751
+ lg ($r0,"32($ctx)"); # load key
752
+ lg ($r1,"40($ctx)");
753
+
754
&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
755
srlg ($s1,$r1,2);
756
algr ($s1,$r1); # s1 = r1 + r1>>2
757
758
759
ALIGN (16);
760
LABEL (".Loop");
761
- lrvg ($d0lo,"0($inp)"); # load little-endian input
762
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
763
lrvg ($d1lo,"8($inp)");
764
la ($inp,"16($inp)");
765
766
- algr ($d0lo,$h0); # accumulate input
767
+ algr ($d0lo,$h0); # accumulate input
768
alcgr ($d1lo,$h1);
769
+ alcgr ($h2,$padbit);
770
771
lgr ($h0,$d0lo);
772
- mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
773
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
774
lgr ($h1,$d1lo);
775
- mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
776
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
777
778
- mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
779
- mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
780
- alcgr ($h2,$padbit);
781
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
782
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
783
784
algr ($d0lo,$d1lo);
785
lgr ($d1lo,$h2);
786
787
algr ($h1,$h0);
788
alcgr ($t1,$t0);
789
790
- msgr ($d1lo,$s1); # h2*s1
791
- msgr ($h2,$r0); # h2*r0
792
+ msgr ($d1lo,$s1); # h2*s1
793
+ msgr ($h2,$r0); # h2*r0
794
795
algr ($h1,$d1lo);
796
- alcgr ($t1,$d1hi); # $d1hi is zero
797
+ alcgr ($t1,$d1hi); # $d1hi is zero
798
799
algr ($h1,$d0hi);
800
alcgr ($h2,$t1);
801
802
- lghi ($h0,-4); # final reduction step
803
+ lghi ($h0,-4); # final reduction step
804
ngr ($h0,$h2);
805
srlg ($t0,$h2,2);
806
algr ($h0,$t0);
807
808
ngr ($h2,$t1);
809
810
algr ($h0,$d0lo);
811
- alcgr ($h1,$d1hi); # $d1hi is still zero
812
- alcgr ($h2,$d1hi); # $d1hi is still zero
813
+ alcgr ($h1,$d1hi); # $d1hi is still zero
814
+ alcgr ($h2,$d1hi); # $d1hi is still zero
815
816
&{$z? \&brctg:\&brct} ($len,".Loop");
817
818
&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
819
820
- stg ($h0,"0($ctx)"); # store hash value
821
+ stg ($h0,"0($ctx)"); # store hash value
822
stg ($h1,"8($ctx)");
823
stg ($h2,"16($ctx)");
824
825
826
}
827
828
################
829
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
830
+# size_t len, u32 padbit)
831
+{
832
+my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
833
+my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
834
+my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
835
+my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
836
+my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
837
+my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
838
+my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
839
+
840
+my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
841
+
842
+TYPE ("poly1305_blocks_vx","\@function");
843
+ALIGN (16);
844
+LABEL ("poly1305_blocks_vx");
845
+LABEL (".Lpoly1305_blocks_vx");
846
+&{$z? \&clgfi:\&clfi} ($len,128);
847
+ jhe ("__poly1305_blocks_vx");
848
+
849
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
850
+
851
+ lg ($d0,"0($ctx)");
852
+ lg ($d1,"8($ctx)");
853
+ lg ($d2,"16($ctx)");
854
+
855
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
856
+ srlg ($h0,$d0,32);
857
+ llgfr ("%r1",$d1);
858
+ srlg ($h1,$d1,32);
859
+ srlg ($h2,$d2,32);
860
+
861
+ sllg ("%r0","%r0",26);
862
+ algr ($h0,"%r0");
863
+ sllg ("%r0",$h1,52);
864
+ srlg ($h1,$h1,12);
865
+ sllg ("%r1","%r1",14);
866
+ algr ($h0,"%r0");
867
+ alcgr ($h1,"%r1");
868
+ sllg ("%r0",$h2,40);
869
+ srlg ($h2,$h2,24);
870
+ lghi ("%r1",0);
871
+ algr ($h1,"%r0");
872
+ alcgr ($h2,"%r1");
873
+
874
+ llgf ("%r0","24($ctx)"); # is_base2_26
875
+ lcgr ("%r0","%r0");
876
+
877
+ xgr ($h0,$d0); # choose between radixes
878
+ xgr ($h1,$d1);
879
+ xgr ($h2,$d2);
880
+ ngr ($h0,"%r0");
881
+ ngr ($h1,"%r0");
882
+ ngr ($h2,"%r0");
883
+ xgr ($h0,$d0);
884
+ xgr ($h1,$d1);
885
+ xgr ($h2,$d2);
886
+
887
+ lhi ("%r0",0);
888
+ st ("%r0","24($ctx)"); # clear is_base2_26
889
+
890
+ j (".Lpoly1305_blocks_entry");
891
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
892
+
893
+TYPE ("__poly1305_mul","\@function");
894
+ALIGN (16);
895
+LABEL ("__poly1305_mul");
896
+ vmlof ($ACC0,$H0,$R0);
897
+ vmlof ($ACC1,$H0,$R1);
898
+ vmlof ($ACC2,$H0,$R2);
899
+ vmlof ($ACC3,$H0,$R3);
900
+ vmlof ($ACC4,$H0,$R4);
901
+
902
+ vmalof ($ACC0,$H1,$S4,$ACC0);
903
+ vmalof ($ACC1,$H1,$R0,$ACC1);
904
+ vmalof ($ACC2,$H1,$R1,$ACC2);
905
+ vmalof ($ACC3,$H1,$R2,$ACC3);
906
+ vmalof ($ACC4,$H1,$R3,$ACC4);
907
+
908
+ vmalof ($ACC0,$H2,$S3,$ACC0);
909
+ vmalof ($ACC1,$H2,$S4,$ACC1);
910
+ vmalof ($ACC2,$H2,$R0,$ACC2);
911
+ vmalof ($ACC3,$H2,$R1,$ACC3);
912
+ vmalof ($ACC4,$H2,$R2,$ACC4);
913
+
914
+ vmalof ($ACC0,$H3,$S2,$ACC0);
915
+ vmalof ($ACC1,$H3,$S3,$ACC1);
916
+ vmalof ($ACC2,$H3,$S4,$ACC2);
917
+ vmalof ($ACC3,$H3,$R0,$ACC3);
918
+ vmalof ($ACC4,$H3,$R1,$ACC4);
919
+
920
+ vmalof ($ACC0,$H4,$S1,$ACC0);
921
+ vmalof ($ACC1,$H4,$S2,$ACC1);
922
+ vmalof ($ACC2,$H4,$S3,$ACC2);
923
+ vmalof ($ACC3,$H4,$S4,$ACC3);
924
+ vmalof ($ACC4,$H4,$R0,$ACC4);
925
+
926
+ ################################################################
927
+ # lazy reduction
928
+
929
+ vesrlg ($H4,$ACC3,26);
930
+ vesrlg ($H1,$ACC0,26);
931
+ vn ($H3,$ACC3,$mask26);
932
+ vn ($H0,$ACC0,$mask26);
933
+ vag ($H4,$H4,$ACC4); # h3 -> h4
934
+ vag ($H1,$H1,$ACC1); # h0 -> h1
935
+
936
+ vesrlg ($ACC4,$H4,26);
937
+ vesrlg ($ACC1,$H1,26);
938
+ vn ($H4,$H4,$mask26);
939
+ vn ($H1,$H1,$mask26);
940
+ vag ($H0,$H0,$ACC4);
941
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
942
+
943
+ veslg ($ACC4,$ACC4,2); # <<2
944
+ vesrlg ($ACC2,$H2,26);
945
+ vn ($H2,$H2,$mask26);
946
+ vag ($H0,$H0,$ACC4); # h4 -> h0
947
+ vag ($H3,$H3,$ACC2); # h2 -> h3
948
+
949
+ vesrlg ($ACC0,$H0,26);
950
+ vesrlg ($ACC3,$H3,26);
951
+ vn ($H0,$H0,$mask26);
952
+ vn ($H3,$H3,$mask26);
953
+ vag ($H1,$H1,$ACC0); # h0 -> h1
954
+ vag ($H4,$H4,$ACC3); # h3 -> h4
955
+ br ("%r14");
956
+SIZE ("__poly1305_mul",".-__poly1305_mul");
957
+
958
+TYPE ("__poly1305_blocks_vx","\@function");
959
+ALIGN (16);
960
+LABEL ("__poly1305_blocks_vx");
961
+&{$z? \&lgr:\&lr} ("%r0",$sp);
962
+&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
963
+if (!$z) {
964
+ std ("%f4","16*$SIZE_T+2*8($sp)");
965
+ std ("%f6","16*$SIZE_T+3*8($sp)");
966
+ ahi ($sp,-$stdframe);
967
+ st ("%r0","0($sp)"); # back-chain
968
+
969
+ llgfr ($len,$len); # so that srlg works on $len
970
+} else {
971
+ aghi ($sp,"-($stdframe+8*8)");
972
+ stg ("%r0","0($sp)"); # back-chain
973
+
974
+ std ("%f8","$stdframe+0*8($sp)");
975
+ std ("%f9","$stdframe+1*8($sp)");
976
+ std ("%f10","$stdframe+2*8($sp)");
977
+ std ("%f11","$stdframe+3*8($sp)");
978
+ std ("%f12","$stdframe+4*8($sp)");
979
+ std ("%f13","$stdframe+5*8($sp)");
980
+ std ("%f14","$stdframe+6*8($sp)");
981
+ std ("%f15","$stdframe+7*8($sp)");
982
+}
983
+ larl ("%r1",".Lconst");
984
+ vgmg ($mask26,38,63);
985
+ vlm ($bswaplo,$bswapmi,"16(%r1)");
986
+
987
+ < ("%r0","24($ctx)"); # is_base2_26?
988
+ jnz (".Lskip_init");
989
+
990
+ lg ($h0,"32($ctx)"); # load key base 2^64
991
+ lg ($h1,"40($ctx)");
992
+
993
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
994
+ srlg ($d1,$h0,52);
995
+ risbg ($h0,$h0,38,0x80+63,0);
996
+ vlvgg ($R0,$h0,0);
997
+ risbg ($d1,$h1,38,51,12);
998
+ vlvgg ($R1,$d0,0);
999
+ risbg ($d0,$h1,38,63,50);
1000
+ vlvgg ($R2,$d1,0);
1001
+ srlg ($d1,$h1,40);
1002
+ vlvgg ($R3,$d0,0);
1003
+ vlvgg ($R4,$d1,0);
1004
+
1005
+ veslg ($S1,$R1,2);
1006
+ veslg ($S2,$R2,2);
1007
+ veslg ($S3,$R3,2);
1008
+ veslg ($S4,$R4,2);
1009
+ vlr ($H0,$R0);
1010
+ vlr ($H1,$R1);
1011
+ vlr ($H2,$R2);
1012
+ vlr ($H3,$R3);
1013
+ vlr ($H4,$R4);
1014
+ vag ($S1,$S1,$R1); # * 5
1015
+ vag ($S2,$S2,$R2);
1016
+ vag ($S3,$S3,$R3);
1017
+ vag ($S4,$S4,$R4);
1018
+
1019
+ brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
1020
+
1021
+ vpdi ($R0,$H0,$R0,0); # r^2:r^1
1022
+ vpdi ($R1,$H1,$R1,0);
1023
+ vpdi ($R2,$H2,$R2,0);
1024
+ vpdi ($R3,$H3,$R3,0);
1025
+ vpdi ($R4,$H4,$R4,0);
1026
+ vpdi ($H0,$H0,$H0,0); # r^2:r^2
1027
+ vpdi ($H1,$H1,$H1,0);
1028
+ vpdi ($H2,$H2,$H2,0);
1029
+ vpdi ($H3,$H3,$H3,0);
1030
+ vpdi ($H4,$H4,$H4,0);
1031
+ veslg ($S1,$R1,2);
1032
+ veslg ($S2,$R2,2);
1033
+ veslg ($S3,$R3,2);
1034
+ veslg ($S4,$R4,2);
1035
+ vag ($S1,$S1,$R1); # * 5
1036
+ vag ($S2,$S2,$R2);
1037
+ vag ($S3,$S3,$R3);
1038
+ vag ($S4,$S4,$R4);
1039
+
1040
+ brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
1041
+
1042
+ vl ($I0,"0(%r1)"); # borrow $I0
1043
+ vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
1044
+ vperm ($R1,$R1,$H1,$I0);
1045
+ vperm ($R2,$R2,$H2,$I0);
1046
+ vperm ($R3,$R3,$H3,$I0);
1047
+ vperm ($R4,$R4,$H4,$I0);
1048
+ veslf ($S1,$R1,2);
1049
+ veslf ($S2,$R2,2);
1050
+ veslf ($S3,$R3,2);
1051
+ veslf ($S4,$R4,2);
1052
+ vaf ($S1,$S1,$R1); # * 5
1053
+ vaf ($S2,$S2,$R2);
1054
+ vaf ($S3,$S3,$R3);
1055
+ vaf ($S4,$S4,$R4);
1056
+
1057
+ lg ($h0,"0($ctx)"); # load hash base 2^64
1058
+ lg ($h1,"8($ctx)");
1059
+ lg ($h2,"16($ctx)");
1060
+
1061
+ vzero ($H0);
1062
+ vzero ($H1);
1063
+ vzero ($H2);
1064
+ vzero ($H3);
1065
+ vzero ($H4);
1066
+
1067
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
1068
+ srlg ($d1,$h0,52);
1069
+ risbg ($h0,$h0,38,0x80+63,0);
1070
+ vlvgg ($H0,$h0,0);
1071
+ risbg ($d1,$h1,38,51,12);
1072
+ vlvgg ($H1,$d0,0);
1073
+ risbg ($d0,$h1,38,63,50);
1074
+ vlvgg ($H2,$d1,0);
1075
+ srlg ($d1,$h1,40);
1076
+ vlvgg ($H3,$d0,0);
1077
+ risbg ($d1,$h2,37,39,24);
1078
+ vlvgg ($H4,$d1,0);
1079
+
1080
+ lhi ("%r0",1);
1081
+ st ("%r0","24($ctx)"); # set is_base2_26
1082
+
1083
+ vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
1084
+
1085
+ vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
1086
+ vpdi ($R1,$R1,$R1,0);
1087
+ vpdi ($S1,$S1,$S1,0);
1088
+ vpdi ($R2,$R2,$R2,0);
1089
+ vpdi ($S2,$S2,$S2,0);
1090
+ vpdi ($R3,$R3,$R3,0);
1091
+ vpdi ($S3,$S3,$S3,0);
1092
+ vpdi ($R4,$R4,$R4,0);
1093
+ vpdi ($S4,$S4,$S4,0);
1094
+
1095
+ j (".Loaded_hash");
1096
+
1097
+ALIGN (16);
1098
+LABEL (".Lskip_init");
1099
+ vllezf ($H0,"0($ctx)"); # load hash base 2^26
1100
+ vllezf ($H1,"4($ctx)");
1101
+ vllezf ($H2,"8($ctx)");
1102
+ vllezf ($H3,"12($ctx)");
1103
+ vllezf ($H4,"16($ctx)");
1104
+
1105
+ vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
1106
+ vlrepg ($R1,"0x40($ctx)");
1107
+ vlrepg ($S1,"0x50($ctx)");
1108
+ vlrepg ($R2,"0x60($ctx)");
1109
+ vlrepg ($S2,"0x70($ctx)");
1110
+ vlrepg ($R3,"0x80($ctx)");
1111
+ vlrepg ($S3,"0x90($ctx)");
1112
+ vlrepg ($R4,"0xa0($ctx)");
1113
+ vlrepg ($S4,"0xb0($ctx)");
1114
+
1115
+LABEL (".Loaded_hash");
1116
+ vzero ($I1);
1117
+ vzero ($I3);
1118
+
1119
+ vlm ($T1,$T4,"0x00($inp)"); # load first input block
1120
+ la ($inp,"0x40($inp)");
1121
+ vgmg ($mask26,6,31);
1122
+ vgmf ($I4,5,5); # padbit<<2
1123
+
1124
+ vperm ($I0,$T3,$T4,$bswaplo);
1125
+ vperm ($I2,$T3,$T4,$bswapmi);
1126
+ vperm ($T3,$T3,$T4,$bswaphi);
1127
+
1128
+ verimg ($I1,$I0,$mask26,6); # >>26
1129
+ veslg ($I0,$I0,32);
1130
+ veslg ($I2,$I2,28); # >>4
1131
+ verimg ($I3,$T3,$mask26,18); # >>14
1132
+ verimg ($I4,$T3,$mask26,58); # >>38
1133
+ vn ($I0,$I0,$mask26);
1134
+ vn ($I2,$I2,$mask26);
1135
+ vesrlf ($I4,$I4,2); # >>2
1136
+
1137
+ vgmg ($mask26,38,63);
1138
+ vperm ($T3,$T1,$T2,$bswaplo);
1139
+ vperm ($T4,$T1,$T2,$bswaphi);
1140
+ vperm ($T2,$T1,$T2,$bswapmi);
1141
+
1142
+ verimg ($I0,$T3,$mask26,0);
1143
+ verimg ($I1,$T3,$mask26,38); # >>26
1144
+ verimg ($I2,$T2,$mask26,60); # >>4
1145
+ verimg ($I3,$T4,$mask26,50); # >>14
1146
+ vesrlg ($T4,$T4,40);
1147
+ vo ($I4,$I4,$T4);
1148
+
1149
+ srlg ("%r0",$len,6);
1150
+&{$z? \&aghi:\&ahi} ("%r0",-1);
1151
+
1152
+ALIGN (16);
1153
+LABEL (".Loop_vx");
1154
+ vmlef ($ACC0,$I0,$R0);
1155
+ vmlef ($ACC1,$I0,$R1);
1156
+ vmlef ($ACC2,$I0,$R2);
1157
+ vmlef ($ACC3,$I0,$R3);
1158
+ vmlef ($ACC4,$I0,$R4);
1159
+
1160
+ vmalef ($ACC0,$I1,$S4,$ACC0);
1161
+ vmalef ($ACC1,$I1,$R0,$ACC1);
1162
+ vmalef ($ACC2,$I1,$R1,$ACC2);
1163
+ vmalef ($ACC3,$I1,$R2,$ACC3);
1164
+ vmalef ($ACC4,$I1,$R3,$ACC4);
1165
+
1166
+ vaf ($H2,$H2,$I2);
1167
+ vaf ($H0,$H0,$I0);
1168
+ vaf ($H3,$H3,$I3);
1169
+ vaf ($H1,$H1,$I1);
1170
+ vaf ($H4,$H4,$I4);
1171
+
1172
+ vmalef ($ACC0,$I2,$S3,$ACC0);
1173
+ vmalef ($ACC1,$I2,$S4,$ACC1);
1174
+ vmalef ($ACC2,$I2,$R0,$ACC2);
1175
+ vmalef ($ACC3,$I2,$R1,$ACC3);
1176
+ vmalef ($ACC4,$I2,$R2,$ACC4);
1177
+
1178
+ vlm ($T1,$T4,"0x00($inp)"); # load next input block
1179
+ la ($inp,"0x40($inp)");
1180
+ vgmg ($mask26,6,31);
1181
+
1182
+ vmalef ($ACC0,$I3,$S2,$ACC0);
1183
+ vmalef ($ACC1,$I3,$S3,$ACC1);
1184
+ vmalef ($ACC2,$I3,$S4,$ACC2);
1185
+ vmalef ($ACC3,$I3,$R0,$ACC3);
1186
+ vmalef ($ACC4,$I3,$R1,$ACC4);
1187
+
1188
+ vperm ($I0,$T3,$T4,$bswaplo);
1189
+ vperm ($I2,$T3,$T4,$bswapmi);
1190
+ vperm ($T3,$T3,$T4,$bswaphi);
1191
+
1192
+ vmalef ($ACC0,$I4,$S1,$ACC0);
1193
+ vmalef ($ACC1,$I4,$S2,$ACC1);
1194
+ vmalef ($ACC2,$I4,$S3,$ACC2);
1195
+ vmalef ($ACC3,$I4,$S4,$ACC3);
1196
+ vmalef ($ACC4,$I4,$R0,$ACC4);
1197
+
1198
+ verimg ($I1,$I0,$mask26,6); # >>26
1199
+ veslg ($I0,$I0,32);
1200
+ veslg ($I2,$I2,28); # >>4
1201
+ verimg ($I3,$T3,$mask26,18); # >>14
1202
+
1203
+ vmalof ($ACC0,$H0,$R0,$ACC0);
1204
+ vmalof ($ACC1,$H0,$R1,$ACC1);
1205
+ vmalof ($ACC2,$H0,$R2,$ACC2);
1206
+ vmalof ($ACC3,$H0,$R3,$ACC3);
1207
+ vmalof ($ACC4,$H0,$R4,$ACC4);
1208
+
1209
+ vgmf ($I4,5,5); # padbit<<2
1210
+ verimg ($I4,$T3,$mask26,58); # >>38
1211
+ vn ($I0,$I0,$mask26);
1212
+ vn ($I2,$I2,$mask26);
1213
+ vesrlf ($I4,$I4,2); # >>2
1214
+
1215
+ vmalof ($ACC0,$H1,$S4,$ACC0);
1216
+ vmalof ($ACC1,$H1,$R0,$ACC1);
1217
+ vmalof ($ACC2,$H1,$R1,$ACC2);
1218
+ vmalof ($ACC3,$H1,$R2,$ACC3);
1219
+ vmalof ($ACC4,$H1,$R3,$ACC4);
1220
+
1221
+ vgmg ($mask26,38,63);
1222
+ vperm ($T3,$T1,$T2,$bswaplo);
1223
+ vperm ($T4,$T1,$T2,$bswaphi);
1224
+ vperm ($T2,$T1,$T2,$bswapmi);
1225
+
1226
+ vmalof ($ACC0,$H2,$S3,$ACC0);
1227
+ vmalof ($ACC1,$H2,$S4,$ACC1);
1228
+ vmalof ($ACC2,$H2,$R0,$ACC2);
1229
+ vmalof ($ACC3,$H2,$R1,$ACC3);
1230
+ vmalof ($ACC4,$H2,$R2,$ACC4);
1231
+
1232
+ verimg ($I0,$T3,$mask26,0);
1233
+ verimg ($I1,$T3,$mask26,38); # >>26
1234
+ verimg ($I2,$T2,$mask26,60); # >>4
1235
+
1236
+ vmalof ($ACC0,$H3,$S2,$ACC0);
1237
+ vmalof ($ACC1,$H3,$S3,$ACC1);
1238
+ vmalof ($ACC2,$H3,$S4,$ACC2);
1239
+ vmalof ($ACC3,$H3,$R0,$ACC3);
1240
+ vmalof ($ACC4,$H3,$R1,$ACC4);
1241
+
1242
+ verimg ($I3,$T4,$mask26,50); # >>14
1243
+ vesrlg ($T4,$T4,40);
1244
+ vo ($I4,$I4,$T4);
1245
+
1246
+ vmalof ($ACC0,$H4,$S1,$ACC0);
1247
+ vmalof ($ACC1,$H4,$S2,$ACC1);
1248
+ vmalof ($ACC2,$H4,$S3,$ACC2);
1249
+ vmalof ($ACC3,$H4,$S4,$ACC3);
1250
+ vmalof ($ACC4,$H4,$R0,$ACC4);
1251
+
1252
+ ################################################################
1253
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1254
+ # and P. Schwabe
1255
+
1256
+ vesrlg ($H4,$ACC3,26);
1257
+ vesrlg ($H1,$ACC0,26);
1258
+ vn ($H3,$ACC3,$mask26);
1259
+ vn ($H0,$ACC0,$mask26);
1260
+ vag ($H4,$H4,$ACC4); # h3 -> h4
1261
+ vag ($H1,$H1,$ACC1); # h0 -> h1
1262
+
1263
+ vesrlg ($ACC4,$H4,26);
1264
+ vesrlg ($ACC1,$H1,26);
1265
+ vn ($H4,$H4,$mask26);
1266
+ vn ($H1,$H1,$mask26);
1267
+ vag ($H0,$H0,$ACC4);
1268
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
1269
+
1270
+ veslg ($ACC4,$ACC4,2); # <<2
1271
+ vesrlg ($ACC2,$H2,26);
1272
+ vn ($H2,$H2,$mask26);
1273
+ vag ($H0,$H0,$ACC4); # h4 -> h0
1274
+ vag ($H3,$H3,$ACC2); # h2 -> h3
1275
+
1276
+ vesrlg ($ACC0,$H0,26);
1277
+ vesrlg ($ACC3,$H3,26);
1278
+ vn ($H0,$H0,$mask26);
1279
+ vn ($H3,$H3,$mask26);
1280
+ vag ($H1,$H1,$ACC0); # h0 -> h1
1281
+ vag ($H4,$H4,$ACC3); # h3 -> h4
1282
+
1283
+&{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
1284
+
1285
+ vlm ($R0,$S4,"48($ctx)"); # load all powers
1286
+
1287
+ lghi ("%r0",0x30);
1288
+&{$z? \&lcgr:\&lcr} ($len,$len);
1289
+&{$z? \&ngr:\&nr} ($len,"%r0");
1290
+&{$z? \&slgr:\&slr} ($inp,$len);
1291
+
1292
+LABEL (".Last");
1293
+ vmlef ($ACC0,$I0,$R0);
1294
+ vmlef ($ACC1,$I0,$R1);
1295
+ vmlef ($ACC2,$I0,$R2);
1296
+ vmlef ($ACC3,$I0,$R3);
1297
+ vmlef ($ACC4,$I0,$R4);
1298
+
1299
+ vmalef ($ACC0,$I1,$S4,$ACC0);
1300
+ vmalef ($ACC1,$I1,$R0,$ACC1);
1301
+ vmalef ($ACC2,$I1,$R1,$ACC2);
1302
+ vmalef ($ACC3,$I1,$R2,$ACC3);
1303
+ vmalef ($ACC4,$I1,$R3,$ACC4);
1304
+
1305
+ vaf ($H0,$H0,$I0);
1306
+ vaf ($H1,$H1,$I1);
1307
+ vaf ($H2,$H2,$I2);
1308
+ vaf ($H3,$H3,$I3);
1309
+ vaf ($H4,$H4,$I4);
1310
+
1311
+ vmalef ($ACC0,$I2,$S3,$ACC0);
1312
+ vmalef ($ACC1,$I2,$S4,$ACC1);
1313
+ vmalef ($ACC2,$I2,$R0,$ACC2);
1314
+ vmalef ($ACC3,$I2,$R1,$ACC3);
1315
+ vmalef ($ACC4,$I2,$R2,$ACC4);
1316
+
1317
+ vmalef ($ACC0,$I3,$S2,$ACC0);
1318
+ vmalef ($ACC1,$I3,$S3,$ACC1);
1319
+ vmalef ($ACC2,$I3,$S4,$ACC2);
1320
+ vmalef ($ACC3,$I3,$R0,$ACC3);
1321
+ vmalef ($ACC4,$I3,$R1,$ACC4);
1322
+
1323
+ vmalef ($ACC0,$I4,$S1,$ACC0);
1324
+ vmalef ($ACC1,$I4,$S2,$ACC1);
1325
+ vmalef ($ACC2,$I4,$S3,$ACC2);
1326
+ vmalef ($ACC3,$I4,$S4,$ACC3);
1327
+ vmalef ($ACC4,$I4,$R0,$ACC4);
1328
+
1329
+ vmalof ($ACC0,$H0,$R0,$ACC0);
1330
+ vmalof ($ACC1,$H0,$R1,$ACC1);
1331
+ vmalof ($ACC2,$H0,$R2,$ACC2);
1332
+ vmalof ($ACC3,$H0,$R3,$ACC3);
1333
+ vmalof ($ACC4,$H0,$R4,$ACC4);
1334
+
1335
+ vmalof ($ACC0,$H1,$S4,$ACC0);
1336
+ vmalof ($ACC1,$H1,$R0,$ACC1);
1337
+ vmalof ($ACC2,$H1,$R1,$ACC2);
1338
+ vmalof ($ACC3,$H1,$R2,$ACC3);
1339
+ vmalof ($ACC4,$H1,$R3,$ACC4);
1340
+
1341
+ vmalof ($ACC0,$H2,$S3,$ACC0);
1342
+ vmalof ($ACC1,$H2,$S4,$ACC1);
1343
+ vmalof ($ACC2,$H2,$R0,$ACC2);
1344
+ vmalof ($ACC3,$H2,$R1,$ACC3);
1345
+ vmalof ($ACC4,$H2,$R2,$ACC4);
1346
+
1347
+ vmalof ($ACC0,$H3,$S2,$ACC0);
1348
+ vmalof ($ACC1,$H3,$S3,$ACC1);
1349
+ vmalof ($ACC2,$H3,$S4,$ACC2);
1350
+ vmalof ($ACC3,$H3,$R0,$ACC3);
1351
+ vmalof ($ACC4,$H3,$R1,$ACC4);
1352
+
1353
+ vmalof ($ACC0,$H4,$S1,$ACC0);
1354
+ vmalof ($ACC1,$H4,$S2,$ACC1);
1355
+ vmalof ($ACC2,$H4,$S3,$ACC2);
1356
+ vmalof ($ACC3,$H4,$S4,$ACC3);
1357
+ vmalof ($ACC4,$H4,$R0,$ACC4);
1358
+
1359
+ ################################################################
1360
+ # horizontal addition
1361
+
1362
+ vzero ($H0);
1363
+ vsumqg ($ACC0,$ACC0,$H0);
1364
+ vsumqg ($ACC1,$ACC1,$H0);
1365
+ vsumqg ($ACC2,$ACC2,$H0);
1366
+ vsumqg ($ACC3,$ACC3,$H0);
1367
+ vsumqg ($ACC4,$ACC4,$H0);
1368
+
1369
+ ################################################################
1370
+ # lazy reduction
1371
+
1372
+ vesrlg ($H4,$ACC3,26);
1373
+ vesrlg ($H1,$ACC0,26);
1374
+ vn ($H3,$ACC3,$mask26);
1375
+ vn ($H0,$ACC0,$mask26);
1376
+ vag ($H4,$H4,$ACC4); # h3 -> h4
1377
+ vag ($H1,$H1,$ACC1); # h0 -> h1
1378
+
1379
+ vesrlg ($ACC4,$H4,26);
1380
+ vesrlg ($ACC1,$H1,26);
1381
+ vn ($H4,$H4,$mask26);
1382
+ vn ($H1,$H1,$mask26);
1383
+ vag ($H0,$H0,$ACC4);
1384
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
1385
+
1386
+ veslg ($ACC4,$ACC4,2); # <<2
1387
+ vesrlg ($ACC2,$H2,26);
1388
+ vn ($H2,$H2,$mask26);
1389
+ vag ($H0,$H0,$ACC4); # h4 -> h0
1390
+ vag ($H3,$H3,$ACC2); # h2 -> h3
1391
+
1392
+ vesrlg ($ACC0,$H0,26);
1393
+ vesrlg ($ACC3,$H3,26);
1394
+ vn ($H0,$H0,$mask26);
1395
+ vn ($H3,$H3,$mask26);
1396
+ vag ($H1,$H1,$ACC0); # h0 -> h1
1397
+ vag ($H4,$H4,$ACC3); # h3 -> h4
1398
+
1399
+&{$z? \&clgfi:\&clfi} ($len,0);
1400
+ je (".Ldone");
1401
+
1402
+ vlm ($T1,$T4,"0x00($inp)"); # load last partial block
1403
+ vgmg ($mask26,6,31);
1404
+ vgmf ($I4,5,5); # padbit<<2
1405
+
1406
+ vperm ($I0,$T3,$T4,$bswaplo);
1407
+ vperm ($I2,$T3,$T4,$bswapmi);
1408
+ vperm ($T3,$T3,$T4,$bswaphi);
1409
+
1410
+ vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
1411
+ vl ($ACC1,"0x60($len,%r1)");
1412
+
1413
+ verimg ($I1,$I0,$mask26,6); # >>26
1414
+ veslg ($I0,$I0,32);
1415
+ veslg ($I2,$I2,28); # >>4
1416
+ verimg ($I3,$T3,$mask26,18); # >>14
1417
+ verimg ($I4,$T3,$mask26,58); # >>38
1418
+ vn ($I0,$I0,$mask26);
1419
+ vn ($I2,$I2,$mask26);
1420
+ vesrlf ($I4,$I4,2); # >>2
1421
+
1422
+ vgmg ($mask26,38,63);
1423
+ vperm ($T3,$T1,$T2,$bswaplo);
1424
+ vperm ($T4,$T1,$T2,$bswaphi);
1425
+ vperm ($T2,$T1,$T2,$bswapmi);
1426
+
1427
+ verimg ($I0,$T3,$mask26,0);
1428
+ verimg ($I1,$T3,$mask26,38); # >>26
1429
+ verimg ($I2,$T2,$mask26,60); # >>4
1430
+ verimg ($I3,$T4,$mask26,50); # >>14
1431
+ vesrlg ($T4,$T4,40);
1432
+ vo ($I4,$I4,$T4);
1433
+
1434
+ vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
1435
+ vn ($I0,$I0,$ACC1); # mask redundant lane[s]
1436
+ vperm ($H1,$H1,$H1,$ACC0);
1437
+ vn ($I1,$I1,$ACC1);
1438
+ vperm ($H2,$H2,$H2,$ACC0);
1439
+ vn ($I2,$I2,$ACC1);
1440
+ vperm ($H3,$H3,$H3,$ACC0);
1441
+ vn ($I3,$I3,$ACC1);
1442
+ vperm ($H4,$H4,$H4,$ACC0);
1443
+ vn ($I4,$I4,$ACC1);
1444
+
1445
+ vaf ($I0,$I0,$H0); # accumulate hash
1446
+ vzero ($H0); # wipe hash value
1447
+ vaf ($I1,$I1,$H1);
1448
+ vzero ($H1);
1449
+ vaf ($I2,$I2,$H2);
1450
+ vzero ($H2);
1451
+ vaf ($I3,$I3,$H3);
1452
+ vzero ($H3);
1453
+ vaf ($I4,$I4,$H4);
1454
+ vzero ($H4);
1455
+
1456
+&{$z? \&lghi:\&lhi} ($len,0);
1457
+ j (".Last");
1458
+ # I don't bother to tell apart cases when only one multiplication
1459
+ # pass is sufficient, because I argue that mispredicted branch
1460
+ # penalties are comparable to overhead of sometimes redundant
1461
+ # multiplication pass...
1462
+
1463
+LABEL (".Ldone");
1464
+ vstef ($H0,"0($ctx)",3); # store hash base 2^26
1465
+ vstef ($H1,"4($ctx)",3);
1466
+ vstef ($H2,"8($ctx)",3);
1467
+ vstef ($H3,"12($ctx)",3);
1468
+ vstef ($H4,"16($ctx)",3);
1469
+
1470
+if ($z) {
1471
+ ld ("%f8","$stdframe+0*8($sp)");
1472
+ ld ("%f9","$stdframe+1*8($sp)");
1473
+ ld ("%f10","$stdframe+2*8($sp)");
1474
+ ld ("%f11","$stdframe+3*8($sp)");
1475
+ ld ("%f12","$stdframe+4*8($sp)");
1476
+ ld ("%f13","$stdframe+5*8($sp)");
1477
+ ld ("%f14","$stdframe+6*8($sp)");
1478
+ ld ("%f15","$stdframe+7*8($sp)");
1479
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
1480
+} else {
1481
+ ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
1482
+ ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
1483
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
1484
+}
1485
+ br ("%r14");
1486
+SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
1487
+}
1488
+
1489
+################
1490
# static void poly1305_emit(void *ctx, unsigned char mac[16],
1491
# const u32 nonce[4])
1492
{
1493
-my ($ctx,$mac,$nonce) = map("%r$_",(2..4));
1494
-my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
1495
+my ($mac,$nonce)=($inp,$len);
1496
+my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
1497
1498
GLOBL ("poly1305_emit");
1499
TYPE ("poly1305_emit","\@function");
1500
ALIGN (16);
1501
LABEL ("poly1305_emit");
1502
-&{$z? \&stmg:\&stm} ("%r6","%r9","6*$SIZE_T($sp)");
1503
+LABEL (".Lpoly1305_emit");
1504
+&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
1505
1506
- lg ($h0,"0($ctx)");
1507
- lg ($h1,"8($ctx)");
1508
- lg ($h2,"16($ctx)");
1509
+ lg ($d0,"0($ctx)");
1510
+ lg ($d1,"8($ctx)");
1511
+ lg ($d2,"16($ctx)");
1512
+
1513
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
1514
+ srlg ($h0,$d0,32);
1515
+ llgfr ("%r1",$d1);
1516
+ srlg ($h1,$d1,32);
1517
+ srlg ($h2,$d2,32);
1518
+
1519
+ sllg ("%r0","%r0",26);
1520
+ algr ($h0,"%r0");
1521
+ sllg ("%r0",$h1,52);
1522
+ srlg ($h1,$h1,12);
1523
+ sllg ("%r1","%r1",14);
1524
+ algr ($h0,"%r0");
1525
+ alcgr ($h1,"%r1");
1526
+ sllg ("%r0",$h2,40);
1527
+ srlg ($h2,$h2,24);
1528
+ lghi ("%r1",0);
1529
+ algr ($h1,"%r0");
1530
+ alcgr ($h2,"%r1");
1531
+
1532
+ llgf ("%r0","24($ctx)"); # is_base2_26
1533
+ lcgr ("%r0","%r0");
1534
+
1535
+ xgr ($h0,$d0); # choose between radixes
1536
+ xgr ($h1,$d1);
1537
+ xgr ($h2,$d2);
1538
+ ngr ($h0,"%r0");
1539
+ ngr ($h1,"%r0");
1540
+ ngr ($h2,"%r0");
1541
+ xgr ($h0,$d0);
1542
+ xgr ($h1,$d1);
1543
+ xgr ($h2,$d2);
1544
1545
lghi ("%r0",5);
1546
- lghi ("%r1",0);
1547
lgr ($d0,$h0);
1548
lgr ($d1,$h1);
1549
1550
- algr ($h0,"%r0"); # compare to modulus
1551
+ algr ($h0,"%r0"); # compare to modulus
1552
alcgr ($h1,"%r1");
1553
alcgr ($h2,"%r1");
1554
1555
- srlg ($h2,$h2,2); # did it borrow/carry?
1556
- slgr ("%r1",$h2); # 0-$h2>>2
1557
- lg ($h2,"0($nonce)"); # load nonce
1558
- lghi ("%r0",-1);
1559
+ srlg ($h2,$h2,2); # did it borrow/carry?
1560
+ slgr ("%r1",$h2); # 0-$h2>>2
1561
+ lg ($d2,"0($nonce)"); # load nonce
1562
lg ($ctx,"8($nonce)");
1563
- xgr ("%r0","%r1"); # ~%r1
1564
1565
+ xgr ($h0,$d0);
1566
+ xgr ($h1,$d1);
1567
ngr ($h0,"%r1");
1568
- ngr ($d0,"%r0");
1569
ngr ($h1,"%r1");
1570
- ngr ($d1,"%r0");
1571
- ogr ($h0,$d0);
1572
- rllg ($d0,$h2,32); # flip nonce words
1573
- ogr ($h1,$d1);
1574
+ xgr ($h0,$d0);
1575
+ rllg ($d0,$d2,32); # flip nonce words
1576
+ xgr ($h1,$d1);
1577
rllg ($d1,$ctx,32);
1578
1579
- algr ($h0,$d0); # accumulate nonce
1580
+ algr ($h0,$d0); # accumulate nonce
1581
alcgr ($h1,$d1);
1582
1583
- strvg ($h0,"0($mac)"); # write little-endian result
1584
+ strvg ($h0,"0($mac)"); # write little-endian result
1585
strvg ($h1,"8($mac)");
1586
1587
-&{$z? \&lmg:\&lm} ("%r6","%r9","6*$SIZE_T($sp)");
1588
+&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
1589
br ("%r14");
1590
SIZE ("poly1305_emit",".-poly1305_emit");
1591
}
1592
-}
1593
+
1594
################
1595
1596
-ALIGN (128);
1597
+ALIGN (16);
1598
LABEL (".Lconst");
1599
-LONG (0x00060504,0x03020100,0x00161514,0x13121110); # vperm op[m[1],m[0]]
1600
-LONG (0x000c0b0a,0x09080706,0x001c1b1a,0x19181716); # vperm op[m[3],m[2]]
1601
-LONG (0x00000000,0x000f0e0d,0x00000000,0x001f1e1d); # vperm op[ - ,m[4]]
1602
-LONG (0x00000000,0x03ffffff,0x00000000,0x03ffffff); # [0,2^26-1,0,2^26-1]
1603
-LONG (0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); # vperm op endian
1604
+LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
1605
+LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
1606
+LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
1607
+LONG (0x00000000,0x09080706,0x00000000,0x19181716);
1608
+
1609
+LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
1610
+LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
1611
+LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
1612
+
1613
+LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
1614
+LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
1615
+LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
1616
+
1617
STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
1618
1619
PERLASM_END();
1620
Index: openssl-1.1.1c/crypto/poly1305/build.info
1621
===================================================================
1622
--- openssl-1.1.1c.orig/crypto/poly1305/build.info 2019-06-06 12:18:54.556316994 +0200
1623
+++ openssl-1.1.1c/crypto/poly1305/build.info 2019-06-06 12:19:24.232504722 +0200
1624
1625
INCLUDE[poly1305-armv8.o]=..
1626
GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME)
1627
INCLUDE[poly1305-mips.o]=..
1628
+INCLUDE[poly1305-s390x.o]=..
1629
GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME)
1630
1631
BEGINRAW[Makefile(unix)]
1632