File 0005-s390x-assembly-pack-import-chacha-from-cry... of Package openssl-1_1

Overview Repositories Revisions Requests Users Attributes Meta

File 0005-s390x-assembly-pack-import-chacha-from-cryptogams-re.patch of Package openssl-1_1

From d1229190bfbb19439589557e4d65f9bccab09b2d Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Mon, 25 Feb 2019 18:55:04 +0100
Subject: [PATCH] s390x assembly pack: import chacha from cryptogams repo

featuring 6x"horizontal" code path which is up to 25%
faster than present 4x"vertical" for larger blocks.

Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>

Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8287)
---
 crypto/chacha/asm/chacha-s390x.pl | 1006 +++++++++++++++++++++--------
 1 file changed, 719 insertions(+), 287 deletions(-)

diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl
index abf7283dd8..51efe64408 100755
--- a/crypto/chacha/asm/chacha-s390x.pl
+++ b/crypto/chacha/asm/chacha-s390x.pl
@@ -23,11 +23,20 @@
 #
 # August 2018
 #
-# Add vx code path.
+# Add vx code path: 4x"vertical".
 #
 # Copyright IBM Corp. 2018
 # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
 
+#
+# February 2019
+#
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
+# But to harness overheads revert to transliteration of VSX code path
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
+# not longer than 256 bytes.
+
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin/../..";
@@ -50,11 +59,9 @@ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 my $sp="%r15";
 my $stdframe=16*$SIZE_T+4*8;
 
+sub ROUND {
 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
 my @t=map("%r$_",(8,9));
-my @v=map("%v$_",(16..31));
-
-sub ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
@@ -143,63 +150,92 @@ my ($xc,$xc_)=map("$_",@t);
 	 rll	(@x[$b3],@x[$b3],7);
 }
 
-sub VX_ROUND {
+sub VX_lane_ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("%v$_",(0..15));
+
+	vaf	(@x[$a0],@x[$a0],@x[$b0]);	# Q1
+	vx	(@x[$d0],@x[$d0],@x[$a0]);
+	verllf	(@x[$d0],@x[$d0],16);
+	vaf	(@x[$a1],@x[$a1],@x[$b1]);	# Q2
+	vx	(@x[$d1],@x[$d1],@x[$a1]);
+	verllf	(@x[$d1],@x[$d1],16);
+	vaf	(@x[$a2],@x[$a2],@x[$b2]);	# Q3
+	vx	(@x[$d2],@x[$d2],@x[$a2]);
+	verllf	(@x[$d2],@x[$d2],16);
+	vaf	(@x[$a3],@x[$a3],@x[$b3]);	# Q4
+	vx	(@x[$d3],@x[$d3],@x[$a3]);
+	verllf	(@x[$d3],@x[$d3],16);
+
+	vaf	(@x[$c0],@x[$c0],@x[$d0]);
+	vx	(@x[$b0],@x[$b0],@x[$c0]);
+	verllf	(@x[$b0],@x[$b0],12);
+	vaf	(@x[$c1],@x[$c1],@x[$d1]);
+	vx	(@x[$b1],@x[$b1],@x[$c1]);
+	verllf	(@x[$b1],@x[$b1],12);
+	vaf	(@x[$c2],@x[$c2],@x[$d2]);
+	vx	(@x[$b2],@x[$b2],@x[$c2]);
+	verllf	(@x[$b2],@x[$b2],12);
+	vaf	(@x[$c3],@x[$c3],@x[$d3]);
+	vx	(@x[$b3],@x[$b3],@x[$c3]);
+	verllf	(@x[$b3],@x[$b3],12);
+
+	vaf	(@x[$a0],@x[$a0],@x[$b0]);
+	vx	(@x[$d0],@x[$d0],@x[$a0]);
+	verllf	(@x[$d0],@x[$d0],8);
+	vaf	(@x[$a1],@x[$a1],@x[$b1]);
+	vx	(@x[$d1],@x[$d1],@x[$a1]);
+	verllf	(@x[$d1],@x[$d1],8);
+	vaf	(@x[$a2],@x[$a2],@x[$b2]);
+	vx	(@x[$d2],@x[$d2],@x[$a2]);
+	verllf	(@x[$d2],@x[$d2],8);
+	vaf	(@x[$a3],@x[$a3],@x[$b3]);
+	vx	(@x[$d3],@x[$d3],@x[$a3]);
+	verllf	(@x[$d3],@x[$d3],8);
+
+	vaf	(@x[$c0],@x[$c0],@x[$d0]);
+	vx	(@x[$b0],@x[$b0],@x[$c0]);
+	verllf	(@x[$b0],@x[$b0],7);
+	vaf	(@x[$c1],@x[$c1],@x[$d1]);
+	vx	(@x[$b1],@x[$b1],@x[$c1]);
+	verllf	(@x[$b1],@x[$b1],7);
+	vaf	(@x[$c2],@x[$c2],@x[$d2]);
+	vx	(@x[$b2],@x[$b2],@x[$c2]);
+	verllf	(@x[$b2],@x[$b2],7);
+	vaf	(@x[$c3],@x[$c3],@x[$d3]);
+	vx	(@x[$b3],@x[$b3],@x[$c3]);
+	verllf	(@x[$b3],@x[$b3],7);
+}
 
-	vaf	(@v[$a0],@v[$a0],@v[$b0]);
-	vaf	(@v[$a1],@v[$a1],@v[$b1]);
-	vaf	(@v[$a2],@v[$a2],@v[$b2]);
-	vaf	(@v[$a3],@v[$a3],@v[$b3]);
-	vx	(@v[$d0],@v[$d0],@v[$a0]);
-	vx	(@v[$d1],@v[$d1],@v[$a1]);
-	vx	(@v[$d2],@v[$d2],@v[$a2]);
-	vx	(@v[$d3],@v[$d3],@v[$a3]);
-	verllf	(@v[$d0],@v[$d0],16);
-	verllf	(@v[$d1],@v[$d1],16);
-	verllf	(@v[$d2],@v[$d2],16);
-	verllf	(@v[$d3],@v[$d3],16);
-
-	vaf	(@v[$c0],@v[$c0],@v[$d0]);
-	vaf	(@v[$c1],@v[$c1],@v[$d1]);
-	vaf	(@v[$c2],@v[$c2],@v[$d2]);
-	vaf	(@v[$c3],@v[$c3],@v[$d3]);
-	vx	(@v[$b0],@v[$b0],@v[$c0]);
-	vx	(@v[$b1],@v[$b1],@v[$c1]);
-	vx	(@v[$b2],@v[$b2],@v[$c2]);
-	vx	(@v[$b3],@v[$b3],@v[$c3]);
-	verllf	(@v[$b0],@v[$b0],12);
-	verllf	(@v[$b1],@v[$b1],12);
-	verllf	(@v[$b2],@v[$b2],12);
-	verllf	(@v[$b3],@v[$b3],12);
-
-	vaf	(@v[$a0],@v[$a0],@v[$b0]);
-	vaf	(@v[$a1],@v[$a1],@v[$b1]);
-	vaf	(@v[$a2],@v[$a2],@v[$b2]);
-	vaf	(@v[$a3],@v[$a3],@v[$b3]);
-	vx	(@v[$d0],@v[$d0],@v[$a0]);
-	vx	(@v[$d1],@v[$d1],@v[$a1]);
-	vx	(@v[$d2],@v[$d2],@v[$a2]);
-	vx	(@v[$d3],@v[$d3],@v[$a3]);
-	verllf	(@v[$d0],@v[$d0],8);
-	verllf	(@v[$d1],@v[$d1],8);
-	verllf	(@v[$d2],@v[$d2],8);
-	verllf	(@v[$d3],@v[$d3],8);
-
-	vaf	(@v[$c0],@v[$c0],@v[$d0]);
-	vaf	(@v[$c1],@v[$c1],@v[$d1]);
-	vaf	(@v[$c2],@v[$c2],@v[$d2]);
-	vaf	(@v[$c3],@v[$c3],@v[$d3]);
-	vx	(@v[$b0],@v[$b0],@v[$c0]);
-	vx	(@v[$b1],@v[$b1],@v[$c1]);
-	vx	(@v[$b2],@v[$b2],@v[$c2]);
-	vx	(@v[$b3],@v[$b3],@v[$c3]);
-	verllf	(@v[$b0],@v[$b0],7);
-	verllf	(@v[$b1],@v[$b1],7);
-	verllf	(@v[$b2],@v[$b2],7);
-	verllf	(@v[$b3],@v[$b3],7);
+sub VX_ROUND {
+my @a=@_[0..5];
+my @b=@_[6..11];
+my @c=@_[12..17];
+my @d=@_[18..23];
+my $odd=@_[24];
+
+	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
+	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
+	verllf		(@d[$_],@d[$_],16) for (0..5);
+
+	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
+	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
+	verllf		(@b[$_],@b[$_],12) for (0..5);
+
+	vaf		(@a[$_],@a[$_],@b[$_]) for (0..5);
+	vx		(@d[$_],@d[$_],@a[$_]) for (0..5);
+	verllf		(@d[$_],@d[$_],8) for (0..5);
+
+	vaf		(@c[$_],@c[$_],@d[$_]) for (0..5);
+	vx		(@b[$_],@b[$_],@c[$_]) for (0..5);
+	verllf		(@b[$_],@b[$_],7) for (0..5);
+
+	vsldb		(@c[$_],@c[$_],@c[$_],8) for (0..5);
+	vsldb		(@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
+	vsldb		(@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
 }
 
 PERLASM_BEGIN($output);
@@ -210,13 +246,11 @@ TEXT	();
 ################
 # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
 #                     const unsigned int key[8], const unsigned int counter[4])
-{
 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
-
-# VX CODE PATH
 {
-my $off=$z*8*16+8;	# offset(initial state)
-my $frame=$stdframe+4*16+$off;
+my $frame=$stdframe+4*20;
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+my @t=map("%r$_",(8,9));
 
 GLOBL	("ChaCha20_ctr32");
 TYPE	("ChaCha20_ctr32","\@function");
@@ -225,230 +259,16 @@ LABEL	("ChaCha20_ctr32");
 	larl	("%r1","OPENSSL_s390xcap_P");
 
 	lghi	("%r0",64);
+&{$z?	\&ltgr:\&ltr}	($len,$len);		# len==0?
+	bzr	("%r14");
+	lg	("%r1","S390X_STFLE+16(%r1)");
 &{$z?	\&clgr:\&clr}	($len,"%r0");
-	jle	("_s390x_chacha_novx");
-
-	lg	("%r0","S390X_STFLE+16(%r1)");
-	tmhh	("%r0",0x4000);	# check for vector facility
-	jz	("_s390x_chacha_novx");
-
-if (!$z) {
-	llgfr   ($len,$len);
-	std	("%f4","16*$SIZE_T+2*8($sp)");
-	std	("%f6","16*$SIZE_T+3*8($sp)");
-}
-&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
+	jle	(".Lshort");
 
-	lghi	("%r1",-$frame);
-	lgr	("%r0",$sp);
-	la	($sp,"0(%r1,$sp)");	# allocate stack frame
+	tmhh	("%r1",0x4000);			# check for vx bit
+	jnz	(".LChaCha20_ctr32_vx");
 
-	larl	("%r7",".Lsigma");
-&{$z?	\&stg:\&st}	("%r0","0($sp)");	# backchain
-
-	vstm	("%v8","%v15","8($sp)") if ($z);
-
-	vlm	("%v1","%v2","0($key)");	# load key
-	vl	("%v0","0(%r7)");	# load sigma constant
-	vl	("%v3","0($counter)");	# load iv (counter||nonce)
-	l	("%r0","0($counter)");	# load counter
-	vstm	("%v0","%v3","$off($sp)");	# copy initial state to stack
-
-	srlg	("%r1",$len,8);
-	ltgr	("%r1","%r1");
-	jz	(".Lvx_4x_done");
-
-ALIGN	(16);	# process 4 64-byte blocks
-LABEL	(".Lvx_4x");
-	vlrepf	("%v$_",($_*4)."+$off($sp)") for (0..15);	# load initial
-								#  state
-	vl	("%v31","16(%r7)");
-	vaf	("%v12","%v12","%v31");	# increment counter
-
-	vlr	(@v[$_],"%v$_") for (0..15);	# copy initial state
-
-	lhi	("%r6",10);
-	j	(".Loop_vx_4x");
-
-ALIGN	(16);
-LABEL	(".Loop_vx_4x");
-	VX_ROUND( 0, 4, 8,12);	# column round
-	VX_ROUND( 0, 5,10,15);	# diagonal round
-	brct	("%r6",".Loop_vx_4x");
-
-	vaf	(@v[$_],@v[$_],"%v$_") for (0..15);	# state += initial
-							#  state (mod 32)
-	vlm	("%v6","%v7","32(%r7)");	# load vperm operands
-
-for (0..3) {	# blocks 1,2
-	vmrhf	("%v0",@v[$_*4+0],@v[$_*4+1]);	# ks = serialize(state)
-	vmrhf	("%v1",@v[$_*4+2],@v[$_*4+3]);
-	vperm	("%v".($_+ 8),"%v0","%v1","%v6");
-	vperm	("%v".($_+12),"%v0","%v1","%v7");
-}
-	vlm	("%v0","%v7","0($inp)");	# load in
-	vx	("%v$_","%v$_","%v".($_+8)) for (0..7);	# out = in ^ ks
-	vstm	("%v0","%v7","0($out)");	# store out
-
-	vlm	("%v6","%v7","32(%r7)");	# restore vperm operands
-
-for (0..3) {	# blocks 2,3
-	vmrlf	("%v0",@v[$_*4+0],@v[$_*4+1]);	# ks = serialize(state)
-	vmrlf	("%v1",@v[$_*4+2],@v[$_*4+3]);
-	vperm	("%v".($_+ 8),"%v0","%v1","%v6");
-	vperm	("%v".($_+12),"%v0","%v1","%v7");
-}
-	vlm	("%v0","%v7","128($inp)");	# load in
-	vx	("%v$_","%v$_","%v".($_+8)) for (0..7);	# out = in ^ ks
-	vstm	("%v0","%v7","128($out)");	# store out
-
-	ahi	("%r0",4);
-	st	("%r0","48+$off($sp)");	# update initial state
-
-	la	($inp,"256($inp)");
-	la	($out,"256($out)");
-	brctg	("%r1",".Lvx_4x");
-
-ALIGN	(16);
-LABEL	(".Lvx_4x_done");
-	lghi	("%r1",0xff);
-	ngr	($len,"%r1");
-	jnz	(".Lvx_rem");
-
-ALIGN	(16);
-LABEL	(".Lvx_done");
-	vzero	("%v$_") for (16..31);	# wipe ks and key copy
-	vstm	("%v16","%v17","16+$off($sp)");
-	vlm	("%v8","%v15","8($sp)") if ($z);
-
-	la	($sp,"$frame($sp)");
-&{$z?	\&lmg:\&lm}	("%r6","%r7","6*$SIZE_T($sp)");
-
-if (!$z) {
-	ld	("%f4","16*$SIZE_T+2*8($sp)");
-	ld	("%f6","16*$SIZE_T+3*8($sp)");
-	vzero	("%v$_") for (8..15);
-}
-	br	("%r14");
-ALIGN	(16);
-LABEL	(".Lvx_rem");
-	lhi	("%r0",64);
-
-	sr	($len,"%r0");
-	brc	(2,".Lvx_rem_g64");	# cc==2?
-
-	lghi	("%r1",-$stdframe);
-
-	la	($counter,"48+$off($sp)");	# load updated iv
-	ar	($len,"%r0");	# restore len
-
-	lgr	("%r7",$counter);
-&{$z?	\&stg:\&st}	("%r14","14*$SIZE_T+$frame($sp)");
-	la	($sp,"0(%r1,$sp)");
-
-	bras	("%r14","_s390x_chacha_novx");
-
-	la	($sp,"$stdframe($sp)");
-&{$z?	\&lg:\&l}	("%r14","14*$SIZE_T+$frame($sp)");
-	lgr	($counter,"%r7");
-	j	(".Lvx_done");
-
-ALIGN	(16);
-LABEL	(".Lvx_rem_g64");
-	vlrepf	("%v$_",($_*4)."+$off($sp)") for (0..15);	# load initial
-								#  state
-	vl	("%v31","16(%r7)");
-	vaf	("%v12","%v12","%v31");	# increment counter
-
-	vlr	(@v[$_],"%v$_") for (0..15);	# state = initial state
-
-	lhi	("%r6",10);
-	j	(".Loop_vx_rem");
-
-ALIGN	(16);
-LABEL	(".Loop_vx_rem");
-	VX_ROUND( 0, 4, 8,12);	# column round
-	VX_ROUND( 0, 5,10,15);	# diagonal round
-	brct	("%r6",".Loop_vx_rem");
-
-	vaf	(@v[$_],@v[$_],"%v$_") for (0..15);	# state += initial
-							#  state (mod 32)
-	vlm	("%v6","%v7","32(%r7)");	# load vperm operands
-
-for (0..3) {	# blocks 1,2
-	vmrhf	("%v0",@v[$_*4+0],@v[$_*4+1]);	# ks = serialize(state)
-	vmrhf	("%v1",@v[$_*4+2],@v[$_*4+3]);
-	vperm	("%v".($_+8),"%v0","%v1","%v6");
-	vperm	("%v".($_+12),"%v0","%v1","%v7");
-}
-	vlm	("%v0","%v3","0($inp)");	# load in
-	vx	("%v$_","%v$_","%v".($_+8)) for (0..3);	# out = in ^ ks
-	vstm	("%v0","%v3","0($out)");	# store out
-
-	la	($inp,"64($inp)");
-	la	($out,"64($out)");
-
-	sr	($len,"%r0");
-	brc	(4,".Lvx_tail");	# cc==4?
-
-	vlm	("%v0","%v3","0($inp)");	# load in
-	vx	("%v$_","%v$_","%v".($_+12)) for (0..3);	# out = in ^ ks
-	vstm	("%v0","%v3","0($out)");	# store out
-	jz	(".Lvx_done");
-
-for (0..3) {	# blocks 3,4
-	vmrlf	("%v0",@v[$_*4+0],@v[$_*4+1]);	# ks = serialize(state)
-	vmrlf	("%v1",@v[$_*4+2],@v[$_*4+3]);
-	vperm	("%v".($_+12),"%v0","%v1","%v6");
-	vperm	("%v".($_+8),"%v0","%v1","%v7");
-}
-	la	($inp,"64($inp)");
-	la	($out,"64($out)");
-
-	sr	($len,"%r0");
-	brc	(4,".Lvx_tail");	# cc==4?
-
-	vlm	("%v0","%v3","0($inp)");	# load in
-	vx	("%v$_","%v$_","%v".($_+12)) for (0..3);	# out = in ^ ks
-	vstm	("%v0","%v3","0($out)");	# store out
-	jz	(".Lvx_done");
-
-	la	($inp,"64($inp)");
-	la	($out,"64($out)");
-
-	sr	($len,"%r0");
-	vlr	("%v".($_+4),"%v$_") for (8..11);
-	j	(".Lvx_tail");
-
-ALIGN	(16);
-LABEL	(".Lvx_tail");
-	ar	($len,"%r0");	# restore $len
-	ahi	($len,-1);
-
-	lhi	("%r0",16);
-for (0..2) {
-	vll	("%v0",$len,($_*16)."($inp)");
-	vx	("%v0","%v0","%v".($_+12));
-	vstl	("%v0",$len,($_*16)."($out)");
-	sr	($len,"%r0");
-	brc	(4,".Lvx_done");	# cc==4?
-}
-	vll	("%v0",$len,"3*16($inp)");
-	vx	("%v0","%v0","%v15");
-	vstl	("%v0",$len,"3*16($out)");
-	j	(".Lvx_done");
-SIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
-}
-
-# NOVX CODE PATH
-{
-my $frame=$stdframe+4*20;
-
-TYPE	("_s390x_chacha_novx","\@function");
-ALIGN	(32);
-LABEL	("_s390x_chacha_novx");
-&{$z?	\&ltgr:\&ltr}	($len,$len);	# $len==0?
-	bzr	("%r14");
+LABEL	(".Lshort");
 &{$z?	\&aghi:\&ahi}	($len,-64);
 &{$z?	\&lghi:\&lhi}	("%r1",-$frame);
 &{$z?	\&stmg:\&stm}	("%r6","%r15","6*$SIZE_T($sp)");
@@ -607,17 +427,629 @@ LABEL	(".Loop_tail");
 	brct	(@t[1],".Loop_tail");
 
 	j	(".Ldone");
-SIZE	("_s390x_chacha_novx",".-_s390x_chacha_novx");
+SIZE	("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+########################################################################
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
+# runs underutilized [because of vector instructions' high latency].
+# On the other hand minimum amount of data it takes to fully utilize
+# the pipeline is higher, so that effectively, short inputs would be
+# processed slower. Hence this code path targeting <=256 bytes lengths.
+#
+{
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
+my @K=map("%v$_",(16..19));
+my $CTR="%v26";
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
+my $beperm="%v31";
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
+my $FRAME=$stdframe+4*16;
+
+ALIGN	(32);
+LABEL	("ChaCha20_ctr32_4x");
+LABEL	(".LChaCha20_ctr32_4x");
+&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+	std	("%f4","16*$SIZE_T+2*8($sp)");
+	std	("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
+	lgr	("%r0",$sp);
+	la	($sp,"0(%r1,$sp)");
+&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
+if ($z) {
+	std	("%f8","$stdframe+8*0($sp)");
+	std	("%f9","$stdframe+8*1($sp)");
+	std	("%f10","$stdframe+8*2($sp)");
+	std	("%f11","$stdframe+8*3($sp)");
+	std	("%f12","$stdframe+8*4($sp)");
+	std	("%f13","$stdframe+8*5($sp)");
+	std	("%f14","$stdframe+8*6($sp)");
+	std	("%f15","$stdframe+8*7($sp)");
+}
+	larl	("%r7",".Lsigma");
+	lhi	("%r0",10);
+	lhi	("%r1",0);
+
+	vl	(@K[0],"0(%r7)");		# load sigma
+	vl	(@K[1],"0($key)");		# load key
+	vl	(@K[2],"16($key)");
+	vl	(@K[3],"0($counter)");		# load counter
+
+	vl	($beperm,"0x40(%r7)");
+	vl	($xt1,"0x50(%r7)");
+	vrepf	($CTR,@K[3],0);
+	vlvgf	(@K[3],"%r1",0);		# clear @K[3].word[0]
+	vaf	($CTR,$CTR,$xt1);
+
+#LABEL	(".Loop_outer_4x");
+	vlm	($xa0,$xa3,"0x60(%r7)");	# load [smashed] sigma
+
+	vrepf	($xb0,@K[1],0);			# smash the key
+	vrepf	($xb1,@K[1],1);
+	vrepf	($xb2,@K[1],2);
+	vrepf	($xb3,@K[1],3);
+
+	vrepf	($xc0,@K[2],0);
+	vrepf	($xc1,@K[2],1);
+	vrepf	($xc2,@K[2],2);
+	vrepf	($xc3,@K[2],3);
+
+	vlr	($xd0,$CTR);
+	vrepf	($xd1,@K[3],1);
+	vrepf	($xd2,@K[3],2);
+	vrepf	($xd3,@K[3],3);
+
+LABEL	(".Loop_4x");
+	VX_lane_ROUND(0, 4, 8,12);
+	VX_lane_ROUND(0, 5,10,15);
+	brct	("%r0",".Loop_4x");
+
+	vaf	($xd0,$xd0,$CTR);
+
+	vmrhf	($xt0,$xa0,$xa1);		# transpose data
+	vmrhf	($xt1,$xa2,$xa3);
+	vmrlf	($xt2,$xa0,$xa1);
+	vmrlf	($xt3,$xa2,$xa3);
+	vpdi	($xa0,$xt0,$xt1,0b0000);
+	vpdi	($xa1,$xt0,$xt1,0b0101);
+	vpdi	($xa2,$xt2,$xt3,0b0000);
+	vpdi	($xa3,$xt2,$xt3,0b0101);
+
+	vmrhf	($xt0,$xb0,$xb1);
+	vmrhf	($xt1,$xb2,$xb3);
+	vmrlf	($xt2,$xb0,$xb1);
+	vmrlf	($xt3,$xb2,$xb3);
+	vpdi	($xb0,$xt0,$xt1,0b0000);
+	vpdi	($xb1,$xt0,$xt1,0b0101);
+	vpdi	($xb2,$xt2,$xt3,0b0000);
+	vpdi	($xb3,$xt2,$xt3,0b0101);
+
+	vmrhf	($xt0,$xc0,$xc1);
+	vmrhf	($xt1,$xc2,$xc3);
+	vmrlf	($xt2,$xc0,$xc1);
+	vmrlf	($xt3,$xc2,$xc3);
+	vpdi	($xc0,$xt0,$xt1,0b0000);
+	vpdi	($xc1,$xt0,$xt1,0b0101);
+	vpdi	($xc2,$xt2,$xt3,0b0000);
+	vpdi	($xc3,$xt2,$xt3,0b0101);
+
+	vmrhf	($xt0,$xd0,$xd1);
+	vmrhf	($xt1,$xd2,$xd3);
+	vmrlf	($xt2,$xd0,$xd1);
+	vmrlf	($xt3,$xd2,$xd3);
+	vpdi	($xd0,$xt0,$xt1,0b0000);
+	vpdi	($xd1,$xt0,$xt1,0b0101);
+	vpdi	($xd2,$xt2,$xt3,0b0000);
+	vpdi	($xd3,$xt2,$xt3,0b0101);
+
+	#vrepif	($xt0,4);
+	#vaf	($CTR,$CTR,$xt0);		# next counter value
+
+	vaf	($xa0,$xa0,@K[0]);
+	vaf	($xb0,$xb0,@K[1]);
+	vaf	($xc0,$xc0,@K[2]);
+	vaf	($xd0,$xd0,@K[3]);
+
+	vperm	($xa0,$xa0,$xa0,$beperm);
+	vperm	($xb0,$xb0,$xb0,$beperm);
+	vperm	($xc0,$xc0,$xc0,$beperm);
+	vperm	($xd0,$xd0,$xd0,$beperm);
+
+	#&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	#jl	(".Ltail_4x");
+
+	vlm	($xt0,$xt3,"0($inp)");
+
+	vx	($xt0,$xt0,$xa0);
+	vx	($xt1,$xt1,$xb0);
+	vx	($xt2,$xt2,$xc0);
+	vx	($xt3,$xt3,$xd0);
+
+	vstm	($xt0,$xt3,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	#je	(".Ldone_4x");
+
+	vaf	($xa0,$xa1,@K[0]);
+	vaf	($xb0,$xb1,@K[1]);
+	vaf	($xc0,$xc1,@K[2]);
+	vaf	($xd0,$xd1,@K[3]);
+
+	vperm	($xa0,$xa0,$xa0,$beperm);
+	vperm	($xb0,$xb0,$xb0,$beperm);
+	vperm	($xc0,$xc0,$xc0,$beperm);
+	vperm	($xd0,$xd0,$xd0,$beperm);
+
+&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	jl	(".Ltail_4x");
+
+	vlm	($xt0,$xt3,"0($inp)");
+
+	vx	($xt0,$xt0,$xa0);
+	vx	($xt1,$xt1,$xb0);
+	vx	($xt2,$xt2,$xc0);
+	vx	($xt3,$xt3,$xd0);
+
+	vstm	($xt0,$xt3,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_4x");
+
+	vaf	($xa0,$xa2,@K[0]);
+	vaf	($xb0,$xb2,@K[1]);
+	vaf	($xc0,$xc2,@K[2]);
+	vaf	($xd0,$xd2,@K[3]);
+
+	vperm	($xa0,$xa0,$xa0,$beperm);
+	vperm	($xb0,$xb0,$xb0,$beperm);
+	vperm	($xc0,$xc0,$xc0,$beperm);
+	vperm	($xd0,$xd0,$xd0,$beperm);
+
+&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	jl	(".Ltail_4x");
+
+	vlm	($xt0,$xt3,"0($inp)");
+
+	vx	($xt0,$xt0,$xa0);
+	vx	($xt1,$xt1,$xb0);
+	vx	($xt2,$xt2,$xc0);
+	vx	($xt3,$xt3,$xd0);
+
+	vstm	($xt0,$xt3,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_4x");
+
+	vaf	($xa0,$xa3,@K[0]);
+	vaf	($xb0,$xb3,@K[1]);
+	vaf	($xc0,$xc3,@K[2]);
+	vaf	($xd0,$xd3,@K[3]);
+
+	vperm	($xa0,$xa0,$xa0,$beperm);
+	vperm	($xb0,$xb0,$xb0,$beperm);
+	vperm	($xc0,$xc0,$xc0,$beperm);
+	vperm	($xd0,$xd0,$xd0,$beperm);
+
+&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	jl	(".Ltail_4x");
+
+	vlm	($xt0,$xt3,"0($inp)");
+
+	vx	($xt0,$xt0,$xa0);
+	vx	($xt1,$xt1,$xb0);
+	vx	($xt2,$xt2,$xc0);
+	vx	($xt3,$xt3,$xd0);
+
+	vstm	($xt0,$xt3,"0($out)");
+
+	#la	$inp,0x40($inp));
+	#la	$out,0x40($out));
+	#lhi	%r0,10);
+	#&{$z?	\&aghi:\&ahi}	$len,-0x40);
+	#jne	.Loop_outer_4x);
+
+LABEL	(".Ldone_4x");
+if (!$z) {
+	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+	ld	("%f8","$stdframe+8*0($sp)");
+	ld	("%f9","$stdframe+8*1($sp)");
+	ld	("%f10","$stdframe+8*2($sp)");
+	ld	("%f11","$stdframe+8*3($sp)");
+	ld	("%f12","$stdframe+8*4($sp)");
+	ld	("%f13","$stdframe+8*5($sp)");
+	ld	("%f14","$stdframe+8*6($sp)");
+	ld	("%f15","$stdframe+8*7($sp)");
+}
+&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+	la	($sp,"$FRAME($sp)");
+	br	("%r14");
+
+ALIGN	(16);
+LABEL	(".Ltail_4x");
+if (!$z) {
+	vlr	($xt0,$xb0);
+	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+
+	vst	($xa0,"$stdframe+0x00($sp)");
+	vst	($xt0,"$stdframe+0x10($sp)");
+	vst	($xc0,"$stdframe+0x20($sp)");
+	vst	($xd0,"$stdframe+0x30($sp)");
+} else {
+	vlr	($xt0,$xc0);
+	ld	("%f8","$stdframe+8*0($sp)");
+	ld	("%f9","$stdframe+8*1($sp)");
+	ld	("%f10","$stdframe+8*2($sp)");
+	ld	("%f11","$stdframe+8*3($sp)");
+	vlr	($xt1,$xd0);
+	ld	("%f12","$stdframe+8*4($sp)");
+	ld	("%f13","$stdframe+8*5($sp)");
+	ld	("%f14","$stdframe+8*6($sp)");
+	ld	("%f15","$stdframe+8*7($sp)");
+
+	vst	($xa0,"$stdframe+0x00($sp)");
+	vst	($xb0,"$stdframe+0x10($sp)");
+	vst	($xt0,"$stdframe+0x20($sp)");
+	vst	($xt1,"$stdframe+0x30($sp)");
 }
+	lghi	("%r1",0);
+
+LABEL	(".Loop_tail_4x");
+	llgc	("%r5","0(%r1,$inp)");
+	llgc	("%r6","$stdframe(%r1,$sp)");
+	xr	("%r6","%r5");
+	stc	("%r6","0(%r1,$out)");
+	la	("%r1","1(%r1)");
+	brct	($len,".Loop_tail_4x");
+
+&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+	la	($sp,"$FRAME($sp)");
+	br	("%r14");
+SIZE	("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
+}
+
+########################################################################
+# 6x"horizontal" layout is optimal fit for the platform in its current
+# shape, more specifically for given vector instructions' latency. Well,
+# computational part of 8x"vertical" would be faster, but it consumes
+# all registers and dealing with that will diminish the return...
+#
+{
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
+    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
+    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
+my @K=map("%v$_",(27,24..26));
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
+my $beperm="%v31";
+my $FRAME=$stdframe + 4*16;
+
+GLOBL	("ChaCha20_ctr32_vx");
+ALIGN	(32);
+LABEL	("ChaCha20_ctr32_vx");
+LABEL	(".LChaCha20_ctr32_vx");
+&{$z?	\&clgfi:\&clfi}	($len,256);
+	jle	(".LChaCha20_ctr32_4x");
+&{$z?	\&stmg:\&stm}	("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+	std	("%f4","16*$SIZE_T+2*8($sp)");
+	std	("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?	\&lghi:\&lhi}	("%r1",-$FRAME);
+	lgr	("%r0",$sp);
+	la	($sp,"0(%r1,$sp)");
+&{$z?	\&stg:\&st}	("%r0","0($sp)");	# back-chain
+if ($z) {
+	std	("%f8","$FRAME-8*8($sp)");
+	std	("%f9","$FRAME-8*7($sp)");
+	std	("%f10","$FRAME-8*6($sp)");
+	std	("%f11","$FRAME-8*5($sp)");
+	std	("%f12","$FRAME-8*4($sp)");
+	std	("%f13","$FRAME-8*3($sp)");
+	std	("%f14","$FRAME-8*2($sp)");
+	std	("%f15","$FRAME-8*1($sp)");
+}
+	larl	("%r7",".Lsigma");
+	lhi	("%r0",10);
+
+	vlm	(@K[1],@K[2],"0($key)");	# load key
+	vl	(@K[3],"0($counter)");		# load counter
+
+	vlm	(@K[0],"$beperm","0(%r7)");	# load sigma, increments, ...
+
+LABEL	(".Loop_outer_vx");
+	vlr	($a0,@K[0]);
+	vlr	($b0,@K[1]);
+	vlr	($a1,@K[0]);
+	vlr	($b1,@K[1]);
+	vlr	($a2,@K[0]);
+	vlr	($b2,@K[1]);
+	vlr	($a3,@K[0]);
+	vlr	($b3,@K[1]);
+	vlr	($a4,@K[0]);
+	vlr	($b4,@K[1]);
+	vlr	($a5,@K[0]);
+	vlr	($b5,@K[1]);
+
+	vlr	($d0,@K[3]);
+	vaf	($d1,@K[3],$t1);		# K[3]+1
+	vaf	($d2,@K[3],$t2);		# K[3]+2
+	vaf	($d3,@K[3],$t3);		# K[3]+3
+	vaf	($d4,$d2,$t2);			# K[3]+4
+	vaf	($d5,$d2,$t3);			# K[3]+5
+
+	vlr	($c0,@K[2]);
+	vlr	($c1,@K[2]);
+	vlr	($c2,@K[2]);
+	vlr	($c3,@K[2]);
+	vlr	($c4,@K[2]);
+	vlr	($c5,@K[2]);
+
+	vlr	($t1,$d1);
+	vlr	($t2,$d2);
+	vlr	($t3,$d3);
+
+ALIGN	(4);
+LABEL	(".Loop_vx");
+
+	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+		 $b0,$b1,$b2,$b3,$b4,$b5,
+		 $c0,$c1,$c2,$c3,$c4,$c5,
+		 $d0,$d1,$d2,$d3,$d4,$d5,
+		 0);
+
+	VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+		 $b0,$b1,$b2,$b3,$b4,$b5,
+		 $c0,$c1,$c2,$c3,$c4,$c5,
+		 $d0,$d1,$d2,$d3,$d4,$d5,
+		 1);
+
+	brct	("%r0",".Loop_vx");
+
+	vaf	($a0,$a0,@K[0]);
+	vaf	($b0,$b0,@K[1]);
+	vaf	($c0,$c0,@K[2]);
+	vaf	($d0,$d0,@K[3]);
+	vaf	($a1,$a1,@K[0]);
+	vaf	($d1,$d1,$t1);			# +K[3]+1
+
+	vperm	($a0,$a0,$a0,$beperm);
+	vperm	($b0,$b0,$b0,$beperm);
+	vperm	($c0,$c0,$c0,$beperm);
+	vperm	($d0,$d0,$d0,$beperm);
+
+&{$z?	\&clgfi:\&clfi}	($len,0x40);
+	jl	(".Ltail_vx");
+
+	vaf	($d2,$d2,$t2);			# +K[3]+2
+	vaf	($d3,$d3,$t3);			# +K[3]+3
+	vlm	($t0,$t3,"0($inp)");
+
+	vx	($a0,$a0,$t0);
+	vx	($b0,$b0,$t1);
+	vx	($c0,$c0,$t2);
+	vx	($d0,$d0,$t3);
+
+	vlm	(@K[0],$t3,"0(%r7)");		# re-load sigma and increments
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_vx");
+
+	vaf	($b1,$b1,@K[1]);
+	vaf	($c1,$c1,@K[2]);
+
+	vperm	($a0,$a1,$a1,$beperm);
+	vperm	($b0,$b1,$b1,$beperm);
+	vperm	($c0,$c1,$c1,$beperm);
+	vperm	($d0,$d1,$d1,$beperm);
+
+&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	jl	(".Ltail_vx");
+
+	vlm	($a1,$d1,"0($inp)");
+
+	vx	($a0,$a0,$a1);
+	vx	($b0,$b0,$b1);
+	vx	($c0,$c0,$c1);
+	vx	($d0,$d0,$d1);
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_vx");
+
+	vaf	($a2,$a2,@K[0]);
+	vaf	($b2,$b2,@K[1]);
+	vaf	($c2,$c2,@K[2]);
+
+	vperm	($a0,$a2,$a2,$beperm);
+	vperm	($b0,$b2,$b2,$beperm);
+	vperm	($c0,$c2,$c2,$beperm);
+	vperm	($d0,$d2,$d2,$beperm);
+
+&{$z?	\&clgfi:\&clfi}	($len,0x40);
+	jl	(".Ltail_vx");
+
+	vlm	($a1,$d1,"0($inp)");
+
+	vx	($a0,$a0,$a1);
+	vx	($b0,$b0,$b1);
+	vx	($c0,$c0,$c1);
+	vx	($d0,$d0,$d1);
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_vx");
+
+	vaf	($a3,$a3,@K[0]);
+	vaf	($b3,$b3,@K[1]);
+	vaf	($c3,$c3,@K[2]);
+	vaf	($d2,@K[3],$t3);		# K[3]+3
+
+	vperm	($a0,$a3,$a3,$beperm);
+	vperm	($b0,$b3,$b3,$beperm);
+	vperm	($c0,$c3,$c3,$beperm);
+	vperm	($d0,$d3,$d3,$beperm);
+
+&{$z?	\&clgfi:\&clfi}	($len,0x40);
+	jl	(".Ltail_vx");
+
+	vaf	($d3,$d2,$t1);			# K[3]+4
+	vlm	($a1,$d1,"0($inp)");
+
+	vx	($a0,$a0,$a1);
+	vx	($b0,$b0,$b1);
+	vx	($c0,$c0,$c1);
+	vx	($d0,$d0,$d1);
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_vx");
+
+	vaf	($a4,$a4,@K[0]);
+	vaf	($b4,$b4,@K[1]);
+	vaf	($c4,$c4,@K[2]);
+	vaf	($d4,$d4,$d3);			# +K[3]+4
+	vaf	($d3,$d3,$t1);			# K[3]+5
+	vaf	(@K[3],$d2,$t3);		# K[3]+=6
+
+	vperm	($a0,$a4,$a4,$beperm);
+	vperm	($b0,$b4,$b4,$beperm);
+	vperm	($c0,$c4,$c4,$beperm);
+	vperm	($d0,$d4,$d4,$beperm);
+
+&{$z?	\&clgfi:\&clfi}	($len,0x40);
+	jl	(".Ltail_vx");
+
+	vlm	($a1,$d1,"0($inp)");
+
+	vx	($a0,$a0,$a1);
+	vx	($b0,$b0,$b1);
+	vx	($c0,$c0,$c1);
+	vx	($d0,$d0,$d1);
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	je	(".Ldone_vx");
+
+	vaf	($a5,$a5,@K[0]);
+	vaf	($b5,$b5,@K[1]);
+	vaf	($c5,$c5,@K[2]);
+	vaf	($d5,$d5,$d3);			# +K[3]+5
+
+	vperm	($a0,$a5,$a5,$beperm);
+	vperm	($b0,$b5,$b5,$beperm);
+	vperm	($c0,$c5,$c5,$beperm);
+	vperm	($d0,$d5,$d5,$beperm);
+
+&{$z?	\&clgfi:\&clfi} ($len,0x40);
+	jl	(".Ltail_vx");
+
+	vlm	($a1,$d1,"0($inp)");
+
+	vx	($a0,$a0,$a1);
+	vx	($b0,$b0,$b1);
+	vx	($c0,$c0,$c1);
+	vx	($d0,$d0,$d1);
+
+	vstm	($a0,$d0,"0($out)");
+
+	la	($inp,"0x40($inp)");
+	la	($out,"0x40($out)");
+	lhi	("%r0",10);
+&{$z?	\&aghi:\&ahi}	($len,-0x40);
+	jne	(".Loop_outer_vx");
+
+LABEL	(".Ldone_vx");
+if (!$z) {
+	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+	ld	("%f8","$FRAME-8*8($sp)");
+	ld	("%f9","$FRAME-8*7($sp)");
+	ld	("%f10","$FRAME-8*6($sp)");
+	ld	("%f11","$FRAME-8*5($sp)");
+	ld	("%f12","$FRAME-8*4($sp)");
+	ld	("%f13","$FRAME-8*3($sp)");
+	ld	("%f14","$FRAME-8*2($sp)");
+	ld	("%f15","$FRAME-8*1($sp)");
+}
+&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+	la	($sp,"$FRAME($sp)");
+	br	("%r14");
+
+ALIGN	(16);
+LABEL	(".Ltail_vx");
+if (!$z) {
+	ld	("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+	ld	("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+	ld	("%f8","$FRAME-8*8($sp)");
+	ld	("%f9","$FRAME-8*7($sp)");
+	ld	("%f10","$FRAME-8*6($sp)");
+	ld	("%f11","$FRAME-8*5($sp)");
+	ld	("%f12","$FRAME-8*4($sp)");
+	ld	("%f13","$FRAME-8*3($sp)");
+	ld	("%f14","$FRAME-8*2($sp)");
+	ld	("%f15","$FRAME-8*1($sp)");
+}
+	vstm	($a0,$d0,"$stdframe($sp)");
+	lghi	("%r1",0);
+
+LABEL	(".Loop_tail_vx");
+	llgc	("%r5","0(%r1,$inp)");
+	llgc	("%r6","$stdframe(%r1,$sp)");
+	xr	("%r6","%r5");
+	stc	("%r6","0(%r1,$out)");
+	la	("%r1","1(%r1)");
+	brct	($len,".Loop_tail_vx");
+
+&{$z?	\&lmg:\&lm}	("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+	la	($sp,"$FRAME($sp)");
+	br	("%r14");
+SIZE	("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
 }
 ################
 
-ALIGN	(64);
+ALIGN	(32);
 LABEL	(".Lsigma");
 LONG	(0x61707865,0x3320646e,0x79622d32,0x6b206574);	# endian-neutral sigma
-LONG	(0x00000000,0x00000001,0x00000002,0x00000003);	# vaf counter increment
-LONG	(0x03020100,0x07060504,0x13121110,0x17161514);	# vperm serialization
-LONG	(0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c);	# vperm serialization
+LONG	(1,0,0,0);
+LONG	(2,0,0,0);
+LONG	(3,0,0,0);
+LONG	(0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);	# byte swap
+
+LONG	(0,1,2,3);
+LONG	(0x61707865,0x61707865,0x61707865,0x61707865);	# smashed sigma
+LONG	(0x3320646e,0x3320646e,0x3320646e,0x3320646e);
+LONG	(0x79622d32,0x79622d32,0x79622d32,0x79622d32);
+LONG	(0x6b206574,0x6b206574,0x6b206574,0x6b206574);
+
 ASCIZ	("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
 ALIGN	(4);
 
-- 
2.21.0

xxxxxxxxxx
 
From d1229190bfbb19439589557e4d65f9bccab09b2d Mon Sep 17 00:00:00 2001
From: Patrick Steuer <patrick.steuer@de.ibm.com>
Date: Mon, 25 Feb 2019 18:55:04 +0100
Subject: [PATCH] s390x assembly pack: import chacha from cryptogams repo
​
featuring 6x"horizontal" code path which is up to 25%
faster than present 4x"vertical" for larger blocks.
​
Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com>
​
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8287)
---
 crypto/chacha/asm/chacha-s390x.pl | 1006 +++++++++++++++++++++--------
 1 file changed, 719 insertions(+), 287 deletions(-)
​
diff --git a/crypto/chacha/asm/chacha-s390x.pl b/crypto/chacha/asm/chacha-s390x.pl
index abf7283dd8..51efe64408 100755
--- a/crypto/chacha/asm/chacha-s390x.pl
+++ b/crypto/chacha/asm/chacha-s390x.pl
@@ -23,11 +23,20 @@
 #
 # August 2018
 #
-# Add vx code path.
+# Add vx code path: 4x"vertical".
 #
 # Copyright IBM Corp. 2018
 # Author: Patrick Steuer <patrick.steuer@de.ibm.com>
 
+#
+# February 2019
+#
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
+# But to harness overheads revert to transliteration of VSX code path
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
+# not longer than 256 bytes.
+
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin/../..";
@@ -50,11 +59,9 @@ while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
 my $sp="%r15";
 my $stdframe=16*$SIZE_T+4*8;
 
+sub ROUND {
 my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
 my @t=map("%r$_",(8,9));
-my @v=map("%v$_",(16..31));
-
-sub ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
@@ -143,63 +150,92 @@ my ($xc,$xc_)=map("$_",@t);
     rll    (@x[$b3],@x[$b3],7);
 }
 
-sub VX_ROUND {
+sub VX_lane_ROUND {
 my ($a0,$b0,$c0,$d0)=@_;
 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("%v$_",(0..15));
+
+   vaf (@x[$a0],@x[$a0],@x[$b0]);  # Q1
+   vx  (@x[$d0],@x[$d0],@x[$a0]);
+   verllf  (@x[$d0],@x[$d0],16);
+   vaf (@x[$a1],@x[$a1],@x[$b1]);  # Q2
+   vx  (@x[$d1],@x[$d1],@x[$a1]);
+   verllf  (@x[$d1],@x[$d1],16);
+   vaf (@x[$a2],@x[$a2],@x[$b2]);  # Q3
+   vx  (@x[$d2],@x[$d2],@x[$a2]);
+   verllf  (@x[$d2],@x[$d2],16);
+   vaf (@x[$a3],@x[$a3],@x[$b3]);  # Q4
+   vx  (@x[$d3],@x[$d3],@x[$a3]);
+   verllf  (@x[$d3],@x[$d3],16);
+
+   vaf (@x[$c0],@x[$c0],@x[$d0]);
+   vx  (@x[$b0],@x[$b0],@x[$c0]);
+   verllf  (@x[$b0],@x[$b0],12);
+   vaf (@x[$c1],@x[$c1],@x[$d1]);
+   vx  (@x[$b1],@x[$b1],@x[$c1]);
+   verllf  (@x[$b1],@x[$b1],12);
+   vaf (@x[$c2],@x[$c2],@x[$d2]);
+   vx  (@x[$b2],@x[$b2],@x[$c2]);
+   verllf  (@x[$b2],@x[$b2],12);
+   vaf (@x[$c3],@x[$c3],@x[$d3]);
+   vx  (@x[$b3],@x[$b3],@x[$c3]);
+   verllf  (@x[$b3],@x[$b3],12);
+
+   vaf (@x[$a0],@x[$a0],@x[$b0]);
+   vx  (@x[$d0],@x[$d0],@x[$a0]);
+   verllf  (@x[$d0],@x[$d0],8);
+   vaf (@x[$a1],@x[$a1],@x[$b1]);
+   vx  (@x[$d1],@x[$d1],@x[$a1]);
+   verllf  (@x[$d1],@x[$d1],8);
+   vaf (@x[$a2],@x[$a2],@x[$b2]);
+   vx  (@x[$d2],@x[$d2],@x[$a2]);
+   verllf  (@x[$d2],@x[$d2],8);
+   vaf (@x[$a3],@x[$a3],@x[$b3]);
+   vx  (@x[$d3],@x[$d3],@x[$a3]);
+   verllf  (@x[$d3],@x[$d3],8);
+
+   vaf (@x[$c0],@x[$c0],@x[$d0]);
+   vx  (@x[$b0],@x[$b0],@x[$c0]);
+   verllf  (@x[$b0],@x[$b0],7);
+   vaf (@x[$c1],@x[$c1],@x[$d1]);
+   vx  (@x[$b1],@x[$b1],@x[$c1]);
+   verllf  (@x[$b1],@x[$b1],7);
+   vaf (@x[$c2],@x[$c2],@x[$d2]);
+   vx  (@x[$b2],@x[$b2],@x[$c2]);
+   verllf  (@x[$b2],@x[$b2],7);
+   vaf (@x[$c3],@x[$c3],@x[$d3]);
+   vx  (@x[$b3],@x[$b3],@x[$c3]);
+   verllf  (@x[$b3],@x[$b3],7);
+}
 
-   vaf (@v[$a0],@v[$a0],@v[$b0]);
-   vaf (@v[$a1],@v[$a1],@v[$b1]);
-   vaf (@v[$a2],@v[$a2],@v[$b2]);
-   vaf (@v[$a3],@v[$a3],@v[$b3]);
-   vx  (@v[$d0],@v[$d0],@v[$a0]);
-   vx  (@v[$d1],@v[$d1],@v[$a1]);
-   vx  (@v[$d2],@v[$d2],@v[$a2]);
-   vx  (@v[$d3],@v[$d3],@v[$a3]);
-   verllf  (@v[$d0],@v[$d0],16);
-   verllf  (@v[$d1],@v[$d1],16);
-   verllf  (@v[$d2],@v[$d2],16);
-   verllf  (@v[$d3],@v[$d3],16);
-
-   vaf (@v[$c0],@v[$c0],@v[$d0]);
-   vaf (@v[$c1],@v[$c1],@v[$d1]);
-   vaf (@v[$c2],@v[$c2],@v[$d2]);
-   vaf (@v[$c3],@v[$c3],@v[$d3]);
-   vx  (@v[$b0],@v[$b0],@v[$c0]);
-   vx  (@v[$b1],@v[$b1],@v[$c1]);
-   vx  (@v[$b2],@v[$b2],@v[$c2]);
-   vx  (@v[$b3],@v[$b3],@v[$c3]);
-   verllf  (@v[$b0],@v[$b0],12);
-   verllf  (@v[$b1],@v[$b1],12);
-   verllf  (@v[$b2],@v[$b2],12);
-   verllf  (@v[$b3],@v[$b3],12);
-
-   vaf (@v[$a0],@v[$a0],@v[$b0]);
-   vaf (@v[$a1],@v[$a1],@v[$b1]);
-   vaf (@v[$a2],@v[$a2],@v[$b2]);
-   vaf (@v[$a3],@v[$a3],@v[$b3]);
-   vx  (@v[$d0],@v[$d0],@v[$a0]);
-   vx  (@v[$d1],@v[$d1],@v[$a1]);
-   vx  (@v[$d2],@v[$d2],@v[$a2]);
-   vx  (@v[$d3],@v[$d3],@v[$a3]);
-   verllf  (@v[$d0],@v[$d0],8);
-   verllf  (@v[$d1],@v[$d1],8);
-   verllf  (@v[$d2],@v[$d2],8);
-   verllf  (@v[$d3],@v[$d3],8);
-
-   vaf (@v[$c0],@v[$c0],@v[$d0]);
-   vaf (@v[$c1],@v[$c1],@v[$d1]);
-   vaf (@v[$c2],@v[$c2],@v[$d2]);
-   vaf (@v[$c3],@v[$c3],@v[$d3]);
-   vx  (@v[$b0],@v[$b0],@v[$c0]);
-   vx  (@v[$b1],@v[$b1],@v[$c1]);
-   vx  (@v[$b2],@v[$b2],@v[$c2]);
-   vx  (@v[$b3],@v[$b3],@v[$c3]);
-   verllf  (@v[$b0],@v[$b0],7);
-   verllf  (@v[$b1],@v[$b1],7);
-   verllf  (@v[$b2],@v[$b2],7);
-   verllf  (@v[$b3],@v[$b3],7);
+sub VX_ROUND {
+my @a=@_[0..5];
+my @b=@_[6..11];
+my @c=@_[12..17];
+my @d=@_[18..23];
+my $odd=@_[24];
+
+   vaf     (@a[$_],@a[$_],@b[$_]) for (0..5);
+   vx      (@d[$_],@d[$_],@a[$_]) for (0..5);
+   verllf      (@d[$_],@d[$_],16) for (0..5);
+
+   vaf     (@c[$_],@c[$_],@d[$_]) for (0..5);
+   vx      (@b[$_],@b[$_],@c[$_]) for (0..5);
+   verllf      (@b[$_],@b[$_],12) for (0..5);
+
+   vaf     (@a[$_],@a[$_],@b[$_]) for (0..5);
+   vx      (@d[$_],@d[$_],@a[$_]) for (0..5);
+   verllf      (@d[$_],@d[$_],8) for (0..5);
+
+   vaf     (@c[$_],@c[$_],@d[$_]) for (0..5);
+   vx      (@b[$_],@b[$_],@c[$_]) for (0..5);
+   verllf      (@b[$_],@b[$_],7) for (0..5);
+
+   vsldb       (@c[$_],@c[$_],@c[$_],8) for (0..5);
+   vsldb       (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
+   vsldb       (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
 }
 
 PERLASM_BEGIN($output);
@@ -210,13 +246,11 @@ TEXT  ();
 ################
 # void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
 #                     const unsigned int key[8], const unsigned int counter[4])
-{
 my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
-
-# VX CODE PATH
 {
-my $off=$z*8*16+8; # offset(initial state)
-my $frame=$stdframe+4*16+$off;
+my $frame=$stdframe+4*20;
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+my @t=map("%r$_",(8,9));
 
 GLOBL  ("ChaCha20_ctr32");
 TYPE   ("ChaCha20_ctr32","\@function");
@@ -225,230 +259,16 @@ LABEL    ("ChaCha20_ctr32");
    larl    ("%r1","OPENSSL_s390xcap_P");
 
    lghi    ("%r0",64);
+&{$z?  \&ltgr:\&ltr}   ($len,$len);        # len==0?
+   bzr ("%r14");
+   lg  ("%r1","S390X_STFLE+16(%r1)");
 &{$z?  \&clgr:\&clr}   ($len,"%r0");
-   jle ("_s390x_chacha_novx");
-
-   lg  ("%r0","S390X_STFLE+16(%r1)");
-   tmhh    ("%r0",0x4000); # check for vector facility
-   jz  ("_s390x_chacha_novx");
-
-if (!$z) {
-   llgfr   ($len,$len);
-   std ("%f4","16*$SIZE_T+2*8($sp)");
-   std ("%f6","16*$SIZE_T+3*8($sp)");
-}
-&{$z?  \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
+   jle (".Lshort");
 
-   lghi    ("%r1",-$frame);
-   lgr ("%r0",$sp);
-   la  ($sp,"0(%r1,$sp)"); # allocate stack frame
+   tmhh    ("%r1",0x4000);         # check for vx bit
+   jnz (".LChaCha20_ctr32_vx");
 
-   larl    ("%r7",".Lsigma");
-&{$z?  \&stg:\&st} ("%r0","0($sp)");   # backchain
-
-   vstm    ("%v8","%v15","8($sp)") if ($z);
-
-   vlm ("%v1","%v2","0($key)");    # load key
-   vl  ("%v0","0(%r7)");   # load sigma constant
-   vl  ("%v3","0($counter)");  # load iv (counter||nonce)
-   l   ("%r0","0($counter)");  # load counter
-   vstm    ("%v0","%v3","$off($sp)");  # copy initial state to stack
-
-   srlg    ("%r1",$len,8);
-   ltgr    ("%r1","%r1");
-   jz  (".Lvx_4x_done");
-
-ALIGN  (16);   # process 4 64-byte blocks
-LABEL  (".Lvx_4x");
-   vlrepf  ("%v$_",($_*4)."+$off($sp)") for (0..15);   # load initial
-                               #  state
-   vl  ("%v31","16(%r7)");
-   vaf ("%v12","%v12","%v31"); # increment counter
-
-   vlr (@v[$_],"%v$_") for (0..15);    # copy initial state
-
-   lhi ("%r6",10);
-   j   (".Loop_vx_4x");
-
-ALIGN  (16);
-LABEL  (".Loop_vx_4x");
-   VX_ROUND( 0, 4, 8,12);  # column round
-   VX_ROUND( 0, 5,10,15);  # diagonal round
-   brct    ("%r6",".Loop_vx_4x");
-
-   vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
-                           #  state (mod 32)
-   vlm ("%v6","%v7","32(%r7)");    # load vperm operands
-
-for (0..3) {   # blocks 1,2
-   vmrhf   ("%v0",@v[$_*4+0],@v[$_*4+1]);  # ks = serialize(state)
-   vmrhf   ("%v1",@v[$_*4+2],@v[$_*4+3]);
-   vperm   ("%v".($_+ 8),"%v0","%v1","%v6");
-   vperm   ("%v".($_+12),"%v0","%v1","%v7");
-}
-   vlm ("%v0","%v7","0($inp)");    # load in
-   vx  ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
-   vstm    ("%v0","%v7","0($out)");    # store out
-
-   vlm ("%v6","%v7","32(%r7)");    # restore vperm operands
-
-for (0..3) {   # blocks 2,3
-   vmrlf   ("%v0",@v[$_*4+0],@v[$_*4+1]);  # ks = serialize(state)
-   vmrlf   ("%v1",@v[$_*4+2],@v[$_*4+3]);
-   vperm   ("%v".($_+ 8),"%v0","%v1","%v6");
-   vperm   ("%v".($_+12),"%v0","%v1","%v7");
-}
-   vlm ("%v0","%v7","128($inp)");  # load in
-   vx  ("%v$_","%v$_","%v".($_+8)) for (0..7); # out = in ^ ks
-   vstm    ("%v0","%v7","128($out)");  # store out
-
-   ahi ("%r0",4);
-   st  ("%r0","48+$off($sp)"); # update initial state
-
-   la  ($inp,"256($inp)");
-   la  ($out,"256($out)");
-   brctg   ("%r1",".Lvx_4x");
-
-ALIGN  (16);
-LABEL  (".Lvx_4x_done");
-   lghi    ("%r1",0xff);
-   ngr ($len,"%r1");
-   jnz (".Lvx_rem");
-
-ALIGN  (16);
-LABEL  (".Lvx_done");
-   vzero   ("%v$_") for (16..31);  # wipe ks and key copy
-   vstm    ("%v16","%v17","16+$off($sp)");
-   vlm ("%v8","%v15","8($sp)") if ($z);
-
-   la  ($sp,"$frame($sp)");
-&{$z?  \&lmg:\&lm} ("%r6","%r7","6*$SIZE_T($sp)");
-
-if (!$z) {
-   ld  ("%f4","16*$SIZE_T+2*8($sp)");
-   ld  ("%f6","16*$SIZE_T+3*8($sp)");
-   vzero   ("%v$_") for (8..15);
-}
-   br  ("%r14");
-ALIGN  (16);
-LABEL  (".Lvx_rem");
-   lhi ("%r0",64);
-
-   sr  ($len,"%r0");
-   brc (2,".Lvx_rem_g64"); # cc==2?
-
-   lghi    ("%r1",-$stdframe);
-
-   la  ($counter,"48+$off($sp)");  # load updated iv
-   ar  ($len,"%r0");   # restore len
-
-   lgr ("%r7",$counter);
-&{$z?  \&stg:\&st} ("%r14","14*$SIZE_T+$frame($sp)");
-   la  ($sp,"0(%r1,$sp)");
-
-   bras    ("%r14","_s390x_chacha_novx");
-
-   la  ($sp,"$stdframe($sp)");
-&{$z?  \&lg:\&l}   ("%r14","14*$SIZE_T+$frame($sp)");
-   lgr ($counter,"%r7");
-   j   (".Lvx_done");
-
-ALIGN  (16);
-LABEL  (".Lvx_rem_g64");
-   vlrepf  ("%v$_",($_*4)."+$off($sp)") for (0..15);   # load initial
-                               #  state
-   vl  ("%v31","16(%r7)");
-   vaf ("%v12","%v12","%v31"); # increment counter
-
-   vlr (@v[$_],"%v$_") for (0..15);    # state = initial state
-
-   lhi ("%r6",10);
-   j   (".Loop_vx_rem");
-
-ALIGN  (16);
-LABEL  (".Loop_vx_rem");
-   VX_ROUND( 0, 4, 8,12);  # column round
-   VX_ROUND( 0, 5,10,15);  # diagonal round
-   brct    ("%r6",".Loop_vx_rem");
-
-   vaf (@v[$_],@v[$_],"%v$_") for (0..15); # state += initial
-                           #  state (mod 32)
-   vlm ("%v6","%v7","32(%r7)");    # load vperm operands
-
-for (0..3) {   # blocks 1,2
-   vmrhf   ("%v0",@v[$_*4+0],@v[$_*4+1]);  # ks = serialize(state)
-   vmrhf   ("%v1",@v[$_*4+2],@v[$_*4+3]);
-   vperm   ("%v".($_+8),"%v0","%v1","%v6");
-   vperm   ("%v".($_+12),"%v0","%v1","%v7");
-}
-   vlm ("%v0","%v3","0($inp)");    # load in
-   vx  ("%v$_","%v$_","%v".($_+8)) for (0..3); # out = in ^ ks
-   vstm    ("%v0","%v3","0($out)");    # store out
-
-   la  ($inp,"64($inp)");
-   la  ($out,"64($out)");
-
-   sr  ($len,"%r0");
-   brc (4,".Lvx_tail");    # cc==4?
-
-   vlm ("%v0","%v3","0($inp)");    # load in
-   vx  ("%v$_","%v$_","%v".($_+12)) for (0..3);    # out = in ^ ks
-   vstm    ("%v0","%v3","0($out)");    # store out
-   jz  (".Lvx_done");
-
-for (0..3) {   # blocks 3,4
-   vmrlf   ("%v0",@v[$_*4+0],@v[$_*4+1]);  # ks = serialize(state)
-   vmrlf   ("%v1",@v[$_*4+2],@v[$_*4+3]);
-   vperm   ("%v".($_+12),"%v0","%v1","%v6");
-   vperm   ("%v".($_+8),"%v0","%v1","%v7");
-}
-   la  ($inp,"64($inp)");
-   la  ($out,"64($out)");
-
-   sr  ($len,"%r0");
-   brc (4,".Lvx_tail");    # cc==4?
-
-   vlm ("%v0","%v3","0($inp)");    # load in
-   vx  ("%v$_","%v$_","%v".($_+12)) for (0..3);    # out = in ^ ks
-   vstm    ("%v0","%v3","0($out)");    # store out
-   jz  (".Lvx_done");
-
-   la  ($inp,"64($inp)");
-   la  ($out,"64($out)");
-
-   sr  ($len,"%r0");
-   vlr ("%v".($_+4),"%v$_") for (8..11);
-   j   (".Lvx_tail");
-
-ALIGN  (16);
-LABEL  (".Lvx_tail");
-   ar  ($len,"%r0");   # restore $len
-   ahi ($len,-1);
-
-   lhi ("%r0",16);
-for (0..2) {
-   vll ("%v0",$len,($_*16)."($inp)");
-   vx  ("%v0","%v0","%v".($_+12));
-   vstl    ("%v0",$len,($_*16)."($out)");
-   sr  ($len,"%r0");
-   brc (4,".Lvx_done");    # cc==4?
-}
-   vll ("%v0",$len,"3*16($inp)");
-   vx  ("%v0","%v0","%v15");
-   vstl    ("%v0",$len,"3*16($out)");
-   j   (".Lvx_done");
-SIZE   ("ChaCha20_ctr32",".-ChaCha20_ctr32");
-}
-
-# NOVX CODE PATH
-{
-my $frame=$stdframe+4*20;
-
-TYPE   ("_s390x_chacha_novx","\@function");
-ALIGN  (32);
-LABEL  ("_s390x_chacha_novx");
-&{$z?  \&ltgr:\&ltr}   ($len,$len);    # $len==0?
-   bzr ("%r14");
+LABEL  (".Lshort");
 &{$z?  \&aghi:\&ahi}   ($len,-64);
 &{$z?  \&lghi:\&lhi}   ("%r1",-$frame);
 &{$z?  \&stmg:\&stm}   ("%r6","%r15","6*$SIZE_T($sp)");
@@ -607,17 +427,629 @@ LABEL    (".Loop_tail");
    brct    (@t[1],".Loop_tail");
 
    j   (".Ldone");
-SIZE   ("_s390x_chacha_novx",".-_s390x_chacha_novx");
+SIZE   ("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+########################################################################
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
+# runs underutilized [because of vector instructions' high latency].
+# On the other hand minimum amount of data it takes to fully utilize
+# the pipeline is higher, so that effectively, short inputs would be
+# processed slower. Hence this code path targeting <=256 bytes lengths.
+#
+{
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
+my @K=map("%v$_",(16..19));
+my $CTR="%v26";
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
+my $beperm="%v31";
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
+my $FRAME=$stdframe+4*16;
+
+ALIGN  (32);
+LABEL  ("ChaCha20_ctr32_4x");
+LABEL  (".LChaCha20_ctr32_4x");
+&{$z?  \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+   std ("%f4","16*$SIZE_T+2*8($sp)");
+   std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?  \&lghi:\&lhi}   ("%r1",-$FRAME);
+   lgr ("%r0",$sp);
+   la  ($sp,"0(%r1,$sp)");
+&{$z?  \&stg:\&st} ("%r0","0($sp)");   # back-chain
+if ($z) {
+   std ("%f8","$stdframe+8*0($sp)");
+   std ("%f9","$stdframe+8*1($sp)");
+   std ("%f10","$stdframe+8*2($sp)");
+   std ("%f11","$stdframe+8*3($sp)");
+   std ("%f12","$stdframe+8*4($sp)");
+   std ("%f13","$stdframe+8*5($sp)");
+   std ("%f14","$stdframe+8*6($sp)");
+   std ("%f15","$stdframe+8*7($sp)");
+}
+   larl    ("%r7",".Lsigma");
+   lhi ("%r0",10);
+   lhi ("%r1",0);
+
+   vl  (@K[0],"0(%r7)");       # load sigma
+   vl  (@K[1],"0($key)");      # load key
+   vl  (@K[2],"16($key)");
+   vl  (@K[3],"0($counter)");      # load counter
+
+   vl  ($beperm,"0x40(%r7)");
+   vl  ($xt1,"0x50(%r7)");
+   vrepf   ($CTR,@K[3],0);
+   vlvgf   (@K[3],"%r1",0);        # clear @K[3].word[0]
+   vaf ($CTR,$CTR,$xt1);
+
+#LABEL (".Loop_outer_4x");
+   vlm ($xa0,$xa3,"0x60(%r7)");    # load [smashed] sigma
+
+   vrepf   ($xb0,@K[1],0);         # smash the key
+   vrepf   ($xb1,@K[1],1);
+   vrepf   ($xb2,@K[1],2);
+   vrepf   ($xb3,@K[1],3);
+
+   vrepf   ($xc0,@K[2],0);
+   vrepf   ($xc1,@K[2],1);
+   vrepf   ($xc2,@K[2],2);
+   vrepf   ($xc3,@K[2],3);
+
+   vlr ($xd0,$CTR);
+   vrepf   ($xd1,@K[3],1);
+   vrepf   ($xd2,@K[3],2);
+   vrepf   ($xd3,@K[3],3);
+
+LABEL  (".Loop_4x");
+   VX_lane_ROUND(0, 4, 8,12);
+   VX_lane_ROUND(0, 5,10,15);
+   brct    ("%r0",".Loop_4x");
+
+   vaf ($xd0,$xd0,$CTR);
+
+   vmrhf   ($xt0,$xa0,$xa1);       # transpose data
+   vmrhf   ($xt1,$xa2,$xa3);
+   vmrlf   ($xt2,$xa0,$xa1);
+   vmrlf   ($xt3,$xa2,$xa3);
+   vpdi    ($xa0,$xt0,$xt1,0b0000);
+   vpdi    ($xa1,$xt0,$xt1,0b0101);
+   vpdi    ($xa2,$xt2,$xt3,0b0000);
+   vpdi    ($xa3,$xt2,$xt3,0b0101);
+
+   vmrhf   ($xt0,$xb0,$xb1);
+   vmrhf   ($xt1,$xb2,$xb3);
+   vmrlf   ($xt2,$xb0,$xb1);
+   vmrlf   ($xt3,$xb2,$xb3);
+   vpdi    ($xb0,$xt0,$xt1,0b0000);
+   vpdi    ($xb1,$xt0,$xt1,0b0101);
+   vpdi    ($xb2,$xt2,$xt3,0b0000);
+   vpdi    ($xb3,$xt2,$xt3,0b0101);
+
+   vmrhf   ($xt0,$xc0,$xc1);
+   vmrhf   ($xt1,$xc2,$xc3);
+   vmrlf   ($xt2,$xc0,$xc1);
+   vmrlf   ($xt3,$xc2,$xc3);
+   vpdi    ($xc0,$xt0,$xt1,0b0000);
+   vpdi    ($xc1,$xt0,$xt1,0b0101);
+   vpdi    ($xc2,$xt2,$xt3,0b0000);
+   vpdi    ($xc3,$xt2,$xt3,0b0101);
+
+   vmrhf   ($xt0,$xd0,$xd1);
+   vmrhf   ($xt1,$xd2,$xd3);
+   vmrlf   ($xt2,$xd0,$xd1);
+   vmrlf   ($xt3,$xd2,$xd3);
+   vpdi    ($xd0,$xt0,$xt1,0b0000);
+   vpdi    ($xd1,$xt0,$xt1,0b0101);
+   vpdi    ($xd2,$xt2,$xt3,0b0000);
+   vpdi    ($xd3,$xt2,$xt3,0b0101);
+
+   #vrepif ($xt0,4);
+   #vaf    ($CTR,$CTR,$xt0);       # next counter value
+
+   vaf ($xa0,$xa0,@K[0]);
+   vaf ($xb0,$xb0,@K[1]);
+   vaf ($xc0,$xc0,@K[2]);
+   vaf ($xd0,$xd0,@K[3]);
+
+   vperm   ($xa0,$xa0,$xa0,$beperm);
+   vperm   ($xb0,$xb0,$xb0,$beperm);
+   vperm   ($xc0,$xc0,$xc0,$beperm);
+   vperm   ($xd0,$xd0,$xd0,$beperm);
+
+   #&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   #jl (".Ltail_4x");
+
+   vlm ($xt0,$xt3,"0($inp)");
+
+   vx  ($xt0,$xt0,$xa0);
+   vx  ($xt1,$xt1,$xb0);
+   vx  ($xt2,$xt2,$xc0);
+   vx  ($xt3,$xt3,$xd0);
+
+   vstm    ($xt0,$xt3,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   #je (".Ldone_4x");
+
+   vaf ($xa0,$xa1,@K[0]);
+   vaf ($xb0,$xb1,@K[1]);
+   vaf ($xc0,$xc1,@K[2]);
+   vaf ($xd0,$xd1,@K[3]);
+
+   vperm   ($xa0,$xa0,$xa0,$beperm);
+   vperm   ($xb0,$xb0,$xb0,$beperm);
+   vperm   ($xc0,$xc0,$xc0,$beperm);
+   vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_4x");
+
+   vlm ($xt0,$xt3,"0($inp)");
+
+   vx  ($xt0,$xt0,$xa0);
+   vx  ($xt1,$xt1,$xb0);
+   vx  ($xt2,$xt2,$xc0);
+   vx  ($xt3,$xt3,$xd0);
+
+   vstm    ($xt0,$xt3,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_4x");
+
+   vaf ($xa0,$xa2,@K[0]);
+   vaf ($xb0,$xb2,@K[1]);
+   vaf ($xc0,$xc2,@K[2]);
+   vaf ($xd0,$xd2,@K[3]);
+
+   vperm   ($xa0,$xa0,$xa0,$beperm);
+   vperm   ($xb0,$xb0,$xb0,$beperm);
+   vperm   ($xc0,$xc0,$xc0,$beperm);
+   vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_4x");
+
+   vlm ($xt0,$xt3,"0($inp)");
+
+   vx  ($xt0,$xt0,$xa0);
+   vx  ($xt1,$xt1,$xb0);
+   vx  ($xt2,$xt2,$xc0);
+   vx  ($xt3,$xt3,$xd0);
+
+   vstm    ($xt0,$xt3,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_4x");
+
+   vaf ($xa0,$xa3,@K[0]);
+   vaf ($xb0,$xb3,@K[1]);
+   vaf ($xc0,$xc3,@K[2]);
+   vaf ($xd0,$xd3,@K[3]);
+
+   vperm   ($xa0,$xa0,$xa0,$beperm);
+   vperm   ($xb0,$xb0,$xb0,$beperm);
+   vperm   ($xc0,$xc0,$xc0,$beperm);
+   vperm   ($xd0,$xd0,$xd0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_4x");
+
+   vlm ($xt0,$xt3,"0($inp)");
+
+   vx  ($xt0,$xt0,$xa0);
+   vx  ($xt1,$xt1,$xb0);
+   vx  ($xt2,$xt2,$xc0);
+   vx  ($xt3,$xt3,$xd0);
+
+   vstm    ($xt0,$xt3,"0($out)");
+
+   #la $inp,0x40($inp));
+   #la $out,0x40($out));
+   #lhi    %r0,10);
+   #&{$z?  \&aghi:\&ahi}   $len,-0x40);
+   #jne    .Loop_outer_4x);
+
+LABEL  (".Ldone_4x");
+if (!$z) {
+   ld  ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+   ld  ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+   ld  ("%f8","$stdframe+8*0($sp)");
+   ld  ("%f9","$stdframe+8*1($sp)");
+   ld  ("%f10","$stdframe+8*2($sp)");
+   ld  ("%f11","$stdframe+8*3($sp)");
+   ld  ("%f12","$stdframe+8*4($sp)");
+   ld  ("%f13","$stdframe+8*5($sp)");
+   ld  ("%f14","$stdframe+8*6($sp)");
+   ld  ("%f15","$stdframe+8*7($sp)");
+}
+&{$z?  \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+   la  ($sp,"$FRAME($sp)");
+   br  ("%r14");
+
+ALIGN  (16);
+LABEL  (".Ltail_4x");
+if (!$z) {
+   vlr ($xt0,$xb0);
+   ld  ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+   ld  ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+
+   vst ($xa0,"$stdframe+0x00($sp)");
+   vst ($xt0,"$stdframe+0x10($sp)");
+   vst ($xc0,"$stdframe+0x20($sp)");
+   vst ($xd0,"$stdframe+0x30($sp)");
+} else {
+   vlr ($xt0,$xc0);
+   ld  ("%f8","$stdframe+8*0($sp)");
+   ld  ("%f9","$stdframe+8*1($sp)");
+   ld  ("%f10","$stdframe+8*2($sp)");
+   ld  ("%f11","$stdframe+8*3($sp)");
+   vlr ($xt1,$xd0);
+   ld  ("%f12","$stdframe+8*4($sp)");
+   ld  ("%f13","$stdframe+8*5($sp)");
+   ld  ("%f14","$stdframe+8*6($sp)");
+   ld  ("%f15","$stdframe+8*7($sp)");
+
+   vst ($xa0,"$stdframe+0x00($sp)");
+   vst ($xb0,"$stdframe+0x10($sp)");
+   vst ($xt0,"$stdframe+0x20($sp)");
+   vst ($xt1,"$stdframe+0x30($sp)");
 }
+   lghi    ("%r1",0);
+
+LABEL  (".Loop_tail_4x");
+   llgc    ("%r5","0(%r1,$inp)");
+   llgc    ("%r6","$stdframe(%r1,$sp)");
+   xr  ("%r6","%r5");
+   stc ("%r6","0(%r1,$out)");
+   la  ("%r1","1(%r1)");
+   brct    ($len,".Loop_tail_4x");
+
+&{$z?  \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+   la  ($sp,"$FRAME($sp)");
+   br  ("%r14");
+SIZE   ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
+}
+
+########################################################################
+# 6x"horizontal" layout is optimal fit for the platform in its current
+# shape, more specifically for given vector instructions' latency. Well,
+# computational part of 8x"vertical" would be faster, but it consumes
+# all registers and dealing with that will diminish the return...
+#
+{
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
+    $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
+    $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
+my @K=map("%v$_",(27,24..26));
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
+my $beperm="%v31";
+my $FRAME=$stdframe + 4*16;
+
+GLOBL  ("ChaCha20_ctr32_vx");
+ALIGN  (32);
+LABEL  ("ChaCha20_ctr32_vx");
+LABEL  (".LChaCha20_ctr32_vx");
+&{$z?  \&clgfi:\&clfi} ($len,256);
+   jle (".LChaCha20_ctr32_4x");
+&{$z?  \&stmg:\&stm}   ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+   std ("%f4","16*$SIZE_T+2*8($sp)");
+   std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z?  \&lghi:\&lhi}   ("%r1",-$FRAME);
+   lgr ("%r0",$sp);
+   la  ($sp,"0(%r1,$sp)");
+&{$z?  \&stg:\&st} ("%r0","0($sp)");   # back-chain
+if ($z) {
+   std ("%f8","$FRAME-8*8($sp)");
+   std ("%f9","$FRAME-8*7($sp)");
+   std ("%f10","$FRAME-8*6($sp)");
+   std ("%f11","$FRAME-8*5($sp)");
+   std ("%f12","$FRAME-8*4($sp)");
+   std ("%f13","$FRAME-8*3($sp)");
+   std ("%f14","$FRAME-8*2($sp)");
+   std ("%f15","$FRAME-8*1($sp)");
+}
+   larl    ("%r7",".Lsigma");
+   lhi ("%r0",10);
+
+   vlm (@K[1],@K[2],"0($key)");    # load key
+   vl  (@K[3],"0($counter)");      # load counter
+
+   vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
+
+LABEL  (".Loop_outer_vx");
+   vlr ($a0,@K[0]);
+   vlr ($b0,@K[1]);
+   vlr ($a1,@K[0]);
+   vlr ($b1,@K[1]);
+   vlr ($a2,@K[0]);
+   vlr ($b2,@K[1]);
+   vlr ($a3,@K[0]);
+   vlr ($b3,@K[1]);
+   vlr ($a4,@K[0]);
+   vlr ($b4,@K[1]);
+   vlr ($a5,@K[0]);
+   vlr ($b5,@K[1]);
+
+   vlr ($d0,@K[3]);
+   vaf ($d1,@K[3],$t1);        # K[3]+1
+   vaf ($d2,@K[3],$t2);        # K[3]+2
+   vaf ($d3,@K[3],$t3);        # K[3]+3
+   vaf ($d4,$d2,$t2);          # K[3]+4
+   vaf ($d5,$d2,$t3);          # K[3]+5
+
+   vlr ($c0,@K[2]);
+   vlr ($c1,@K[2]);
+   vlr ($c2,@K[2]);
+   vlr ($c3,@K[2]);
+   vlr ($c4,@K[2]);
+   vlr ($c5,@K[2]);
+
+   vlr ($t1,$d1);
+   vlr ($t2,$d2);
+   vlr ($t3,$d3);
+
+ALIGN  (4);
+LABEL  (".Loop_vx");
+
+   VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+        $b0,$b1,$b2,$b3,$b4,$b5,
+        $c0,$c1,$c2,$c3,$c4,$c5,
+        $d0,$d1,$d2,$d3,$d4,$d5,
+        0);
+
+   VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+        $b0,$b1,$b2,$b3,$b4,$b5,
+        $c0,$c1,$c2,$c3,$c4,$c5,
+        $d0,$d1,$d2,$d3,$d4,$d5,
+        1);
+
+   brct    ("%r0",".Loop_vx");
+
+   vaf ($a0,$a0,@K[0]);
+   vaf ($b0,$b0,@K[1]);
+   vaf ($c0,$c0,@K[2]);
+   vaf ($d0,$d0,@K[3]);
+   vaf ($a1,$a1,@K[0]);
+   vaf ($d1,$d1,$t1);          # +K[3]+1
+
+   vperm   ($a0,$a0,$a0,$beperm);
+   vperm   ($b0,$b0,$b0,$beperm);
+   vperm   ($c0,$c0,$c0,$beperm);
+   vperm   ($d0,$d0,$d0,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vaf ($d2,$d2,$t2);          # +K[3]+2
+   vaf ($d3,$d3,$t3);          # +K[3]+3
+   vlm ($t0,$t3,"0($inp)");
+
+   vx  ($a0,$a0,$t0);
+   vx  ($b0,$b0,$t1);
+   vx  ($c0,$c0,$t2);
+   vx  ($d0,$d0,$t3);
+
+   vlm (@K[0],$t3,"0(%r7)");       # re-load sigma and increments
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_vx");
+
+   vaf ($b1,$b1,@K[1]);
+   vaf ($c1,$c1,@K[2]);
+
+   vperm   ($a0,$a1,$a1,$beperm);
+   vperm   ($b0,$b1,$b1,$beperm);
+   vperm   ($c0,$c1,$c1,$beperm);
+   vperm   ($d0,$d1,$d1,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vlm ($a1,$d1,"0($inp)");
+
+   vx  ($a0,$a0,$a1);
+   vx  ($b0,$b0,$b1);
+   vx  ($c0,$c0,$c1);
+   vx  ($d0,$d0,$d1);
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_vx");
+
+   vaf ($a2,$a2,@K[0]);
+   vaf ($b2,$b2,@K[1]);
+   vaf ($c2,$c2,@K[2]);
+
+   vperm   ($a0,$a2,$a2,$beperm);
+   vperm   ($b0,$b2,$b2,$beperm);
+   vperm   ($c0,$c2,$c2,$beperm);
+   vperm   ($d0,$d2,$d2,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vlm ($a1,$d1,"0($inp)");
+
+   vx  ($a0,$a0,$a1);
+   vx  ($b0,$b0,$b1);
+   vx  ($c0,$c0,$c1);
+   vx  ($d0,$d0,$d1);
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_vx");
+
+   vaf ($a3,$a3,@K[0]);
+   vaf ($b3,$b3,@K[1]);
+   vaf ($c3,$c3,@K[2]);
+   vaf ($d2,@K[3],$t3);        # K[3]+3
+
+   vperm   ($a0,$a3,$a3,$beperm);
+   vperm   ($b0,$b3,$b3,$beperm);
+   vperm   ($c0,$c3,$c3,$beperm);
+   vperm   ($d0,$d3,$d3,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vaf ($d3,$d2,$t1);          # K[3]+4
+   vlm ($a1,$d1,"0($inp)");
+
+   vx  ($a0,$a0,$a1);
+   vx  ($b0,$b0,$b1);
+   vx  ($c0,$c0,$c1);
+   vx  ($d0,$d0,$d1);
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_vx");
+
+   vaf ($a4,$a4,@K[0]);
+   vaf ($b4,$b4,@K[1]);
+   vaf ($c4,$c4,@K[2]);
+   vaf ($d4,$d4,$d3);          # +K[3]+4
+   vaf ($d3,$d3,$t1);          # K[3]+5
+   vaf (@K[3],$d2,$t3);        # K[3]+=6
+
+   vperm   ($a0,$a4,$a4,$beperm);
+   vperm   ($b0,$b4,$b4,$beperm);
+   vperm   ($c0,$c4,$c4,$beperm);
+   vperm   ($d0,$d4,$d4,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vlm ($a1,$d1,"0($inp)");
+
+   vx  ($a0,$a0,$a1);
+   vx  ($b0,$b0,$b1);
+   vx  ($c0,$c0,$c1);
+   vx  ($d0,$d0,$d1);
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   je  (".Ldone_vx");
+
+   vaf ($a5,$a5,@K[0]);
+   vaf ($b5,$b5,@K[1]);
+   vaf ($c5,$c5,@K[2]);
+   vaf ($d5,$d5,$d3);          # +K[3]+5
+
+   vperm   ($a0,$a5,$a5,$beperm);
+   vperm   ($b0,$b5,$b5,$beperm);
+   vperm   ($c0,$c5,$c5,$beperm);
+   vperm   ($d0,$d5,$d5,$beperm);
+
+&{$z?  \&clgfi:\&clfi} ($len,0x40);
+   jl  (".Ltail_vx");
+
+   vlm ($a1,$d1,"0($inp)");
+
+   vx  ($a0,$a0,$a1);
+   vx  ($b0,$b0,$b1);
+   vx  ($c0,$c0,$c1);
+   vx  ($d0,$d0,$d1);
+
+   vstm    ($a0,$d0,"0($out)");
+
+   la  ($inp,"0x40($inp)");
+   la  ($out,"0x40($out)");
+   lhi ("%r0",10);
+&{$z?  \&aghi:\&ahi}   ($len,-0x40);
+   jne (".Loop_outer_vx");
+
+LABEL  (".Ldone_vx");
+if (!$z) {
+   ld  ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+   ld  ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+   ld  ("%f8","$FRAME-8*8($sp)");
+   ld  ("%f9","$FRAME-8*7($sp)");
+   ld  ("%f10","$FRAME-8*6($sp)");
+   ld  ("%f11","$FRAME-8*5($sp)");
+   ld  ("%f12","$FRAME-8*4($sp)");
+   ld  ("%f13","$FRAME-8*3($sp)");
+   ld  ("%f14","$FRAME-8*2($sp)");
+   ld  ("%f15","$FRAME-8*1($sp)");
+}
+&{$z?  \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+   la  ($sp,"$FRAME($sp)");
+   br  ("%r14");
+
+ALIGN  (16);
+LABEL  (".Ltail_vx");
+if (!$z) {
+   ld  ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+   ld  ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+   ld  ("%f8","$FRAME-8*8($sp)");
+   ld  ("%f9","$FRAME-8*7($sp)");
+   ld  ("%f10","$FRAME-8*6($sp)");
+   ld  ("%f11","$FRAME-8*5($sp)");
+   ld  ("%f12","$FRAME-8*4($sp)");
+   ld  ("%f13","$FRAME-8*3($sp)");
+   ld  ("%f14","$FRAME-8*2($sp)");
+   ld  ("%f15","$FRAME-8*1($sp)");
+}
+   vstm    ($a0,$d0,"$stdframe($sp)");
+   lghi    ("%r1",0);
+
+LABEL  (".Loop_tail_vx");
+   llgc    ("%r5","0(%r1,$inp)");
+   llgc    ("%r6","$stdframe(%r1,$sp)");
+   xr  ("%r6","%r5");
+   stc ("%r6","0(%r1,$out)");
+   la  ("%r1","1(%r1)");
+   brct    ($len,".Loop_tail_vx");
+
+&{$z?  \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+   la  ($sp,"$FRAME($sp)");
+   br  ("%r14");
+SIZE   ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
 }
 ################
 
-ALIGN  (64);
+ALIGN  (32);
 LABEL  (".Lsigma");
 LONG   (0x61707865,0x3320646e,0x79622d32,0x6b206574);  # endian-neutral sigma
-LONG   (0x00000000,0x00000001,0x00000002,0x00000003);  # vaf counter increment
-LONG   (0x03020100,0x07060504,0x13121110,0x17161514);  # vperm serialization
-LONG   (0x0b0a0908,0x0f0e0d0c,0x1b1a1918,0x1f1e1d1c);  # vperm serialization
+LONG   (1,0,0,0);
+LONG   (2,0,0,0);
+LONG   (3,0,0,0);
+LONG   (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c);  # byte swap
+
+LONG   (0,1,2,3);
+LONG   (0x61707865,0x61707865,0x61707865,0x61707865);  # smashed sigma
+LONG   (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
+LONG   (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
+LONG   (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
+
 ASCIZ  ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
 ALIGN  (4);
 
-- 
2.21.0
​
​

Places

File 0005-s390x-assembly-pack-import-chacha-from-cryptogams-re.patch of Package openssl-1_1

Places