1*73f315c1SEric Biggers#!/usr/bin/env perl 2*73f315c1SEric Biggers# SPDX-License-Identifier: GPL-2.0 3*73f315c1SEric Biggers 4*73f315c1SEric Biggers# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5*73f315c1SEric Biggers# has relicensed it under the GPLv2. Therefore this program is free software; 6*73f315c1SEric Biggers# you can redistribute it and/or modify it under the terms of the GNU General 7*73f315c1SEric Biggers# Public License version 2 as published by the Free Software Foundation. 8*73f315c1SEric Biggers# 9*73f315c1SEric Biggers# The original headers, including the original license headers, are 10*73f315c1SEric Biggers# included below for completeness. 11*73f315c1SEric Biggers 12*73f315c1SEric Biggers# ==================================================================== 13*73f315c1SEric Biggers# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14*73f315c1SEric Biggers# project. The module is, however, dual licensed under OpenSSL and 15*73f315c1SEric Biggers# CRYPTOGAMS licenses depending on where you obtain it. For further 16*73f315c1SEric Biggers# details see https://www.openssl.org/~appro/cryptogams/. 17*73f315c1SEric Biggers# ==================================================================== 18*73f315c1SEric Biggers# 19*73f315c1SEric Biggers# GHASH for PowerISA v2.07. 20*73f315c1SEric Biggers# 21*73f315c1SEric Biggers# July 2014 22*73f315c1SEric Biggers# 23*73f315c1SEric Biggers# Accurate performance measurements are problematic, because it's 24*73f315c1SEric Biggers# always virtualized setup with possibly throttled processor. 25*73f315c1SEric Biggers# Relative comparison is therefore more informative. This initial 26*73f315c1SEric Biggers# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x 27*73f315c1SEric Biggers# faster than "4-bit" integer-only compiler-generated 64-bit code. 28*73f315c1SEric Biggers# "Initial version" means that there is room for futher improvement. 29*73f315c1SEric Biggers 30*73f315c1SEric Biggers$flavour=shift; 31*73f315c1SEric Biggers$output =shift; 32*73f315c1SEric Biggers 33*73f315c1SEric Biggersif ($flavour =~ /64/) { 34*73f315c1SEric Biggers $SIZE_T=8; 35*73f315c1SEric Biggers $LRSAVE=2*$SIZE_T; 36*73f315c1SEric Biggers $STU="stdu"; 37*73f315c1SEric Biggers $POP="ld"; 38*73f315c1SEric Biggers $PUSH="std"; 39*73f315c1SEric Biggers} elsif ($flavour =~ /32/) { 40*73f315c1SEric Biggers $SIZE_T=4; 41*73f315c1SEric Biggers $LRSAVE=$SIZE_T; 42*73f315c1SEric Biggers $STU="stwu"; 43*73f315c1SEric Biggers $POP="lwz"; 44*73f315c1SEric Biggers $PUSH="stw"; 45*73f315c1SEric Biggers} else { die "nonsense $flavour"; } 46*73f315c1SEric Biggers 47*73f315c1SEric Biggers$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 48*73f315c1SEric Biggers( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 49*73f315c1SEric Biggers( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 50*73f315c1SEric Biggers( $xlate="${dir}../../../arch/powerpc/crypto/ppc-xlate.pl" and -f $xlate) or 51*73f315c1SEric Biggersdie "can't locate ppc-xlate.pl"; 52*73f315c1SEric Biggers 53*73f315c1SEric Biggersopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 54*73f315c1SEric Biggers 55*73f315c1SEric Biggersmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block 56*73f315c1SEric Biggers 57*73f315c1SEric Biggersmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); 58*73f315c1SEric Biggersmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); 59*73f315c1SEric Biggersmy $vrsave="r12"; 60*73f315c1SEric Biggers 61*73f315c1SEric Biggers$code=<<___; 62*73f315c1SEric Biggers.machine "any" 63*73f315c1SEric Biggers 64*73f315c1SEric Biggers.text 65*73f315c1SEric Biggers 66*73f315c1SEric Biggers.globl .gcm_init_p8 67*73f315c1SEric Biggers lis r0,0xfff0 68*73f315c1SEric Biggers li r8,0x10 69*73f315c1SEric Biggers mfspr $vrsave,256 70*73f315c1SEric Biggers li r9,0x20 71*73f315c1SEric Biggers mtspr 256,r0 72*73f315c1SEric Biggers li r10,0x30 73*73f315c1SEric Biggers lvx_u $H,0,r4 # load H 74*73f315c1SEric Biggers le?xor r7,r7,r7 75*73f315c1SEric Biggers le?addi r7,r7,0x8 # need a vperm start with 08 76*73f315c1SEric Biggers le?lvsr 5,0,r7 77*73f315c1SEric Biggers le?vspltisb 6,0x0f 78*73f315c1SEric Biggers le?vxor 5,5,6 # set a b-endian mask 79*73f315c1SEric Biggers le?vperm $H,$H,$H,5 80*73f315c1SEric Biggers 81*73f315c1SEric Biggers vspltisb $xC2,-16 # 0xf0 82*73f315c1SEric Biggers vspltisb $t0,1 # one 83*73f315c1SEric Biggers vaddubm $xC2,$xC2,$xC2 # 0xe0 84*73f315c1SEric Biggers vxor $zero,$zero,$zero 85*73f315c1SEric Biggers vor $xC2,$xC2,$t0 # 0xe1 86*73f315c1SEric Biggers vsldoi $xC2,$xC2,$zero,15 # 0xe1... 87*73f315c1SEric Biggers vsldoi $t1,$zero,$t0,1 # ...1 88*73f315c1SEric Biggers vaddubm $xC2,$xC2,$xC2 # 0xc2... 89*73f315c1SEric Biggers vspltisb $t2,7 90*73f315c1SEric Biggers vor $xC2,$xC2,$t1 # 0xc2....01 91*73f315c1SEric Biggers vspltb $t1,$H,0 # most significant byte 92*73f315c1SEric Biggers vsl $H,$H,$t0 # H<<=1 93*73f315c1SEric Biggers vsrab $t1,$t1,$t2 # broadcast carry bit 94*73f315c1SEric Biggers vand $t1,$t1,$xC2 95*73f315c1SEric Biggers vxor $H,$H,$t1 # twisted H 96*73f315c1SEric Biggers 97*73f315c1SEric Biggers vsldoi $H,$H,$H,8 # twist even more ... 98*73f315c1SEric Biggers vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 99*73f315c1SEric Biggers vsldoi $Hl,$zero,$H,8 # ... and split 100*73f315c1SEric Biggers vsldoi $Hh,$H,$zero,8 101*73f315c1SEric Biggers 102*73f315c1SEric Biggers stvx_u $xC2,0,r3 # save pre-computed table 103*73f315c1SEric Biggers stvx_u $Hl,r8,r3 104*73f315c1SEric Biggers stvx_u $H, r9,r3 105*73f315c1SEric Biggers stvx_u $Hh,r10,r3 106*73f315c1SEric Biggers 107*73f315c1SEric Biggers mtspr 256,$vrsave 108*73f315c1SEric Biggers blr 109*73f315c1SEric Biggers .long 0 110*73f315c1SEric Biggers .byte 0,12,0x14,0,0,0,2,0 111*73f315c1SEric Biggers .long 0 112*73f315c1SEric Biggers.size .gcm_init_p8,.-.gcm_init_p8 113*73f315c1SEric Biggers 114*73f315c1SEric Biggers.globl .gcm_gmult_p8 115*73f315c1SEric Biggers lis r0,0xfff8 116*73f315c1SEric Biggers li r8,0x10 117*73f315c1SEric Biggers mfspr $vrsave,256 118*73f315c1SEric Biggers li r9,0x20 119*73f315c1SEric Biggers mtspr 256,r0 120*73f315c1SEric Biggers li r10,0x30 121*73f315c1SEric Biggers lvx_u $IN,0,$Xip # load Xi 122*73f315c1SEric Biggers 123*73f315c1SEric Biggers lvx_u $Hl,r8,$Htbl # load pre-computed table 124*73f315c1SEric Biggers le?lvsl $lemask,r0,r0 125*73f315c1SEric Biggers lvx_u $H, r9,$Htbl 126*73f315c1SEric Biggers le?vspltisb $t0,0x07 127*73f315c1SEric Biggers lvx_u $Hh,r10,$Htbl 128*73f315c1SEric Biggers le?vxor $lemask,$lemask,$t0 129*73f315c1SEric Biggers lvx_u $xC2,0,$Htbl 130*73f315c1SEric Biggers le?vperm $IN,$IN,$IN,$lemask 131*73f315c1SEric Biggers vxor $zero,$zero,$zero 132*73f315c1SEric Biggers 133*73f315c1SEric Biggers vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 134*73f315c1SEric Biggers vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 135*73f315c1SEric Biggers vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 136*73f315c1SEric Biggers 137*73f315c1SEric Biggers vpmsumd $t2,$Xl,$xC2 # 1st phase 138*73f315c1SEric Biggers 139*73f315c1SEric Biggers vsldoi $t0,$Xm,$zero,8 140*73f315c1SEric Biggers vsldoi $t1,$zero,$Xm,8 141*73f315c1SEric Biggers vxor $Xl,$Xl,$t0 142*73f315c1SEric Biggers vxor $Xh,$Xh,$t1 143*73f315c1SEric Biggers 144*73f315c1SEric Biggers vsldoi $Xl,$Xl,$Xl,8 145*73f315c1SEric Biggers vxor $Xl,$Xl,$t2 146*73f315c1SEric Biggers 147*73f315c1SEric Biggers vsldoi $t1,$Xl,$Xl,8 # 2nd phase 148*73f315c1SEric Biggers vpmsumd $Xl,$Xl,$xC2 149*73f315c1SEric Biggers vxor $t1,$t1,$Xh 150*73f315c1SEric Biggers vxor $Xl,$Xl,$t1 151*73f315c1SEric Biggers 152*73f315c1SEric Biggers le?vperm $Xl,$Xl,$Xl,$lemask 153*73f315c1SEric Biggers stvx_u $Xl,0,$Xip # write out Xi 154*73f315c1SEric Biggers 155*73f315c1SEric Biggers mtspr 256,$vrsave 156*73f315c1SEric Biggers blr 157*73f315c1SEric Biggers .long 0 158*73f315c1SEric Biggers .byte 0,12,0x14,0,0,0,2,0 159*73f315c1SEric Biggers .long 0 160*73f315c1SEric Biggers.size .gcm_gmult_p8,.-.gcm_gmult_p8 161*73f315c1SEric Biggers 162*73f315c1SEric Biggers.globl .gcm_ghash_p8 163*73f315c1SEric Biggers lis r0,0xfff8 164*73f315c1SEric Biggers li r8,0x10 165*73f315c1SEric Biggers mfspr $vrsave,256 166*73f315c1SEric Biggers li r9,0x20 167*73f315c1SEric Biggers mtspr 256,r0 168*73f315c1SEric Biggers li r10,0x30 169*73f315c1SEric Biggers lvx_u $Xl,0,$Xip # load Xi 170*73f315c1SEric Biggers 171*73f315c1SEric Biggers lvx_u $Hl,r8,$Htbl # load pre-computed table 172*73f315c1SEric Biggers le?lvsl $lemask,r0,r0 173*73f315c1SEric Biggers lvx_u $H, r9,$Htbl 174*73f315c1SEric Biggers le?vspltisb $t0,0x07 175*73f315c1SEric Biggers lvx_u $Hh,r10,$Htbl 176*73f315c1SEric Biggers le?vxor $lemask,$lemask,$t0 177*73f315c1SEric Biggers lvx_u $xC2,0,$Htbl 178*73f315c1SEric Biggers le?vperm $Xl,$Xl,$Xl,$lemask 179*73f315c1SEric Biggers vxor $zero,$zero,$zero 180*73f315c1SEric Biggers 181*73f315c1SEric Biggers lvx_u $IN,0,$inp 182*73f315c1SEric Biggers addi $inp,$inp,16 183*73f315c1SEric Biggers subi $len,$len,16 184*73f315c1SEric Biggers le?vperm $IN,$IN,$IN,$lemask 185*73f315c1SEric Biggers vxor $IN,$IN,$Xl 186*73f315c1SEric Biggers b Loop 187*73f315c1SEric Biggers 188*73f315c1SEric Biggers.align 5 189*73f315c1SEric BiggersLoop: 190*73f315c1SEric Biggers subic $len,$len,16 191*73f315c1SEric Biggers vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 192*73f315c1SEric Biggers subfe. r0,r0,r0 # borrow?-1:0 193*73f315c1SEric Biggers vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 194*73f315c1SEric Biggers and r0,r0,$len 195*73f315c1SEric Biggers vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 196*73f315c1SEric Biggers add $inp,$inp,r0 197*73f315c1SEric Biggers 198*73f315c1SEric Biggers vpmsumd $t2,$Xl,$xC2 # 1st phase 199*73f315c1SEric Biggers 200*73f315c1SEric Biggers vsldoi $t0,$Xm,$zero,8 201*73f315c1SEric Biggers vsldoi $t1,$zero,$Xm,8 202*73f315c1SEric Biggers vxor $Xl,$Xl,$t0 203*73f315c1SEric Biggers vxor $Xh,$Xh,$t1 204*73f315c1SEric Biggers 205*73f315c1SEric Biggers vsldoi $Xl,$Xl,$Xl,8 206*73f315c1SEric Biggers vxor $Xl,$Xl,$t2 207*73f315c1SEric Biggers lvx_u $IN,0,$inp 208*73f315c1SEric Biggers addi $inp,$inp,16 209*73f315c1SEric Biggers 210*73f315c1SEric Biggers vsldoi $t1,$Xl,$Xl,8 # 2nd phase 211*73f315c1SEric Biggers vpmsumd $Xl,$Xl,$xC2 212*73f315c1SEric Biggers le?vperm $IN,$IN,$IN,$lemask 213*73f315c1SEric Biggers vxor $t1,$t1,$Xh 214*73f315c1SEric Biggers vxor $IN,$IN,$t1 215*73f315c1SEric Biggers vxor $IN,$IN,$Xl 216*73f315c1SEric Biggers beq Loop # did $len-=16 borrow? 217*73f315c1SEric Biggers 218*73f315c1SEric Biggers vxor $Xl,$Xl,$t1 219*73f315c1SEric Biggers le?vperm $Xl,$Xl,$Xl,$lemask 220*73f315c1SEric Biggers stvx_u $Xl,0,$Xip # write out Xi 221*73f315c1SEric Biggers 222*73f315c1SEric Biggers mtspr 256,$vrsave 223*73f315c1SEric Biggers blr 224*73f315c1SEric Biggers .long 0 225*73f315c1SEric Biggers .byte 0,12,0x14,0,0,0,4,0 226*73f315c1SEric Biggers .long 0 227*73f315c1SEric Biggers.size .gcm_ghash_p8,.-.gcm_ghash_p8 228*73f315c1SEric Biggers 229*73f315c1SEric Biggers.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 230*73f315c1SEric Biggers.align 2 231*73f315c1SEric Biggers___ 232*73f315c1SEric Biggers 233*73f315c1SEric Biggersforeach (split("\n",$code)) { 234*73f315c1SEric Biggers if ($flavour =~ /le$/o) { # little-endian 235*73f315c1SEric Biggers s/le\?//o or 236*73f315c1SEric Biggers s/be\?/#be#/o; 237*73f315c1SEric Biggers } else { 238*73f315c1SEric Biggers s/le\?/#le#/o or 239*73f315c1SEric Biggers s/be\?//o; 240*73f315c1SEric Biggers } 241*73f315c1SEric Biggers print $_,"\n"; 242*73f315c1SEric Biggers} 243*73f315c1SEric Biggers 244*73f315c1SEric Biggersclose STDOUT; # enforce flush 245