1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-2.0 3 4# This code is taken from the OpenSSL project but the author (Andy Polyakov) 5# has relicensed it under the GPLv2. Therefore this program is free software; 6# you can redistribute it and/or modify it under the terms of the GNU General 7# Public License version 2 as published by the Free Software Foundation. 8# 9# The original headers, including the original license headers, are 10# included below for completeness. 11 12# ==================================================================== 13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14# project. The module is, however, dual licensed under OpenSSL and 15# CRYPTOGAMS licenses depending on where you obtain it. For further 16# details see https://www.openssl.org/~appro/cryptogams/. 17# ==================================================================== 18# 19# GHASH for PowerISA v2.07. 20# 21# July 2014 22# 23# Accurate performance measurements are problematic, because it's 24# always virtualized setup with possibly throttled processor. 25# Relative comparison is therefore more informative. This initial 26# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x 27# faster than "4-bit" integer-only compiler-generated 64-bit code. 28# "Initial version" means that there is room for futher improvement. 29 30$flavour=shift; 31$output =shift; 32 33if ($flavour =~ /64/) { 34 $SIZE_T=8; 35 $LRSAVE=2*$SIZE_T; 36 $STU="stdu"; 37 $POP="ld"; 38 $PUSH="std"; 39} elsif ($flavour =~ /32/) { 40 $SIZE_T=4; 41 $LRSAVE=$SIZE_T; 42 $STU="stwu"; 43 $POP="lwz"; 44 $PUSH="stw"; 45} else { die "nonsense $flavour"; } 46 47$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 48( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 49( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 50die "can't locate ppc-xlate.pl"; 51 52open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 53 54my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block 55 56my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); 57my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); 58my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); 59my $vrsave="r12"; 60my ($t4,$t5,$t6) = ($Hl,$H,$Hh); 61 62$code=<<___; 63.machine "any" 64 65.text 66 67.globl .gcm_init_p10 68 lis r0,0xfff0 69 li r8,0x10 70 mfspr $vrsave,256 71 li r9,0x20 72 mtspr 256,r0 73 li r10,0x30 74 lvx_u $H,0,r4 # load H 75 le?xor r7,r7,r7 76 le?addi r7,r7,0x8 # need a vperm start with 08 77 le?lvsr 5,0,r7 78 le?vspltisb 6,0x0f 79 le?vxor 5,5,6 # set a b-endian mask 80 le?vperm $H,$H,$H,5 81 82 vspltisb $xC2,-16 # 0xf0 83 vspltisb $t0,1 # one 84 vaddubm $xC2,$xC2,$xC2 # 0xe0 85 vxor $zero,$zero,$zero 86 vor $xC2,$xC2,$t0 # 0xe1 87 vsldoi $xC2,$xC2,$zero,15 # 0xe1... 88 vsldoi $t1,$zero,$t0,1 # ...1 89 vaddubm $xC2,$xC2,$xC2 # 0xc2... 90 vspltisb $t2,7 91 vor $xC2,$xC2,$t1 # 0xc2....01 92 vspltb $t1,$H,0 # most significant byte 93 vsl $H,$H,$t0 # H<<=1 94 vsrab $t1,$t1,$t2 # broadcast carry bit 95 vand $t1,$t1,$xC2 96 vxor $H,$H,$t1 # twisted H 97 98 vsldoi $H,$H,$H,8 # twist even more ... 99 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 100 vsldoi $Hl,$zero,$H,8 # ... and split 101 vsldoi $Hh,$H,$zero,8 102 103 stvx_u $xC2,0,r3 # save pre-computed table 104 stvx_u $Hl,r8,r3 105 stvx_u $H, r9,r3 106 stvx_u $Hh,r10,r3 107 108 mtspr 256,$vrsave 109 blr 110 .long 0 111 .byte 0,12,0x14,0,0,0,2,0 112 .long 0 113.size .gcm_init_p10,.-.gcm_init_p10 114 115.globl .gcm_init_htable 116 lis r0,0xfff0 117 li r8,0x10 118 mfspr $vrsave,256 119 li r9,0x20 120 mtspr 256,r0 121 li r10,0x30 122 lvx_u $H,0,r4 # load H 123 124 vspltisb $xC2,-16 # 0xf0 125 vspltisb $t0,1 # one 126 vaddubm $xC2,$xC2,$xC2 # 0xe0 127 vxor $zero,$zero,$zero 128 vor $xC2,$xC2,$t0 # 0xe1 129 vsldoi $xC2,$xC2,$zero,15 # 0xe1... 130 vsldoi $t1,$zero,$t0,1 # ...1 131 vaddubm $xC2,$xC2,$xC2 # 0xc2... 132 vspltisb $t2,7 133 vor $xC2,$xC2,$t1 # 0xc2....01 134 vspltb $t1,$H,0 # most significant byte 135 vsl $H,$H,$t0 # H<<=1 136 vsrab $t1,$t1,$t2 # broadcast carry bit 137 vand $t1,$t1,$xC2 138 vxor $IN,$H,$t1 # twisted H 139 140 vsldoi $H,$IN,$IN,8 # twist even more ... 141 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 142 vsldoi $Hl,$zero,$H,8 # ... and split 143 vsldoi $Hh,$H,$zero,8 144 145 stvx_u $xC2,0,r3 # save pre-computed table 146 stvx_u $Hl,r8,r3 147 li r8,0x40 148 stvx_u $H, r9,r3 149 li r9,0x50 150 stvx_u $Hh,r10,r3 151 li r10,0x60 152 153 vpmsumd $Xl,$IN,$Hl # H.lo·H.lo 154 vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi 155 vpmsumd $Xh,$IN,$Hh # H.hi·H.hi 156 157 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase 158 159 vsldoi $t0,$Xm,$zero,8 160 vsldoi $t1,$zero,$Xm,8 161 vxor $Xl,$Xl,$t0 162 vxor $Xh,$Xh,$t1 163 164 vsldoi $Xl,$Xl,$Xl,8 165 vxor $Xl,$Xl,$t2 166 167 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase 168 vpmsumd $Xl,$Xl,$xC2 169 vxor $t1,$t1,$Xh 170 vxor $IN1,$Xl,$t1 171 172 vsldoi $H2,$IN1,$IN1,8 173 vsldoi $H2l,$zero,$H2,8 174 vsldoi $H2h,$H2,$zero,8 175 176 stvx_u $H2l,r8,r3 # save H^2 177 li r8,0x70 178 stvx_u $H2,r9,r3 179 li r9,0x80 180 stvx_u $H2h,r10,r3 181 li r10,0x90 182 183 vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo 184 vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo 185 vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi 186 vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi 187 vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi 188 vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi 189 190 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase 191 vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase 192 193 vsldoi $t0,$Xm,$zero,8 194 vsldoi $t1,$zero,$Xm,8 195 vsldoi $t4,$Xm1,$zero,8 196 vsldoi $t5,$zero,$Xm1,8 197 vxor $Xl,$Xl,$t0 198 vxor $Xh,$Xh,$t1 199 vxor $Xl1,$Xl1,$t4 200 vxor $Xh1,$Xh1,$t5 201 202 vsldoi $Xl,$Xl,$Xl,8 203 vsldoi $Xl1,$Xl1,$Xl1,8 204 vxor $Xl,$Xl,$t2 205 vxor $Xl1,$Xl1,$t6 206 207 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase 208 vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase 209 vpmsumd $Xl,$Xl,$xC2 210 vpmsumd $Xl1,$Xl1,$xC2 211 vxor $t1,$t1,$Xh 212 vxor $t5,$t5,$Xh1 213 vxor $Xl,$Xl,$t1 214 vxor $Xl1,$Xl1,$t5 215 216 vsldoi $H,$Xl,$Xl,8 217 vsldoi $H2,$Xl1,$Xl1,8 218 vsldoi $Hl,$zero,$H,8 219 vsldoi $Hh,$H,$zero,8 220 vsldoi $H2l,$zero,$H2,8 221 vsldoi $H2h,$H2,$zero,8 222 223 stvx_u $Hl,r8,r3 # save H^3 224 li r8,0xa0 225 stvx_u $H,r9,r3 226 li r9,0xb0 227 stvx_u $Hh,r10,r3 228 li r10,0xc0 229 stvx_u $H2l,r8,r3 # save H^4 230 stvx_u $H2,r9,r3 231 stvx_u $H2h,r10,r3 232 233 mtspr 256,$vrsave 234 blr 235 .long 0 236 .byte 0,12,0x14,0,0,0,2,0 237 .long 0 238.size .gcm_init_htable,.-.gcm_init_htable 239 240.globl .gcm_gmult_p10 241 lis r0,0xfff8 242 li r8,0x10 243 mfspr $vrsave,256 244 li r9,0x20 245 mtspr 256,r0 246 li r10,0x30 247 lvx_u $IN,0,$Xip # load Xi 248 249 lvx_u $Hl,r8,$Htbl # load pre-computed table 250 le?lvsl $lemask,r0,r0 251 lvx_u $H, r9,$Htbl 252 le?vspltisb $t0,0x07 253 lvx_u $Hh,r10,$Htbl 254 le?vxor $lemask,$lemask,$t0 255 lvx_u $xC2,0,$Htbl 256 le?vperm $IN,$IN,$IN,$lemask 257 vxor $zero,$zero,$zero 258 259 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 260 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 261 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 262 263 vpmsumd $t2,$Xl,$xC2 # 1st phase 264 265 vsldoi $t0,$Xm,$zero,8 266 vsldoi $t1,$zero,$Xm,8 267 vxor $Xl,$Xl,$t0 268 vxor $Xh,$Xh,$t1 269 270 vsldoi $Xl,$Xl,$Xl,8 271 vxor $Xl,$Xl,$t2 272 273 vsldoi $t1,$Xl,$Xl,8 # 2nd phase 274 vpmsumd $Xl,$Xl,$xC2 275 vxor $t1,$t1,$Xh 276 vxor $Xl,$Xl,$t1 277 278 le?vperm $Xl,$Xl,$Xl,$lemask 279 stvx_u $Xl,0,$Xip # write out Xi 280 281 mtspr 256,$vrsave 282 blr 283 .long 0 284 .byte 0,12,0x14,0,0,0,2,0 285 .long 0 286.size .gcm_gmult_p10,.-.gcm_gmult_p10 287 288.globl .gcm_ghash_p10 289 lis r0,0xfff8 290 li r8,0x10 291 mfspr $vrsave,256 292 li r9,0x20 293 mtspr 256,r0 294 li r10,0x30 295 lvx_u $Xl,0,$Xip # load Xi 296 297 lvx_u $Hl,r8,$Htbl # load pre-computed table 298 le?lvsl $lemask,r0,r0 299 lvx_u $H, r9,$Htbl 300 le?vspltisb $t0,0x07 301 lvx_u $Hh,r10,$Htbl 302 le?vxor $lemask,$lemask,$t0 303 lvx_u $xC2,0,$Htbl 304 le?vperm $Xl,$Xl,$Xl,$lemask 305 vxor $zero,$zero,$zero 306 307 lvx_u $IN,0,$inp 308 addi $inp,$inp,16 309 subi $len,$len,16 310 le?vperm $IN,$IN,$IN,$lemask 311 vxor $IN,$IN,$Xl 312 b Loop 313 314.align 5 315Loop: 316 subic $len,$len,16 317 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo 318 subfe. r0,r0,r0 # borrow?-1:0 319 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi 320 and r0,r0,$len 321 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi 322 add $inp,$inp,r0 323 324 vpmsumd $t2,$Xl,$xC2 # 1st phase 325 326 vsldoi $t0,$Xm,$zero,8 327 vsldoi $t1,$zero,$Xm,8 328 vxor $Xl,$Xl,$t0 329 vxor $Xh,$Xh,$t1 330 331 vsldoi $Xl,$Xl,$Xl,8 332 vxor $Xl,$Xl,$t2 333 lvx_u $IN,0,$inp 334 addi $inp,$inp,16 335 336 vsldoi $t1,$Xl,$Xl,8 # 2nd phase 337 vpmsumd $Xl,$Xl,$xC2 338 le?vperm $IN,$IN,$IN,$lemask 339 vxor $t1,$t1,$Xh 340 vxor $IN,$IN,$t1 341 vxor $IN,$IN,$Xl 342 beq Loop # did $len-=16 borrow? 343 344 vxor $Xl,$Xl,$t1 345 le?vperm $Xl,$Xl,$Xl,$lemask 346 stvx_u $Xl,0,$Xip # write out Xi 347 348 mtspr 256,$vrsave 349 blr 350 .long 0 351 .byte 0,12,0x14,0,0,0,4,0 352 .long 0 353.size .gcm_ghash_p10,.-.gcm_ghash_p10 354 355.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 356.align 2 357___ 358 359foreach (split("\n",$code)) { 360 if ($flavour =~ /le$/o) { # little-endian 361 s/le\?//o or 362 s/be\?/#be#/o; 363 } else { 364 s/le\?/#le#/o or 365 s/be\?//o; 366 } 367 print $_,"\n"; 368} 369 370close STDOUT; # enforce flush 371