1*e0c4386eSCy Schubert#! /usr/bin/env perl 2*e0c4386eSCy Schubert# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3*e0c4386eSCy Schubert# 4*e0c4386eSCy Schubert# Licensed under the Apache License 2.0 (the "License"). You may not use 5*e0c4386eSCy Schubert# this file except in compliance with the License. You can obtain a copy 6*e0c4386eSCy Schubert# in the file LICENSE in the source distribution or at 7*e0c4386eSCy Schubert# https://www.openssl.org/source/license.html 8*e0c4386eSCy Schubert 9*e0c4386eSCy Schubert# 10*e0c4386eSCy Schubert# ==================================================================== 11*e0c4386eSCy Schubert# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12*e0c4386eSCy Schubert# project. The module is, however, dual licensed under OpenSSL and 13*e0c4386eSCy Schubert# CRYPTOGAMS licenses depending on where you obtain it. For further 14*e0c4386eSCy Schubert# details see http://www.openssl.org/~appro/cryptogams/. 15*e0c4386eSCy Schubert# ==================================================================== 16*e0c4386eSCy Schubert# 17*e0c4386eSCy Schubert# March 2010 18*e0c4386eSCy Schubert# 19*e0c4386eSCy Schubert# The module implements "4-bit" GCM GHASH function and underlying 20*e0c4386eSCy Schubert# single multiplication operation in GF(2^128). "4-bit" means that it 21*e0c4386eSCy Schubert# uses 256 bytes per-key table [+128 bytes shared table]. Even though 22*e0c4386eSCy Schubert# loops are aggressively modulo-scheduled in respect to references to 23*e0c4386eSCy Schubert# Htbl and Z.hi updates for 8 cycles per byte, measured performance is 24*e0c4386eSCy Schubert# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic 25*e0c4386eSCy Schubert# scheduling "glitch," because uprofile(1) indicates uniform sample 26*e0c4386eSCy Schubert# distribution, as if all instruction bundles execute in 1.5 cycles. 27*e0c4386eSCy Schubert# Meaning that it could have been even faster, yet 12 cycles is ~60% 28*e0c4386eSCy Schubert# better than gcc-generated code and ~80% than code generated by vendor 29*e0c4386eSCy Schubert# compiler. 30*e0c4386eSCy Schubert 31*e0c4386eSCy Schubert$cnt="v0"; # $0 32*e0c4386eSCy Schubert$t0="t0"; 33*e0c4386eSCy Schubert$t1="t1"; 34*e0c4386eSCy Schubert$t2="t2"; 35*e0c4386eSCy Schubert$Thi0="t3"; # $4 36*e0c4386eSCy Schubert$Tlo0="t4"; 37*e0c4386eSCy Schubert$Thi1="t5"; 38*e0c4386eSCy Schubert$Tlo1="t6"; 39*e0c4386eSCy Schubert$rem="t7"; # $8 40*e0c4386eSCy Schubert################# 41*e0c4386eSCy Schubert$Xi="a0"; # $16, input argument block 42*e0c4386eSCy Schubert$Htbl="a1"; 43*e0c4386eSCy Schubert$inp="a2"; 44*e0c4386eSCy Schubert$len="a3"; 45*e0c4386eSCy Schubert$nlo="a4"; # $20 46*e0c4386eSCy Schubert$nhi="a5"; 47*e0c4386eSCy Schubert$Zhi="t8"; 48*e0c4386eSCy Schubert$Zlo="t9"; 49*e0c4386eSCy Schubert$Xhi="t10"; # $24 50*e0c4386eSCy Schubert$Xlo="t11"; 51*e0c4386eSCy Schubert$remp="t12"; 52*e0c4386eSCy Schubert$rem_4bit="AT"; # $28 53*e0c4386eSCy Schubert 54*e0c4386eSCy Schubert{ my $N; 55*e0c4386eSCy Schubert sub loop() { 56*e0c4386eSCy Schubert 57*e0c4386eSCy Schubert $N++; 58*e0c4386eSCy Schubert$code.=<<___; 59*e0c4386eSCy Schubert.align 4 60*e0c4386eSCy Schubert extbl $Xlo,7,$nlo 61*e0c4386eSCy Schubert and $nlo,0xf0,$nhi 62*e0c4386eSCy Schubert sll $nlo,4,$nlo 63*e0c4386eSCy Schubert and $nlo,0xf0,$nlo 64*e0c4386eSCy Schubert 65*e0c4386eSCy Schubert addq $nlo,$Htbl,$nlo 66*e0c4386eSCy Schubert ldq $Zlo,8($nlo) 67*e0c4386eSCy Schubert addq $nhi,$Htbl,$nhi 68*e0c4386eSCy Schubert ldq $Zhi,0($nlo) 69*e0c4386eSCy Schubert 70*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 71*e0c4386eSCy Schubert sll $Zhi,60,$t0 72*e0c4386eSCy Schubert lda $cnt,6(zero) 73*e0c4386eSCy Schubert extbl $Xlo,6,$nlo 74*e0c4386eSCy Schubert 75*e0c4386eSCy Schubert ldq $Tlo1,8($nhi) 76*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 77*e0c4386eSCy Schubert ldq $Thi1,0($nhi) 78*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 79*e0c4386eSCy Schubert 80*e0c4386eSCy Schubert ldq $rem,0($remp) 81*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 82*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 83*e0c4386eSCy Schubert and $nlo,0xf0,$nhi 84*e0c4386eSCy Schubert 85*e0c4386eSCy Schubert xor $Tlo1,$Zlo,$Zlo 86*e0c4386eSCy Schubert sll $nlo,4,$nlo 87*e0c4386eSCy Schubert xor $Thi1,$Zhi,$Zhi 88*e0c4386eSCy Schubert and $nlo,0xf0,$nlo 89*e0c4386eSCy Schubert 90*e0c4386eSCy Schubert addq $nlo,$Htbl,$nlo 91*e0c4386eSCy Schubert ldq $Tlo0,8($nlo) 92*e0c4386eSCy Schubert addq $nhi,$Htbl,$nhi 93*e0c4386eSCy Schubert ldq $Thi0,0($nlo) 94*e0c4386eSCy Schubert 95*e0c4386eSCy Schubert.Looplo$N: 96*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 97*e0c4386eSCy Schubert sll $Zhi,60,$t0 98*e0c4386eSCy Schubert subq $cnt,1,$cnt 99*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 100*e0c4386eSCy Schubert 101*e0c4386eSCy Schubert ldq $Tlo1,8($nhi) 102*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 103*e0c4386eSCy Schubert ldq $Thi1,0($nhi) 104*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 105*e0c4386eSCy Schubert 106*e0c4386eSCy Schubert ldq $rem,0($remp) 107*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 108*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 109*e0c4386eSCy Schubert extbl $Xlo,$cnt,$nlo 110*e0c4386eSCy Schubert 111*e0c4386eSCy Schubert and $nlo,0xf0,$nhi 112*e0c4386eSCy Schubert xor $Thi0,$Zhi,$Zhi 113*e0c4386eSCy Schubert xor $Tlo0,$Zlo,$Zlo 114*e0c4386eSCy Schubert sll $nlo,4,$nlo 115*e0c4386eSCy Schubert 116*e0c4386eSCy Schubert 117*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 118*e0c4386eSCy Schubert sll $Zhi,60,$t0 119*e0c4386eSCy Schubert and $nlo,0xf0,$nlo 120*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 121*e0c4386eSCy Schubert 122*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 123*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 124*e0c4386eSCy Schubert addq $nlo,$Htbl,$nlo 125*e0c4386eSCy Schubert addq $nhi,$Htbl,$nhi 126*e0c4386eSCy Schubert 127*e0c4386eSCy Schubert ldq $rem,0($remp) 128*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 129*e0c4386eSCy Schubert ldq $Tlo0,8($nlo) 130*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 131*e0c4386eSCy Schubert 132*e0c4386eSCy Schubert xor $Tlo1,$Zlo,$Zlo 133*e0c4386eSCy Schubert xor $Thi1,$Zhi,$Zhi 134*e0c4386eSCy Schubert ldq $Thi0,0($nlo) 135*e0c4386eSCy Schubert bne $cnt,.Looplo$N 136*e0c4386eSCy Schubert 137*e0c4386eSCy Schubert 138*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 139*e0c4386eSCy Schubert sll $Zhi,60,$t0 140*e0c4386eSCy Schubert lda $cnt,7(zero) 141*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 142*e0c4386eSCy Schubert 143*e0c4386eSCy Schubert ldq $Tlo1,8($nhi) 144*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 145*e0c4386eSCy Schubert ldq $Thi1,0($nhi) 146*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 147*e0c4386eSCy Schubert 148*e0c4386eSCy Schubert ldq $rem,0($remp) 149*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 150*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 151*e0c4386eSCy Schubert extbl $Xhi,$cnt,$nlo 152*e0c4386eSCy Schubert 153*e0c4386eSCy Schubert and $nlo,0xf0,$nhi 154*e0c4386eSCy Schubert xor $Thi0,$Zhi,$Zhi 155*e0c4386eSCy Schubert xor $Tlo0,$Zlo,$Zlo 156*e0c4386eSCy Schubert sll $nlo,4,$nlo 157*e0c4386eSCy Schubert 158*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 159*e0c4386eSCy Schubert sll $Zhi,60,$t0 160*e0c4386eSCy Schubert and $nlo,0xf0,$nlo 161*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 162*e0c4386eSCy Schubert 163*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 164*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 165*e0c4386eSCy Schubert addq $nlo,$Htbl,$nlo 166*e0c4386eSCy Schubert addq $nhi,$Htbl,$nhi 167*e0c4386eSCy Schubert 168*e0c4386eSCy Schubert ldq $rem,0($remp) 169*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 170*e0c4386eSCy Schubert ldq $Tlo0,8($nlo) 171*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 172*e0c4386eSCy Schubert 173*e0c4386eSCy Schubert xor $Tlo1,$Zlo,$Zlo 174*e0c4386eSCy Schubert xor $Thi1,$Zhi,$Zhi 175*e0c4386eSCy Schubert ldq $Thi0,0($nlo) 176*e0c4386eSCy Schubert unop 177*e0c4386eSCy Schubert 178*e0c4386eSCy Schubert 179*e0c4386eSCy Schubert.Loophi$N: 180*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 181*e0c4386eSCy Schubert sll $Zhi,60,$t0 182*e0c4386eSCy Schubert subq $cnt,1,$cnt 183*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 184*e0c4386eSCy Schubert 185*e0c4386eSCy Schubert ldq $Tlo1,8($nhi) 186*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 187*e0c4386eSCy Schubert ldq $Thi1,0($nhi) 188*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 189*e0c4386eSCy Schubert 190*e0c4386eSCy Schubert ldq $rem,0($remp) 191*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 192*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 193*e0c4386eSCy Schubert extbl $Xhi,$cnt,$nlo 194*e0c4386eSCy Schubert 195*e0c4386eSCy Schubert and $nlo,0xf0,$nhi 196*e0c4386eSCy Schubert xor $Thi0,$Zhi,$Zhi 197*e0c4386eSCy Schubert xor $Tlo0,$Zlo,$Zlo 198*e0c4386eSCy Schubert sll $nlo,4,$nlo 199*e0c4386eSCy Schubert 200*e0c4386eSCy Schubert 201*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 202*e0c4386eSCy Schubert sll $Zhi,60,$t0 203*e0c4386eSCy Schubert and $nlo,0xf0,$nlo 204*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 205*e0c4386eSCy Schubert 206*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 207*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 208*e0c4386eSCy Schubert addq $nlo,$Htbl,$nlo 209*e0c4386eSCy Schubert addq $nhi,$Htbl,$nhi 210*e0c4386eSCy Schubert 211*e0c4386eSCy Schubert ldq $rem,0($remp) 212*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 213*e0c4386eSCy Schubert ldq $Tlo0,8($nlo) 214*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 215*e0c4386eSCy Schubert 216*e0c4386eSCy Schubert xor $Tlo1,$Zlo,$Zlo 217*e0c4386eSCy Schubert xor $Thi1,$Zhi,$Zhi 218*e0c4386eSCy Schubert ldq $Thi0,0($nlo) 219*e0c4386eSCy Schubert bne $cnt,.Loophi$N 220*e0c4386eSCy Schubert 221*e0c4386eSCy Schubert 222*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 223*e0c4386eSCy Schubert sll $Zhi,60,$t0 224*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 225*e0c4386eSCy Schubert 226*e0c4386eSCy Schubert ldq $Tlo1,8($nhi) 227*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 228*e0c4386eSCy Schubert ldq $Thi1,0($nhi) 229*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 230*e0c4386eSCy Schubert 231*e0c4386eSCy Schubert ldq $rem,0($remp) 232*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 233*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 234*e0c4386eSCy Schubert 235*e0c4386eSCy Schubert xor $Tlo0,$Zlo,$Zlo 236*e0c4386eSCy Schubert xor $Thi0,$Zhi,$Zhi 237*e0c4386eSCy Schubert 238*e0c4386eSCy Schubert and $Zlo,0x0f,$remp 239*e0c4386eSCy Schubert sll $Zhi,60,$t0 240*e0c4386eSCy Schubert srl $Zlo,4,$Zlo 241*e0c4386eSCy Schubert 242*e0c4386eSCy Schubert s8addq $remp,$rem_4bit,$remp 243*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 244*e0c4386eSCy Schubert 245*e0c4386eSCy Schubert ldq $rem,0($remp) 246*e0c4386eSCy Schubert srl $Zhi,4,$Zhi 247*e0c4386eSCy Schubert xor $Tlo1,$Zlo,$Zlo 248*e0c4386eSCy Schubert xor $Thi1,$Zhi,$Zhi 249*e0c4386eSCy Schubert xor $t0,$Zlo,$Zlo 250*e0c4386eSCy Schubert xor $rem,$Zhi,$Zhi 251*e0c4386eSCy Schubert___ 252*e0c4386eSCy Schubert}} 253*e0c4386eSCy Schubert 254*e0c4386eSCy Schubert$code=<<___; 255*e0c4386eSCy Schubert#ifdef __linux__ 256*e0c4386eSCy Schubert#include <asm/regdef.h> 257*e0c4386eSCy Schubert#else 258*e0c4386eSCy Schubert#include <asm.h> 259*e0c4386eSCy Schubert#include <regdef.h> 260*e0c4386eSCy Schubert#endif 261*e0c4386eSCy Schubert 262*e0c4386eSCy Schubert.text 263*e0c4386eSCy Schubert 264*e0c4386eSCy Schubert.set noat 265*e0c4386eSCy Schubert.set noreorder 266*e0c4386eSCy Schubert.globl gcm_gmult_4bit 267*e0c4386eSCy Schubert.align 4 268*e0c4386eSCy Schubert.ent gcm_gmult_4bit 269*e0c4386eSCy Schubertgcm_gmult_4bit: 270*e0c4386eSCy Schubert .frame sp,0,ra 271*e0c4386eSCy Schubert .prologue 0 272*e0c4386eSCy Schubert 273*e0c4386eSCy Schubert ldq $Xlo,8($Xi) 274*e0c4386eSCy Schubert ldq $Xhi,0($Xi) 275*e0c4386eSCy Schubert 276*e0c4386eSCy Schubert bsr $t0,picmeup 277*e0c4386eSCy Schubert nop 278*e0c4386eSCy Schubert___ 279*e0c4386eSCy Schubert 280*e0c4386eSCy Schubert &loop(); 281*e0c4386eSCy Schubert 282*e0c4386eSCy Schubert$code.=<<___; 283*e0c4386eSCy Schubert srl $Zlo,24,$t0 # byte swap 284*e0c4386eSCy Schubert srl $Zlo,8,$t1 285*e0c4386eSCy Schubert 286*e0c4386eSCy Schubert sll $Zlo,8,$t2 287*e0c4386eSCy Schubert sll $Zlo,24,$Zlo 288*e0c4386eSCy Schubert zapnot $t0,0x11,$t0 289*e0c4386eSCy Schubert zapnot $t1,0x22,$t1 290*e0c4386eSCy Schubert 291*e0c4386eSCy Schubert zapnot $Zlo,0x88,$Zlo 292*e0c4386eSCy Schubert or $t0,$t1,$t0 293*e0c4386eSCy Schubert zapnot $t2,0x44,$t2 294*e0c4386eSCy Schubert 295*e0c4386eSCy Schubert or $Zlo,$t0,$Zlo 296*e0c4386eSCy Schubert srl $Zhi,24,$t0 297*e0c4386eSCy Schubert srl $Zhi,8,$t1 298*e0c4386eSCy Schubert 299*e0c4386eSCy Schubert or $Zlo,$t2,$Zlo 300*e0c4386eSCy Schubert sll $Zhi,8,$t2 301*e0c4386eSCy Schubert sll $Zhi,24,$Zhi 302*e0c4386eSCy Schubert 303*e0c4386eSCy Schubert srl $Zlo,32,$Xlo 304*e0c4386eSCy Schubert sll $Zlo,32,$Zlo 305*e0c4386eSCy Schubert 306*e0c4386eSCy Schubert zapnot $t0,0x11,$t0 307*e0c4386eSCy Schubert zapnot $t1,0x22,$t1 308*e0c4386eSCy Schubert or $Zlo,$Xlo,$Xlo 309*e0c4386eSCy Schubert 310*e0c4386eSCy Schubert zapnot $Zhi,0x88,$Zhi 311*e0c4386eSCy Schubert or $t0,$t1,$t0 312*e0c4386eSCy Schubert zapnot $t2,0x44,$t2 313*e0c4386eSCy Schubert 314*e0c4386eSCy Schubert or $Zhi,$t0,$Zhi 315*e0c4386eSCy Schubert or $Zhi,$t2,$Zhi 316*e0c4386eSCy Schubert 317*e0c4386eSCy Schubert srl $Zhi,32,$Xhi 318*e0c4386eSCy Schubert sll $Zhi,32,$Zhi 319*e0c4386eSCy Schubert 320*e0c4386eSCy Schubert or $Zhi,$Xhi,$Xhi 321*e0c4386eSCy Schubert stq $Xlo,8($Xi) 322*e0c4386eSCy Schubert stq $Xhi,0($Xi) 323*e0c4386eSCy Schubert 324*e0c4386eSCy Schubert ret (ra) 325*e0c4386eSCy Schubert.end gcm_gmult_4bit 326*e0c4386eSCy Schubert___ 327*e0c4386eSCy Schubert 328*e0c4386eSCy Schubert$inhi="s0"; 329*e0c4386eSCy Schubert$inlo="s1"; 330*e0c4386eSCy Schubert 331*e0c4386eSCy Schubert$code.=<<___; 332*e0c4386eSCy Schubert.globl gcm_ghash_4bit 333*e0c4386eSCy Schubert.align 4 334*e0c4386eSCy Schubert.ent gcm_ghash_4bit 335*e0c4386eSCy Schubertgcm_ghash_4bit: 336*e0c4386eSCy Schubert lda sp,-32(sp) 337*e0c4386eSCy Schubert stq ra,0(sp) 338*e0c4386eSCy Schubert stq s0,8(sp) 339*e0c4386eSCy Schubert stq s1,16(sp) 340*e0c4386eSCy Schubert .mask 0x04000600,-32 341*e0c4386eSCy Schubert .frame sp,32,ra 342*e0c4386eSCy Schubert .prologue 0 343*e0c4386eSCy Schubert 344*e0c4386eSCy Schubert ldq_u $inhi,0($inp) 345*e0c4386eSCy Schubert ldq_u $Thi0,7($inp) 346*e0c4386eSCy Schubert ldq_u $inlo,8($inp) 347*e0c4386eSCy Schubert ldq_u $Tlo0,15($inp) 348*e0c4386eSCy Schubert ldq $Xhi,0($Xi) 349*e0c4386eSCy Schubert ldq $Xlo,8($Xi) 350*e0c4386eSCy Schubert 351*e0c4386eSCy Schubert bsr $t0,picmeup 352*e0c4386eSCy Schubert nop 353*e0c4386eSCy Schubert 354*e0c4386eSCy Schubert.Louter: 355*e0c4386eSCy Schubert extql $inhi,$inp,$inhi 356*e0c4386eSCy Schubert extqh $Thi0,$inp,$Thi0 357*e0c4386eSCy Schubert or $inhi,$Thi0,$inhi 358*e0c4386eSCy Schubert lda $inp,16($inp) 359*e0c4386eSCy Schubert 360*e0c4386eSCy Schubert extql $inlo,$inp,$inlo 361*e0c4386eSCy Schubert extqh $Tlo0,$inp,$Tlo0 362*e0c4386eSCy Schubert or $inlo,$Tlo0,$inlo 363*e0c4386eSCy Schubert subq $len,16,$len 364*e0c4386eSCy Schubert 365*e0c4386eSCy Schubert xor $Xlo,$inlo,$Xlo 366*e0c4386eSCy Schubert xor $Xhi,$inhi,$Xhi 367*e0c4386eSCy Schubert___ 368*e0c4386eSCy Schubert 369*e0c4386eSCy Schubert &loop(); 370*e0c4386eSCy Schubert 371*e0c4386eSCy Schubert$code.=<<___; 372*e0c4386eSCy Schubert srl $Zlo,24,$t0 # byte swap 373*e0c4386eSCy Schubert srl $Zlo,8,$t1 374*e0c4386eSCy Schubert 375*e0c4386eSCy Schubert sll $Zlo,8,$t2 376*e0c4386eSCy Schubert sll $Zlo,24,$Zlo 377*e0c4386eSCy Schubert zapnot $t0,0x11,$t0 378*e0c4386eSCy Schubert zapnot $t1,0x22,$t1 379*e0c4386eSCy Schubert 380*e0c4386eSCy Schubert zapnot $Zlo,0x88,$Zlo 381*e0c4386eSCy Schubert or $t0,$t1,$t0 382*e0c4386eSCy Schubert zapnot $t2,0x44,$t2 383*e0c4386eSCy Schubert 384*e0c4386eSCy Schubert or $Zlo,$t0,$Zlo 385*e0c4386eSCy Schubert srl $Zhi,24,$t0 386*e0c4386eSCy Schubert srl $Zhi,8,$t1 387*e0c4386eSCy Schubert 388*e0c4386eSCy Schubert or $Zlo,$t2,$Zlo 389*e0c4386eSCy Schubert sll $Zhi,8,$t2 390*e0c4386eSCy Schubert sll $Zhi,24,$Zhi 391*e0c4386eSCy Schubert 392*e0c4386eSCy Schubert srl $Zlo,32,$Xlo 393*e0c4386eSCy Schubert sll $Zlo,32,$Zlo 394*e0c4386eSCy Schubert beq $len,.Ldone 395*e0c4386eSCy Schubert 396*e0c4386eSCy Schubert zapnot $t0,0x11,$t0 397*e0c4386eSCy Schubert zapnot $t1,0x22,$t1 398*e0c4386eSCy Schubert or $Zlo,$Xlo,$Xlo 399*e0c4386eSCy Schubert ldq_u $inhi,0($inp) 400*e0c4386eSCy Schubert 401*e0c4386eSCy Schubert zapnot $Zhi,0x88,$Zhi 402*e0c4386eSCy Schubert or $t0,$t1,$t0 403*e0c4386eSCy Schubert zapnot $t2,0x44,$t2 404*e0c4386eSCy Schubert ldq_u $Thi0,7($inp) 405*e0c4386eSCy Schubert 406*e0c4386eSCy Schubert or $Zhi,$t0,$Zhi 407*e0c4386eSCy Schubert or $Zhi,$t2,$Zhi 408*e0c4386eSCy Schubert ldq_u $inlo,8($inp) 409*e0c4386eSCy Schubert ldq_u $Tlo0,15($inp) 410*e0c4386eSCy Schubert 411*e0c4386eSCy Schubert srl $Zhi,32,$Xhi 412*e0c4386eSCy Schubert sll $Zhi,32,$Zhi 413*e0c4386eSCy Schubert 414*e0c4386eSCy Schubert or $Zhi,$Xhi,$Xhi 415*e0c4386eSCy Schubert br zero,.Louter 416*e0c4386eSCy Schubert 417*e0c4386eSCy Schubert.Ldone: 418*e0c4386eSCy Schubert zapnot $t0,0x11,$t0 419*e0c4386eSCy Schubert zapnot $t1,0x22,$t1 420*e0c4386eSCy Schubert or $Zlo,$Xlo,$Xlo 421*e0c4386eSCy Schubert 422*e0c4386eSCy Schubert zapnot $Zhi,0x88,$Zhi 423*e0c4386eSCy Schubert or $t0,$t1,$t0 424*e0c4386eSCy Schubert zapnot $t2,0x44,$t2 425*e0c4386eSCy Schubert 426*e0c4386eSCy Schubert or $Zhi,$t0,$Zhi 427*e0c4386eSCy Schubert or $Zhi,$t2,$Zhi 428*e0c4386eSCy Schubert 429*e0c4386eSCy Schubert srl $Zhi,32,$Xhi 430*e0c4386eSCy Schubert sll $Zhi,32,$Zhi 431*e0c4386eSCy Schubert 432*e0c4386eSCy Schubert or $Zhi,$Xhi,$Xhi 433*e0c4386eSCy Schubert 434*e0c4386eSCy Schubert stq $Xlo,8($Xi) 435*e0c4386eSCy Schubert stq $Xhi,0($Xi) 436*e0c4386eSCy Schubert 437*e0c4386eSCy Schubert .set noreorder 438*e0c4386eSCy Schubert /*ldq ra,0(sp)*/ 439*e0c4386eSCy Schubert ldq s0,8(sp) 440*e0c4386eSCy Schubert ldq s1,16(sp) 441*e0c4386eSCy Schubert lda sp,32(sp) 442*e0c4386eSCy Schubert ret (ra) 443*e0c4386eSCy Schubert.end gcm_ghash_4bit 444*e0c4386eSCy Schubert 445*e0c4386eSCy Schubert.align 4 446*e0c4386eSCy Schubert.ent picmeup 447*e0c4386eSCy Schubertpicmeup: 448*e0c4386eSCy Schubert .frame sp,0,$t0 449*e0c4386eSCy Schubert .prologue 0 450*e0c4386eSCy Schubert br $rem_4bit,.Lpic 451*e0c4386eSCy Schubert.Lpic: lda $rem_4bit,12($rem_4bit) 452*e0c4386eSCy Schubert ret ($t0) 453*e0c4386eSCy Schubert.end picmeup 454*e0c4386eSCy Schubert nop 455*e0c4386eSCy Schubertrem_4bit: 456*e0c4386eSCy Schubert .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 457*e0c4386eSCy Schubert .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 458*e0c4386eSCy Schubert .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 459*e0c4386eSCy Schubert .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 460*e0c4386eSCy Schubert.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" 461*e0c4386eSCy Schubert.align 4 462*e0c4386eSCy Schubert 463*e0c4386eSCy Schubert___ 464*e0c4386eSCy Schubert$output=pop and open STDOUT,">$output"; 465*e0c4386eSCy Schubertprint $code; 466*e0c4386eSCy Schubertclose STDOUT or die "error closing STDOUT: $!"; 467*e0c4386eSCy Schubert 468