1#! /usr/bin/env perl 2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# February 2012 18# 19# The module implements bn_GF2m_mul_2x2 polynomial multiplication 20# used in bn_gf2m.c. It's kind of low-hanging mechanical port from 21# C for the time being... The subroutine runs in 37 cycles, which is 22# 4.5x faster than compiler-generated code. Though comparison is 23# totally unfair, because this module utilizes Galois Field Multiply 24# instruction. 25 26while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 27open STDOUT,">$output"; 28 29($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector 30 31($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); 32($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); 33($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); 34($A,$B)=($Alo,$B_1); 35$xFF="B1"; 36 37sub mul_1x1_upper { 38my ($A,$B)=@_; 39$code.=<<___; 40 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes 41|| AND $B,$xFF,$B_0 42|| SHRU $B,24,$B_3 43 SHRU $A,16, $Ahi ; smash $A to two halfwords 44|| EXTU $A,16,16,$Alo 45 46 XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication 47|| XORMPY $Ahi,$B_2,$Ahix2 48|| EXTU $B,16,24,$B_1 49 XORMPY $Alo,$B_0,$Alox0 50|| XORMPY $Ahi,$B_0,$Ahix0 51 XORMPY $Alo,$B_3,$Alox3 52|| XORMPY $Ahi,$B_3,$Ahix3 53 XORMPY $Alo,$B_1,$Alox1 54|| XORMPY $Ahi,$B_1,$Ahix1 55___ 56} 57sub mul_1x1_merged { 58my ($OUTlo,$OUThi,$A,$B)=@_; 59$code.=<<___; 60 EXTU $B,8,24,$B_2 ; smash $B to 4 bytes 61|| AND $B,$xFF,$B_0 62|| SHRU $B,24,$B_3 63 SHRU $A,16, $Ahi ; smash $A to two halfwords 64|| EXTU $A,16,16,$Alo 65 66 XOR $Ahix0,$Alox2,$Ahix0 67|| MV $Ahix2,$OUThi 68|| XORMPY $Alo,$B_2,$Alox2 69 XORMPY $Ahi,$B_2,$Ahix2 70|| EXTU $B,16,24,$B_1 71|| XORMPY $Alo,$B_0,A1 ; $Alox0 72 XOR $Ahix1,$Alox3,$Ahix1 73|| SHL $Ahix0,16,$OUTlo 74|| SHRU $Ahix0,16,$Ahix0 75 XOR $Alox0,$OUTlo,$OUTlo 76|| XOR $Ahix0,$OUThi,$OUThi 77|| XORMPY $Ahi,$B_0,$Ahix0 78|| XORMPY $Alo,$B_3,$Alox3 79|| SHL $Alox1,8,$Alox1 80|| SHL $Ahix3,8,$Ahix3 81 XOR $Alox1,$OUTlo,$OUTlo 82|| XOR $Ahix3,$OUThi,$OUThi 83|| XORMPY $Ahi,$B_3,$Ahix3 84|| SHL $Ahix1,24,$Alox1 85|| SHRU $Ahix1,8, $Ahix1 86 XOR $Alox1,$OUTlo,$OUTlo 87|| XOR $Ahix1,$OUThi,$OUThi 88|| XORMPY $Alo,$B_1,$Alox1 89|| XORMPY $Ahi,$B_1,$Ahix1 90|| MV A1,$Alox0 91___ 92} 93sub mul_1x1_lower { 94my ($OUTlo,$OUThi)=@_; 95$code.=<<___; 96 ;NOP 97 XOR $Ahix0,$Alox2,$Ahix0 98|| MV $Ahix2,$OUThi 99 NOP 100 XOR $Ahix1,$Alox3,$Ahix1 101|| SHL $Ahix0,16,$OUTlo 102|| SHRU $Ahix0,16,$Ahix0 103 XOR $Alox0,$OUTlo,$OUTlo 104|| XOR $Ahix0,$OUThi,$OUThi 105|| SHL $Alox1,8,$Alox1 106|| SHL $Ahix3,8,$Ahix3 107 XOR $Alox1,$OUTlo,$OUTlo 108|| XOR $Ahix3,$OUThi,$OUThi 109|| SHL $Ahix1,24,$Alox1 110|| SHRU $Ahix1,8, $Ahix1 111 XOR $Alox1,$OUTlo,$OUTlo 112|| XOR $Ahix1,$OUThi,$OUThi 113___ 114} 115$code.=<<___; 116 .text 117 118 .if .ASSEMBLER_VERSION<7000000 119 .asg 0,__TI_EABI__ 120 .endif 121 .if __TI_EABI__ 122 .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 123 .endif 124 125 .global _bn_GF2m_mul_2x2 126_bn_GF2m_mul_2x2: 127 .asmfunc 128 MVK 0xFF,$xFF 129___ 130 &mul_1x1_upper($a0,$b0); # a0·b0 131$code.=<<___; 132|| MV $b1,$B 133 MV $a1,$A 134___ 135 &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 136$code.=<<___; 137|| XOR $b0,$b1,$B 138 XOR $a0,$a1,$A 139___ 140 &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) 141$code.=<<___; 142 XOR A28,A31,A29 143|| XOR B28,B31,B29 ; a0·b0+a1·b1 144___ 145 &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) 146$code.=<<___; 147|| BNOP B3 148 XOR A29,A30,A30 149|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 150 XOR B28,A30,A30 151|| STW A28,*${rp}[0] 152 XOR B30,A31,A31 153|| STW A30,*${rp}[1] 154 STW A31,*${rp}[2] 155 STW B31,*${rp}[3] 156 .endasmfunc 157___ 158 159print $code; 160close STDOUT; 161