These 4 files exist in the git repository for rust-ring, and are from the same commit where 0.16.20 is taken from. They were not added to the include list in Cargo.toml, so they were not added to the tarball. --- crypto/curve25519/make_curve25519_tables.py | 222 +++++ crypto/fipsmodule/aes/asm/vpaes-armv7.pl | 896 ++++++++++++++++++ crypto/fipsmodule/aes/asm/vpaes-armv8.pl | 837 ++++++++++++++++ .../fipsmodule/modes/asm/ghash-neon-armv8.pl | 294 ++++++ 4 files changed, 2249 insertions(+) create mode 100755 crypto/curve25519/make_curve25519_tables.py create mode 100644 crypto/fipsmodule/aes/asm/vpaes-armv7.pl create mode 100755 crypto/fipsmodule/aes/asm/vpaes-armv8.pl create mode 100644 crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl diff --git a/crypto/curve25519/make_curve25519_tables.py b/crypto/curve25519/make_curve25519_tables.py new file mode 100755 index 0000000..50dee2a --- /dev/null +++ b/crypto/curve25519/make_curve25519_tables.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright (c) 2020, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +import StringIO +import subprocess + +# Base field Z_p +p = 2**255 - 19 + +def modp_inv(x): + return pow(x, p-2, p) + +# Square root of -1 +modp_sqrt_m1 = pow(2, (p-1) // 4, p) + +# Compute corresponding x-coordinate, with low bit corresponding to +# sign, or return None on failure +def recover_x(y, sign): + if y >= p: + return None + x2 = (y*y-1) * modp_inv(d*y*y+1) + if x2 == 0: + if sign: + return None + else: + return 0 + + # Compute square root of x2 + x = pow(x2, (p+3) // 8, p) + if (x*x - x2) % p != 0: + x = x * modp_sqrt_m1 % p + if (x*x - x2) % p != 0: + return None + + if (x & 1) != sign: + x = p - x + return x + +# Curve constant +d = -121665 * modp_inv(121666) % p + +# Base point +g_y = 4 * modp_inv(5) % p +g_x = recover_x(g_y, 0) + +# Points are represented as affine tuples (x, y). + +def point_add(P, Q): + x1, y1 = P + x2, y2 = Q + x3 = ((x1*y2 + y1*x2) * modp_inv(1 + d*x1*x2*y1*y2)) % p + y3 = ((y1*y2 + x1*x2) * modp_inv(1 - d*x1*x2*y1*y2)) % p + return (x3, y3) + +# Computes Q = s * P +def point_mul(s, P): + Q = (0, 1) # Neutral element + while s > 0: + if s & 1: + Q = point_add(Q, P) + P = point_add(P, P) + s >>= 1 + return Q + +def to_bytes(x): + ret = bytearray(32) + for i in range(len(ret)): + ret[i] = x % 256 + x >>= 8 + assert x == 0 + return ret + +def to_ge_precomp(P): + # typedef struct { + # fe_loose yplusx; + # fe_loose yminusx; + # fe_loose xy2d; + # } ge_precomp; + x, y = P + return ((y + x) % p, (y - x) % p, (x * y * 2 * d) % p) + +def to_base_25_5(x): + limbs = (26, 25, 26, 25, 26, 25, 26, 25, 26, 25) + ret = [] + for l in limbs: + ret.append(x & ((1<>= l + assert x == 0 + return ret + +def to_base_51(x): + ret = [] + for _ in range(5): + ret.append(x & ((1<<51) - 1)) + x >>= 51 + assert x == 0 + return ret + +def to_literal(x): + ret = "{{\n#if defined(BORINGSSL_CURVE25519_64BIT)\n" + ret += ", ".join(map(str, to_base_51(x))) + ret += "\n#else\n" + ret += ", ".join(map(str, to_base_25_5(x))) + ret += "\n#endif\n}}" + return ret + +def main(): + d2 = (2 * d) % p + + small_precomp = bytearray() + for i in range(1, 16): + s = (i&1) | ((i&2) << (64-1)) | ((i&4) << (128-2)) | ((i&8) << (192-3)) + P = point_mul(s, (g_x, g_y)) + small_precomp += to_bytes(P[0]) + small_precomp += to_bytes(P[1]) + + large_precomp = [] + for i in range(32): + large_precomp.append([]) + for j in range(8): + P = point_mul((j + 1) << (i * 8), (g_x, g_y)) + large_precomp[-1].append(to_ge_precomp(P)) + + bi_precomp = [] + for i in range(8): + P = point_mul(2*i + 1, (g_x, g_y)) + bi_precomp.append(to_ge_precomp(P)) + + + buf = StringIO.StringIO() + buf.write("""/* Copyright (c) 2020, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +// This file is generated from +// ./make_curve25519_tables.py > curve25519_tables.h + + +static const fe d = """) + buf.write(to_literal(d)) + buf.write("""; + +static const fe sqrtm1 = """) + buf.write(to_literal(modp_sqrt_m1)) + buf.write("""; + +static const fe d2 = """) + buf.write(to_literal(d2)) + buf.write("""; + +#if defined(OPENSSL_SMALL) + +// This block of code replaces the standard base-point table with a much smaller +// one. The standard table is 30,720 bytes while this one is just 960. +// +// This table contains 15 pairs of group elements, (x, y), where each field +// element is serialised with |fe_tobytes|. If |i| is the index of the group +// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀ +// is the most significant bit). The value of the group element is then: +// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator. +static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""") + for i, b in enumerate(small_precomp): + buf.write("0x%02x, " % b) + buf.write(""" +}; + +#else + +// k25519Precomp[i][j] = (j+1)*256^i*B +static const ge_precomp k25519Precomp[32][8] = { +""") + for child in large_precomp: + buf.write("{\n") + for val in child: + buf.write("{\n") + for term in val: + buf.write(to_literal(term) + ",\n") + buf.write("},\n") + buf.write("},\n") + buf.write("""}; + +#endif // OPENSSL_SMALL + +// Bi[i] = (2*i+1)*B +static const ge_precomp Bi[8] = { +""") + for val in bi_precomp: + buf.write("{\n") + for term in val: + buf.write(to_literal(term) + ",\n") + buf.write("},\n") + buf.write("""}; +""") + + proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE) + proc.communicate(buf.getvalue()) + +if __name__ == "__main__": + main() diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl new file mode 100644 index 0000000..d36a97a --- /dev/null +++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl @@ -0,0 +1,896 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# Adapted from the original x86_64 version and 's ARMv8 +# version. +# +# armv7, aarch64, and x86_64 differ in several ways: +# +# * x86_64 SSSE3 instructions are two-address (destination operand is also a +# source), while NEON is three-address (destination operand is separate from +# two sources). +# +# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16. +# +# * x86_64 instructions can take memory references, while ARM is a load/store +# architecture. This means we sometimes need a spare register. +# +# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb), +# while armv7 only has a 64-bit byte shuffle (vtbl). +# +# This means this armv7 version must be a mix of both aarch64 and x86_64 +# implementations. armv7 and aarch64 have analogous SIMD instructions, so we +# base the instructions on aarch64. However, we cannot use aarch64's register +# allocation. x86_64's register count matches, but x86_64 is two-address. +# vpaes-armv8.pl already accounts for this in the comments, which use +# three-address AVX instructions instead of the original SSSE3 ones. We base +# register usage on these comments, which are preserved in this file. +# +# This means we do not use separate input and output registers as in aarch64 and +# cannot pin as many constants in the preheat functions. However, the load/store +# architecture means we must still deviate from x86_64 in places. +# +# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source +# and destination and 128-bit table. Fortunately, armv7 also allows addressing +# upper and lower halves of each 128-bit register. The lower half of q{N} is +# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent +# instruction, +# +# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0. +# +# we write: +# +# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0. +# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1. +# +# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and +# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note, +# however, that destination (q0) and table (q1) registers may no longer match. +# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the +# two-address pshufb always matched these operands, so this is common.) +# +# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR +# expands to an ADD or SUB of the pc register to find an address. That immediate +# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation. +# This means larger values must be more aligned. +# +# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may +# use either encoding (do we actually need to support this?). In ARM mode, the +# distances get large enough to require 16-byte alignment. Moving constants +# closer to their use resolves most of this, but common constants in +# _vpaes_consts are used by the whole file. Affected ADR instructions must be +# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this +# constraint have been commented. +# +# For details on ARM's immediate value encoding scheme, see +# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ +# +# Finally, a summary of armv7 and aarch64 SIMD syntax differences: +# +# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not. +# +# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones). +# aarch64 names registers like v0, and denotes half-width operations in an +# instruction suffix (see below). +# +# * aarch64 embeds size and lane information in register suffixes. v0.16b is +# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s. +# armv7 embeds the total size in the register name (see above) and the size of +# each element in an instruction suffix, which may look like vmov.i8, +# vshr.u8, or vtbl.8, depending on instruction. + +use strict; + +my $flavour = shift; +my $output; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir=$1; +my $xlate; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +my $code = ""; + +$code.=<<___; +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: @ mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: @ sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv: @ inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: @ input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: @ sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: @ sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: @ sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 +___ + +{ +my ($inp,$out,$key) = map("r$_", (0..2)); + +my ($invlo,$invhi) = map("q$_", (10..11)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15)); + +$code.=<<___; +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [$key] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, $key + ldr r8, [$key,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 + vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 q1#hi, {q2}, q1#hi + vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 q2#hi, {q3}, q0#hi + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 q4#hi, {$sb1t}, q2#hi + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 q0#hi, {$sb1u}, q3#hi + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 q5#hi, {$sb2t}, q2#hi + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 q2#hi, {$sb2u}, q3#hi + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 q3#hi, {q0}, q1#hi + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 q5#hi, {q0}, q4#hi + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 q4#hi, {q3}, q1#hi + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i + vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 q5#hi, {$invhi}, q1#hi + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 q3#hi, {$invlo}, q0#hi + vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 q4#hi, {$invlo}, q1#hi + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 q2#hi, {$invlo}, q3#hi + vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 q3#hi, {$invlo}, q4#hi + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 q4#hi, {q1}, q2#hi + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 q2#hi, {q0}, q3#hi + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 q0#hi, {q2}, q1#hi + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl GFp_vpaes_encrypt +.type GFp_vpaes_encrypt,%function +.align 4 +GFp_vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7-r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8-d11} + + vld1.64 {q0}, [$inp] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [$out] + + vldmia sp!, {d8-d11} + ldmia sp!, {r7-r11, pc} @ return +.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt +___ +} +{ +my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3"); +my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12)); + +$code.=<<___; +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_rcon: @ rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: @ output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: @ deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 $s63, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 $s0F, #0x0f @ .Lk_s0F + vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv + vld1.64 {$rcon}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) + + @ *ring*: Decryption removed. + +.Lschedule_go: + cmp $bits, #192 @ cmp \$192, %esi + bhi .Lschedule_256 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov $inp, #10 @ mov \$10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs $inp, $inp, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov $inp, #7 @ mov \$7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs $inp, $inp, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add $out, $out, #32 @ add \$32, %rdx + vmov q2, q0 + vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 q0#hi, {q2}, q1#hi + +.Lschedule_mangle_last_dec: + sub $out, $out, #16 @ add \$-16, %rdx + veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1 + vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 q2#hi, {$invhi}, q1#hi + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 q3#hi, {$invlo}, q0#hi + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 q4#hi, {$invlo}, q1#hi + veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 q3#hi, {$invlo}, q3#hi + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 q2#hi, {$invlo}, q4#hi + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 q4#hi, {q15}, q3#hi + vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 q1#hi, {q14}, q2#hi + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 + vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 q2#hi, {q14}, q1#hi + vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 q0#hi, {q15}, q0#hi + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst $dir, $dir + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 @ add \$16, %rdx + vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 q4#hi, {q2}, q5#hi + vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 q1#hi, {q4}, q5#hi + vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 q3#hi, {q1}, q5#hi + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 q2#hi, {q3}, q1#hi + add r8, r8, #64-16 @ add \$-16, %r8 + and r8, r8, #~(1<<6) @ and \$0x30, %r8 + vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl GFp_vpaes_set_encrypt_key +.type GFp_vpaes_set_encrypt_key,%function +.align 4 +GFp_vpaes_set_encrypt_key: + stmdb sp!, {r7-r11, lr} + vstmdb sp!, {d8-d15} + + lsr r9, $bits, #5 @ shr \$5,%eax + add r9, r9, #5 @ \$5,%eax + str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 @ mov \$0,%ecx + mov r8, #0x30 @ mov \$0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8-d15} + ldmia sp!, {r7-r11, pc} @ return +.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key +___ +} + +{ +my ($out, $inp) = map("r$_", (0..1)); +my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12)); + +$code .= <<___; + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: + .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b + .quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl GFp_vpaes_encrypt_key_to_bsaes +.type GFp_vpaes_encrypt_key_to_bsaes,%function +.align 4 +GFp_vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {$mc_forward}, [r2] + vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [$inp,#240] + add r2, r2, #1 + str r2, [$out,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [$inp]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [$out]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [$inp]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 q2#lo, {q0}, q1#lo + vtbl.8 q2#hi, {q0}, q1#hi + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 q1#lo, {q0}, $mc_forward#lo + vtbl.8 q1#hi, {q0}, $mc_forward#hi + vmov q0, q1 + vtbl.8 q2#lo, {q1}, $mc_forward#lo + vtbl.8 q2#hi, {q1}, $mc_forward#hi + veor q0, q0, q2 + vtbl.8 q1#lo, {q2}, $mc_forward#lo + vtbl.8 q1#hi, {q2}, $mc_forward#hi + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, $s63 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [$out]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ $s63_raw, not $s63. + veor q0, q0, $s63_raw + vrev32.8 q0, q0 + vst1.64 {q0}, [$out] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes +___ +} + +{ +# Register-passed parameters. +my ($inp, $out, $len, $key) = map("r$_", 0..3); +# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and +# $tmp. $ctr is r7 because it must be preserved across calls. +my ($ctr, $ivec, $tmp) = map("r$_", 7..9); + +# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code .= <<___; +.globl GFp_vpaes_ctr32_encrypt_blocks +.type GFp_vpaes_ctr32_encrypt_blocks,%function +.align 4 +GFp_vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7-r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8-d15} + + cmp $len, #0 + @ $ivec is passed on the stack. + ldr $ivec, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. + mov $tmp, $key + mov $key, $len + mov $len, $tmp +___ +my ($len, $key) = ($key, $len); +$code .= <<___; + + @ Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + vld1.8 {q7}, [$ivec] + + bl _vpaes_preheat + rev $ctr, $ctr @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [$inp]! @ Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [$out]! + subs $len, $len, #1 + @ Update the counter. + add $ctr, $ctr, #1 + rev $tmp, $ctr + vmov.32 q7#hi[1], $tmp + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8-d15} + ldmia sp!, {r7-r11, pc} @ return +.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks +___ +} + +foreach (split("\n",$code)) { + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl new file mode 100755 index 0000000..b31bbb8 --- /dev/null +++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl @@ -0,0 +1,837 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. +## +###################################################################### +# ARMv8 NEON adaptation by +# +# Reason for undertaken effort is that there is at least one popular +# SoC based on Cortex-A53 that doesn't have crypto extensions. +# +# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] +# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] +# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] +# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] +# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] +# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] +# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] +# +# (*) ECB denotes approximate result for parallelizable modes +# such as CBC decrypt, CTR, etc.; +# (**) these results are worse than scalar compiler-generated +# code, but it's constant-time and therefore preferred; +# (***) presented for reference/comparison purposes; + +$flavour = shift; +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$code.=<<___; +#include + +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:// mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +___ + +{ +my ($inp,$out,$key) = map("x$_",(0..2)); + +my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); +my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); +my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); + +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl GFp_vpaes_encrypt +.type GFp_vpaes_encrypt,%function +.align 4 +GFp_vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [$inp] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, $key + ldr w8, [$key,#240] // pull rounds + adrp x11, :pg_hi21:.Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {$iptlo}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {$ipthi}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {$sb1t}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {$sb1u}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {$sb2t}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {$sb2u}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {$invhi},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {$invlo},v8.16b + tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {$invlo},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {$invlo},v11.16b + tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {$invlo},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {$sbou}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {$sbot}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x +___ +} +{ +my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); +my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); + +$code.=<<___; +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, :pg_hi21:.Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, :pg_hi21:.Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, :pg_hi21:.Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 + adrp x11, :pg_hi21:.Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) + + cmp $bits, #192 // cmp \$192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov $inp, #10 // mov \$10, %esi + +.Loop_schedule_128: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub $inp, $inp, #8 + ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov $inp, #4 // mov \$4, %esi + +.Loop_schedule_192: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov $inp, #7 // mov \$7, %esi + +.Loop_schedule_256: + sub $inp, $inp, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz $inp, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz $dir, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add $out, $out, #32 // add \$32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d-v21.2d}, [x11] // reload constants + sub $out, $out, #16 // add \$-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 + ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add $out, $out, #16 // add \$16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #64-16 // add \$-16, %r8 + and x8, x8, #~(1<<6) // and \$0x30, %r8 + st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl GFp_vpaes_set_encrypt_key +.type GFp_vpaes_set_encrypt_key,%function +.align 4 +GFp_vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, $bits, #5 // shr \$5,%eax + add w9, w9, #5 // \$5,%eax + str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov $dir, #0 // mov \$0,%ecx + mov x8, #0x30 // mov \$0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key +___ +} +{ +my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4)); +my ($ctr, $ctr_tmp) = ("w6", "w7"); + +# void GFp_vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, +# const AES_KEY *key, const uint8_t ivec[16]); +$code.=<<___; +.globl GFp_vpaes_ctr32_encrypt_blocks +.type GFp_vpaes_ctr32_encrypt_blocks,%function +.align 4 +GFp_vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz $len, .Lctr32_done + + // Note, unlike the other functions, $len here is measured in blocks, + // not bytes. + mov x17, $len + mov x2, $key + + // Load the IV and counter portion. + ldr $ctr, [$ivec, #12] + ld1 {v7.16b}, [$ivec] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev $ctr, $ctr // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [$inp], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [$out], #16 + subs x17, x17, #1 + // Update the counter. + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v7.s[3], $ctr_tmp + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add $ctr, $ctr, #1 + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [$out], #32 + subs x17, x17, #2 + // Update the counter. + add $ctr_tmp, $ctr, #1 + add $ctr, $ctr, #2 + rev $ctr_tmp, $ctr_tmp + mov v14.s[3], $ctr_tmp + rev $ctr_tmp, $ctr + mov v15.s[3], $ctr_tmp + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks +___ +} + +print $code; + +close STDOUT or die "error closing STDOUT"; diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl new file mode 100644 index 0000000..7e52ad6 --- /dev/null +++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl @@ -0,0 +1,294 @@ +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It +# implements the multiplication algorithm described in: +# +# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software +# Polynomial Multiplication on ARM Processors using the NEON Engine. +# +# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf +# +# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is +# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit +# NEON, the low and high halves of the 128-bit register q0 are accessible as +# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of +# vN. Where the 32-bit version would use the upper half, this file must keep +# halves in separate registers. +# +# The other distinction is in syntax. 32-bit NEON embeds lane information in the +# instruction name, while AArch64 uses suffixes on the registers. For instance, +# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: +# +# vshl.i64 q0, q0, #1 +# +# in 64-bit, it would be written: +# +# shl v0.2d, v0.2d, #1 +# +# See Programmer's Guide for ARMv8-A, section 7 for details. +# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf +# +# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ +# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials +# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit +# polynomial and is conditioned on the PMULL extension. This file emulates the +# latter with the former. + +use strict; + +my $flavour = shift; +my $output; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; + my $dir = $1; + my $xlate; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open OUT,"| \"$^X\" $xlate $flavour $output"; + *STDOUT=*OUT; +} else { + open OUT,">$output"; + *STDOUT=*OUT; +} + +my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block +my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); +my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); +# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers +# to spare. +my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); +my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); +my ($k48_k32, $k16_k0) = map("v$_", (24..25)); + +my $code = ""; + +# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b +# must be distinct from $t* and $k*. $t* are clobbered by the emitted code. +sub clmul64x64 { +my ($r, $a, $b) = @_; +$code .= <<___; + ext $t0.8b, $a.8b, $a.8b, #1 // A1 + pmull $t0.8h, $t0.8b, $b.8b // F = A1*B + ext $r.8b, $b.8b, $b.8b, #1 // B1 + pmull $r.8h, $a.8b, $r.8b // E = A*B1 + ext $t1.8b, $a.8b, $a.8b, #2 // A2 + pmull $t1.8h, $t1.8b, $b.8b // H = A2*B + ext $t3.8b, $b.8b, $b.8b, #2 // B2 + pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 + ext $t2.8b, $a.8b, $a.8b, #3 // A3 + eor $t0.16b, $t0.16b, $r.16b // L = E + F + pmull $t2.8h, $t2.8b, $b.8b // J = A3*B + ext $r.8b, $b.8b, $b.8b, #3 // B3 + eor $t1.16b, $t1.16b, $t3.16b // M = G + H + pmull $r.8h, $a.8b, $r.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) + // vand \$t0#hi, \$t0#hi, \$k48 + // veor \$t0#lo, \$t0#lo, \$t0#hi + // + // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) + // vand \$t1#hi, \$t1#hi, \$k32 + // veor \$t1#lo, \$t1#lo, \$t1#hi + // + // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) + // vand \$t2#hi, \$t2#hi, \$k16 + // veor \$t2#lo, \$t2#lo, \$t2#hi + // + // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 \$t3#hi, #0 + // + // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext $t3.8b, $b.8b, $b.8b, #4 // B4 + eor $t2.16b, $t2.16b, $r.16b // N = I + J + pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 $t0l_t1l.2d, $t0.2d, $t1.2d + zip1 $t2l_t3l.2d, $t2.2d, $t3.2d + zip2 $t0h_t1h.2d, $t0.2d, $t1.2d + zip2 $t2h_t3h.2d, $t2.2d, $t3.2d + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b + and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b + eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b + eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b + zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d + zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d + zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d + + ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 + ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 + pmull $r.8h, $a.8b, $b.8b // D = A*B + ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 + ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 + eor $t0.16b, $t0.16b, $t1.16b + eor $t2.16b, $t2.16b, $t3.16b + eor $r.16b, $r.16b, $t0.16b + eor $r.16b, $r.16b, $t2.16b +___ +} + +$code .= <<___; +#include + +.text + +.global GFp_gcm_init_neon +.type GFp_gcm_init_neon,%function +.align 4 +GFp_gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {$t1.2d}, [x1] // load H + movi $t3.16b, #0xe1 + shl $t3.2d, $t3.2d, #57 // 0xc2.0 + ext $INlo.16b, $t1.16b, $t1.16b, #8 + ushr $t2.2d, $t3.2d, #63 + dup $t1.4s, $t1.s[1] + ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 + ushr $t2.2d, $INlo.2d, #63 + sshr $t1.4s, $t1.4s, #31 // broadcast carry bit + and $t2.16b, $t2.16b, $t0.16b + shl $INlo.2d, $INlo.2d, #1 + ext $t2.16b, $t2.16b, $t2.16b, #8 + and $t0.16b, $t0.16b, $t1.16b + orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 + eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H + st1 {$Hlo.2d}, [x0] // store Htable[0] + ret +.size GFp_gcm_init_neon,.-GFp_gcm_init_neon + +.global GFp_gcm_gmult_neon +.type GFp_gcm_gmult_neon,%function +.align 4 +GFp_gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {$INlo.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $INlo.16b, $INlo.16b // byteswap Xi + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + + mov $len, #16 + b .Lgmult_neon +.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon + +.global GFp_gcm_ghash_neon +.type GFp_gcm_ghash_neon,%function +.align 4 +GFp_gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {$Xl.16b}, [$Xi] // load Xi + ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H + ld1 {$Hhi.1d}, [$Htbl] + adrp x9, :pg_hi21:.Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] + rev64 $Xl.16b, $Xl.16b // byteswap Xi + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {$INlo.16b}, [$inp], #16 // load inp + rev64 $INlo.16b, $INlo.16b // byteswap inp + ext $INlo.16b, $INlo.16b, $INlo.16b, #8 + eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into $INlo and $INhi. (The upper halves are unused, + // so it is okay to leave them alone.) + ins $INhi.d[0], $INlo.d[1] +___ +&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo +$code .= <<___; + eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing +___ +&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) +&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi +$code .= <<___; + ext $t0.16b, $Xl.16b, $Xh.16b, #8 + eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing + eor $Xm.16b, $Xm.16b, $Xh.16b + eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi + ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins $Xh.d[0], $Xm.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl $t1.2d, $Xl.2d, #57 // 1st phase + shl $t2.2d, $Xl.2d, #62 + eor $t2.16b, $t2.16b, $t1.16b // + shl $t1.2d, $Xl.2d, #63 + eor $t2.16b, $t2.16b, $t1.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor $t2.16b, $t2.16b, $Xm.16b + ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] + ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] + + ushr $t2.2d, $Xl.2d, #1 // 2nd phase + eor $Xh.16b, $Xh.16b,$Xl.16b + eor $Xl.16b, $Xl.16b,$t2.16b // + ushr $t2.2d, $t2.2d, #6 + ushr $Xl.2d, $Xl.2d, #1 // + eor $Xl.16b, $Xl.16b, $Xh.16b // + eor $Xl.16b, $Xl.16b, $t2.16b // + + subs $len, $len, #16 + bne .Loop_neon + + rev64 $Xl.16b, $Xl.16b // byteswap Xi and write + ext $Xl.16b, $Xl.16b, $Xl.16b, #8 + st1 {$Xl.16b}, [$Xi] + + ret +.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.asciz "GHASH for ARMv8, derived from ARMv4 version by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + print $_,"\n"; +} +close STDOUT or die "error closing STDOUT"; # enforce flush -- Efraim Flashner רנשלפ םירפא GPG key = A28B F40C 3E55 1372 662D 14F7 41AA E7DC CA3D 8351 Confidentiality cannot be guaranteed on emails sent or received unencrypted