Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/abi_self_test.cc =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/abi_self_test.cc +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/abi_self_test.cc @@ -521,3 +521,289 @@ TEST(ABITest, AArch64) { CHECK_ABI_NO_UNWIND(abi_test_clobber_v15_upper); } #endif // OPENSSL_AARCH64 && SUPPORTS_ABI_TEST + +#if defined(OPENSSL_PPC64LE) && defined(SUPPORTS_ABI_TEST) +extern "C" { +void abi_test_clobber_r0(void); +// r1 is the stack pointer. +void abi_test_clobber_r2(void); +void abi_test_clobber_r3(void); +void abi_test_clobber_r4(void); +void abi_test_clobber_r5(void); +void abi_test_clobber_r6(void); +void abi_test_clobber_r7(void); +void abi_test_clobber_r8(void); +void abi_test_clobber_r9(void); +void abi_test_clobber_r10(void); +void abi_test_clobber_r11(void); +void abi_test_clobber_r12(void); +// r13 is the thread pointer. +void abi_test_clobber_r14(void); +void abi_test_clobber_r15(void); +void abi_test_clobber_r16(void); +void abi_test_clobber_r17(void); +void abi_test_clobber_r18(void); +void abi_test_clobber_r19(void); +void abi_test_clobber_r20(void); +void abi_test_clobber_r21(void); +void abi_test_clobber_r22(void); +void abi_test_clobber_r23(void); +void abi_test_clobber_r24(void); +void abi_test_clobber_r25(void); +void abi_test_clobber_r26(void); +void abi_test_clobber_r27(void); +void abi_test_clobber_r28(void); +void abi_test_clobber_r29(void); +void abi_test_clobber_r30(void); +void abi_test_clobber_r31(void); + +void abi_test_clobber_f0(void); +void abi_test_clobber_f1(void); +void abi_test_clobber_f2(void); +void abi_test_clobber_f3(void); +void abi_test_clobber_f4(void); +void abi_test_clobber_f5(void); +void abi_test_clobber_f6(void); +void abi_test_clobber_f7(void); +void abi_test_clobber_f8(void); +void abi_test_clobber_f9(void); +void abi_test_clobber_f10(void); +void abi_test_clobber_f11(void); +void abi_test_clobber_f12(void); +void abi_test_clobber_f13(void); +void abi_test_clobber_f14(void); +void abi_test_clobber_f15(void); +void abi_test_clobber_f16(void); +void abi_test_clobber_f17(void); +void abi_test_clobber_f18(void); +void abi_test_clobber_f19(void); +void abi_test_clobber_f20(void); +void abi_test_clobber_f21(void); +void abi_test_clobber_f22(void); +void abi_test_clobber_f23(void); +void abi_test_clobber_f24(void); +void abi_test_clobber_f25(void); +void abi_test_clobber_f26(void); +void abi_test_clobber_f27(void); +void abi_test_clobber_f28(void); +void abi_test_clobber_f29(void); +void abi_test_clobber_f30(void); +void abi_test_clobber_f31(void); + +void abi_test_clobber_v0(void); +void abi_test_clobber_v1(void); +void abi_test_clobber_v2(void); +void abi_test_clobber_v3(void); +void abi_test_clobber_v4(void); +void abi_test_clobber_v5(void); +void abi_test_clobber_v6(void); +void abi_test_clobber_v7(void); +void abi_test_clobber_v8(void); +void abi_test_clobber_v9(void); +void abi_test_clobber_v10(void); +void abi_test_clobber_v11(void); +void abi_test_clobber_v12(void); +void abi_test_clobber_v13(void); +void abi_test_clobber_v14(void); +void abi_test_clobber_v15(void); +void abi_test_clobber_v16(void); +void abi_test_clobber_v17(void); +void abi_test_clobber_v18(void); +void abi_test_clobber_v19(void); +void abi_test_clobber_v20(void); +void abi_test_clobber_v21(void); +void abi_test_clobber_v22(void); +void abi_test_clobber_v23(void); +void abi_test_clobber_v24(void); +void abi_test_clobber_v25(void); +void abi_test_clobber_v26(void); +void abi_test_clobber_v27(void); +void abi_test_clobber_v28(void); +void abi_test_clobber_v29(void); +void abi_test_clobber_v30(void); +void abi_test_clobber_v31(void); + +void abi_test_clobber_cr0(void); +void abi_test_clobber_cr1(void); +void abi_test_clobber_cr2(void); +void abi_test_clobber_cr3(void); +void abi_test_clobber_cr4(void); +void abi_test_clobber_cr5(void); +void abi_test_clobber_cr6(void); +void abi_test_clobber_cr7(void); + +void abi_test_clobber_ctr(void); +void abi_test_clobber_lr(void); + +} // extern "C" + +TEST(ABITest, PPC64LE) { + // abi_test_trampoline hides unsaved registers from the caller, so we can + // safely call the abi_test_clobber_* functions below. + abi_test::internal::CallerState state; + RAND_bytes(reinterpret_cast(&state), sizeof(state)); + CHECK_ABI_NO_UNWIND(abi_test_trampoline, + reinterpret_cast(abi_test_clobber_r14), + &state, nullptr, 0, 0 /* no breakpoint */); + + CHECK_ABI_NO_UNWIND(abi_test_clobber_r0); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r2); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r3); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r4); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r5); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r6); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r7); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r8); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r9); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r10); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r11); + CHECK_ABI_NO_UNWIND(abi_test_clobber_r12); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r14), + "r14 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r15), + "r15 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r16), + "r16 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r17), + "r17 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r18), + "r18 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r19), + "r19 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r20), + "r20 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r21), + "r21 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r22), + "r22 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r23), + "r23 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r24), + "r24 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r25), + "r25 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r26), + "r26 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r27), + "r27 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r28), + "r28 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r29), + "r29 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r30), + "r30 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_r31), + "r31 was not restored after return"); + + CHECK_ABI_NO_UNWIND(abi_test_clobber_f0); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f1); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f2); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f3); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f4); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f5); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f6); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f7); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f8); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f9); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f10); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f11); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f12); + CHECK_ABI_NO_UNWIND(abi_test_clobber_f13); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f14), + "f14 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f15), + "f15 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f16), + "f16 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f17), + "f17 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f18), + "f18 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f19), + "f19 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f20), + "f20 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f21), + "f21 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f22), + "f22 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f23), + "f23 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f24), + "f24 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f25), + "f25 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f26), + "f26 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f27), + "f27 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f28), + "f28 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f29), + "f29 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f30), + "f30 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_f31), + "f31 was not restored after return"); + + CHECK_ABI_NO_UNWIND(abi_test_clobber_v0); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v1); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v2); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v3); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v4); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v5); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v6); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v7); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v8); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v9); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v10); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v11); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v12); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v13); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v14); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v15); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v16); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v17); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v18); + CHECK_ABI_NO_UNWIND(abi_test_clobber_v19); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v20), + "v20 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v21), + "v21 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v22), + "v22 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v23), + "v23 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v24), + "v24 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v25), + "v25 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v26), + "v26 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v27), + "v27 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v28), + "v28 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v29), + "v29 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v30), + "v30 was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_v31), + "v31 was not restored after return"); + + CHECK_ABI_NO_UNWIND(abi_test_clobber_cr0); + CHECK_ABI_NO_UNWIND(abi_test_clobber_cr1); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_cr2), + "cr was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_cr3), + "cr was not restored after return"); + EXPECT_NONFATAL_FAILURE(CHECK_ABI_NO_UNWIND(abi_test_clobber_cr4), + "cr was not restored after return"); + CHECK_ABI_NO_UNWIND(abi_test_clobber_cr5); + CHECK_ABI_NO_UNWIND(abi_test_clobber_cr6); + CHECK_ABI_NO_UNWIND(abi_test_clobber_cr7); + + CHECK_ABI_NO_UNWIND(abi_test_clobber_ctr); + CHECK_ABI_NO_UNWIND(abi_test_clobber_lr); +} +#endif // OPENSSL_PPC64LE && SUPPORTS_ABI_TEST Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/cpu_ppc64le.c =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/cpu_ppc64le.c @@ -0,0 +1,38 @@ +/* Copyright (c) 2016, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#include + +#if defined(OPENSSL_PPC64LE) + +#include + +#include "internal.h" + + +#if !defined(PPC_FEATURE2_HAS_VCRYPTO) +// PPC_FEATURE2_HAS_VCRYPTO was taken from section 4.1.2.3 of the “OpenPOWER +// ABI for Linux Supplement”. +#define PPC_FEATURE2_HAS_VCRYPTO 0x02000000 +#endif + +void OPENSSL_cpuid_setup(void) { + OPENSSL_ppc64le_hwcap2 = getauxval(AT_HWCAP2); +} + +int CRYPTO_is_PPC64LE_vcrypto_capable(void) { + return (OPENSSL_ppc64le_hwcap2 & PPC_FEATURE2_HAS_VCRYPTO) != 0; +} + +#endif // OPENSSL_PPC64LE Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/crypto.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/crypto.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/crypto.c @@ -66,6 +66,10 @@ uint32_t OPENSSL_get_ia32cap(int idx) { return OPENSSL_ia32cap_P[idx]; } +#elif defined(OPENSSL_PPC64LE) + +HIDDEN unsigned long OPENSSL_ppc64le_hwcap2 = 0; + #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) #include Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/aes/asm/aesp8-ppc.pl =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/aes/asm/aesp8-ppc.pl @@ -0,0 +1,3809 @@ +#! /usr/bin/env perl +# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for AES instructions as per PowerISA +# specification version 2.07, first implemented by POWER8 processor. +# The module is endian-agnostic in sense that it supports both big- +# and little-endian cases. Data alignment in parallelizable modes is +# handled with VSX loads and stores, which implies MSR.VSX flag being +# set. It should also be noted that ISA specification doesn't prohibit +# alignment exceptions for these instructions on page boundaries. +# Initially alignment was handled in pure AltiVec/VMX way [when data +# is aligned programmatically, which in turn guarantees exception- +# free execution], but it turned to hamper performance when vcipher +# instructions are interleaved. It's reckoned that eventual +# misalignment penalties at page boundaries are in average lower +# than additional overhead in pure AltiVec approach. +# +# May 2016 +# +# Add XTS subroutine, 9x on little- and 12x improvement on big-endian +# systems were measured. +# +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS +# POWER8[le] 3.96/0.72 0.74 1.1 +# POWER8[be] 3.75/0.65 0.66 1.0 +# POWER9[le] 4.02/0.86 0.84 1.05 +# POWER9[be] 3.99/0.78 0.79 0.97 + +$flavour = shift; +$output = shift; + +if ($flavour =~ /64/) { + $SIZE_T =8; + $LRSAVE =2*$SIZE_T; + $STU ="stdu"; + $POP ="ld"; + $PUSH ="std"; + $UCMP ="cmpld"; + $SHL ="sldi"; +} elsif ($flavour =~ /32/) { + $SIZE_T =4; + $LRSAVE =$SIZE_T; + $STU ="stwu"; + $POP ="lwz"; + $PUSH ="stw"; + $UCMP ="cmplw"; + $SHL ="slwi"; +} else { die "nonsense $flavour"; } + +$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| $^X \"$xlate\" $flavour \"$output\"" || die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$FRAME=8*$SIZE_T; +$prefix="aes_hw"; + +$sp="r1"; +$vrsave="r12"; + +######################################################################### +{{{ # Key setup procedures # +my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8)); +my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6)); +my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11)); + +$code.=<<___; +.machine "any" + +.text + +.align 7 +Lrcon: +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev +.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev +.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev +.long 0,0,0,0 ?asis +Lconsts: + mflr r0 + bcl 20,31,\$+4 + mflr $ptr #vvvvv "distance between . and rcon + addi $ptr,$ptr,-0x48 + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.asciz "AES for PowerISA 2.07, CRYPTOGAMS by " + +.globl .${prefix}_set_encrypt_key +.align 5 +.${prefix}_set_encrypt_key: +Lset_encrypt_key: + mflr r11 + $PUSH r11,$LRSAVE($sp) + + li $ptr,-1 + ${UCMP}i $inp,0 + beq- Lenc_key_abort # if ($inp==0) return -1; + ${UCMP}i $out,0 + beq- Lenc_key_abort # if ($out==0) return -1; + li $ptr,-2 + cmpwi $bits,128 + blt- Lenc_key_abort + cmpwi $bits,256 + bgt- Lenc_key_abort + andi. r0,$bits,0x3f + bne- Lenc_key_abort + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + bl Lconsts + mtlr r11 + + neg r9,$inp + lvx $in0,0,$inp + addi $inp,$inp,15 # 15 is not typo + lvsr $key,0,r9 # borrow $key + li r8,0x20 + cmpwi $bits,192 + lvx $in1,0,$inp + le?vspltisb $mask,0x0f # borrow $mask + lvx $rcon,0,$ptr + le?vxor $key,$key,$mask # adjust for byte swap + lvx $mask,r8,$ptr + addi $ptr,$ptr,0x10 + vperm $in0,$in0,$in1,$key # align [and byte swap in LE] + li $cnt,8 + vxor $zero,$zero,$zero + mtctr $cnt + + ?lvsr $outperm,0,$out + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$zero,$outmask,$outperm + + blt Loop128 + addi $inp,$inp,8 + beq L192 + addi $inp,$inp,8 + b L256 + +.align 4 +Loop128: + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + bdnz Loop128 + + lvx $rcon,0,$ptr # last two round keys + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + + vperm $key,$in0,$in0,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + + addi $inp,$out,15 # 15 is not typo + addi $out,$out,0x50 + + li $rounds,10 + b Ldone + +.align 4 +L192: + lvx $tmp,0,$inp + li $cnt,4 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + vspltisb $key,8 # borrow $key + mtctr $cnt + vsububm $mask,$mask,$key # adjust the mask + +Loop192: + vperm $key,$in1,$in1,$mask # roate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vcipherlast $key,$key,$rcon + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + + vsldoi $stage,$zero,$in1,8 + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vsldoi $stage,$stage,$in0,8 + + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vsldoi $stage,$in0,$in1,8 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vperm $outtail,$stage,$stage,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + stvx $stage,0,$out + addi $out,$out,16 + + vspltw $tmp,$in0,3 + vxor $tmp,$tmp,$in1 + vsldoi $in1,$zero,$in1,12 # >>32 + vadduwm $rcon,$rcon,$rcon + vxor $in1,$in1,$tmp + vxor $in0,$in0,$key + vxor $in1,$in1,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdnz Loop192 + + li $rounds,12 + addi $out,$out,0x20 + b Ldone + +.align 4 +L256: + lvx $tmp,0,$inp + li $cnt,7 + li $rounds,14 + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $out,$out,16 + vperm $in1,$in1,$tmp,$key # align [and byte swap in LE] + mtctr $cnt + +Loop256: + vperm $key,$in1,$in1,$mask # rotate-n-splat + vsldoi $tmp,$zero,$in0,12 # >>32 + vperm $outtail,$in1,$in1,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + vcipherlast $key,$key,$rcon + stvx $stage,0,$out + addi $out,$out,16 + + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in0,$in0,$tmp + vadduwm $rcon,$rcon,$rcon + vxor $in0,$in0,$key + vperm $outtail,$in0,$in0,$outperm # rotate + vsel $stage,$outhead,$outtail,$outmask + vmr $outhead,$outtail + stvx $stage,0,$out + addi $inp,$out,15 # 15 is not typo + addi $out,$out,16 + bdz Ldone + + vspltw $key,$in0,3 # just splat + vsldoi $tmp,$zero,$in1,12 # >>32 + vsbox $key,$key + + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + vsldoi $tmp,$zero,$tmp,12 # >>32 + vxor $in1,$in1,$tmp + + vxor $in1,$in1,$key + b Loop256 + +.align 4 +Ldone: + lvx $in1,0,$inp # redundant in aligned case + vsel $in1,$outhead,$in1,$outmask + stvx $in1,0,$inp + li $ptr,0 + mtspr 256,$vrsave + stw $rounds,0($out) + +Lenc_key_abort: + mr r3,$ptr + blr + .long 0 + .byte 0,12,0x14,1,0,0,3,0 + .long 0 +.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key + +.globl .${prefix}_set_decrypt_key +.align 5 +.${prefix}_set_decrypt_key: + $STU $sp,-$FRAME($sp) + mflr r10 + $PUSH r10,`$FRAME+$LRSAVE`($sp) + bl Lset_encrypt_key + mtlr r10 + + cmpwi r3,0 + bne- Ldec_key_abort + + slwi $cnt,$rounds,4 + subi $inp,$out,240 # first round key + srwi $rounds,$rounds,1 + add $out,$inp,$cnt # last round key + mtctr $rounds + +Ldeckey: + lwz r0, 0($inp) + lwz r6, 4($inp) + lwz r7, 8($inp) + lwz r8, 12($inp) + addi $inp,$inp,16 + lwz r9, 0($out) + lwz r10,4($out) + lwz r11,8($out) + lwz r12,12($out) + stw r0, 0($out) + stw r6, 4($out) + stw r7, 8($out) + stw r8, 12($out) + subi $out,$out,16 + stw r9, -16($inp) + stw r10,-12($inp) + stw r11,-8($inp) + stw r12,-4($inp) + bdnz Ldeckey + + xor r3,r3,r3 # return value +Ldec_key_abort: + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,0,3,0 + .long 0 +.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key +___ +}}} +######################################################################### +{{{ # Single block en- and decrypt procedures # +sub gen_block () { +my $dir = shift; +my $n = $dir eq "de" ? "n" : ""; +my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7)); + +$code.=<<___; +.globl .${prefix}_${dir}crypt +.align 5 +.${prefix}_${dir}crypt: + lwz $rounds,240($key) + lis r0,0xfc00 + mfspr $vrsave,256 + li $idx,15 # 15 is not typo + mtspr 256,r0 + + lvx v0,0,$inp + neg r11,$out + lvx v1,$idx,$inp + lvsl v2,0,$inp # inpperm + le?vspltisb v4,0x0f + ?lvsl v3,0,r11 # outperm + le?vxor v2,v2,v4 + li $idx,16 + vperm v0,v0,v1,v2 # align [and byte swap in LE] + lvx v1,0,$key + ?lvsl v5,0,$key # keyperm + srwi $rounds,$rounds,1 + lvx v2,$idx,$key + addi $idx,$idx,16 + subi $rounds,$rounds,1 + ?vperm v1,v1,v2,v5 # align round key + + vxor v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + mtctr $rounds + +Loop_${dir}c: + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + addi $idx,$idx,16 + ?vperm v1,v1,v2,v5 + v${n}cipher v0,v0,v1 + lvx v1,$idx,$key + addi $idx,$idx,16 + bdnz Loop_${dir}c + + ?vperm v2,v2,v1,v5 + v${n}cipher v0,v0,v2 + lvx v2,$idx,$key + ?vperm v1,v1,v2,v5 + v${n}cipherlast v0,v0,v1 + + vspltisb v2,-1 + vxor v1,v1,v1 + li $idx,15 # 15 is not typo + ?vperm v2,v1,v2,v3 # outmask + le?vxor v3,v3,v4 + lvx v1,0,$out # outhead + vperm v0,v0,v0,v3 # rotate [and byte swap in LE] + vsel v1,v1,v0,v2 + lvx v4,$idx,$out + stvx v1,0,$out + vsel v0,v0,v4,v2 + stvx v0,$idx,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 + .long 0 +.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} +######################################################################### +{{{ # CBC en- and decrypt procedures # +my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)= + map("v$_",(4..10)); +$code.=<<___; +.globl .${prefix}_cbc_encrypt +.align 5 +.${prefix}_cbc_encrypt: + ${UCMP}i $len,16 + bltlr- + + cmpwi $enc,0 # test direction + lis r0,0xffe0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + beq Lcbc_dec + +Lcbc_enc: + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $inout,$inout,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + vxor $inout,$inout,$ivec + +Loop_cbc_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $ivec,$inout,$rndkey0 + ${UCMP}i $len,16 + + vperm $tmp,$ivec,$ivec,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_enc + + b Lcbc_done + +.align 4 +Lcbc_dec: + ${UCMP}i $len,128 + bge _aesp8_cbc_decrypt8x + vmr $tmp,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + mtctr $rounds + subi $len,$len,16 # len-=16 + + lvx $rndkey0,0,$key + vperm $tmp,$tmp,$inptail,$inpperm + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$tmp,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + +Loop_cbc_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_cbc_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipherlast $inout,$inout,$rndkey0 + ${UCMP}i $len,16 + + vxor $inout,$inout,$ivec + vmr $ivec,$tmp + vperm $tmp,$inout,$inout,$outperm + vsel $inout,$outhead,$tmp,$outmask + vmr $outhead,$tmp + stvx $inout,0,$out + addi $out,$out,16 + bge Lcbc_dec + +Lcbc_done: + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + neg $enc,$ivp # write [unaligned] iv + li $idx,15 # 15 is not typo + vxor $rndkey0,$rndkey0,$rndkey0 + vspltisb $outmask,-1 + le?vspltisb $tmp,0x0f + ?lvsl $outperm,0,$enc + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + lvx $outhead,0,$ivp + vperm $ivec,$ivec,$ivec,$outperm + vsel $inout,$outhead,$ivec,$outmask + lvx $inptail,$idx,$ivp + stvx $inout,0,$ivp + vsel $inout,$ivec,$inptail,$outmask + stvx $inout,$idx,$ivp + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CBC decrypt procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment + +$code.=<<___; +.align 5 +_aesp8_cbc_decrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + subi $len,$len,128 # bias + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,`$FRAME+15` + mtctr $rounds + +Load_cbc_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_cbc_dec_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,`$FRAME+15` # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + #lvx $inptail,0,$inp # "caller" already did this + #addi $inp,$inp,15 # 15 is not typo + subi $inp,$inp,15 # undo "caller" + + le?li $idx,8 + lvx_u $in0,$x00,$inp # load first 8 "words" + le?lvsl $inpperm,0,$idx + le?vspltisb $tmp,0x0f + lvx_u $in1,$x10,$inp + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + lvx_u $in2,$x20,$inp + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in3,$x30,$inp + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in4,$x40,$inp + le?vperm $in2,$in2,$in2,$inpperm + vxor $out0,$in0,$rndkey0 + lvx_u $in5,$x50,$inp + le?vperm $in3,$in3,$in3,$inpperm + vxor $out1,$in1,$rndkey0 + lvx_u $in6,$x60,$inp + le?vperm $in4,$in4,$in4,$inpperm + vxor $out2,$in2,$rndkey0 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + le?vperm $in5,$in5,$in5,$inpperm + vxor $out3,$in3,$rndkey0 + le?vperm $in6,$in6,$in6,$inpperm + vxor $out4,$in4,$rndkey0 + le?vperm $in7,$in7,$in7,$inpperm + vxor $out5,$in5,$rndkey0 + vxor $out6,$in6,$rndkey0 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + b Loop_cbc_dec8x +.align 5 +Loop_cbc_dec8x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x + + subic $len,$len,128 # $len-=128 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + and r0,r0,$len + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vncipher $out0,$out0,v30 + vxor $ivec,$ivec,v31 # xor with last round key + vncipher $out1,$out1,v30 + vxor $in0,$in0,v31 + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + vncipherlast $out0,$out0,$ivec + vncipherlast $out1,$out1,$in0 + lvx_u $in0,$x00,$inp # load next input block + vncipherlast $out2,$out2,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out3,$out3,$in2 + le?vperm $in0,$in0,$in0,$inpperm + lvx_u $in2,$x20,$inp + vncipherlast $out4,$out4,$in3 + le?vperm $in1,$in1,$in1,$inpperm + lvx_u $in3,$x30,$inp + vncipherlast $out5,$out5,$in4 + le?vperm $in2,$in2,$in2,$inpperm + lvx_u $in4,$x40,$inp + vncipherlast $out6,$out6,$in5 + le?vperm $in3,$in3,$in3,$inpperm + lvx_u $in5,$x50,$inp + vncipherlast $out7,$out7,$in6 + le?vperm $in4,$in4,$in4,$inpperm + lvx_u $in6,$x60,$inp + vmr $ivec,$in7 + le?vperm $in5,$in5,$in5,$inpperm + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $in6,$in6,$in6,$inpperm + vxor $out0,$in0,$rndkey0 + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $in7,$in7,$in7,$inpperm + vxor $out1,$in1,$rndkey0 + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$rndkey0 + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$rndkey0 + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$rndkey0 + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + vxor $out5,$in5,$rndkey0 + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + vxor $out6,$in6,$rndkey0 + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + vxor $out7,$in7,$rndkey0 + + mtctr $rounds + beq Loop_cbc_dec8x # did $len-=128 borrow? + + addic. $len,$len,128 + beq Lcbc_dec8x_done + nop + nop + +Loop_cbc_dec8x_tail: # up to 7 "words" tail... + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_cbc_dec8x_tail + + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + vncipher $out6,$out6,v24 + vncipher $out7,$out7,v24 + + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + vncipher $out6,$out6,v25 + vncipher $out7,$out7,v25 + + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + vncipher $out6,$out6,v26 + vncipher $out7,$out7,v26 + + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + vncipher $out6,$out6,v27 + vncipher $out7,$out7,v27 + + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + vncipher $out6,$out6,v28 + vncipher $out7,$out7,v28 + + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + vncipher $out6,$out6,v29 + vncipher $out7,$out7,v29 + + vncipher $out1,$out1,v30 + vxor $ivec,$ivec,v31 # last round key + vncipher $out2,$out2,v30 + vxor $in1,$in1,v31 + vncipher $out3,$out3,v30 + vxor $in2,$in2,v31 + vncipher $out4,$out4,v30 + vxor $in3,$in3,v31 + vncipher $out5,$out5,v30 + vxor $in4,$in4,v31 + vncipher $out6,$out6,v30 + vxor $in5,$in5,v31 + vncipher $out7,$out7,v30 + vxor $in6,$in6,v31 + + cmplwi $len,32 # switch($len) + blt Lcbc_dec8x_one + nop + beq Lcbc_dec8x_two + cmplwi $len,64 + blt Lcbc_dec8x_three + nop + beq Lcbc_dec8x_four + cmplwi $len,96 + blt Lcbc_dec8x_five + nop + beq Lcbc_dec8x_six + +Lcbc_dec8x_seven: + vncipherlast $out1,$out1,$ivec + vncipherlast $out2,$out2,$in1 + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out1,$out1,$out1,$inpperm + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x00,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x10,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x20,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x30,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x40,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x50,$out + stvx_u $out7,$x60,$out + addi $out,$out,0x70 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_six: + vncipherlast $out2,$out2,$ivec + vncipherlast $out3,$out3,$in2 + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out2,$out2,$out2,$inpperm + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x00,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x10,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x20,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x30,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x40,$out + stvx_u $out7,$x50,$out + addi $out,$out,0x60 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_five: + vncipherlast $out3,$out3,$ivec + vncipherlast $out4,$out4,$in3 + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out3,$out3,$out3,$inpperm + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x00,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x10,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x20,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x30,$out + stvx_u $out7,$x40,$out + addi $out,$out,0x50 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_four: + vncipherlast $out4,$out4,$ivec + vncipherlast $out5,$out5,$in4 + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out4,$out4,$out4,$inpperm + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x00,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x10,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x20,$out + stvx_u $out7,$x30,$out + addi $out,$out,0x40 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_three: + vncipherlast $out5,$out5,$ivec + vncipherlast $out6,$out6,$in5 + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out5,$out5,$out5,$inpperm + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x00,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x10,$out + stvx_u $out7,$x20,$out + addi $out,$out,0x30 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_two: + vncipherlast $out6,$out6,$ivec + vncipherlast $out7,$out7,$in6 + vmr $ivec,$in7 + + le?vperm $out6,$out6,$out6,$inpperm + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x00,$out + stvx_u $out7,$x10,$out + addi $out,$out,0x20 + b Lcbc_dec8x_done + +.align 5 +Lcbc_dec8x_one: + vncipherlast $out7,$out7,$ivec + vmr $ivec,$in7 + + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out7,0,$out + addi $out,$out,0x10 + +Lcbc_dec8x_done: + le?vperm $ivec,$ivec,$ivec,$inpperm + stvx_u $ivec,0,$ivp # write [unaligned] iv + + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt +___ +}} }}} + +######################################################################### +{{{ # CTR procedure[s] # +my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3)); +my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)= + map("v$_",(4..11)); +my $dat=$tmp; + +$code.=<<___; +.globl .${prefix}_ctr32_encrypt_blocks +.align 5 +.${prefix}_ctr32_encrypt_blocks: + ${UCMP}i $len,1 + bltlr- + + lis r0,0xfff0 + mfspr $vrsave,256 + mtspr 256,r0 + + li $idx,15 + vxor $rndkey0,$rndkey0,$rndkey0 + le?vspltisb $tmp,0x0f + + lvx $ivec,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + vspltisb $one,1 + le?vxor $inpperm,$inpperm,$tmp + vperm $ivec,$ivec,$inptail,$inpperm + vsldoi $one,$rndkey0,$one,1 + + neg r11,$inp + ?lvsl $keyperm,0,$key # prepare for unaligned key + lwz $rounds,240($key) + + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inptail,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + srwi $rounds,$rounds,1 + li $idx,16 + subi $rounds,$rounds,1 + + ${UCMP}i $len,8 + bge _aesp8_ctr32_encrypt8x + + ?lvsr $outperm,0,$out # prepare for unaligned store + vspltisb $outmask,-1 + lvx $outhead,0,$out + ?vperm $outmask,$rndkey0,$outmask,$outperm + le?vxor $outperm,$outperm,$tmp + + lvx $rndkey0,0,$key + mtctr $rounds + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + b Loop_ctr32_enc + +.align 5 +Loop_ctr32_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + bdnz Loop_ctr32_enc + + vadduwm $ivec,$ivec,$one + vmr $dat,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + subic. $len,$len,1 # blocks-- + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key + vperm $dat,$dat,$inptail,$inpperm + li $idx,16 + ?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm + lvx $rndkey0,0,$key + vxor $dat,$dat,$rndkey1 # last round key + vcipherlast $inout,$inout,$dat + + lvx $rndkey1,$idx,$key + addi $idx,$idx,16 + vperm $inout,$inout,$inout,$outperm + vsel $dat,$outhead,$inout,$outmask + mtctr $rounds + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vmr $outhead,$inout + vxor $inout,$ivec,$rndkey0 + lvx $rndkey0,$idx,$key + addi $idx,$idx,16 + stvx $dat,0,$out + addi $out,$out,16 + bne Loop_ctr32_enc + + addi $out,$out,-1 + lvx $inout,0,$out # redundant in aligned case + vsel $inout,$outhead,$inout,$outmask + stvx $inout,0,$out + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,6,0 + .long 0 +___ +######################################################################### +{{ # Optimized CTR procedure # +my $key_="r11"; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14)); +my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment +my ($two,$three,$four)=($outhead,$outperm,$outmask); + +$code.=<<___; +.align 5 +_aesp8_ctr32_encrypt8x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + li r10,`$FRAME+8*16+15` + li r11,`$FRAME+8*16+31` + stvx v20,r10,$sp # ABI says so + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key # load key schedule + lvx v30,$x10,$key + addi $key,$key,0x20 + lvx v31,$x00,$key + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,`$FRAME+15` + mtctr $rounds + +Load_ctr32_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key + addi $key,$key,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_ctr32_enc_key + + lvx v26,$x10,$key + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,`$FRAME+15` # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key + ?vperm v29,v29,v30,$keyperm + lvx $out0,$x70,$key # borrow $out0 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$out0,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vadduwm $two,$one,$one + subi $inp,$inp,15 # undo "caller" + $SHL $len,$len,4 + + vadduwm $out1,$ivec,$one # counter values ... + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + le?li $idx,8 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + le?lvsl $inpperm,0,$idx + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + le?vspltisb $tmp,0x0f + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + vxor $out7,$out7,$rndkey0 + + mtctr $rounds + b Loop_ctr32_enc8x +.align 5 +Loop_ctr32_enc8x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 +Loop_ctr32_enc8x_middle: + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_ctr32_enc8x + + subic r11,$len,256 # $len-256, borrow $key_ + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + vcipher $out6,$out6,v24 + vcipher $out7,$out7,v24 + + subfe r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + vcipher $out6,$out6,v25 + vcipher $out7,$out7,v25 + + and r0,r0,r11 + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + vcipher $out6,$out6,v26 + vcipher $out7,$out7,v26 + lvx v24,$x00,$key_ # re-pre-load round[1] + + subic $len,$len,129 # $len-=129 + vcipher $out0,$out0,v27 + addi $len,$len,1 # $len-=128 really + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + vcipher $out6,$out6,v27 + vcipher $out7,$out7,v27 + lvx v25,$x10,$key_ # re-pre-load round[2] + + vcipher $out0,$out0,v28 + lvx_u $in0,$x00,$inp # load input + vcipher $out1,$out1,v28 + lvx_u $in1,$x10,$inp + vcipher $out2,$out2,v28 + lvx_u $in2,$x20,$inp + vcipher $out3,$out3,v28 + lvx_u $in3,$x30,$inp + vcipher $out4,$out4,v28 + lvx_u $in4,$x40,$inp + vcipher $out5,$out5,v28 + lvx_u $in5,$x50,$inp + vcipher $out6,$out6,v28 + lvx_u $in6,$x60,$inp + vcipher $out7,$out7,v28 + lvx_u $in7,$x70,$inp + addi $inp,$inp,0x80 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v29 + le?vperm $in1,$in1,$in1,$inpperm + vcipher $out2,$out2,v29 + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out3,$out3,v29 + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out4,$out4,v29 + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out5,$out5,v29 + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out6,$out6,v29 + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out7,$out7,v29 + le?vperm $in7,$in7,$in7,$inpperm + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in7 are loaded + # with last "words" + subfe. r0,r0,r0 # borrow?-1:0 + vcipher $out0,$out0,v30 + vxor $in0,$in0,v31 # xor with last round key + vcipher $out1,$out1,v30 + vxor $in1,$in1,v31 + vcipher $out2,$out2,v30 + vxor $in2,$in2,v31 + vcipher $out3,$out3,v30 + vxor $in3,$in3,v31 + vcipher $out4,$out4,v30 + vxor $in4,$in4,v31 + vcipher $out5,$out5,v30 + vxor $in5,$in5,v31 + vcipher $out6,$out6,v30 + vxor $in6,$in6,v31 + vcipher $out7,$out7,v30 + vxor $in7,$in7,v31 + + bne Lctr32_enc8x_break # did $len-129 borrow? + + vcipherlast $in0,$out0,$in0 + vcipherlast $in1,$out1,$in1 + vadduwm $out1,$ivec,$one # counter values ... + vcipherlast $in2,$out2,$in2 + vadduwm $out2,$ivec,$two + vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0] + vcipherlast $in3,$out3,$in3 + vadduwm $out3,$out1,$two + vxor $out1,$out1,$rndkey0 + vcipherlast $in4,$out4,$in4 + vadduwm $out4,$out2,$two + vxor $out2,$out2,$rndkey0 + vcipherlast $in5,$out5,$in5 + vadduwm $out5,$out3,$two + vxor $out3,$out3,$rndkey0 + vcipherlast $in6,$out6,$in6 + vadduwm $out6,$out4,$two + vxor $out4,$out4,$rndkey0 + vcipherlast $in7,$out7,$in7 + vadduwm $out7,$out5,$two + vxor $out5,$out5,$rndkey0 + le?vperm $in0,$in0,$in0,$inpperm + vadduwm $ivec,$out6,$two # next counter value + vxor $out6,$out6,$rndkey0 + le?vperm $in1,$in1,$in1,$inpperm + vxor $out7,$out7,$rndkey0 + mtctr $rounds + + vcipher $out0,$out0,v24 + stvx_u $in0,$x00,$out + le?vperm $in2,$in2,$in2,$inpperm + vcipher $out1,$out1,v24 + stvx_u $in1,$x10,$out + le?vperm $in3,$in3,$in3,$inpperm + vcipher $out2,$out2,v24 + stvx_u $in2,$x20,$out + le?vperm $in4,$in4,$in4,$inpperm + vcipher $out3,$out3,v24 + stvx_u $in3,$x30,$out + le?vperm $in5,$in5,$in5,$inpperm + vcipher $out4,$out4,v24 + stvx_u $in4,$x40,$out + le?vperm $in6,$in6,$in6,$inpperm + vcipher $out5,$out5,v24 + stvx_u $in5,$x50,$out + le?vperm $in7,$in7,$in7,$inpperm + vcipher $out6,$out6,v24 + stvx_u $in6,$x60,$out + vcipher $out7,$out7,v24 + stvx_u $in7,$x70,$out + addi $out,$out,0x80 + + b Loop_ctr32_enc8x_middle + +.align 5 +Lctr32_enc8x_break: + cmpwi $len,-0x60 + blt Lctr32_enc8x_one + nop + beq Lctr32_enc8x_two + cmpwi $len,-0x40 + blt Lctr32_enc8x_three + nop + beq Lctr32_enc8x_four + cmpwi $len,-0x20 + blt Lctr32_enc8x_five + nop + beq Lctr32_enc8x_six + cmpwi $len,0x00 + blt Lctr32_enc8x_seven + +Lctr32_enc8x_eight: + vcipherlast $out0,$out0,$in0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + vcipherlast $out5,$out5,$in5 + vcipherlast $out6,$out6,$in6 + vcipherlast $out7,$out7,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + le?vperm $out7,$out7,$out7,$inpperm + stvx_u $out6,$x60,$out + stvx_u $out7,$x70,$out + addi $out,$out,0x80 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_seven: + vcipherlast $out0,$out0,$in1 + vcipherlast $out1,$out1,$in2 + vcipherlast $out2,$out2,$in3 + vcipherlast $out3,$out3,$in4 + vcipherlast $out4,$out4,$in5 + vcipherlast $out5,$out5,$in6 + vcipherlast $out6,$out6,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + le?vperm $out6,$out6,$out6,$inpperm + stvx_u $out5,$x50,$out + stvx_u $out6,$x60,$out + addi $out,$out,0x70 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_six: + vcipherlast $out0,$out0,$in2 + vcipherlast $out1,$out1,$in3 + vcipherlast $out2,$out2,$in4 + vcipherlast $out3,$out3,$in5 + vcipherlast $out4,$out4,$in6 + vcipherlast $out5,$out5,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + le?vperm $out5,$out5,$out5,$inpperm + stvx_u $out4,$x40,$out + stvx_u $out5,$x50,$out + addi $out,$out,0x60 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_five: + vcipherlast $out0,$out0,$in3 + vcipherlast $out1,$out1,$in4 + vcipherlast $out2,$out2,$in5 + vcipherlast $out3,$out3,$in6 + vcipherlast $out4,$out4,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$inpperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_four: + vcipherlast $out0,$out0,$in4 + vcipherlast $out1,$out1,$in5 + vcipherlast $out2,$out2,$in6 + vcipherlast $out3,$out3,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$inpperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_three: + vcipherlast $out0,$out0,$in5 + vcipherlast $out1,$out1,$in6 + vcipherlast $out2,$out2,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + le?vperm $out2,$out2,$out2,$inpperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_two: + vcipherlast $out0,$out0,$in6 + vcipherlast $out1,$out1,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + le?vperm $out1,$out1,$out1,$inpperm + stvx_u $out0,$x00,$out + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + b Lctr32_enc8x_done + +.align 5 +Lctr32_enc8x_one: + vcipherlast $out0,$out0,$in7 + + le?vperm $out0,$out0,$out0,$inpperm + stvx_u $out0,0,$out + addi $out,$out,0x10 + +Lctr32_enc8x_done: + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $inpperm,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + stvx $inpperm,r10,$sp + addi r10,r10,32 + stvx $inpperm,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks +___ +}} }}} + +######################################################################### +{{{ # XTS procedures # +# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, # +# const AES_KEY *key1, const AES_KEY *key2, # +# [const] unsigned char iv[16]); # +# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which # +# input tweak value is assumed to be encrypted already, and last tweak # +# value, one suitable for consecutive call on same chunk of data, is # +# written back to original buffer. In addition, in "tweak chaining" # +# mode only complete input blocks are processed. # + +my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10)); +my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2)); +my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7)); +my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12)); +my $taillen = $key2; + + ($inp,$idx) = ($idx,$inp); # reassign + +$code.=<<___; +.globl .${prefix}_xts_encrypt +.align 5 +.${prefix}_xts_encrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff0 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_enc_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_enc + +Lxts_enc_no_key2: + li $idx,-16 + and $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_enc: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_encrypt6x + + andi. $taillen,$len,15 + subic r0,$len,32 + subi $taillen,$taillen,16 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + b Loop_xts_enc + +.align 5 +Loop_xts_enc: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_enc + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vcipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_enc_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + subic r0,$len,32 + subfe r0,r0,r0 + and r0,r0,$taillen + add $inp,$inp,r0 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $output,$output,$rndkey0 # just in case $len<16 + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_enc + + vxor $output,$output,$tweak + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + subi r11,$out,17 + subi $out,$out,16 + mtctr $len + li $len,16 +Loop_xts_enc_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_enc_steal + + mtctr $rounds + b Loop_xts_enc # one more time... + +Lxts_enc_done: + ${UCMP}i $ivp,0 + beq Lxts_enc_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt + +.globl .${prefix}_xts_decrypt +.align 5 +.${prefix}_xts_decrypt: + mr $inp,r3 # reassign + li r3,-1 + ${UCMP}i $len,16 + bltlr- + + lis r0,0xfff8 + mfspr r12,256 # save vrsave + li r11,0 + mtspr 256,r0 + + andi. r0,$len,15 + neg r0,r0 + andi. r0,r0,16 + sub $len,$len,r0 + + vspltisb $seven,0x07 # 0x070707..07 + le?lvsl $leperm,r11,r11 + le?vspltisb $tmp,0x0f + le?vxor $leperm,$leperm,$seven + + li $idx,15 + lvx $tweak,0,$ivp # load [unaligned] iv + lvsl $inpperm,0,$ivp + lvx $inptail,$idx,$ivp + le?vxor $inpperm,$inpperm,$tmp + vperm $tweak,$tweak,$inptail,$inpperm + + neg r11,$inp + lvsr $inpperm,0,r11 # prepare for unaligned load + lvx $inout,0,$inp + addi $inp,$inp,15 # 15 is not typo + le?vxor $inpperm,$inpperm,$tmp + + ${UCMP}i $key2,0 # key2==NULL? + beq Lxts_dec_no_key2 + + ?lvsl $keyperm,0,$key2 # prepare for unaligned key + lwz $rounds,240($key2) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + lvx $rndkey0,0,$key2 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + mtctr $rounds + +Ltweak_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipher $tweak,$tweak,$rndkey0 + lvx $rndkey0,$idx,$key2 + addi $idx,$idx,16 + bdnz Ltweak_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vcipher $tweak,$tweak,$rndkey1 + lvx $rndkey1,$idx,$key2 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vcipherlast $tweak,$tweak,$rndkey0 + + li $ivp,0 # don't chain the tweak + b Lxts_dec + +Lxts_dec_no_key2: + neg $idx,$len + andi. $idx,$idx,15 + add $len,$len,$idx # in "tweak chaining" + # mode only complete + # blocks are processed +Lxts_dec: + lvx $inptail,0,$inp + addi $inp,$inp,16 + + ?lvsl $keyperm,0,$key1 # prepare for unaligned key + lwz $rounds,240($key1) + srwi $rounds,$rounds,1 + subi $rounds,$rounds,1 + li $idx,16 + + vslb $eighty7,$seven,$seven # 0x808080..80 + vor $eighty7,$eighty7,$seven # 0x878787..87 + vspltisb $tmp,1 # 0x010101..01 + vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01 + + ${UCMP}i $len,96 + bge _aesp8_xts_decrypt6x + + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + mtctr $rounds + + ${UCMP}i $len,16 + blt Ltail_xts_dec + be?b Loop_xts_dec + +.align 5 +Loop_xts_dec: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + addi $out,$out,16 + + subic. $len,$len,16 + beq Lxts_dec_done + + vmr $inout,$inptail + lvx $inptail,0,$inp + addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $inout,$inout,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + mtctr $rounds + ${UCMP}i $len,16 + bge Loop_xts_dec + +Ltail_xts_dec: + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak1,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak1,$tweak1,$tmp + + subi $inp,$inp,16 + add $inp,$inp,$len + + vxor $inout,$inout,$tweak # :-( + vxor $inout,$inout,$tweak1 # :-) + +Loop_xts_dec_short: + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vncipher $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + bdnz Loop_xts_dec_short + + ?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm + vncipher $inout,$inout,$rndkey1 + lvx $rndkey1,$idx,$key1 + li $idx,16 + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + vxor $rndkey0,$rndkey0,$tweak1 + vncipherlast $output,$inout,$rndkey0 + + le?vperm $tmp,$output,$output,$leperm + be?nop + le?stvx_u $tmp,0,$out + be?stvx_u $output,0,$out + + vmr $inout,$inptail + lvx $inptail,0,$inp + #addi $inp,$inp,16 + lvx $rndkey0,0,$key1 + lvx $rndkey1,$idx,$key1 + addi $idx,$idx,16 + vperm $inout,$inout,$inptail,$inpperm + ?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm + + lvsr $inpperm,0,$len # $inpperm is no longer needed + vxor $inptail,$inptail,$inptail # $inptail is no longer needed + vspltisb $tmp,-1 + vperm $inptail,$inptail,$tmp,$inpperm + vsel $inout,$inout,$output,$inptail + + vxor $rndkey0,$rndkey0,$tweak + vxor $inout,$inout,$rndkey0 + lvx $rndkey0,$idx,$key1 + addi $idx,$idx,16 + + subi r11,$out,1 + mtctr $len + li $len,16 +Loop_xts_dec_steal: + lbzu r0,1(r11) + stb r0,16(r11) + bdnz Loop_xts_dec_steal + + mtctr $rounds + b Loop_xts_dec # one more time... + +Lxts_dec_done: + ${UCMP}i $ivp,0 + beq Lxts_dec_ret + + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $tweak,$tweak,$tmp + + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec_ret: + mtspr 256,r12 # restore vrsave + li r3,0 + blr + .long 0 + .byte 0,12,0x04,0,0x80,6,6,0 + .long 0 +.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt +___ +######################################################################### +{{ # Optimized XTS procedures # +my $key_=$key2; +my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31)); + $x00=0 if ($flavour =~ /osx/); +my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5)); +my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16)); +my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22)); +my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys + # v26-v31 last 6 round keys +my ($keyperm)=($out0); # aliases with "caller", redundant assignment +my $taillen=$x70; + +$code.=<<___; +.align 5 +_aesp8_xts_encrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,`$FRAME+15` + mtctr $rounds + +Load_xts_enc_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_enc_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,`$FRAME+15` # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_enc6x + +.align 5 +Loop_xts_enc6x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v24 + vcipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vcipher $out4,$out4,v25 + vcipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out0,$out0,v26 + vcipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v26 + vcipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vcipher $out0,$out0,v27 + vcipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vcipher $out4,$out4,v27 + vcipher $out5,$out5,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vxor $tweak,$tweak,$tmp + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipher $out4,$out4,v28 + vcipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vcipher $out0,$out0,v29 + vcipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vcipher $out4,$out4,v29 + vcipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vcipher $out0,$out0,v30 + vcipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vcipher $out4,$out4,v30 + vcipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vcipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vcipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vcipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vcipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vcipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vcipherlast $tmp,$out5,$in5 # last block might be needed + # in stealing mode + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$tmp,$tmp,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + le?stvx_u $out5,$x50,$out + be?stvx_u $tmp, $x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_enc6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_enc6x_zero + cmpwi $len,0x20 + blt Lxts_enc6x_one + nop + beq Lxts_enc6x_two + cmpwi $len,0x40 + blt Lxts_enc6x_three + nop + beq Lxts_enc6x_four + +Lxts_enc6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $tmp,$out4,$twk5 # last block prep for stealing + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $tmp,$out3,$twk4 # last block prep for stealing + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $tmp,$out2,$twk3 # last block prep for stealing + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_enc5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vxor $tmp,$out1,$twk2 # last block prep for stealing + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_enc1x: + vcipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_enc1x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + + lvsr $inpperm,0,$taillen + vcipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vcipher $out0,$out0,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vcipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vcipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vcipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vcipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vxor $tmp,$out0,$twk1 # last block prep for stealing + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + bne Lxts_enc6x_steal + b Lxts_enc6x_done + +.align 4 +Lxts_enc6x_zero: + cmpwi $taillen,0 + beq Lxts_enc6x_done + + add $inp,$inp,$taillen + subi $inp,$inp,16 + lvx_u $in0,0,$inp + lvsr $inpperm,0,$taillen # $in5 is no more + le?vperm $in0,$in0,$in0,$leperm + vperm $in0,$in0,$in0,$inpperm + vxor $tmp,$tmp,$twk0 +Lxts_enc6x_steal: + vxor $in0,$in0,$twk0 + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember? + + subi r30,$out,17 + subi $out,$out,16 + mtctr $taillen +Loop_xts_enc6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_enc6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_enc1x # one more time... + +.align 4 +Lxts_enc6x_done: + ${UCMP}i $ivp,0 + beq Lxts_enc6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_enc6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_enc5x: + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_enc5x + + add $inp,$inp,$taillen + cmpwi $taillen,0 + vcipher $out0,$out0,v24 + vcipher $out1,$out1,v24 + vcipher $out2,$out2,v24 + vcipher $out3,$out3,v24 + vcipher $out4,$out4,v24 + + subi $inp,$inp,16 + vcipher $out0,$out0,v25 + vcipher $out1,$out1,v25 + vcipher $out2,$out2,v25 + vcipher $out3,$out3,v25 + vcipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + vcipher $out0,$out0,v26 + lvsr $inpperm,0,$taillen # $in5 is no more + vcipher $out1,$out1,v26 + vcipher $out2,$out2,v26 + vcipher $out3,$out3,v26 + vcipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vcipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vcipher $out1,$out1,v27 + vcipher $out2,$out2,v27 + vcipher $out3,$out3,v27 + vcipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vcipher $out0,$out0,v28 + vcipher $out1,$out1,v28 + vcipher $out2,$out2,v28 + vcipher $out3,$out3,v28 + vcipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vcipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vcipher $out1,$out1,v29 + vcipher $out2,$out2,v29 + vcipher $out3,$out3,v29 + vcipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vcipher $out0,$out0,v30 + vperm $in0,$in0,$in0,$inpperm + vcipher $out1,$out1,v30 + vcipher $out2,$out2,v30 + vcipher $out3,$out3,v30 + vcipher $out4,$out4,v30 + + vcipherlast $out0,$out0,$twk0 + vcipherlast $out1,$out1,$in1 + vcipherlast $out2,$out2,$in2 + vcipherlast $out3,$out3,$in3 + vcipherlast $out4,$out4,$in4 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.align 5 +_aesp8_xts_decrypt6x: + $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + mflr r11 + li r7,`$FRAME+8*16+15` + li r3,`$FRAME+8*16+31` + $PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + stvx v20,r7,$sp # ABI says so + addi r7,r7,32 + stvx v21,r3,$sp + addi r3,r3,32 + stvx v22,r7,$sp + addi r7,r7,32 + stvx v23,r3,$sp + addi r3,r3,32 + stvx v24,r7,$sp + addi r7,r7,32 + stvx v25,r3,$sp + addi r3,r3,32 + stvx v26,r7,$sp + addi r7,r7,32 + stvx v27,r3,$sp + addi r3,r3,32 + stvx v28,r7,$sp + addi r7,r7,32 + stvx v29,r3,$sp + addi r3,r3,32 + stvx v30,r7,$sp + stvx v31,r3,$sp + li r0,-1 + stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li $x10,0x10 + $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + li $x20,0x20 + $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + li $x30,0x30 + $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + li $x40,0x40 + $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + li $x50,0x50 + $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + li $x60,0x60 + $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + li $x70,0x70 + mtspr 256,r0 + + subi $rounds,$rounds,3 # -4 in total + + lvx $rndkey0,$x00,$key1 # load key schedule + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + lvx v31,$x00,$key1 + ?vperm $rndkey0,$rndkey0,v30,$keyperm + addi $key_,$sp,`$FRAME+15` + mtctr $rounds + +Load_xts_dec_key: + ?vperm v24,v30,v31,$keyperm + lvx v30,$x10,$key1 + addi $key1,$key1,0x20 + stvx v24,$x00,$key_ # off-load round[1] + ?vperm v25,v31,v30,$keyperm + lvx v31,$x00,$key1 + stvx v25,$x10,$key_ # off-load round[2] + addi $key_,$key_,0x20 + bdnz Load_xts_dec_key + + lvx v26,$x10,$key1 + ?vperm v24,v30,v31,$keyperm + lvx v27,$x20,$key1 + stvx v24,$x00,$key_ # off-load round[3] + ?vperm v25,v31,v26,$keyperm + lvx v28,$x30,$key1 + stvx v25,$x10,$key_ # off-load round[4] + addi $key_,$sp,`$FRAME+15` # rewind $key_ + ?vperm v26,v26,v27,$keyperm + lvx v29,$x40,$key1 + ?vperm v27,v27,v28,$keyperm + lvx v30,$x50,$key1 + ?vperm v28,v28,v29,$keyperm + lvx v31,$x60,$key1 + ?vperm v29,v29,v30,$keyperm + lvx $twk5,$x70,$key1 # borrow $twk5 + ?vperm v30,v30,v31,$keyperm + lvx v24,$x00,$key_ # pre-load round[1] + ?vperm v31,v31,$twk5,$keyperm + lvx v25,$x10,$key_ # pre-load round[2] + + vperm $in0,$inout,$inptail,$inpperm + subi $inp,$inp,31 # undo "caller" + vxor $twk0,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vand $tmp,$tmp,$eighty7 + vxor $out0,$in0,$twk0 + vxor $tweak,$tweak,$tmp + + lvx_u $in1,$x10,$inp + vxor $twk1,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in1,$in1,$in1,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out1,$in1,$twk1 + vxor $tweak,$tweak,$tmp + + lvx_u $in2,$x20,$inp + andi. $taillen,$len,15 + vxor $twk2,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in2,$in2,$in2,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out2,$in2,$twk2 + vxor $tweak,$tweak,$tmp + + lvx_u $in3,$x30,$inp + sub $len,$len,$taillen + vxor $twk3,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in3,$in3,$in3,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out3,$in3,$twk3 + vxor $tweak,$tweak,$tmp + + lvx_u $in4,$x40,$inp + subi $len,$len,0x60 + vxor $twk4,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in4,$in4,$in4,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out4,$in4,$twk4 + vxor $tweak,$tweak,$tmp + + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + vxor $twk5,$tweak,$rndkey0 + vsrab $tmp,$tweak,$seven # next tweak value + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + le?vperm $in5,$in5,$in5,$leperm + vand $tmp,$tmp,$eighty7 + vxor $out5,$in5,$twk5 + vxor $tweak,$tweak,$tmp + + vxor v31,v31,$rndkey0 + mtctr $rounds + b Loop_xts_dec6x + +.align 5 +Loop_xts_dec6x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec6x + + subic $len,$len,96 # $len-=96 + vxor $in0,$twk0,v31 # xor with last round key + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk0,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v24 + vncipher $out5,$out5,v24 + + subfe. r0,r0,r0 # borrow?-1:0 + vand $tmp,$tmp,$eighty7 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vxor $in1,$twk1,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk1,$tweak,$rndkey0 + vncipher $out4,$out4,v25 + vncipher $out5,$out5,v25 + + and r0,r0,$len + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v26 + vncipher $out5,$out5,v26 + + add $inp,$inp,r0 # $inp is adjusted in such + # way that at exit from the + # loop inX-in5 are loaded + # with last "words" + vxor $in2,$twk2,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk2,$tweak,$rndkey0 + vaddubm $tweak,$tweak,$tweak + vncipher $out0,$out0,v27 + vncipher $out1,$out1,v27 + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vand $tmp,$tmp,$eighty7 + vncipher $out4,$out4,v27 + vncipher $out5,$out5,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vxor $tweak,$tweak,$tmp + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vxor $in3,$twk3,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk3,$tweak,$rndkey0 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipher $out4,$out4,v28 + vncipher $out5,$out5,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vand $tmp,$tmp,$eighty7 + + vncipher $out0,$out0,v29 + vncipher $out1,$out1,v29 + vxor $tweak,$tweak,$tmp + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vxor $in4,$twk4,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk4,$tweak,$rndkey0 + vncipher $out4,$out4,v29 + vncipher $out5,$out5,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vand $tmp,$tmp,$eighty7 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vxor $tweak,$tweak,$tmp + vncipher $out4,$out4,v30 + vncipher $out5,$out5,v30 + vxor $in5,$twk5,v31 + vsrab $tmp,$tweak,$seven # next tweak value + vxor $twk5,$tweak,$rndkey0 + + vncipherlast $out0,$out0,$in0 + lvx_u $in0,$x00,$inp # load next input block + vaddubm $tweak,$tweak,$tweak + vsldoi $tmp,$tmp,$tmp,15 + vncipherlast $out1,$out1,$in1 + lvx_u $in1,$x10,$inp + vncipherlast $out2,$out2,$in2 + le?vperm $in0,$in0,$in0,$leperm + lvx_u $in2,$x20,$inp + vand $tmp,$tmp,$eighty7 + vncipherlast $out3,$out3,$in3 + le?vperm $in1,$in1,$in1,$leperm + lvx_u $in3,$x30,$inp + vncipherlast $out4,$out4,$in4 + le?vperm $in2,$in2,$in2,$leperm + lvx_u $in4,$x40,$inp + vxor $tweak,$tweak,$tmp + vncipherlast $out5,$out5,$in5 + le?vperm $in3,$in3,$in3,$leperm + lvx_u $in5,$x50,$inp + addi $inp,$inp,0x60 + le?vperm $in4,$in4,$in4,$leperm + le?vperm $in5,$in5,$in5,$leperm + + le?vperm $out0,$out0,$out0,$leperm + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk0 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + vxor $out1,$in1,$twk1 + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + vxor $out2,$in2,$twk2 + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + vxor $out3,$in3,$twk3 + le?vperm $out5,$out5,$out5,$leperm + stvx_u $out4,$x40,$out + vxor $out4,$in4,$twk4 + stvx_u $out5,$x50,$out + vxor $out5,$in5,$twk5 + addi $out,$out,0x60 + + mtctr $rounds + beq Loop_xts_dec6x # did $len-=96 borrow? + + addic. $len,$len,0x60 + beq Lxts_dec6x_zero + cmpwi $len,0x20 + blt Lxts_dec6x_one + nop + beq Lxts_dec6x_two + cmpwi $len,0x40 + blt Lxts_dec6x_three + nop + beq Lxts_dec6x_four + +Lxts_dec6x_five: + vxor $out0,$in1,$twk0 + vxor $out1,$in2,$twk1 + vxor $out2,$in3,$twk2 + vxor $out3,$in4,$twk3 + vxor $out4,$in5,$twk4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk5 # unused tweak + vxor $twk1,$tweak,$rndkey0 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk1 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + le?vperm $out4,$out4,$out4,$leperm + stvx_u $out3,$x30,$out + stvx_u $out4,$x40,$out + addi $out,$out,0x50 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_four: + vxor $out0,$in2,$twk0 + vxor $out1,$in3,$twk1 + vxor $out2,$in4,$twk2 + vxor $out3,$in5,$twk3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk4 # unused tweak + vmr $twk1,$twk5 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk5 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + le?vperm $out3,$out3,$out3,$leperm + stvx_u $out2,$x20,$out + stvx_u $out3,$x30,$out + addi $out,$out,0x40 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_three: + vxor $out0,$in3,$twk0 + vxor $out1,$in4,$twk1 + vxor $out2,$in5,$twk2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk3 # unused tweak + vmr $twk1,$twk4 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk4 + le?vperm $out2,$out2,$out2,$leperm + stvx_u $out1,$x10,$out + stvx_u $out2,$x20,$out + addi $out,$out,0x30 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_two: + vxor $out0,$in4,$twk0 + vxor $out1,$in5,$twk1 + vxor $out2,$out2,$out2 + vxor $out3,$out3,$out3 + vxor $out4,$out4,$out4 + + bl _aesp8_xts_dec5x + + le?vperm $out0,$out0,$out0,$leperm + vmr $twk0,$twk2 # unused tweak + vmr $twk1,$twk3 + le?vperm $out1,$out1,$out1,$leperm + stvx_u $out0,$x00,$out # store output + vxor $out0,$in0,$twk3 + stvx_u $out1,$x10,$out + addi $out,$out,0x20 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_one: + vxor $out0,$in5,$twk0 + nop +Loop_xts_dec1x: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Loop_xts_dec1x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk0,$twk0,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + mtctr $rounds + vncipherlast $out0,$out0,$twk0 + + vmr $twk0,$twk1 # unused tweak + vmr $twk1,$twk2 + le?vperm $out0,$out0,$out0,$leperm + stvx_u $out0,$x00,$out # store output + addi $out,$out,0x10 + vxor $out0,$in0,$twk2 + bne Lxts_dec6x_steal + b Lxts_dec6x_done + +.align 4 +Lxts_dec6x_zero: + cmpwi $taillen,0 + beq Lxts_dec6x_done + + lvx_u $in0,0,$inp + le?vperm $in0,$in0,$in0,$leperm + vxor $out0,$in0,$twk1 +Lxts_dec6x_steal: + vncipher $out0,$out0,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + lvx v25,$x10,$key_ # round[4] + bdnz Lxts_dec6x_steal + + add $inp,$inp,$taillen + vncipher $out0,$out0,v24 + + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + + lvx_u $in0,0,$inp + vncipher $out0,$out0,v26 + + lvsr $inpperm,0,$taillen # $in5 is no more + vncipher $out0,$out0,v27 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vncipher $out0,$out0,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + + vncipher $out0,$out0,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $twk1,$twk1,v31 + + le?vperm $in0,$in0,$in0,$leperm + vncipher $out0,$out0,v30 + + vperm $in0,$in0,$in0,$inpperm + vncipherlast $tmp,$out0,$twk1 + + le?vperm $out0,$tmp,$tmp,$leperm + le?stvx_u $out0,0,$out + be?stvx_u $tmp,0,$out + + vxor $out0,$out0,$out0 + vspltisb $out1,-1 + vperm $out0,$out0,$out1,$inpperm + vsel $out0,$in0,$tmp,$out0 + vxor $out0,$out0,$twk0 + + subi r30,$out,1 + mtctr $taillen +Loop_xts_dec6x_steal: + lbzu r0,1(r30) + stb r0,16(r30) + bdnz Loop_xts_dec6x_steal + + li $taillen,0 + mtctr $rounds + b Loop_xts_dec1x # one more time... + +.align 4 +Lxts_dec6x_done: + ${UCMP}i $ivp,0 + beq Lxts_dec6x_ret + + vxor $tweak,$twk0,$rndkey0 + le?vperm $tweak,$tweak,$tweak,$leperm + stvx_u $tweak,0,$ivp + +Lxts_dec6x_ret: + mtlr r11 + li r10,`$FRAME+15` + li r11,`$FRAME+31` + stvx $seven,r10,$sp # wipe copies of round keys + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + stvx $seven,r10,$sp + addi r10,r10,32 + stvx $seven,r11,$sp + addi r11,r11,32 + + mtspr 256,$vrsave + lvx v20,r10,$sp # ABI says so + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) + addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + blr + .long 0 + .byte 0,12,0x04,1,0x80,6,6,0 + .long 0 + +.align 5 +_aesp8_xts_dec5x: + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + lvx v24,$x20,$key_ # round[3] + addi $key_,$key_,0x20 + + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + lvx v25,$x10,$key_ # round[4] + bdnz _aesp8_xts_dec5x + + subi r0,$taillen,1 + vncipher $out0,$out0,v24 + vncipher $out1,$out1,v24 + vncipher $out2,$out2,v24 + vncipher $out3,$out3,v24 + vncipher $out4,$out4,v24 + + andi. r0,r0,16 + cmpwi $taillen,0 + vncipher $out0,$out0,v25 + vncipher $out1,$out1,v25 + vncipher $out2,$out2,v25 + vncipher $out3,$out3,v25 + vncipher $out4,$out4,v25 + vxor $twk0,$twk0,v31 + + sub $inp,$inp,r0 + vncipher $out0,$out0,v26 + vncipher $out1,$out1,v26 + vncipher $out2,$out2,v26 + vncipher $out3,$out3,v26 + vncipher $out4,$out4,v26 + vxor $in1,$twk1,v31 + + vncipher $out0,$out0,v27 + lvx_u $in0,0,$inp + vncipher $out1,$out1,v27 + vncipher $out2,$out2,v27 + vncipher $out3,$out3,v27 + vncipher $out4,$out4,v27 + vxor $in2,$twk2,v31 + + addi $key_,$sp,`$FRAME+15` # rewind $key_ + vncipher $out0,$out0,v28 + vncipher $out1,$out1,v28 + vncipher $out2,$out2,v28 + vncipher $out3,$out3,v28 + vncipher $out4,$out4,v28 + lvx v24,$x00,$key_ # re-pre-load round[1] + vxor $in3,$twk3,v31 + + vncipher $out0,$out0,v29 + le?vperm $in0,$in0,$in0,$leperm + vncipher $out1,$out1,v29 + vncipher $out2,$out2,v29 + vncipher $out3,$out3,v29 + vncipher $out4,$out4,v29 + lvx v25,$x10,$key_ # re-pre-load round[2] + vxor $in4,$twk4,v31 + + vncipher $out0,$out0,v30 + vncipher $out1,$out1,v30 + vncipher $out2,$out2,v30 + vncipher $out3,$out3,v30 + vncipher $out4,$out4,v30 + + vncipherlast $out0,$out0,$twk0 + vncipherlast $out1,$out1,$in1 + vncipherlast $out2,$out2,$in2 + vncipherlast $out3,$out3,$in3 + vncipherlast $out4,$out4,$in4 + mtctr $rounds + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +___ +}} }}} + +my $consts=1; +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/geo; + + # constants table endian-specific conversion + if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) { + my $conv=$3; + my @bytes=(); + + # convert to endian-agnostic format + if ($1 eq "long") { + foreach (split(/,\s*/,$2)) { + my $l = /^0/?oct:int; + push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; + } + } else { + @bytes = map(/^0/?oct:int,split(/,\s*/,$2)); + } + + # little-endian conversion + if ($flavour =~ /le$/o) { + SWITCH: for($conv) { + /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; + /\?rev/ && do { @bytes=reverse(@bytes); last; }; + } + } + + #emit + print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; + next; + } + $consts=0 if (m/Lconsts:/o); # end of table + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o or + s/\?lvsr/lvsl/o or + s/\?lvsl/lvsr/o or + s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or + s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or + s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; + } else { # big-endian + s/le\?/#le#/o or + s/be\?//o or + s/\?([a-z]+)/$1/o; + } + + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h @@ -59,6 +59,12 @@ OPENSSL_INLINE int vpaes_capable(void) { OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } #endif +#elif defined(OPENSSL_PPC64LE) +#define HWAES + +OPENSSL_INLINE int hwaes_capable(void) { + return CRYPTO_is_PPC64LE_vcrypto_capable(); +} #endif #endif // !NO_ASM Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/bcm.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/bcm.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/bcm.c @@ -102,6 +102,7 @@ #include "self_check/fips.c" #include "self_check/self_check.c" #include "service_indicator/service_indicator.c" +#include "sha/sha1-altivec.c" #include "sha/sha1.c" #include "sha/sha256.c" #include "sha/sha512.c" Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c @@ -384,6 +384,23 @@ int bn_expand(BIGNUM *bn, size_t bits) { } int bn_resize_words(BIGNUM *bn, size_t words) { +#if defined(OPENSSL_PPC64LE) + // This is a workaround for a miscompilation bug in Clang 7.0.1 on POWER. + // The unittests catch the miscompilation, if it occurs, and it manifests + // as a crash in |bn_fits_in_words|. + // + // The bug only triggers if building in FIPS mode and with -O3. Clang 8.0.1 + // has the same bug but this workaround is not effective there---I've not + // been able to find a workaround for 8.0.1. + // + // At the time of writing (2019-08-08), Clang git does *not* have this bug + // and does not need this workaroud. The current git version should go on to + // be Clang 10 thus, once we can depend on that, this can be removed. + if (value_barrier_w((size_t)bn->width == words)) { + return 1; + } +#endif + if ((size_t)bn->width <= words) { if (!bn_wexpand(bn, words)) { return 0; Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c @@ -1455,6 +1455,8 @@ int EVP_has_aes_hardware(void) { return hwaes_capable() && crypto_gcm_clmul_enabled(); #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) return hwaes_capable() && CRYPTO_is_ARMv8_PMULL_capable(); +#elif defined(OPENSSL_PPC64LE) + return CRYPTO_is_PPC64LE_vcrypto_capable(); #else return 0; #endif Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/asm/ghashp8-ppc.pl =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/asm/ghashp8-ppc.pl @@ -0,0 +1,671 @@ +#! /usr/bin/env perl +# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# GHASH for for PowerISA v2.07. +# +# July 2014 +# +# Accurate performance measurements are problematic, because it's +# always virtualized setup with possibly throttled processor. +# Relative comparison is therefore more informative. This initial +# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x +# faster than "4-bit" integer-only compiler-generated 64-bit code. +# "Initial version" means that there is room for futher improvement. + +# May 2016 +# +# 2x aggregated reduction improves performance by 50% (resulting +# performance on POWER8 is 1 cycle per processed byte), and 4x +# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). + +$flavour=shift; +$output =shift; + +if ($flavour =~ /64/) { + $SIZE_T=8; + $LRSAVE=2*$SIZE_T; + $STU="stdu"; + $POP="ld"; + $PUSH="std"; + $UCMP="cmpld"; + $SHRI="srdi"; +} elsif ($flavour =~ /32/) { + $SIZE_T=4; + $LRSAVE=$SIZE_T; + $STU="stwu"; + $POP="lwz"; + $PUSH="stw"; + $UCMP="cmplw"; + $SHRI="srwi"; +} else { die "nonsense $flavour"; } + +$sp="r1"; +$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT,"| $^X \"$xlate\" $flavour \"$output\"" || die "can't call $xlate: $!"; +*STDOUT=*OUT; + +my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block + +my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3)); +my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12)); +my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19)); +my $vrsave="r12"; + +$code=<<___; +.machine "any" + +.text + +.globl .gcm_init_p8 +.align 5 +.gcm_init_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $H,0,r4 # load H + + vspltisb $xC2,-16 # 0xf0 + vspltisb $t0,1 # one + vaddubm $xC2,$xC2,$xC2 # 0xe0 + vxor $zero,$zero,$zero + vor $xC2,$xC2,$t0 # 0xe1 + vsldoi $xC2,$xC2,$zero,15 # 0xe1... + vsldoi $t1,$zero,$t0,1 # ...1 + vaddubm $xC2,$xC2,$xC2 # 0xc2... + vspltisb $t2,7 + vor $xC2,$xC2,$t1 # 0xc2....01 + vspltb $t1,$H,0 # most significant byte + vsl $H,$H,$t0 # H<<=1 + vsrab $t1,$t1,$t2 # broadcast carry bit + vand $t1,$t1,$xC2 + vxor $IN,$H,$t1 # twisted H + + vsldoi $H,$IN,$IN,8 # twist even more ... + vsldoi $xC2,$zero,$xC2,8 # 0xc2.0 + vsldoi $Hl,$zero,$H,8 # ... and split + vsldoi $Hh,$H,$zero,8 + + stvx_u $xC2,0,r3 # save pre-computed table + stvx_u $Hl,r8,r3 + li r8,0x40 + stvx_u $H, r9,r3 + li r9,0x50 + stvx_u $Hh,r10,r3 + li r10,0x60 + + vpmsumd $Xl,$IN,$Hl # H.lo·H.lo + vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi + vpmsumd $Xh,$IN,$Hh # H.hi·H.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $IN1,$Xl,$t1 + + vsldoi $H2,$IN1,$IN1,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $H2l,r8,r3 # save H^2 + li r8,0x70 + stvx_u $H2,r9,r3 + li r9,0x80 + stvx_u $H2h,r10,r3 + li r10,0x90 +___ +{ +my ($t4,$t5,$t6) = ($Hl,$H,$Hh); +$code.=<<___; + vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo + vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo + vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi + vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi + vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi + vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vsldoi $t4,$Xm1,$zero,8 + vsldoi $t5,$zero,$Xm1,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + vxor $Xl1,$Xl1,$t4 + vxor $Xh1,$Xh1,$t5 + + vsldoi $Xl,$Xl,$Xl,8 + vsldoi $Xl1,$Xl1,$Xl1,8 + vxor $Xl,$Xl,$t2 + vxor $Xl1,$Xl1,$t6 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vpmsumd $Xl1,$Xl1,$xC2 + vxor $t1,$t1,$Xh + vxor $t5,$t5,$Xh1 + vxor $Xl,$Xl,$t1 + vxor $Xl1,$Xl1,$t5 + + vsldoi $H,$Xl,$Xl,8 + vsldoi $H2,$Xl1,$Xl1,8 + vsldoi $Hl,$zero,$H,8 + vsldoi $Hh,$H,$zero,8 + vsldoi $H2l,$zero,$H2,8 + vsldoi $H2h,$H2,$zero,8 + + stvx_u $Hl,r8,r3 # save H^3 + li r8,0xa0 + stvx_u $H,r9,r3 + li r9,0xb0 + stvx_u $Hh,r10,r3 + li r10,0xc0 + stvx_u $H2l,r8,r3 # save H^4 + stvx_u $H2,r9,r3 + stvx_u $H2h,r10,r3 + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_init_p8,.-.gcm_init_p8 +___ +} +$code.=<<___; +.globl .gcm_gmult_p8 +.align 5 +.gcm_gmult_p8: + lis r0,0xfff8 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $IN,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $IN,$IN,$IN,$lemask + vxor $zero,$zero,$zero + + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 +.size .gcm_gmult_p8,.-.gcm_gmult_p8 + +.globl .gcm_ghash_p8 +.align 5 +.gcm_ghash_p8: + li r0,-4096 + li r8,0x10 + mfspr $vrsave,256 + li r9,0x20 + mtspr 256,r0 + li r10,0x30 + lvx_u $Xl,0,$Xip # load Xi + + lvx_u $Hl,r8,$Htbl # load pre-computed table + li r8,0x40 + le?lvsl $lemask,r0,r0 + lvx_u $H, r9,$Htbl + li r9,0x50 + le?vspltisb $t0,0x07 + lvx_u $Hh,r10,$Htbl + li r10,0x60 + le?vxor $lemask,$lemask,$t0 + lvx_u $xC2,0,$Htbl + le?vperm $Xl,$Xl,$Xl,$lemask + vxor $zero,$zero,$zero + + ${UCMP}i $len,64 + bge Lgcm_ghash_p8_4x + + lvx_u $IN,0,$inp + addi $inp,$inp,16 + subic. $len,$len,16 + le?vperm $IN,$IN,$IN,$lemask + vxor $IN,$IN,$Xl + beq Lshort + + lvx_u $H2l,r8,$Htbl # load H^2 + li r8,16 + lvx_u $H2, r9,$Htbl + add r9,$inp,$len # end of input + lvx_u $H2h,r10,$Htbl + be?b Loop_2x + +.align 5 +Loop_2x: + lvx_u $IN1,0,$inp + le?vperm $IN1,$IN1,$IN1,$lemask + + subic $len,$len,32 + vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo + vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo + subfe r0,r0,r0 # borrow?-1:0 + vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi + vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi + and r0,r0,$len + vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi + vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi + add $inp,$inp,r0 + + vxor $Xl,$Xl,$Xl1 + vxor $Xm,$Xm,$Xm1 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh1 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + lvx_u $IN,r8,$inp + addi $inp,$inp,32 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + le?vperm $IN,$IN,$IN,$lemask + vxor $t1,$t1,$Xh + vxor $IN,$IN,$t1 + vxor $IN,$IN,$Xl + $UCMP r9,$inp + bgt Loop_2x # done yet? + + cmplwi $len,0 + bne Leven + +Lshort: + vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo + vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi + vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + +Leven: + vxor $Xl,$Xl,$t1 + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + mtspr 256,$vrsave + blr + .long 0 + .byte 0,12,0x14,0,0,0,4,0 + .long 0 +___ +{ +my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h, + $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31)); +my $IN0=$IN; +my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h); + +$code.=<<___; +.align 5 +.gcm_ghash_p8_4x: +Lgcm_ghash_p8_4x: + $STU $sp,-$FRAME($sp) + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + li r10,0x60 + stvx v31,r11,$sp + li r0,-1 + stw $vrsave,`$FRAME-4`($sp) # save vrsave + mtspr 256,r0 # preserve all AltiVec registers + + lvsl $t0,0,r8 # 0x0001..0e0f + #lvx_u $H2l,r8,$Htbl # load H^2 + li r8,0x70 + lvx_u $H2, r9,$Htbl + li r9,0x80 + vspltisb $t1,8 # 0x0808..0808 + #lvx_u $H2h,r10,$Htbl + li r10,0x90 + lvx_u $H3l,r8,$Htbl # load H^3 + li r8,0xa0 + lvx_u $H3, r9,$Htbl + li r9,0xb0 + lvx_u $H3h,r10,$Htbl + li r10,0xc0 + lvx_u $H4l,r8,$Htbl # load H^4 + li r8,0x10 + lvx_u $H4, r9,$Htbl + li r9,0x20 + lvx_u $H4h,r10,$Htbl + li r10,0x30 + + vsldoi $t2,$zero,$t1,8 # 0x0000..0808 + vaddubm $hiperm,$t0,$t2 # 0x0001..1617 + vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f + + $SHRI $len,$len,4 # this allows to use sign bit + # as carry + lvx_u $IN0,0,$inp # load input + lvx_u $IN1,r8,$inp + subic. $len,$len,8 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + + vxor $Xh,$IN0,$Xl + + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vperm $H21l,$H2,$H,$hiperm + vperm $t0,$IN2,$IN3,$loperm + vperm $H21h,$H2,$H,$loperm + vperm $t1,$IN2,$IN3,$hiperm + vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi + + vxor $Xm2,$Xm2,$Xm1 + vxor $Xl3,$Xl3,$Xl1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh3,$Xh3,$Xh1 + + blt Ltail_4x + +Loop_4x: + lvx_u $IN0,0,$inp + lvx_u $IN1,r8,$inp + subic. $len,$len,4 + lvx_u $IN2,r9,$inp + lvx_u $IN3,r10,$inp + addi $inp,$inp,0x40 + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + le?vperm $IN3,$IN3,$IN3,$lemask + le?vperm $IN0,$IN0,$IN0,$lemask + + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + vpmsumd $Xl1,$IN1,$H3l + vpmsumd $Xm1,$IN1,$H3 + vpmsumd $Xh1,$IN1,$H3h + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + vxor $Xh,$Xh,$Xh3 + vperm $t0,$IN2,$IN3,$loperm + vperm $t1,$IN2,$IN3,$hiperm + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi + vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi + vpmsumd $Xl,$Xl,$xC2 + + vxor $Xl3,$Xl3,$Xl1 + vxor $Xh3,$Xh3,$Xh1 + vxor $Xh,$Xh,$IN0 + vxor $Xm2,$Xm2,$Xm1 + vxor $Xh,$Xh,$t1 + vxor $Xm3,$Xm3,$Xm2 + vxor $Xh,$Xh,$Xl + bge Loop_4x + +Ltail_4x: + vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo + vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi + vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi + + vxor $Xl,$Xl,$Xl3 + vxor $Xm,$Xm,$Xm3 + + vpmsumd $t2,$Xl,$xC2 # 1st reduction phase + + vsldoi $t0,$Xm,$zero,8 + vsldoi $t1,$zero,$Xm,8 + vxor $Xh,$Xh,$Xh3 + vxor $Xl,$Xl,$t0 + vxor $Xh,$Xh,$t1 + + vsldoi $Xl,$Xl,$Xl,8 + vxor $Xl,$Xl,$t2 + + vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase + vpmsumd $Xl,$Xl,$xC2 + vxor $t1,$t1,$Xh + vxor $Xl,$Xl,$t1 + + addic. $len,$len,4 + beq Ldone_4x + + lvx_u $IN0,0,$inp + ${UCMP}i $len,2 + li $len,-4 + blt Lone + lvx_u $IN1,r8,$inp + beq Ltwo + +Lthree: + lvx_u $IN2,r9,$inp + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + le?vperm $IN2,$IN2,$IN2,$lemask + + vxor $Xh,$IN0,$Xl + vmr $H4l,$H3l + vmr $H4, $H3 + vmr $H4h,$H3h + + vperm $t0,$IN1,$IN2,$loperm + vperm $t1,$IN1,$IN2,$hiperm + vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo + vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi + vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo + vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi + + vxor $Xm3,$Xm3,$Xm2 + b Ltail_4x + +.align 4 +Ltwo: + le?vperm $IN0,$IN0,$IN0,$lemask + le?vperm $IN1,$IN1,$IN1,$lemask + + vxor $Xh,$IN0,$Xl + vperm $t0,$zero,$IN1,$loperm + vperm $t1,$zero,$IN1,$hiperm + + vsldoi $H4l,$zero,$H2,8 + vmr $H4, $H2 + vsldoi $H4h,$H2,$zero,8 + + vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo + vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi + vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi + + b Ltail_4x + +.align 4 +Lone: + le?vperm $IN0,$IN0,$IN0,$lemask + + vsldoi $H4l,$zero,$H,8 + vmr $H4, $H + vsldoi $H4h,$H,$zero,8 + + vxor $Xh,$IN0,$Xl + vxor $Xl3,$Xl3,$Xl3 + vxor $Xm3,$Xm3,$Xm3 + vxor $Xh3,$Xh3,$Xh3 + + b Ltail_4x + +Ldone_4x: + le?vperm $Xl,$Xl,$Xl,$lemask + stvx_u $Xl,0,$Xip # write out Xi + + li r10,`15+6*$SIZE_T` + li r11,`31+6*$SIZE_T` + mtspr 256,$vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,0x04,0,0x80,0,4,0 + .long 0 +___ +} +$code.=<<___; +.size .gcm_ghash_p8,.-.gcm_ghash_p8 + +.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by " +.align 2 +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + if ($flavour =~ /le$/o) { # little-endian + s/le\?//o or + s/be\?/#be#/o; + } else { + s/le\?/#le#/o or + s/be\?//o; + } + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; # enforce flush Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c @@ -228,6 +228,13 @@ void CRYPTO_ghash_init(gmult_func *out_m *out_hash = gcm_ghash_neon; return; } +#elif defined(GHASH_ASM_PPC64LE) + if (CRYPTO_is_PPC64LE_vcrypto_capable()) { + gcm_init_p8(out_table, H); + *out_mult = gcm_gmult_p8; + *out_hash = gcm_ghash_p8; + return; + } #endif gcm_init_nohw(out_table, H); Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_test.cc =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_test.cc +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_test.cc @@ -209,5 +209,15 @@ TEST(GCMTest, ABI) { } } #endif + +#if defined(GHASH_ASM_PPC64LE) + if (CRYPTO_is_PPC64LE_vcrypto_capable()) { + CHECK_ABI(gcm_init_p8, Htable, kH); + CHECK_ABI(gcm_gmult_p8, X, Htable); + for (size_t blocks : kBlockCounts) { + CHECK_ABI(gcm_ghash_p8, X, Htable, buf, 16 * blocks); + } + } +#endif // GHASH_ASM_PPC64LE } #endif // SUPPORTS_ABI_TEST && !OPENSSL_NO_ASM Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h @@ -325,6 +325,13 @@ void aes_gcm_dec_kernel(const uint8_t *i const u128 Htable[16]); #endif +#elif defined(OPENSSL_PPC64LE) +#define GHASH_ASM_PPC64LE +#define GCM_FUNCREF +void gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]); +void gcm_gmult_p8(uint8_t Xi[16], const u128 Htable[16]); +void gcm_ghash_p8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, + size_t len); #endif #endif // OPENSSL_NO_ASM Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h @@ -30,6 +30,8 @@ #define EXPECTED_NR_getrandom 278 #elif defined(OPENSSL_ARM) #define EXPECTED_NR_getrandom 384 +#elif defined(OPENSSL_PPC64LE) +#define EXPECTED_NR_getrandom 359 #elif defined(OPENSSL_RISCV64) #define EXPECTED_NR_getrandom 278 #endif Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c @@ -431,6 +431,11 @@ void RAND_bytes_with_additional_data(uin // Take a read lock around accesses to |state->drbg|. This is needed to // avoid returning bad entropy if we race with // |rand_thread_state_clear_all|. + // + // This lock must be taken after any calls to |CRYPTO_sysrand| to avoid a + // bug on ppc64le. glibc may implement pthread locks by wrapping user code + // in a hardware transaction, but, on some older versions of glibc and the + // kernel, syscalls made with |syscall| did not abort the transaction. CRYPTO_MUTEX_lock_read(&state->clear_drbg_lock); #endif if (!CTR_DRBG_reseed(&state->drbg, seed, reseed_additional_data, Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h @@ -23,6 +23,16 @@ extern "C" { #endif +#if defined(OPENSSL_PPC64LE) +// POWER has an intrinsics-based implementation of SHA-1 and thus the functions +// normally defined in assembly are available even with |OPENSSL_NO_ASM| in +// this case. +#define SHA1_ASM_PPC64 +void sha1_block_data_order_ppc64(uint32_t *state, const uint8_t *in, + size_t num_blocks); +#endif + + // Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is // defined in assembly. Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c @@ -0,0 +1,361 @@ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] */ + +// Altivec-optimized SHA1 in C. This is tested on ppc64le only. +// +// References: +// https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 +// http://arctic.org/~dean/crypto/sha1.html +// +// This code used the generic SHA-1 from OpenSSL as a basis and AltiVec +// optimisations were added on top. + +#include + +#if defined(OPENSSL_PPC64LE) + +#include + +void sha1_block_data_order_ppc64(uint32_t *state, const uint8_t *data, size_t num); + +static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); } + +typedef vector unsigned int vec_uint32_t; +typedef vector unsigned char vec_uint8_t; + +// Vector constants +static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4, + 11, 10, 9, 8, 15, 14, 13, 12}; + +// Shift amounts for byte and bit shifts and rotations +static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32}; +static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96}; + +#define K_00_19 0x5a827999UL +#define K_20_39 0x6ed9eba1UL +#define K_40_59 0x8f1bbcdcUL +#define K_60_79 0xca62c1d6UL + +// Vector versions of the above. +static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19}; +static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39}; +static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59}; +static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79}; + +// vector message scheduling: compute message schedule for round i..i+3 where i +// is divisible by 4. We return the schedule w[i..i+3] as a vector. In +// addition, we also precompute sum w[i..+3] and an additive constant K. This +// is done to offload some computation of f() in the integer execution units. +// +// Byte shifting code below may not be correct for big-endian systems. +static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data, + vec_uint32_t k) { + const vector unsigned char unaligned_data = + vec_vsx_ld(0, (const unsigned char*) data); + const vec_uint32_t v = (vec_uint32_t) unaligned_data; + const vec_uint32_t w = vec_perm(v, v, k_swap_endianness); + vec_st(w + k, 0, pre_added); + return w; +} + +// Compute w[i..i+3] using these steps for i in [16, 20, 24, 28] +// +// w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1 +// w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1 +// w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1 +// w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1 +// +// w[ i] = w'[ i] +// w[i+1] = w'[i+1] +// w[i+2] = w'[i+2] +// w[i+3] = w'[i+3] ^ (w'[i] <<< 1) +static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4, + vec_uint32_t minus_8, vec_uint32_t minus_12, + vec_uint32_t minus_16, vec_uint32_t k) { + const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes); + const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8); + const vec_uint32_t k_1_bit = vec_splat_u32(1); + const vec_uint32_t w_prime = + vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit); + const vec_uint32_t w = + w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit); + vec_st(w + k, 0, pre_added); + return w; +} + +// Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76] +// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2 +static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4, + vec_uint32_t minus_8, vec_uint32_t minus_16, + vec_uint32_t minus_28, vec_uint32_t minus_32, + vec_uint32_t k) { + const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8); + const vec_uint32_t k_2_bits = vec_splat_u32(2); + const vec_uint32_t w = + vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits); + vec_st(w + k, 0, pre_added); + return w; +} + +// As pointed out by Wei Dai , F() below can be simplified +// to the code in F_00_19. Wei attributes these optimisations to Peter +// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define +// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another +// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a +#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) +#define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) +#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) +#define F_60_79(b, c, d) F_20_39(b, c, d) + +// We pre-added the K constants during message scheduling. +#define BODY_00_19(i, a, b, c, d, e, f) \ + do { \ + (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \ + (b) = rotate((b), 30); \ + } while (0) + +#define BODY_20_39(i, a, b, c, d, e, f) \ + do { \ + (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \ + (b) = rotate((b), 30); \ + } while (0) + +#define BODY_40_59(i, a, b, c, d, e, f) \ + do { \ + (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \ + (b) = rotate((b), 30); \ + } while (0) + +#define BODY_60_79(i, a, b, c, d, e, f) \ + do { \ + (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \ + (b) = rotate((b), 30); \ + } while (0) + +void sha1_block_data_order_ppc64(uint32_t *state, const uint8_t *data, size_t num) { + uint32_t A, B, C, D, E, T; + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + E = state[4]; + + for (;;) { + vec_uint32_t vw[20]; + const uint32_t *w = (const uint32_t *)&vw; + + vec_uint32_t k = K_00_19_x_4; + const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k); + BODY_00_19(0, A, B, C, D, E, T); + BODY_00_19(1, T, A, B, C, D, E); + BODY_00_19(2, E, T, A, B, C, D); + BODY_00_19(3, D, E, T, A, B, C); + + const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k); + BODY_00_19(4, C, D, E, T, A, B); + BODY_00_19(5, B, C, D, E, T, A); + BODY_00_19(6, A, B, C, D, E, T); + BODY_00_19(7, T, A, B, C, D, E); + + const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k); + BODY_00_19(8, E, T, A, B, C, D); + BODY_00_19(9, D, E, T, A, B, C); + BODY_00_19(10, C, D, E, T, A, B); + BODY_00_19(11, B, C, D, E, T, A); + + const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k); + BODY_00_19(12, A, B, C, D, E, T); + BODY_00_19(13, T, A, B, C, D, E); + BODY_00_19(14, E, T, A, B, C, D); + BODY_00_19(15, D, E, T, A, B, C); + + const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k); + BODY_00_19(16, C, D, E, T, A, B); + BODY_00_19(17, B, C, D, E, T, A); + BODY_00_19(18, A, B, C, D, E, T); + BODY_00_19(19, T, A, B, C, D, E); + + k = K_20_39_x_4; + const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k); + BODY_20_39(20, E, T, A, B, C, D); + BODY_20_39(21, D, E, T, A, B, C); + BODY_20_39(22, C, D, E, T, A, B); + BODY_20_39(23, B, C, D, E, T, A); + + const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k); + BODY_20_39(24, A, B, C, D, E, T); + BODY_20_39(25, T, A, B, C, D, E); + BODY_20_39(26, E, T, A, B, C, D); + BODY_20_39(27, D, E, T, A, B, C); + + const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k); + BODY_20_39(28, C, D, E, T, A, B); + BODY_20_39(29, B, C, D, E, T, A); + BODY_20_39(30, A, B, C, D, E, T); + BODY_20_39(31, T, A, B, C, D, E); + + const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k); + BODY_20_39(32, E, T, A, B, C, D); + BODY_20_39(33, D, E, T, A, B, C); + BODY_20_39(34, C, D, E, T, A, B); + BODY_20_39(35, B, C, D, E, T, A); + + const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k); + BODY_20_39(36, A, B, C, D, E, T); + BODY_20_39(37, T, A, B, C, D, E); + BODY_20_39(38, E, T, A, B, C, D); + BODY_20_39(39, D, E, T, A, B, C); + + k = K_40_59_x_4; + const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k); + BODY_40_59(40, C, D, E, T, A, B); + BODY_40_59(41, B, C, D, E, T, A); + BODY_40_59(42, A, B, C, D, E, T); + BODY_40_59(43, T, A, B, C, D, E); + + const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k); + BODY_40_59(44, E, T, A, B, C, D); + BODY_40_59(45, D, E, T, A, B, C); + BODY_40_59(46, C, D, E, T, A, B); + BODY_40_59(47, B, C, D, E, T, A); + + const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k); + BODY_40_59(48, A, B, C, D, E, T); + BODY_40_59(49, T, A, B, C, D, E); + BODY_40_59(50, E, T, A, B, C, D); + BODY_40_59(51, D, E, T, A, B, C); + + const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k); + BODY_40_59(52, C, D, E, T, A, B); + BODY_40_59(53, B, C, D, E, T, A); + BODY_40_59(54, A, B, C, D, E, T); + BODY_40_59(55, T, A, B, C, D, E); + + const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k); + BODY_40_59(56, E, T, A, B, C, D); + BODY_40_59(57, D, E, T, A, B, C); + BODY_40_59(58, C, D, E, T, A, B); + BODY_40_59(59, B, C, D, E, T, A); + + k = K_60_79_x_4; + const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k); + BODY_60_79(60, A, B, C, D, E, T); + BODY_60_79(61, T, A, B, C, D, E); + BODY_60_79(62, E, T, A, B, C, D); + BODY_60_79(63, D, E, T, A, B, C); + + const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k); + BODY_60_79(64, C, D, E, T, A, B); + BODY_60_79(65, B, C, D, E, T, A); + BODY_60_79(66, A, B, C, D, E, T); + BODY_60_79(67, T, A, B, C, D, E); + + const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k); + BODY_60_79(68, E, T, A, B, C, D); + BODY_60_79(69, D, E, T, A, B, C); + BODY_60_79(70, C, D, E, T, A, B); + BODY_60_79(71, B, C, D, E, T, A); + + const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k); + BODY_60_79(72, A, B, C, D, E, T); + BODY_60_79(73, T, A, B, C, D, E); + BODY_60_79(74, E, T, A, B, C, D); + BODY_60_79(75, D, E, T, A, B, C); + + // We don't use the last value + (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k); + BODY_60_79(76, C, D, E, T, A, B); + BODY_60_79(77, B, C, D, E, T, A); + BODY_60_79(78, A, B, C, D, E, T); + BODY_60_79(79, T, A, B, C, D, E); + + const uint32_t mask = 0xffffffffUL; + state[0] = (state[0] + E) & mask; + state[1] = (state[1] + T) & mask; + state[2] = (state[2] + A) & mask; + state[3] = (state[3] + B) & mask; + state[4] = (state[4] + C) & mask; + + data += 64; + if (--num == 0) { + break; + } + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + E = state[4]; + } +} + +#endif // OPENSSL_PPC64LE + +#undef K_00_19 +#undef K_20_39 +#undef K_40_59 +#undef K_60_79 +#undef F_00_19 +#undef F_20_39 +#undef F_40_59 +#undef F_60_79 +#undef BODY_00_19 +#undef BODY_20_39 +#undef BODY_40_59 +#undef BODY_60_79 Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/internal.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/internal.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/internal.h @@ -183,8 +183,9 @@ extern "C" { #if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \ (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) -// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call -// for the asm to work correctly, unless compiled without asm code. +// x86, x86_64, the ARMs, and ppc64le need to record the result of a +// cpuid/getauxval call for the asm to work correctly, unless compiled without +// asm code. #define NEED_CPUID // OPENSSL_cpuid_setup initializes the platform-specific feature cache. This @@ -1657,6 +1658,16 @@ OPENSSL_INLINE int CRYPTO_is_ARMv8_SHA51 #endif // OPENSSL_ARM || OPENSSL_AARCH64 +#if defined(OPENSSL_PPC64LE) + +// CRYPTO_is_PPC64LE_vcrypto_capable returns true iff the current CPU supports +// the Vector.AES category of instructions. +int CRYPTO_is_PPC64LE_vcrypto_capable(void); + +extern unsigned long OPENSSL_ppc64le_hwcap2; + +#endif // OPENSSL_PPC64LE + #if defined(BORINGSSL_DISPATCH_TEST) // Runtime CPU dispatch testing support Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/perlasm/ppc-xlate.pl =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/perlasm/ppc-xlate.pl @@ -0,0 +1,320 @@ +#! /usr/bin/env perl +# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +my $flavour = shift; +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my %TYPES; +my $dotinlocallabels=($flavour=~/linux/)?1:0; + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $type = sub { + my ($dir,$name,$type) = @_; + + $TYPES{$name} = $type; + if ($flavour =~ /linux/) { + $name =~ s|^\.||; + ".type $name,$type"; + } else { + ""; + } +}; +my $globl = sub { + my $junk = shift; + my $name = shift; + my $global = \$GLOBALS{$name}; + my $type = \$TYPES{$name}; + my $ret; + + $name =~ s|^\.||; + + SWITCH: for ($flavour) { + /aix/ && do { if (!$$type) { + $$type = "\@function"; + } + if ($$type =~ /function/) { + $name = ".$name"; + } + last; + }; + /osx/ && do { $name = "_$name"; + last; + }; + /linux.*(32|64le)/ + && do { $ret .= ".globl $name"; + if (!$$type) { + $ret .= "\n.type $name,\@function"; + $$type = "\@function"; + } + last; + }; + /linux.*64/ && do { $ret .= ".globl $name"; + if (!$$type) { + $ret .= "\n.type $name,\@function"; + $$type = "\@function"; + } + if ($$type =~ /function/) { + $ret .= "\n.section \".opd\",\"aw\""; + $ret .= "\n.align 3"; + $ret .= "\n$name:"; + $ret .= "\n.quad .$name,.TOC.\@tocbase,0"; + $ret .= "\n.previous"; + $name = ".$name"; + } + last; + }; + } + + $ret = ".globl $name" if (!$ret); + $$global = $name; + $ret; +}; +my $text = sub { + my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; + $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); + $ret; +}; +my $machine = sub { + my $junk = shift; + my $arch = shift; + if ($flavour =~ /osx/) + { $arch =~ s/\"//g; + $arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any"); + } + ".machine $arch"; +}; +my $size = sub { + if ($flavour =~ /linux/) + { shift; + my $name = shift; + my $real = $GLOBALS{$name} ? \$GLOBALS{$name} : \$name; + my $ret = ".size $$real,.-$$real"; + $name =~ s|^\.||; + if ($$real ne $name) { + $ret .= "\n.size $name,.-$$real"; + } + $ret; + } + else + { ""; } +}; +my $asciz = sub { + shift; + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; } + else + { ""; } +}; +my $quad = sub { + shift; + my @ret; + my ($hi,$lo); + for (@_) { + if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io) + { $hi=$1?"0x$1":"0"; $lo="0x$2"; } + elsif (/^([0-9]+)$/o) + { $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl + else + { $hi=undef; $lo=$_; } + + if (defined($hi)) + { push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); } + else + { push(@ret,".quad $lo"); } + } + join("\n",@ret); +}; + +################################################################ +# simplified mnemonics not handled by at least one assembler +################################################################ +my $cmplw = sub { + my $f = shift; + my $cr = 0; $cr = shift if ($#_>1); + # Some out-of-date 32-bit GNU assembler just can't handle cmplw... + ($flavour =~ /linux.*32/) ? + " .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 : + " cmplw ".join(',',$cr,@_); +}; +my $bdnz = sub { + my $f = shift; + my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint + " bc $bo,0,".shift; +} if ($flavour!~/linux/); +my $bltlr = sub { + my $f = shift; + my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 : + " bclr $bo,0"; +}; +my $bnelr = sub { + my $f = shift; + my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 : + " bclr $bo,2"; +}; +my $beqlr = sub { + my $f = shift; + my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint + ($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints + " .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 : + " bclr $bo,2"; +}; +# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two +# arguments is 64, with "operand out of range" error. +my $extrdi = sub { + my ($f,$ra,$rs,$n,$b) = @_; + $b = ($b+$n)&63; $n = 64-$n; + " rldicl $ra,$rs,$b,$n"; +}; +my $vmr = sub { + my ($f,$vx,$vy) = @_; + " vor $vx,$vy,$vy"; +}; + +# Some ABIs specify vrsave, special-purpose register #256, as reserved +# for system use. +my $no_vrsave = ($flavour =~ /aix|linux64le/); +my $mtspr = sub { + my ($f,$idx,$ra) = @_; + if ($idx == 256 && $no_vrsave) { + " or $ra,$ra,$ra"; + } else { + " mtspr $idx,$ra"; + } +}; +my $mfspr = sub { + my ($f,$rd,$idx) = @_; + if ($idx == 256 && $no_vrsave) { + " li $rd,-1"; + } else { + " mfspr $rd,$idx"; + } +}; + +# PowerISA 2.06 stuff +sub vsxmem_op { + my ($f, $vrt, $ra, $rb, $op) = @_; + " .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1); +} +# made-up unaligned memory reference AltiVec/VMX instructions +my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x +my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x +my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx +my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx +my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x +my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x + +# PowerISA 2.07 stuff +sub vcrypto_op { + my ($f, $vrt, $vra, $vrb, $op) = @_; + " .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op; +} +my $vcipher = sub { vcrypto_op(@_, 1288); }; +my $vcipherlast = sub { vcrypto_op(@_, 1289); }; +my $vncipher = sub { vcrypto_op(@_, 1352); }; +my $vncipherlast= sub { vcrypto_op(@_, 1353); }; +my $vsbox = sub { vcrypto_op(@_, 0, 1480); }; +my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); }; +my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); }; +my $vpmsumb = sub { vcrypto_op(@_, 1032); }; +my $vpmsumd = sub { vcrypto_op(@_, 1224); }; +my $vpmsubh = sub { vcrypto_op(@_, 1096); }; +my $vpmsumw = sub { vcrypto_op(@_, 1160); }; +my $vaddudm = sub { vcrypto_op(@_, 192); }; + +my $mtsle = sub { + my ($f, $arg) = @_; + " .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2); +}; + +# PowerISA 3.0 stuff +my $maddhdu = sub { + my ($f, $rt, $ra, $rb, $rc) = @_; + " .long ".sprintf "0x%X",(4<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($rc<<6)|49; +}; +my $maddld = sub { + my ($f, $rt, $ra, $rb, $rc) = @_; + " .long ".sprintf "0x%X",(4<<26)|($rt<<21)|($ra<<16)|($rb<<11)|($rc<<6)|51; +}; + +my $darn = sub { + my ($f, $rt, $l) = @_; + " .long ".sprintf "0x%X",(31<<26)|($rt<<21)|($l<<16)|(755<<1); +}; + +print <<___; +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) +#define OPENSSL_NO_ASM +#endif +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) && defined(__ELF__) +___ + +while($line=<>) { + + $line =~ s|[#!;].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|\.L(\w+)|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + my $xlated = ($GLOBALS{$label} or $label); + print "$xlated:"; + if ($flavour =~ /linux.*64le/) { + if ($TYPES{$label} =~ /function/) { + printf "\n.localentry %s,0\n",$xlated; + } + } + } + } + + { + $line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $f = $3; + my $opcode = eval("\$$mnemonic"); + $line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/); + if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); } + elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; } + } + + print $line if ($line); + print "\n"; +} + +print <<___; +#endif // !OPENSSL_NO_ASM && __powerpc64__ && __ELF__ +#if defined(__ELF__) +// See https://www.airs.com/blog/archives/518. +.section .note.GNU-stack,"",\%progbits +#endif +___ + +close STDOUT or die "error closing STDOUT: $!"; Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/test/abi_test.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/test/abi_test.h +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/test/abi_test.h @@ -179,7 +179,78 @@ struct alignas(16) Reg128 { CALLER_STATE_REGISTER(uint64_t, x28) \ CALLER_STATE_REGISTER(uint64_t, x29) -#endif // X86_64 || X86 || ARM || AARCH64 +#elif defined(OPENSSL_PPC64LE) + +// CRReg only compares the CR2-CR4 bits of a CR register. +struct CRReg { + uint32_t masked() const { return value & 0x00fff000; } + bool operator==(CRReg r) const { return masked() == r.masked(); } + bool operator!=(CRReg r) const { return masked() != r.masked(); } + uint32_t value; +}; + +// References: +// ELFv2: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf +// +// Note vector and floating-point registers on POWER have two different names. +// Originally, there were 32 floating-point registers and 32 vector registers, +// labelled f0-f31 and v0-v31 respectively. Later, VSX (Vector Scalar Extension) +// unified them into 64 registers vs0-vs63. f0-f31 map to the lower halves of +// vs0-vs31. v0-v31 map to vs32-vs63. The ABI was defined in terms of pre-VSX +// names, so we use those names here. In particular, f14-f31 are +// callee-saved, but the upper halves of vs14-vs31 are not. +#define LOOP_CALLER_STATE_REGISTERS() \ + CALLER_STATE_REGISTER(Reg128, v20) \ + CALLER_STATE_REGISTER(Reg128, v21) \ + CALLER_STATE_REGISTER(Reg128, v22) \ + CALLER_STATE_REGISTER(Reg128, v23) \ + CALLER_STATE_REGISTER(Reg128, v24) \ + CALLER_STATE_REGISTER(Reg128, v25) \ + CALLER_STATE_REGISTER(Reg128, v26) \ + CALLER_STATE_REGISTER(Reg128, v27) \ + CALLER_STATE_REGISTER(Reg128, v28) \ + CALLER_STATE_REGISTER(Reg128, v29) \ + CALLER_STATE_REGISTER(Reg128, v30) \ + CALLER_STATE_REGISTER(Reg128, v31) \ + CALLER_STATE_REGISTER(uint64_t, r14) \ + CALLER_STATE_REGISTER(uint64_t, r15) \ + CALLER_STATE_REGISTER(uint64_t, r16) \ + CALLER_STATE_REGISTER(uint64_t, r17) \ + CALLER_STATE_REGISTER(uint64_t, r18) \ + CALLER_STATE_REGISTER(uint64_t, r19) \ + CALLER_STATE_REGISTER(uint64_t, r20) \ + CALLER_STATE_REGISTER(uint64_t, r21) \ + CALLER_STATE_REGISTER(uint64_t, r22) \ + CALLER_STATE_REGISTER(uint64_t, r23) \ + CALLER_STATE_REGISTER(uint64_t, r24) \ + CALLER_STATE_REGISTER(uint64_t, r25) \ + CALLER_STATE_REGISTER(uint64_t, r26) \ + CALLER_STATE_REGISTER(uint64_t, r27) \ + CALLER_STATE_REGISTER(uint64_t, r28) \ + CALLER_STATE_REGISTER(uint64_t, r29) \ + CALLER_STATE_REGISTER(uint64_t, r30) \ + CALLER_STATE_REGISTER(uint64_t, r31) \ + CALLER_STATE_REGISTER(uint64_t, f14) \ + CALLER_STATE_REGISTER(uint64_t, f15) \ + CALLER_STATE_REGISTER(uint64_t, f16) \ + CALLER_STATE_REGISTER(uint64_t, f17) \ + CALLER_STATE_REGISTER(uint64_t, f18) \ + CALLER_STATE_REGISTER(uint64_t, f19) \ + CALLER_STATE_REGISTER(uint64_t, f20) \ + CALLER_STATE_REGISTER(uint64_t, f21) \ + CALLER_STATE_REGISTER(uint64_t, f22) \ + CALLER_STATE_REGISTER(uint64_t, f23) \ + CALLER_STATE_REGISTER(uint64_t, f24) \ + CALLER_STATE_REGISTER(uint64_t, f25) \ + CALLER_STATE_REGISTER(uint64_t, f26) \ + CALLER_STATE_REGISTER(uint64_t, f27) \ + CALLER_STATE_REGISTER(uint64_t, f28) \ + CALLER_STATE_REGISTER(uint64_t, f29) \ + CALLER_STATE_REGISTER(uint64_t, f30) \ + CALLER_STATE_REGISTER(uint64_t, f31) \ + CALLER_STATE_REGISTER(CRReg, cr) + +#endif // X86_64 || X86 || ARM || AARCH64 || PPC64LE // Enable ABI testing if all of the following are true. // @@ -231,6 +302,12 @@ inline crypto_word_t ToWord(T t) { // on 32-bit architectures for simplicity. static_assert(sizeof(T) == 4, "parameter types must be word-sized"); return (crypto_word_t)t; +#elif defined(OPENSSL_PPC64LE) + // ELFv2, section 2.2.2.3 says the parameter save area sign- or zero-extends + // parameters passed in memory. Section 2.2.3 is unclear on how to handle + // register parameters, but section 2.2.2.3 additionally says that the memory + // copy of a parameter is identical to the register one. + return (crypto_word_t)t; #elif defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64) // AAPCS64, section 5.4.2, clauses C.7 and C.14 says any remaining bits in // aarch are unspecified. iOS64 contradicts this and says the callee extends @@ -285,9 +362,9 @@ inline crypto_word_t ToWord(T t) { template inline crypto_word_t CheckImpl(Result *out, bool unwind, R (*func)(Args...), typename DeductionGuard::Type... args) { - // We only support up to 8 arguments, so all arguments on aarch64 are passed - // in registers. This is simpler and avoids the iOS discrepancy around packing - // small arguments on the stack. (See the iOS64 reference.) + // We only support up to 8 arguments, so all arguments on aarch64 and ppc64le + // are passed in registers. This is simpler and avoids the iOS discrepancy + // around packing small arguments on the stack. (See the iOS64 reference.) static_assert(sizeof...(args) <= 8, "too many arguments for abi_test_trampoline"); Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/test/asm/trampoline-ppc.pl =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/test/asm/trampoline-ppc.pl @@ -0,0 +1,262 @@ +#!/usr/bin/env perl +# Copyright (c) 2019, Google Inc. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION +# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# This file defines helper functions for crypto/test/abi_test.h on ppc64le. See +# that header for details on how to use this. +# +# For convenience, this file is linked into libcrypto, where consuming builds +# already support architecture-specific sources. The static linker should drop +# this code in non-test binaries. This includes a shared library build of +# libcrypto, provided --gc-sections or equivalent is used. +# +# References: +# +# ELFv2: http://openpowerfoundation.org/wp-content/uploads/resources/leabi/leabi-20170510.pdf + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +my $dir = $1; +my $xlate; +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or +die "can't locate ppc-xlate.pl"; + +open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT = *OUT; + +unless ($flavour =~ /linux.*64le/) { + die "This file only supports the ELFv2 ABI, used by ppc64le"; +} + +my $code = ""; + +sub load_or_store_regs { + # $op is "l" or "st". + my ($op, $base_reg, $base_offset) = @_; + # Vector registers. + foreach (20..31) { + my $offset = $base_offset + ($_ - 20) * 16; + # Vector registers only support indexed register addressing. + $code .= "\tli\tr11, $offset\n"; + $code .= "\t${op}vx\tv$_, r11, $base_reg\n"; + } + # Save general registers. + foreach (14..31) { + my $offset = $base_offset + 192 + ($_ - 14) * 8; + $code .= "\t${op}d\tr$_, $offset($base_reg)\n"; + } + # Save floating point registers. + foreach (14..31) { + my $offset = $base_offset + 336 + ($_ - 14) * 8; + $code .= "\t${op}fd\tf$_, $offset($base_reg)\n"; + } +} + +sub load_regs { + my ($base_reg, $base_offset) = @_; + load_or_store_regs("l", $base_reg, $base_offset); +} + +sub store_regs { + my ($base_reg, $base_offset) = @_; + load_or_store_regs("st", $base_reg, $base_offset); +} + +my ($func, $state, $argv, $argc) = ("r3", "r4", "r5", "r6"); +$code .= <<____; +.machine "any" +.text + +# abi_test_trampoline loads callee-saved registers from |state|, calls |func| +# with |argv|, then saves the callee-saved registers into |state|. It returns +# the result of |func|. The |unwind| argument is unused. +# uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +# const uint64_t *argv, size_t argc, +# uint64_t unwind); +.globl abi_test_trampoline +.align 5 +abi_test_trampoline: + # LR is saved into the caller's stack frame. + mflr r0 + std r0, 16(r1) + + # Allocate 66*8 = 528 bytes of stack frame. From the top of the stack + # to the bottom, the stack frame is: + # + # 0(r1) - Back chain pointer + # 8(r1) - CR save area + # 16(r1) - LR save area (for |func|) + # 24(r1) - TOC pointer save area + # 32(r1) - Saved copy of |state| + # 40(r1) - Padding + # 48(r1) - Vector register save area (v20-v31, 12 registers) + # 240(r1) - General register save area (r14-r31, 18 registers) + # 384(r1) - Floating point register save area (f14-f31, 18 registers) + # + # Note the layouts of the register save areas and CallerState match. + # + # In the ELFv2 ABI, the parameter save area is optional if the function + # is non-variadic and all parameters fit in registers. We only support + # such functions, so we omit it to test that |func| does not rely on it. + stdu r1, -528(r1) + + mfcr r0 + std r0, 8(r1) # Save CR + std r2, 24(r1) # Save TOC + std $state, 32(r1) # Save |state| +____ +# Save registers to the stack. +store_regs("r1", 48); +# Load registers from the caller. +load_regs($state, 0); +$code .= <<____; + # Load CR from |state|. + ld r0, 480($state) + mtcr r0 + + # Move parameters into temporary registers so they are not clobbered. + addi r11, $argv, -8 # Adjust for ldu below + mr r12, $func + + # Load parameters into registers. + cmpdi $argc, 0 + beq .Largs_done + mtctr $argc + ldu r3, 8(r11) + bdz .Largs_done + ldu r4, 8(r11) + bdz .Largs_done + ldu r5, 8(r11) + bdz .Largs_done + ldu r6, 8(r11) + bdz .Largs_done + ldu r7, 8(r11) + bdz .Largs_done + ldu r8, 8(r11) + bdz .Largs_done + ldu r9, 8(r11) + bdz .Largs_done + ldu r10, 8(r11) + +.Largs_done: + li r2, 0 # Clear TOC to test |func|'s global entry point + mtctr r12 + bctrl + ld r2, 24(r1) # Restore TOC + + ld $state, 32(r1) # Reload |state| +____ +# Output resulting registers to the caller. +store_regs($state, 0); +# Restore registers from the stack. +load_regs("r1", 48); +$code .= <<____; + mfcr r0 + std r0, 480($state) # Output CR to caller + ld r0, 8(r1) + mtcrf 0b00111000, r0 # Restore CR2-CR4 + addi r1, r1, 528 + ld r0, 16(r1) # Restore LR + mtlr r0 + blr +.size abi_test_trampoline,.-abi_test_trampoline +____ + +# abi_test_clobber_* clobbers the corresponding register. These are used to test +# the ABI-testing framework. +foreach (0..31) { + # r1 is the stack pointer. r13 is the thread pointer. + next if ($_ == 1 || $_ == 13); + $code .= <<____; +.globl abi_test_clobber_r$_ +.align 5 +abi_test_clobber_r$_: + li r$_, 0 + blr +.size abi_test_clobber_r$_,.-abi_test_clobber_r$_ +____ +} + +foreach (0..31) { + $code .= <<____; +.globl abi_test_clobber_f$_ +.align 4 +abi_test_clobber_f$_: + li r0, 0 + # Use the red zone. + std r0, -8(r1) + lfd f$_, -8(r1) + blr +.size abi_test_clobber_f$_,.-abi_test_clobber_f$_ +____ +} + +foreach (0..31) { + $code .= <<____; +.globl abi_test_clobber_v$_ +.align 4 +abi_test_clobber_v$_: + vxor v$_, v$_, v$_ + blr +.size abi_test_clobber_v$_,.-abi_test_clobber_v$_ +____ +} + +foreach (0..7) { + # PPC orders CR fields in big-endian, so the mask is reversed from what one + # would expect. + my $mask = 1 << (7 - $_); + $code .= <<____; +.globl abi_test_clobber_cr$_ +.align 4 +abi_test_clobber_cr$_: + # Flip the bits on cr$_ rather than setting to zero. With a four-bit + # register, zeroing it will do nothing 1 in 16 times. + mfcr r0 + not r0, r0 + mtcrf $mask, r0 + blr +.size abi_test_clobber_cr$_,.-abi_test_clobber_cr$_ +____ +} + +$code .= <<____; +.globl abi_test_clobber_ctr +.align 4 +abi_test_clobber_ctr: + li r0, 0 + mtctr r0 + blr +.size abi_test_clobber_ctr,.-abi_test_clobber_ctr + +.globl abi_test_clobber_lr +.align 4 +abi_test_clobber_lr: + mflr r0 + mtctr r0 + li r0, 0 + mtlr r0 + bctr +.size abi_test_clobber_lr,.-abi_test_clobber_lr + +____ + +print $code; +close STDOUT or die "error closing STDOUT: $!"; Index: chromium-128.0.6613.113/third_party/boringssl/src/include/openssl/target.h =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/include/openssl/target.h +++ chromium-128.0.6613.113/third_party/boringssl/src/include/openssl/target.h @@ -34,6 +34,9 @@ #elif defined(__ARMEL__) || defined(_M_ARM) #define OPENSSL_32_BIT #define OPENSSL_ARM +#elif (defined(__PPC64__) || defined(__powerpc64__)) && defined(_LITTLE_ENDIAN) +#define OPENSSL_64_BIT +#define OPENSSL_PPC64LE #elif defined(__MIPSEL__) && !defined(__LP64__) #define OPENSSL_32_BIT #define OPENSSL_MIPS Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/acvp/modulewrapper/main.cc =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/util/fipstools/acvp/modulewrapper/main.cc +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/acvp/modulewrapper/main.cc @@ -37,6 +37,8 @@ int main(int argc, char **argv) { puts("ARM (32-bit)"); #elif defined(OPENSSL_AARCH64) puts("aarch64 (64-bit)"); +#elif defined(OPENSSL_PPC64LE) + puts("PPC64LE (64-bit)"); #else #error "FIPS build not supported on this architecture" #endif Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate.go =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/util/fipstools/delocate/delocate.go +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate.go @@ -54,7 +54,8 @@ type stringWriter interface { type processorType int const ( - x86_64 processorType = iota + 1 + ppc64le processorType = iota + 1 + x86_64 aarch64 ) @@ -67,6 +68,8 @@ type delocation struct { // symbols is the set of symbols defined in the module. symbols map[string]struct{} + // localEntrySymbols is the set of symbols with .localentry directives. + localEntrySymbols map[string]struct{} // redirectors maps from out-call symbol name to the name of a // redirector function for that symbol. E.g. “memcpy” -> // “bcm_redirector_memcpy”. @@ -75,6 +78,9 @@ type delocation struct { // should be used to reference it. E.g. “P384_data_storage” -> // “P384_data_storage”. bssAccessorsNeeded map[string]string + // tocLoaders is a set of symbol names for which TOC helper functions + // are required. (ppc64le only.) + tocLoaders map[string]struct{} // gotExternalsNeeded is a set of symbol names for which we need // “delta” symbols: symbols that contain the offset from their location // to the memory in question. @@ -151,6 +157,8 @@ func (d *delocation) processInput(input switch d.processor { case x86_64: statement, err = d.processIntelInstruction(statement, node.up) + case ppc64le: + statement, err = d.processPPCInstruction(statement, node.up) case aarch64: statement, err = d.processAarch64Instruction(statement, node.up) default: @@ -247,7 +255,7 @@ func (d *delocation) processDirective(st d.writeNode(statement) break - case ".debug", ".note": + case ".debug", ".note", ".toc": d.writeNode(statement) break @@ -336,6 +344,10 @@ func (d *delocation) processLabelContain d.output.WriteString("\t" + name + "\t" + strings.Join(args, ", ") + "\n") } + if name == ".localentry" { + d.output.WriteString(localEntryName(args[0]) + ":\n") + } + return statement, nil } @@ -646,6 +658,191 @@ func (d *delocation) processAarch64Instr return statement, nil } +/* ppc64le + +[PABI]: “64-Bit ELF V2 ABI Specification. Power Architecture.” March 21st, + 2017 + +(Also useful is “Power ISA Version 2.07 B”. Note that version three of that +document is /not/ good as that's POWER9 specific.) + +ppc64le doesn't have IP-relative addressing and does a lot to work around this. +Rather than reference a PLT and GOT direction, it has a single structure called +the TOC (Table Of Contents). Within the TOC is the contents of .rodata, .data, +.got, .plt, .bss, etc sections [PABI;3.3]. + +A pointer to the TOC is maintained in r2 and the following pattern is used to +load the address of an element into a register: + + addis
, 2, foo@toc@ha + addi
,
, foo@toc@l + +The “addis” instruction shifts a signed constant left 16 bits and adds the +result to its second argument, saving the result in the first argument. The +“addi” instruction does the same, but without shifting. Thus the “@toc@ha" +suffix on a symbol means “the top 16 bits of the TOC offset” and “@toc@l” means +“the bottom 16 bits of the offset”. However, note that both values are signed, +thus offsets in the top half of a 64KB chunk will have an @ha value that's one +greater than expected and a negative @l value. + +The TOC is specific to a “module” (basically an executable or shared object). +This means that there's not a single TOC in a process and that r2 needs to +change as control moves between modules. Thus functions have two entry points: +the “global” entry point and the “local” entry point. Jumps from within the +same module can use the local entry while jumps from other modules must use the +global entry. The global entry establishes the correct value of r2 before +running the function and the local entry skips that code. + +The global entry point for a function is defined by its label. The local entry +is a power-of-two number of bytes from the global entry, set by the +“.localentry” directive. (ppc64le instructions are always 32 bits, so an offset +of 1 or 2 bytes is treated as an offset of zero.) + +In order to help the global entry code set r2 to point to the local TOC, r12 is +set to the address of the global entry point when called [PABI;2.2.1.1]. Thus +the global entry will typically use an addis+addi pair to add a known offset to +r12 and store it in r2. For example: + +foo: + addis 2, 12, .TOC. - foo@ha + addi 2, 2, .TOC. - foo@l + +(It's worth noting that the '@' operator binds very loosely, so the 3rd +arguments parse as (.TOC. - foo)@ha and (.TOC. - foo)@l.) + +When calling a function, the compiler doesn't know whether that function is in +the same module or not. Thus it doesn't know whether r12 needs to be set nor +whether r2 will be clobbered on return. Rather than always assume the worst, +the linker fixes stuff up once it knows that a call is going out of module: + +Firstly, calling, say, memcpy (which we assume to be in a different module) +won't actually jump directly to memcpy, or even a PLT resolution function. +It'll call a synthesised function that: + a) saves r2 in the caller's stack frame + b) loads the address of memcpy@PLT into r12 + c) jumps to r12. + +As this synthesised function loads memcpy@PLT, a call to memcpy from the +compiled code just references “memcpy” directly, not “memcpy@PLT”. + +Since it jumps directly to memcpy@PLT, it can't restore r2 on return. Thus +calls must be followed by a nop. If the call ends up going out-of-module, the +linker will rewrite that nop to load r2 from the stack. + +Speaking of the stack, the stack pointer is kept in r1 and there's a 288-byte +red-zone. The format of the stack frame is defined [PABI;2.2.2] and must be +followed as called functions will write into their parent's stack frame. For +example, the synthesised out-of-module trampolines will save r2 24 bytes into +the caller's frame and all non-leaf functions save the return address 16 bytes +into the caller's frame. + +A final point worth noting: some RISC ISAs have r0 wired to zero: all reads +result in zero and all writes are discarded. POWER does something a little like +that, but r0 is only special in certain argument positions for certain +instructions. You just have to read the manual to know which they are. + + +Delocation is easier than Intel because there's just TOC references, but it's +also harder because there's no IP-relative addressing. + +Jumps are IP-relative however, and have a 24-bit immediate value. So we can +jump to functions that set a register to the needed value. (r3 is the +return-value register and so that's what is generally used here.) */ + +// isPPC64LEAPair recognises an addis+addi pair that's adding the offset of +// source to relative and writing the result to target. +func (d *delocation) isPPC64LEAPair(statement *node32) (target, source, relative string, ok bool) { + instruction := skipWS(statement.up).up + assertNodeType(instruction, ruleInstructionName) + name1 := d.contents(instruction) + args1 := instructionArgs(instruction.next) + + statement = statement.next + instruction = skipWS(statement.up).up + assertNodeType(instruction, ruleInstructionName) + name2 := d.contents(instruction) + args2 := instructionArgs(instruction.next) + + if name1 != "addis" || + len(args1) != 3 || + name2 != "addi" || + len(args2) != 3 { + return "", "", "", false + } + + target = d.contents(args1[0]) + relative = d.contents(args1[1]) + source1 := d.contents(args1[2]) + source2 := d.contents(args2[2]) + + if !strings.HasSuffix(source1, "@ha") || + !strings.HasSuffix(source2, "@l") || + source1[:len(source1)-3] != source2[:len(source2)-2] || + d.contents(args2[0]) != target || + d.contents(args2[1]) != target { + return "", "", "", false + } + + source = source1[:len(source1)-3] + ok = true + return +} + +// establishTOC writes the global entry prelude for a function. The standard +// prelude involves relocations so this version moves the relocation outside +// the integrity-checked area. +func establishTOC(w stringWriter) { + w.WriteString("999:\n") + w.WriteString("\taddis 2, 12, .LBORINGSSL_external_toc-999b@ha\n") + w.WriteString("\taddi 2, 2, .LBORINGSSL_external_toc-999b@l\n") + w.WriteString("\tld 12, 0(2)\n") + w.WriteString("\tadd 2, 2, 12\n") +} + +// loadTOCFuncName returns the name of a synthesized function that sets r3 to +// the value of “symbol+offset”. +func loadTOCFuncName(symbol, offset string) string { + symbol = strings.Replace(symbol, ".", "_dot_", -1) + ret := ".Lbcm_loadtoc_" + symbol + if len(offset) != 0 { + offset = strings.Replace(offset, "+", "_plus_", -1) + offset = strings.Replace(offset, "-", "_minus_", -1) + ret += "_" + offset + } + return ret +} + +func (d *delocation) loadFromTOC(w stringWriter, symbol, offset, dest string) wrapperFunc { + d.tocLoaders[symbol+"\x00"+offset] = struct{}{} + + return func(k func()) { + w.WriteString("\taddi 1, 1, -288\n") // Clear the red zone. + w.WriteString("\tmflr " + dest + "\n") // Stash the link register. + w.WriteString("\tstd " + dest + ", -8(1)\n") + // The TOC loader will use r3, so stash it if necessary. + if dest != "3" { + w.WriteString("\tstd 3, -16(1)\n") + } + + // Because loadTOCFuncName returns a “.L” name, we don't need a + // nop after this call. + w.WriteString("\tbl " + loadTOCFuncName(symbol, offset) + "\n") + + // Cycle registers around. We need r3 -> destReg, -8(1) -> + // lr and, optionally, -16(1) -> r3. + w.WriteString("\tstd 3, -24(1)\n") + w.WriteString("\tld 3, -8(1)\n") + w.WriteString("\tmtlr 3\n") + w.WriteString("\tld " + dest + ", -24(1)\n") + if dest != "3" { + w.WriteString("\tld 3, -16(1)\n") + } + w.WriteString("\taddi 1, 1, 288\n") + + k() + } +} + func (d *delocation) gatherOffsets(symRef *node32, offsets string) (*node32, string) { for symRef != nil && symRef.pegRule == ruleOffset { offset := d.contents(symRef) @@ -700,6 +897,215 @@ func (d *delocation) parseMemRef(memRef return } +func (d *delocation) processPPCInstruction(statement, instruction *node32) (*node32, error) { + assertNodeType(instruction, ruleInstructionName) + instructionName := d.contents(instruction) + isBranch := instructionName[0] == 'b' + + argNodes := instructionArgs(instruction.next) + + var wrappers wrapperStack + var args []string + changed := false + +Args: + for i, arg := range argNodes { + fullArg := arg + isIndirect := false + + if arg.pegRule == ruleIndirectionIndicator { + arg = arg.next + isIndirect = true + } + + switch arg.pegRule { + case ruleRegisterOrConstant, ruleLocalLabelRef: + args = append(args, d.contents(fullArg)) + + case ruleTOCRefLow: + return nil, errors.New("Found low TOC reference outside preamble pattern") + + case ruleTOCRefHigh: + target, _, relative, ok := d.isPPC64LEAPair(statement) + if !ok { + return nil, errors.New("Found high TOC reference outside preamble pattern") + } + + if relative != "12" { + return nil, fmt.Errorf("preamble is relative to %q, not r12", relative) + } + + if target != "2" { + return nil, fmt.Errorf("preamble is setting %q, not r2", target) + } + + statement = statement.next + establishTOC(d.output) + instructionName = "" + changed = true + break Args + + case ruleMemoryRef: + symbol, offset, section, didChange, symbolIsLocal, memRef := d.parseMemRef(arg.up) + changed = didChange + + if len(symbol) > 0 { + if _, localEntrySymbol := d.localEntrySymbols[symbol]; localEntrySymbol && isBranch { + symbol = localEntryName(symbol) + changed = true + } else if _, knownSymbol := d.symbols[symbol]; knownSymbol { + symbol = localTargetName(symbol) + changed = true + } else if !symbolIsLocal && !isSynthesized(symbol) && len(section) == 0 { + changed = true + d.redirectors[symbol] = redirectorName(symbol) + symbol = redirectorName(symbol) + // TODO(davidben): This should sanity-check the next + // instruction is a nop and ideally remove it. + wrappers = append(wrappers, func(k func()) { + k() + // Like the linker's PLT stubs, redirector functions + // expect callers to restore r2. + d.output.WriteString("\tld 2, 24(1)\n") + }) + } + } + + switch section { + case "": + + case "tls": + // This section identifier just tells the + // assembler to use r13, the pointer to the + // thread-local data [PABI;3.7.3.3]. + + case "toc@ha": + // Delete toc@ha instructions. Per + // [PABI;3.6.3], the linker is allowed to erase + // toc@ha instructions. We take advantage of + // this by unconditionally erasing the toc@ha + // instructions and doing the full lookup when + // processing toc@l. + // + // Note that any offset here applies before @ha + // and @l. That is, 42+foo@toc@ha is + // #ha(42+foo-.TOC.), not 42+#ha(foo-.TOC.). Any + // corresponding toc@l references are required + // by the ABI to have the same offset. The + // offset will be incorporated in full when + // those are processed. + if instructionName != "addis" || len(argNodes) != 3 || i != 2 || args[1] != "2" { + return nil, errors.New("can't process toc@ha reference") + } + changed = true + instructionName = "" + break Args + + case "toc@l": + // Per [PAB;3.6.3], this instruction must take + // as input a register which was the output of + // a toc@ha computation and compute the actual + // address of some symbol. The toc@ha + // computation was elided, so we ignore that + // input register and compute the address + // directly. + changed = true + + // For all supported toc@l instructions, the + // destination register is the first argument. + destReg := args[0] + + wrappers = append(wrappers, d.loadFromTOC(d.output, symbol, offset, destReg)) + switch instructionName { + case "addi": + // The original instruction was: + // addi destReg, tocHaReg, offset+symbol@toc@l + instructionName = "" + + case "ld", "lhz", "lwz": + // The original instruction was: + // l?? destReg, offset+symbol@toc@l(tocHaReg) + // + // We transform that into the + // equivalent dereference of destReg: + // l?? destReg, 0(destReg) + origInstructionName := instructionName + instructionName = "" + + assertNodeType(memRef, ruleBaseIndexScale) + assertNodeType(memRef.up, ruleRegisterOrConstant) + if memRef.next != nil || memRef.up.next != nil { + return nil, errors.New("expected single register in BaseIndexScale for ld argument") + } + + baseReg := destReg + if baseReg == "0" { + // Register zero is special as the base register for a load. + // Avoid it by spilling and using r3 instead. + baseReg = "3" + wrappers = append(wrappers, func(k func()) { + d.output.WriteString("\taddi 1, 1, -288\n") // Clear the red zone. + d.output.WriteString("\tstd " + baseReg + ", -8(1)\n") + d.output.WriteString("\tmr " + baseReg + ", " + destReg + "\n") + k() + d.output.WriteString("\tld " + baseReg + ", -8(1)\n") + d.output.WriteString("\taddi 1, 1, 288\n") // Clear the red zone. + }) + } + + wrappers = append(wrappers, func(k func()) { + d.output.WriteString("\t" + origInstructionName + " " + destReg + ", 0(" + baseReg + ")\n") + }) + default: + return nil, fmt.Errorf("can't process TOC argument to %q", instructionName) + } + + default: + return nil, fmt.Errorf("Unknown section type %q", section) + } + + argStr := "" + if isIndirect { + argStr += "*" + } + argStr += symbol + if len(offset) > 0 { + argStr += offset + } + if len(section) > 0 { + argStr += "@" + argStr += section + } + + for ; memRef != nil; memRef = memRef.next { + argStr += d.contents(memRef) + } + + args = append(args, argStr) + + default: + panic(fmt.Sprintf("unknown instruction argument type %q", rul3s[arg.pegRule])) + } + } + + if changed { + d.writeCommentedNode(statement) + + var replacement string + if len(instructionName) > 0 { + replacement = "\t" + instructionName + "\t" + strings.Join(args, ", ") + "\n" + } + + wrappers.do(func() { + d.output.WriteString(replacement) + }) + } else { + d.writeNode(statement) + } + + return statement, nil +} + /* Intel */ type instructionType int @@ -1332,6 +1738,8 @@ func writeAarch64Function(w stringWriter func transform(w stringWriter, inputs []inputFile) error { // symbols contains all defined symbols. symbols := make(map[string]struct{}) + // localEntrySymbols contains all symbols with a .localentry directive. + localEntrySymbols := make(map[string]struct{}) // fileNumbers is the set of IDs seen in .file directives. fileNumbers := make(map[int]struct{}) // maxObservedFileNumber contains the largest seen file number in a @@ -1355,6 +1763,25 @@ func transform(w stringWriter, inputs [] }, ruleStatement, ruleLabel, ruleSymbolName) forEachPath(input.ast.up, func(node *node32) { + node = node.up + assertNodeType(node, ruleLabelContainingDirectiveName) + directive := input.contents[node.begin:node.end] + if directive != ".localentry" { + return + } + // Extract the first argument. + node = skipWS(node.next) + assertNodeType(node, ruleSymbolArgs) + node = node.up + assertNodeType(node, ruleSymbolArg) + symbol := input.contents[node.begin:node.end] + if _, ok := localEntrySymbols[symbol]; ok { + panic(fmt.Sprintf("Duplicate .localentry directive found: %q in %q", symbol, input.path)) + } + localEntrySymbols[symbol] = struct{}{} + }, ruleStatement, ruleLabelContainingDirective) + + forEachPath(input.ast.up, func(node *node32) { assertNodeType(node, ruleLocationDirective) directive := input.contents[node.begin:node.end] if !strings.HasPrefix(directive, ".file") { @@ -1402,11 +1829,13 @@ func transform(w stringWriter, inputs [] d := &delocation{ symbols: symbols, + localEntrySymbols: localEntrySymbols, processor: processor, commentIndicator: commentIndicator, output: w, redirectors: make(map[string]string), bssAccessorsNeeded: make(map[string]string), + tocLoaders: make(map[string]struct{}), gotExternalsNeeded: make(map[string]struct{}), gotOffsetsNeeded: make(map[string]struct{}), gotOffOffsetsNeeded: make(map[string]struct{}), @@ -1441,6 +1870,22 @@ func transform(w stringWriter, inputs [] for _, name := range redirectorNames { redirector := d.redirectors[name] switch d.processor { + case ppc64le: + w.WriteString(".section \".toc\", \"aw\"\n") + w.WriteString(".Lredirector_toc_" + name + ":\n") + w.WriteString(".quad " + name + "\n") + w.WriteString(".text\n") + w.WriteString(".type " + redirector + ", @function\n") + w.WriteString(redirector + ":\n") + // |name| will clobber r2, so save it. This is matched by a restore in + // redirector calls. + w.WriteString("\tstd 2, 24(1)\n") + // Load and call |name|'s global entry point. + w.WriteString("\taddis 12, 2, .Lredirector_toc_" + name + "@toc@ha\n") + w.WriteString("\tld 12, .Lredirector_toc_" + name + "@toc@l(12)\n") + w.WriteString("\tmtctr 12\n") + w.WriteString("\tbctr\n") + case aarch64: writeAarch64Function(w, redirector, func(w stringWriter) { w.WriteString("\tb " + name + "\n") @@ -1465,6 +1910,13 @@ func transform(w stringWriter, inputs [] target := d.bssAccessorsNeeded[name] switch d.processor { + case ppc64le: + w.WriteString(".type " + funcName + ", @function\n") + w.WriteString(funcName + ":\n") + w.WriteString("\taddis 3, 2, " + target + "@toc@ha\n") + w.WriteString("\taddi 3, 3, " + target + "@toc@l\n") + w.WriteString("\tblr\n") + case x86_64: w.WriteString(".type " + funcName + ", @function\n") w.WriteString(funcName + ":\n") @@ -1480,6 +1932,26 @@ func transform(w stringWriter, inputs [] } switch d.processor { + case ppc64le: + loadTOCNames := sortedSet(d.tocLoaders) + for _, symbolAndOffset := range loadTOCNames { + parts := strings.SplitN(symbolAndOffset, "\x00", 2) + symbol, offset := parts[0], parts[1] + + funcName := loadTOCFuncName(symbol, offset) + ref := symbol + offset + + w.WriteString(".type " + funcName[2:] + ", @function\n") + w.WriteString(funcName[2:] + ":\n") + w.WriteString(funcName + ":\n") + w.WriteString("\taddis 3, 2, " + ref + "@toc@ha\n") + w.WriteString("\taddi 3, 3, " + ref + "@toc@l\n") + w.WriteString("\tblr\n") + } + + w.WriteString(".LBORINGSSL_external_toc:\n") + w.WriteString(".quad .TOC.-.LBORINGSSL_external_toc\n") + case aarch64: externalNames := sortedSet(d.gotExternalsNeeded) for _, symbol := range externalNames { @@ -1790,6 +2262,10 @@ func localTargetName(name string) string return ".L" + name + "_local_target" } +func localEntryName(name string) string { + return ".L" + name + "_local_entry" +} + func isSynthesized(symbol string) bool { return strings.HasSuffix(symbol, "_bss_get") || symbol == "OPENSSL_ia32cap_get" || @@ -1845,6 +2321,8 @@ func detectProcessor(input inputFile) pr switch instructionName { case "movq", "call", "leaq": return x86_64 + case "addis", "addi", "mflr": + return ppc64le case "str", "bl", "ldr", "st1": return aarch64 } Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate.peg =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/util/fipstools/delocate/delocate.peg +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate.peg @@ -12,7 +12,7 @@ # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -# This is a rough parser for x86-64 and aarch64 assembly designed to work with +# This is a rough parser for x86-64 and ppc64le assembly designed to work with # https://github.com/pointlander/peg. delocate.go has a go:generate line for # rebuilding delocate.peg.go from this file. Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate_test.go =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/util/fipstools/delocate/delocate_test.go +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/delocate_test.go @@ -39,6 +39,11 @@ func (test *delocateTest) Path(file stri var delocateTests = []delocateTest{ {"generic-FileDirectives", []string{"in.s"}, "out.s"}, + {"ppc64le-GlobalEntry", []string{"in.s"}, "out.s"}, + {"ppc64le-LoadToR0", []string{"in.s"}, "out.s"}, + {"ppc64le-Sample2", []string{"in.s"}, "out.s"}, + {"ppc64le-Sample", []string{"in.s"}, "out.s"}, + {"ppc64le-TOCWithOffset", []string{"in.s"}, "out.s"}, {"x86_64-Basic", []string{"in.s"}, "out.s"}, {"x86_64-BSS", []string{"in.s"}, "out.s"}, {"x86_64-GOTRewrite", []string{"in.s"}, "out.s"}, Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-GlobalEntry/in.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-GlobalEntry/in.s @@ -0,0 +1,9 @@ + .text +foo: +.LCF0: +0: + addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry foo,.-foo +.LVL0: + bl Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-GlobalEntry/out.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-GlobalEntry/out.s @@ -0,0 +1,62 @@ +.text +.file 1 "inserted_by_delocate.c" +.loc 1 1 0 +BORINGSSL_bcm_text_start: + .text +.Lfoo_local_target: +foo: +.LCF0: + +0: + +999: + addis 2, 12, .LBORINGSSL_external_toc-999b@ha + addi 2, 2, .LBORINGSSL_external_toc-999b@l + ld 12, 0(2) + add 2, 2, 12 +# WAS addi 2,2,.TOC.-.LCF0@l + .localentry foo,.-foo +.Lfoo_local_entry: +.LVL0: + + bl +.text +.loc 1 2 0 +BORINGSSL_bcm_text_end: +.LBORINGSSL_external_toc: +.quad .TOC.-.LBORINGSSL_external_toc +.type BORINGSSL_bcm_text_hash, @object +.size BORINGSSL_bcm_text_hash, 32 +BORINGSSL_bcm_text_hash: +.byte 0xae +.byte 0x2c +.byte 0xea +.byte 0x2a +.byte 0xbd +.byte 0xa6 +.byte 0xf3 +.byte 0xec +.byte 0x97 +.byte 0x7f +.byte 0x9b +.byte 0xf6 +.byte 0x94 +.byte 0x9a +.byte 0xfc +.byte 0x83 +.byte 0x68 +.byte 0x27 +.byte 0xcb +.byte 0xa0 +.byte 0xa0 +.byte 0x9f +.byte 0x6b +.byte 0x6f +.byte 0xde +.byte 0x52 +.byte 0xcd +.byte 0xe2 +.byte 0xcd +.byte 0xff +.byte 0x31 +.byte 0x80 Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-LoadToR0/in.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-LoadToR0/in.s @@ -0,0 +1,4 @@ + .text +foo: + addis 22,2,bar@toc@ha + ld 0,bar@toc@l(22) Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-LoadToR0/out.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-LoadToR0/out.s @@ -0,0 +1,72 @@ +.text +.file 1 "inserted_by_delocate.c" +.loc 1 1 0 +BORINGSSL_bcm_text_start: + .text +.Lfoo_local_target: +foo: +# WAS addis 22,2,bar@toc@ha +# WAS ld 0,bar@toc@l(22) + addi 1, 1, -288 + mflr 0 + std 0, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc_bar + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 0, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + addi 1, 1, -288 + std 3, -8(1) + mr 3, 0 + ld 0, 0(3) + ld 3, -8(1) + addi 1, 1, 288 +.text +.loc 1 2 0 +BORINGSSL_bcm_text_end: +.type bcm_loadtoc_bar, @function +bcm_loadtoc_bar: +.Lbcm_loadtoc_bar: + addis 3, 2, bar@toc@ha + addi 3, 3, bar@toc@l + blr +.LBORINGSSL_external_toc: +.quad .TOC.-.LBORINGSSL_external_toc +.type BORINGSSL_bcm_text_hash, @object +.size BORINGSSL_bcm_text_hash, 32 +BORINGSSL_bcm_text_hash: +.byte 0xae +.byte 0x2c +.byte 0xea +.byte 0x2a +.byte 0xbd +.byte 0xa6 +.byte 0xf3 +.byte 0xec +.byte 0x97 +.byte 0x7f +.byte 0x9b +.byte 0xf6 +.byte 0x94 +.byte 0x9a +.byte 0xfc +.byte 0x83 +.byte 0x68 +.byte 0x27 +.byte 0xcb +.byte 0xa0 +.byte 0xa0 +.byte 0x9f +.byte 0x6b +.byte 0x6f +.byte 0xde +.byte 0x52 +.byte 0xcd +.byte 0xe2 +.byte 0xcd +.byte 0xff +.byte 0x31 +.byte 0x80 Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample/in.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample/in.s @@ -0,0 +1,161 @@ + .file "foo.c" + .abiversion 2 + .section ".toc","aw" + .section ".text" + .section .rodata + .align 3 + .type kString, @object + .size kString, 12 +kString: + .string "hello world" + .globl kExportedString + .align 3 + .type kExportedString, @object + .size kExportedString, 26 +kExportedString: + .string "hello world, more visibly" + .align 2 + .type kGiantArray, @object + .size kGiantArray, 400000 +kGiantArray: + .long 1 + .long 0 + .zero 399992 + .lcomm bss,20,4 + .type bss, @object + .align 3 +.LC1: + .string "kString is %p\n" + .align 3 +.LC2: + .string "kExportedString is %p\n" + .align 3 +.LC4: + .string "function is %p\n" + .align 3 +.LC5: + .string "exported_function is %p\n" + .align 3 +.LC7: + .string "&kString[5] is %p\n" + .align 3 +.LC9: + .string "&kGiantArray[0x12345] is %p\n" + .section ".toc","aw" +.LC0: + .quad stderr +.LC3: + .quad kExportedString +.LC6: + .quad exported_function +.LC8: + .quad kString+5 +.LC10: + .quad kGiantArray+298260 + .section ".text" + .align 2 + .type function, @function +function: +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l + .localentry function,.-function + mflr 0 + std 0,16(1) + std 31,-8(1) + stdu 1,-112(1) + mr 31,1 + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC1@toc@ha + addi 4,4,.LC1@toc@l + addis 5,2,kString@toc@ha + addi 5,5,kString@toc@l + bl fprintf + nop + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC2@toc@ha + addi 4,4,.LC2@toc@l + addis 9,2,.LC3@toc@ha + ld 5,.LC3@toc@l(9) + bl fprintf + nop + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC4@toc@ha + addi 4,4,.LC4@toc@l + addis 5,2,function@toc@ha + addi 5,5,function@toc@l + bl fprintf + nop + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC5@toc@ha + addi 4,4,.LC5@toc@l + addis 9,2,.LC6@toc@ha + ld 5,.LC6@toc@l(9) + bl fprintf + nop + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC7@toc@ha + addi 4,4,.LC7@toc@l + addis 9,2,.LC8@toc@ha + ld 5,.LC8@toc@l(9) + bl fprintf + nop + addis 10,2,.LC0@toc@ha + ld 9,.LC0@toc@l(10) + ld 9,0(9) + mr 3,9 + addis 4,2,.LC9@toc@ha + addi 4,4,.LC9@toc@l + addis 9,2,.LC10@toc@ha + ld 5,.LC10@toc@l(9) + bl fprintf + nop + bl exported_function + nop + mr 3,9 + addi 1,31,112 + ld 0,16(1) + mtlr 0 + ld 31,-8(1) + blr + .long 0 + .byte 0,0,0,1,128,1,0,1 + .size function,.-function + .align 2 + .globl exported_function + .type exported_function, @function +exported_function: +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l + .localentry exported_function,.-exported_function + mflr 0 + std 0,16(1) + std 31,-8(1) + stdu 1,-48(1) + mr 31,1 + bl function + mr 3,9 + addi 1,31,48 + ld 0,16(1) + mtlr 0 + ld 31,-8(1) + blr + .long 0 + .byte 0,0,0,1,128,1,0,1 + .size exported_function,.-exported_function + .ident "GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2" + .section .note.GNU-stack,"",@progbits Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample/out.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample/out.s @@ -0,0 +1,552 @@ +.text +.file 1 "inserted_by_delocate.c" +.loc 1 1 0 +BORINGSSL_bcm_text_start: + .file "foo.c" + .abiversion 2 + .section ".toc","aw" +# WAS .section ".text" +.text +# WAS .section .rodata +.text + .align 3 + .type kString, @object + .size kString, 12 +.LkString_local_target: +kString: + .string "hello world" + .globl kExportedString + .align 3 + .type kExportedString, @object + .size kExportedString, 26 +.LkExportedString_local_target: +kExportedString: + .string "hello world, more visibly" + .align 2 + .type kGiantArray, @object + .size kGiantArray, 400000 +.LkGiantArray_local_target: +kGiantArray: + .long 1 + .long 0 + .zero 399992 + .lcomm bss,20,4 + .type bss, @object + .align 3 +.LC1: + + .string "kString is %p\n" + .align 3 +.LC2: + + .string "kExportedString is %p\n" + .align 3 +.LC4: + + .string "function is %p\n" + .align 3 +.LC5: + + .string "exported_function is %p\n" + .align 3 +.LC7: + + .string "&kString[5] is %p\n" + .align 3 +.LC9: + + .string "&kGiantArray[0x12345] is %p\n" + .section ".toc","aw" +.LC0: + + .quad stderr +.LC3: + + .quad kExportedString +.LC6: + + .quad exported_function +.LC8: + + .quad kString+5 +.LC10: + + .quad kGiantArray+298260 +# WAS .section ".text" +.text + .align 2 + .type function, @function +.Lfunction_local_target: +function: +0: +999: + addis 2, 12, .LBORINGSSL_external_toc-999b@ha + addi 2, 2, .LBORINGSSL_external_toc-999b@l + ld 12, 0(2) + add 2, 2, 12 +# WAS addi 2,2,.TOC.-0b@l + .localentry function,.-function +.Lfunction_local_entry: + mflr 0 + std 0,16(1) + std 31,-8(1) + stdu 1,-112(1) + mr 31,1 +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC1@toc@ha +# WAS addi 4,4,.LC1@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC1 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 5,2,kString@toc@ha +# WAS addi 5,5,kString@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LkString_local_target + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC2@toc@ha +# WAS addi 4,4,.LC2@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC2 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 9,2,.LC3@toc@ha +# WAS ld 5,.LC3@toc@l(9) + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC3 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 5, 0(5) +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC4@toc@ha +# WAS addi 4,4,.LC4@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC4 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 5,2,function@toc@ha +# WAS addi 5,5,function@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfunction_local_target + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC5@toc@ha +# WAS addi 4,4,.LC5@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC5 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 9,2,.LC6@toc@ha +# WAS ld 5,.LC6@toc@l(9) + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC6 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 5, 0(5) +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC7@toc@ha +# WAS addi 4,4,.LC7@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC7 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 9,2,.LC8@toc@ha +# WAS ld 5,.LC8@toc@l(9) + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC8 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 5, 0(5) +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS addis 10,2,.LC0@toc@ha +# WAS ld 9,.LC0@toc@l(10) + addi 1, 1, -288 + mflr 9 + std 9, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 9, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 9, 0(9) + ld 9,0(9) + mr 3,9 +# WAS addis 4,2,.LC9@toc@ha +# WAS addi 4,4,.LC9@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC9 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 9,2,.LC10@toc@ha +# WAS ld 5,.LC10@toc@l(9) + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC10 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 5, 0(5) +# WAS bl fprintf + bl bcm_redirector_fprintf + ld 2, 24(1) + nop +# WAS bl exported_function + bl .Lexported_function_local_entry + nop + mr 3,9 + addi 1,31,112 + ld 0,16(1) + mtlr 0 + ld 31,-8(1) + blr + .long 0 + .byte 0,0,0,1,128,1,0,1 + .size function,.-function + .align 2 + .globl exported_function + .type exported_function, @function +.Lexported_function_local_target: +exported_function: +0: +999: + addis 2, 12, .LBORINGSSL_external_toc-999b@ha + addi 2, 2, .LBORINGSSL_external_toc-999b@l + ld 12, 0(2) + add 2, 2, 12 +# WAS addi 2,2,.TOC.-0b@l + .localentry exported_function,.-exported_function +.Lexported_function_local_entry: + mflr 0 + std 0,16(1) + std 31,-8(1) + stdu 1,-48(1) + mr 31,1 +# WAS bl function + bl .Lfunction_local_entry + mr 3,9 + addi 1,31,48 + ld 0,16(1) + mtlr 0 + ld 31,-8(1) + blr + .long 0 + .byte 0,0,0,1,128,1,0,1 + .size exported_function,.-exported_function + .ident "GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2" + .section .note.GNU-stack,"",@progbits +.text +.loc 1 2 0 +BORINGSSL_bcm_text_end: +.section ".toc", "aw" +.Lredirector_toc_fprintf: +.quad fprintf +.text +.type bcm_redirector_fprintf, @function +bcm_redirector_fprintf: + std 2, 24(1) + addis 12, 2, .Lredirector_toc_fprintf@toc@ha + ld 12, .Lredirector_toc_fprintf@toc@l(12) + mtctr 12 + bctr +.type bss_bss_get, @function +bss_bss_get: + addis 3, 2, bss@toc@ha + addi 3, 3, bss@toc@l + blr +.type bcm_loadtoc__dot_LC0, @function +bcm_loadtoc__dot_LC0: +.Lbcm_loadtoc__dot_LC0: + addis 3, 2, .LC0@toc@ha + addi 3, 3, .LC0@toc@l + blr +.type bcm_loadtoc__dot_LC1, @function +bcm_loadtoc__dot_LC1: +.Lbcm_loadtoc__dot_LC1: + addis 3, 2, .LC1@toc@ha + addi 3, 3, .LC1@toc@l + blr +.type bcm_loadtoc__dot_LC10, @function +bcm_loadtoc__dot_LC10: +.Lbcm_loadtoc__dot_LC10: + addis 3, 2, .LC10@toc@ha + addi 3, 3, .LC10@toc@l + blr +.type bcm_loadtoc__dot_LC2, @function +bcm_loadtoc__dot_LC2: +.Lbcm_loadtoc__dot_LC2: + addis 3, 2, .LC2@toc@ha + addi 3, 3, .LC2@toc@l + blr +.type bcm_loadtoc__dot_LC3, @function +bcm_loadtoc__dot_LC3: +.Lbcm_loadtoc__dot_LC3: + addis 3, 2, .LC3@toc@ha + addi 3, 3, .LC3@toc@l + blr +.type bcm_loadtoc__dot_LC4, @function +bcm_loadtoc__dot_LC4: +.Lbcm_loadtoc__dot_LC4: + addis 3, 2, .LC4@toc@ha + addi 3, 3, .LC4@toc@l + blr +.type bcm_loadtoc__dot_LC5, @function +bcm_loadtoc__dot_LC5: +.Lbcm_loadtoc__dot_LC5: + addis 3, 2, .LC5@toc@ha + addi 3, 3, .LC5@toc@l + blr +.type bcm_loadtoc__dot_LC6, @function +bcm_loadtoc__dot_LC6: +.Lbcm_loadtoc__dot_LC6: + addis 3, 2, .LC6@toc@ha + addi 3, 3, .LC6@toc@l + blr +.type bcm_loadtoc__dot_LC7, @function +bcm_loadtoc__dot_LC7: +.Lbcm_loadtoc__dot_LC7: + addis 3, 2, .LC7@toc@ha + addi 3, 3, .LC7@toc@l + blr +.type bcm_loadtoc__dot_LC8, @function +bcm_loadtoc__dot_LC8: +.Lbcm_loadtoc__dot_LC8: + addis 3, 2, .LC8@toc@ha + addi 3, 3, .LC8@toc@l + blr +.type bcm_loadtoc__dot_LC9, @function +bcm_loadtoc__dot_LC9: +.Lbcm_loadtoc__dot_LC9: + addis 3, 2, .LC9@toc@ha + addi 3, 3, .LC9@toc@l + blr +.type bcm_loadtoc__dot_Lfunction_local_target, @function +bcm_loadtoc__dot_Lfunction_local_target: +.Lbcm_loadtoc__dot_Lfunction_local_target: + addis 3, 2, .Lfunction_local_target@toc@ha + addi 3, 3, .Lfunction_local_target@toc@l + blr +.type bcm_loadtoc__dot_LkString_local_target, @function +bcm_loadtoc__dot_LkString_local_target: +.Lbcm_loadtoc__dot_LkString_local_target: + addis 3, 2, .LkString_local_target@toc@ha + addi 3, 3, .LkString_local_target@toc@l + blr +.LBORINGSSL_external_toc: +.quad .TOC.-.LBORINGSSL_external_toc +.type BORINGSSL_bcm_text_hash, @object +.size BORINGSSL_bcm_text_hash, 32 +BORINGSSL_bcm_text_hash: +.byte 0xae +.byte 0x2c +.byte 0xea +.byte 0x2a +.byte 0xbd +.byte 0xa6 +.byte 0xf3 +.byte 0xec +.byte 0x97 +.byte 0x7f +.byte 0x9b +.byte 0xf6 +.byte 0x94 +.byte 0x9a +.byte 0xfc +.byte 0x83 +.byte 0x68 +.byte 0x27 +.byte 0xcb +.byte 0xa0 +.byte 0xa0 +.byte 0x9f +.byte 0x6b +.byte 0x6f +.byte 0xde +.byte 0x52 +.byte 0xcd +.byte 0xe2 +.byte 0xcd +.byte 0xff +.byte 0x31 +.byte 0x80 Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample2/in.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample2/in.s @@ -0,0 +1,226 @@ + .file "foo.c" + .abiversion 2 + .section ".toc","aw" + .section ".text" + .section ".toc","aw" +.LC0: + .quad stderr +.LC3: + .quad kExportedString +.LC6: + .quad exported_function + .section ".text" + .align 2 + .p2align 4,,15 + .globl exported_function + .type exported_function, @function +exported_function: +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l + .localentry exported_function,.-exported_function + mflr 0 + std 19,-104(1) + std 20,-96(1) + std 21,-88(1) + std 22,-80(1) + addis 21,2,.LC1@toc@ha + addis 22,2,.LC2@toc@ha + std 23,-72(1) + std 24,-64(1) + addis 23,2,.LC4@toc@ha + addis 24,2,function@toc@ha + std 25,-56(1) + std 26,-48(1) + addis 25,2,.LC5@toc@ha + addis 26,2,.LC7@toc@ha + std 27,-40(1) + std 28,-32(1) + addis 28,2,.LC8@toc@ha + addi 21,21,.LC1@toc@l + std 29,-24(1) + std 30,-16(1) + addis 29,2,.LANCHOR0@toc@ha + addi 22,22,.LC2@toc@l + std 31,-8(1) + std 0,16(1) + addi 29,29,.LANCHOR0@toc@l + addi 23,23,.LC4@toc@l + stdu 1,-208(1) + addis 31,2,.LC0@toc@ha # gpr load fusion, type long + ld 31,.LC0@toc@l(31) + addis 19,2,.LC3@toc@ha # gpr load fusion, type long + ld 19,.LC3@toc@l(19) + addis 30,29,0x5 + addi 24,24,function@toc@l + addis 20,2,.LC6@toc@ha # gpr load fusion, type long + ld 20,.LC6@toc@l(20) + addi 25,25,.LC5@toc@l + addi 26,26,.LC7@toc@l + addi 27,29,5 + addi 28,28,.LC8@toc@l + addi 30,30,-29404 + .p2align 4,,15 +.L2: + ld 3,0(31) + mr 5,21 + mr 6,29 + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + mr 5,22 + mr 6,19 + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + mr 5,23 + mr 6,24 + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + mr 5,25 + mr 6,20 + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + mr 5,26 + mr 6,27 + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + li 4,1 + mr 5,28 + mr 6,30 + bl __fprintf_chk + nop + b .L2 + .long 0 + .byte 0,0,0,1,128,13,0,0 + .size exported_function,.-exported_function + .section ".toc","aw" + .set .LC11,.LC0 + .set .LC12,.LC3 + .set .LC13,.LC6 + .section ".text" + .align 2 + .p2align 4,,15 + .type function, @function +function: +0: addis 2,12,.TOC.-0b@ha + addi 2,2,.TOC.-0b@l + .localentry function,.-function + mflr 0 + std 31,-8(1) + addis 31,2,.LC11@toc@ha # gpr load fusion, type long + ld 31,.LC11@toc@l(31) + addis 5,2,.LC1@toc@ha + std 30,-16(1) + addis 30,2,.LANCHOR0@toc@ha + addi 5,5,.LC1@toc@l + addi 30,30,.LANCHOR0@toc@l + li 4,1 + mr 6,30 + std 0,16(1) + stdu 1,-112(1) + ld 3,0(31) + bl __fprintf_chk + nop + addis 6,2,.LC12@toc@ha # gpr load fusion, type long + ld 6,.LC12@toc@l(6) + ld 3,0(31) + addis 5,2,.LC2@toc@ha + li 4,1 + addi 5,5,.LC2@toc@l + bl __fprintf_chk + nop + ld 3,0(31) + addis 5,2,.LC4@toc@ha + addis 6,2,function@toc@ha + addi 5,5,.LC4@toc@l + addi 6,6,function@toc@l + li 4,1 + bl __fprintf_chk + nop + addis 6,2,.LC13@toc@ha # gpr load fusion, type long + ld 6,.LC13@toc@l(6) + ld 3,0(31) + addis 5,2,.LC5@toc@ha + li 4,1 + addi 5,5,.LC5@toc@l + bl __fprintf_chk + nop + ld 3,0(31) + addis 5,2,.LC7@toc@ha + addi 6,30,5 + addi 5,5,.LC7@toc@l + li 4,1 + bl __fprintf_chk + nop + ld 3,0(31) + addis 6,30,0x5 + addis 5,2,.LC8@toc@ha + li 4,1 + addi 5,5,.LC8@toc@l + addi 6,6,-29404 + bl __fprintf_chk + nop + bl exported_function + nop + addi 1,1,112 + ld 0,16(1) + ld 30,-16(1) + ld 31,-8(1) + mtlr 0 + blr + .long 0 + .byte 0,0,0,1,128,2,0,0 + .size function,.-function + .globl kExportedString + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type kString, @object + .size kString, 12 +kString: + .string "hello world" + .zero 4 + .type kGiantArray, @object + .size kGiantArray, 400000 +kGiantArray: + .long 1 + .long 0 + .zero 399992 + .type kExportedString, @object + .size kExportedString, 26 +kExportedString: + .string "hello world, more visibly" + .section .rodata.str1.8,"aMS",@progbits,1 + .align 3 +.LC1: + .string "kString is %p\n" + .zero 1 +.LC2: + .string "kExportedString is %p\n" + .zero 1 +.LC4: + .string "function is %p\n" +.LC5: + .string "exported_function is %p\n" + .zero 7 +.LC7: + .string "&kString[5] is %p\n" + .zero 5 +.LC8: + .string "&kGiantArray[0x12345] is %p\n" + .section ".bss" + .align 2 + .type bss, @object + .size bss, 20 +bss: + .zero 20 + .ident "GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2" + .section .note.GNU-stack,"",@progbits Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample2/out.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-Sample2/out.s @@ -0,0 +1,677 @@ +.text +.file 1 "inserted_by_delocate.c" +.loc 1 1 0 +BORINGSSL_bcm_text_start: + .file "foo.c" + .abiversion 2 + .section ".toc","aw" +# WAS .section ".text" +.text + .section ".toc","aw" +.LC0: + + .quad stderr +.LC3: + + .quad kExportedString +.LC6: + + .quad exported_function +# WAS .section ".text" +.text + .align 2 + .p2align 4,,15 + .globl exported_function + .type exported_function, @function +.Lexported_function_local_target: +exported_function: +0: +999: + addis 2, 12, .LBORINGSSL_external_toc-999b@ha + addi 2, 2, .LBORINGSSL_external_toc-999b@l + ld 12, 0(2) + add 2, 2, 12 +# WAS addi 2,2,.TOC.-0b@l + .localentry exported_function,.-exported_function +.Lexported_function_local_entry: + mflr 0 + std 19,-104(1) + std 20,-96(1) + std 21,-88(1) + std 22,-80(1) +# WAS addis 21,2,.LC1@toc@ha +# WAS addis 22,2,.LC2@toc@ha + std 23,-72(1) + std 24,-64(1) +# WAS addis 23,2,.LC4@toc@ha +# WAS addis 24,2,function@toc@ha + std 25,-56(1) + std 26,-48(1) +# WAS addis 25,2,.LC5@toc@ha +# WAS addis 26,2,.LC7@toc@ha + std 27,-40(1) + std 28,-32(1) +# WAS addis 28,2,.LC8@toc@ha +# WAS addi 21,21,.LC1@toc@l + addi 1, 1, -288 + mflr 21 + std 21, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC1 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 21, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + std 29,-24(1) + std 30,-16(1) +# WAS addis 29,2,.LANCHOR0@toc@ha +# WAS addi 22,22,.LC2@toc@l + addi 1, 1, -288 + mflr 22 + std 22, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC2 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 22, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + std 31,-8(1) + std 0,16(1) +# WAS addi 29,29,.LANCHOR0@toc@l + addi 1, 1, -288 + mflr 29 + std 29, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LANCHOR0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 29, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addi 23,23,.LC4@toc@l + addi 1, 1, -288 + mflr 23 + std 23, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC4 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 23, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + stdu 1,-208(1) +# WAS addis 31,2,.LC0@toc@ha # gpr load fusion, type long +# WAS ld 31,.LC0@toc@l(31) + addi 1, 1, -288 + mflr 31 + std 31, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 31, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 31, 0(31) +# WAS addis 19,2,.LC3@toc@ha # gpr load fusion, type long +# WAS ld 19,.LC3@toc@l(19) + addi 1, 1, -288 + mflr 19 + std 19, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC3 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 19, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 19, 0(19) + addis 30,29,0x5 +# WAS addi 24,24,function@toc@l + addi 1, 1, -288 + mflr 24 + std 24, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfunction_local_target + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 24, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addis 20,2,.LC6@toc@ha # gpr load fusion, type long +# WAS ld 20,.LC6@toc@l(20) + addi 1, 1, -288 + mflr 20 + std 20, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC6 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 20, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 20, 0(20) +# WAS addi 25,25,.LC5@toc@l + addi 1, 1, -288 + mflr 25 + std 25, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC5 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 25, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addi 26,26,.LC7@toc@l + addi 1, 1, -288 + mflr 26 + std 26, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC7 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 26, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + addi 27,29,5 +# WAS addi 28,28,.LC8@toc@l + addi 1, 1, -288 + mflr 28 + std 28, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC8 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 28, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + addi 30,30,-29404 + .p2align 4,,15 +.L2: + + ld 3,0(31) + mr 5,21 + mr 6,29 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + mr 5,22 + mr 6,19 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + mr 5,23 + mr 6,24 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + mr 5,25 + mr 6,20 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + mr 5,26 + mr 6,27 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + li 4,1 + mr 5,28 + mr 6,30 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + b .L2 + .long 0 + .byte 0,0,0,1,128,13,0,0 + .size exported_function,.-exported_function + .section ".toc","aw" + .set .LC11,.LC0 + .set .LC12,.LC3 + .set .LC13,.LC6 +# WAS .section ".text" +.text + .align 2 + .p2align 4,,15 + .type function, @function +.Lfunction_local_target: +function: +0: +999: + addis 2, 12, .LBORINGSSL_external_toc-999b@ha + addi 2, 2, .LBORINGSSL_external_toc-999b@l + ld 12, 0(2) + add 2, 2, 12 +# WAS addi 2,2,.TOC.-0b@l + .localentry function,.-function +.Lfunction_local_entry: + mflr 0 + std 31,-8(1) +# WAS addis 31,2,.LC11@toc@ha # gpr load fusion, type long +# WAS ld 31,.LC11@toc@l(31) + addi 1, 1, -288 + mflr 31 + std 31, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC11 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 31, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 31, 0(31) +# WAS addis 5,2,.LC1@toc@ha + std 30,-16(1) +# WAS addis 30,2,.LANCHOR0@toc@ha +# WAS addi 5,5,.LC1@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC1 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addi 30,30,.LANCHOR0@toc@l + addi 1, 1, -288 + mflr 30 + std 30, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LANCHOR0 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 30, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + li 4,1 + mr 6,30 + std 0,16(1) + stdu 1,-112(1) + ld 3,0(31) +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop +# WAS addis 6,2,.LC12@toc@ha # gpr load fusion, type long +# WAS ld 6,.LC12@toc@l(6) + addi 1, 1, -288 + mflr 6 + std 6, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC12 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 6, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 6, 0(6) + ld 3,0(31) +# WAS addis 5,2,.LC2@toc@ha + li 4,1 +# WAS addi 5,5,.LC2@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC2 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) +# WAS addis 5,2,.LC4@toc@ha +# WAS addis 6,2,function@toc@ha +# WAS addi 5,5,.LC4@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC4 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS addi 6,6,function@toc@l + addi 1, 1, -288 + mflr 6 + std 6, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfunction_local_target + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 6, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop +# WAS addis 6,2,.LC13@toc@ha # gpr load fusion, type long +# WAS ld 6,.LC13@toc@l(6) + addi 1, 1, -288 + mflr 6 + std 6, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC13 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 6, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 6, 0(6) + ld 3,0(31) +# WAS addis 5,2,.LC5@toc@ha + li 4,1 +# WAS addi 5,5,.LC5@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC5 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) +# WAS addis 5,2,.LC7@toc@ha + addi 6,30,5 +# WAS addi 5,5,.LC7@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC7 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + li 4,1 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop + ld 3,0(31) + addis 6,30,0x5 +# WAS addis 5,2,.LC8@toc@ha + li 4,1 +# WAS addi 5,5,.LC8@toc@l + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_LC8 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + addi 6,6,-29404 +# WAS bl __fprintf_chk + bl bcm_redirector___fprintf_chk + ld 2, 24(1) + nop +# WAS bl exported_function + bl .Lexported_function_local_entry + nop + addi 1,1,112 + ld 0,16(1) + ld 30,-16(1) + ld 31,-8(1) + mtlr 0 + blr + .long 0 + .byte 0,0,0,1,128,2,0,0 + .size function,.-function + .globl kExportedString +# WAS .section .rodata +.text + .align 4 + .set .LANCHOR0,. + 0 + .type kString, @object + .size kString, 12 +.LkString_local_target: +kString: + .string "hello world" + .zero 4 + .type kGiantArray, @object + .size kGiantArray, 400000 +.LkGiantArray_local_target: +kGiantArray: + .long 1 + .long 0 + .zero 399992 + .type kExportedString, @object + .size kExportedString, 26 +.LkExportedString_local_target: +kExportedString: + .string "hello world, more visibly" +# WAS .section .rodata.str1.8,"aMS",@progbits,1 +.text + .align 3 +.LC1: + + .string "kString is %p\n" + .zero 1 +.LC2: + + .string "kExportedString is %p\n" + .zero 1 +.LC4: + + .string "function is %p\n" +.LC5: + + .string "exported_function is %p\n" + .zero 7 +.LC7: + + .string "&kString[5] is %p\n" + .zero 5 +.LC8: + + .string "&kGiantArray[0x12345] is %p\n" + .section ".bss" + .align 2 + .type bss, @object + .size bss, 20 +bss: +.Lbss_local_target: + + .zero 20 + .ident "GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2" + .section .note.GNU-stack,"",@progbits +.text +.loc 1 2 0 +BORINGSSL_bcm_text_end: +.section ".toc", "aw" +.Lredirector_toc___fprintf_chk: +.quad __fprintf_chk +.text +.type bcm_redirector___fprintf_chk, @function +bcm_redirector___fprintf_chk: + std 2, 24(1) + addis 12, 2, .Lredirector_toc___fprintf_chk@toc@ha + ld 12, .Lredirector_toc___fprintf_chk@toc@l(12) + mtctr 12 + bctr +.type bss_bss_get, @function +bss_bss_get: + addis 3, 2, .Lbss_local_target@toc@ha + addi 3, 3, .Lbss_local_target@toc@l + blr +.type bcm_loadtoc__dot_LANCHOR0, @function +bcm_loadtoc__dot_LANCHOR0: +.Lbcm_loadtoc__dot_LANCHOR0: + addis 3, 2, .LANCHOR0@toc@ha + addi 3, 3, .LANCHOR0@toc@l + blr +.type bcm_loadtoc__dot_LC0, @function +bcm_loadtoc__dot_LC0: +.Lbcm_loadtoc__dot_LC0: + addis 3, 2, .LC0@toc@ha + addi 3, 3, .LC0@toc@l + blr +.type bcm_loadtoc__dot_LC1, @function +bcm_loadtoc__dot_LC1: +.Lbcm_loadtoc__dot_LC1: + addis 3, 2, .LC1@toc@ha + addi 3, 3, .LC1@toc@l + blr +.type bcm_loadtoc__dot_LC11, @function +bcm_loadtoc__dot_LC11: +.Lbcm_loadtoc__dot_LC11: + addis 3, 2, .LC11@toc@ha + addi 3, 3, .LC11@toc@l + blr +.type bcm_loadtoc__dot_LC12, @function +bcm_loadtoc__dot_LC12: +.Lbcm_loadtoc__dot_LC12: + addis 3, 2, .LC12@toc@ha + addi 3, 3, .LC12@toc@l + blr +.type bcm_loadtoc__dot_LC13, @function +bcm_loadtoc__dot_LC13: +.Lbcm_loadtoc__dot_LC13: + addis 3, 2, .LC13@toc@ha + addi 3, 3, .LC13@toc@l + blr +.type bcm_loadtoc__dot_LC2, @function +bcm_loadtoc__dot_LC2: +.Lbcm_loadtoc__dot_LC2: + addis 3, 2, .LC2@toc@ha + addi 3, 3, .LC2@toc@l + blr +.type bcm_loadtoc__dot_LC3, @function +bcm_loadtoc__dot_LC3: +.Lbcm_loadtoc__dot_LC3: + addis 3, 2, .LC3@toc@ha + addi 3, 3, .LC3@toc@l + blr +.type bcm_loadtoc__dot_LC4, @function +bcm_loadtoc__dot_LC4: +.Lbcm_loadtoc__dot_LC4: + addis 3, 2, .LC4@toc@ha + addi 3, 3, .LC4@toc@l + blr +.type bcm_loadtoc__dot_LC5, @function +bcm_loadtoc__dot_LC5: +.Lbcm_loadtoc__dot_LC5: + addis 3, 2, .LC5@toc@ha + addi 3, 3, .LC5@toc@l + blr +.type bcm_loadtoc__dot_LC6, @function +bcm_loadtoc__dot_LC6: +.Lbcm_loadtoc__dot_LC6: + addis 3, 2, .LC6@toc@ha + addi 3, 3, .LC6@toc@l + blr +.type bcm_loadtoc__dot_LC7, @function +bcm_loadtoc__dot_LC7: +.Lbcm_loadtoc__dot_LC7: + addis 3, 2, .LC7@toc@ha + addi 3, 3, .LC7@toc@l + blr +.type bcm_loadtoc__dot_LC8, @function +bcm_loadtoc__dot_LC8: +.Lbcm_loadtoc__dot_LC8: + addis 3, 2, .LC8@toc@ha + addi 3, 3, .LC8@toc@l + blr +.type bcm_loadtoc__dot_Lfunction_local_target, @function +bcm_loadtoc__dot_Lfunction_local_target: +.Lbcm_loadtoc__dot_Lfunction_local_target: + addis 3, 2, .Lfunction_local_target@toc@ha + addi 3, 3, .Lfunction_local_target@toc@l + blr +.LBORINGSSL_external_toc: +.quad .TOC.-.LBORINGSSL_external_toc +.type BORINGSSL_bcm_text_hash, @object +.size BORINGSSL_bcm_text_hash, 32 +BORINGSSL_bcm_text_hash: +.byte 0xae +.byte 0x2c +.byte 0xea +.byte 0x2a +.byte 0xbd +.byte 0xa6 +.byte 0xf3 +.byte 0xec +.byte 0x97 +.byte 0x7f +.byte 0x9b +.byte 0xf6 +.byte 0x94 +.byte 0x9a +.byte 0xfc +.byte 0x83 +.byte 0x68 +.byte 0x27 +.byte 0xcb +.byte 0xa0 +.byte 0xa0 +.byte 0x9f +.byte 0x6b +.byte 0x6f +.byte 0xde +.byte 0x52 +.byte 0xcd +.byte 0xe2 +.byte 0xcd +.byte 0xff +.byte 0x31 +.byte 0x80 Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-TOCWithOffset/in.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-TOCWithOffset/in.s @@ -0,0 +1,23 @@ + .text +foo: + # TOC references may have offsets. + addis 3, 2, 5+foo@toc@ha + addi 3, 3, 10+foo@toc@l + + addis 3, 2, 15+foo@toc@ha + addi 3, 3, 20+foo@toc@l + + addis 4, 2, foo@toc@ha + addi 4, 4, foo@toc@l + + addis 5, 2, 5+foo@toc@ha + ld 5, 10+foo@toc@l(5) + + addis 4, 2, foo-10@toc@ha + addi 4, 4, foo-10@toc@l + + addis 4, 2, foo@toc@ha+25 + addi 4, 4, foo@toc@l+25 + + addis 4, 2, 1+foo-2@toc@ha+3 + addi 4, 4, 1+foo-2@toc@l+3 Index: chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-TOCWithOffset/out.s =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/src/util/fipstools/delocate/testdata/ppc64le-TOCWithOffset/out.s @@ -0,0 +1,178 @@ +.text +.file 1 "inserted_by_delocate.c" +.loc 1 1 0 +BORINGSSL_bcm_text_start: + .text +.Lfoo_local_target: +foo: + # TOC references may have offsets. +# WAS addis 3, 2, 5+foo@toc@ha +# WAS addi 3, 3, 10+foo@toc@l + addi 1, 1, -288 + mflr 3 + std 3, -8(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__plus_10 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 3, -24(1) + addi 1, 1, 288 + +# WAS addis 3, 2, 15+foo@toc@ha +# WAS addi 3, 3, 20+foo@toc@l + addi 1, 1, -288 + mflr 3 + std 3, -8(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__plus_20 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 3, -24(1) + addi 1, 1, 288 + +# WAS addis 4, 2, foo@toc@ha +# WAS addi 4, 4, foo@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + +# WAS addis 5, 2, 5+foo@toc@ha +# WAS ld 5, 10+foo@toc@l(5) + addi 1, 1, -288 + mflr 5 + std 5, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__plus_10 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 5, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + ld 5, 0(5) + +# WAS addis 4, 2, foo-10@toc@ha +# WAS addi 4, 4, foo-10@toc@l + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__minus_10 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + +# WAS addis 4, 2, foo@toc@ha+25 +# WAS addi 4, 4, foo@toc@l+25 + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__plus_25 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 + +# WAS addis 4, 2, 1+foo-2@toc@ha+3 +# WAS addi 4, 4, 1+foo-2@toc@l+3 + addi 1, 1, -288 + mflr 4 + std 4, -8(1) + std 3, -16(1) + bl .Lbcm_loadtoc__dot_Lfoo_local_target__plus_1_minus_2_plus_3 + std 3, -24(1) + ld 3, -8(1) + mtlr 3 + ld 4, -24(1) + ld 3, -16(1) + addi 1, 1, 288 +.text +.loc 1 2 0 +BORINGSSL_bcm_text_end: +.type bcm_loadtoc__dot_Lfoo_local_target, @function +bcm_loadtoc__dot_Lfoo_local_target: +.Lbcm_loadtoc__dot_Lfoo_local_target: + addis 3, 2, .Lfoo_local_target@toc@ha + addi 3, 3, .Lfoo_local_target@toc@l + blr +.type bcm_loadtoc__dot_Lfoo_local_target__plus_1_minus_2_plus_3, @function +bcm_loadtoc__dot_Lfoo_local_target__plus_1_minus_2_plus_3: +.Lbcm_loadtoc__dot_Lfoo_local_target__plus_1_minus_2_plus_3: + addis 3, 2, .Lfoo_local_target+1-2+3@toc@ha + addi 3, 3, .Lfoo_local_target+1-2+3@toc@l + blr +.type bcm_loadtoc__dot_Lfoo_local_target__plus_10, @function +bcm_loadtoc__dot_Lfoo_local_target__plus_10: +.Lbcm_loadtoc__dot_Lfoo_local_target__plus_10: + addis 3, 2, .Lfoo_local_target+10@toc@ha + addi 3, 3, .Lfoo_local_target+10@toc@l + blr +.type bcm_loadtoc__dot_Lfoo_local_target__plus_20, @function +bcm_loadtoc__dot_Lfoo_local_target__plus_20: +.Lbcm_loadtoc__dot_Lfoo_local_target__plus_20: + addis 3, 2, .Lfoo_local_target+20@toc@ha + addi 3, 3, .Lfoo_local_target+20@toc@l + blr +.type bcm_loadtoc__dot_Lfoo_local_target__plus_25, @function +bcm_loadtoc__dot_Lfoo_local_target__plus_25: +.Lbcm_loadtoc__dot_Lfoo_local_target__plus_25: + addis 3, 2, .Lfoo_local_target+25@toc@ha + addi 3, 3, .Lfoo_local_target+25@toc@l + blr +.type bcm_loadtoc__dot_Lfoo_local_target__minus_10, @function +bcm_loadtoc__dot_Lfoo_local_target__minus_10: +.Lbcm_loadtoc__dot_Lfoo_local_target__minus_10: + addis 3, 2, .Lfoo_local_target-10@toc@ha + addi 3, 3, .Lfoo_local_target-10@toc@l + blr +.LBORINGSSL_external_toc: +.quad .TOC.-.LBORINGSSL_external_toc +.type BORINGSSL_bcm_text_hash, @object +.size BORINGSSL_bcm_text_hash, 32 +BORINGSSL_bcm_text_hash: +.byte 0xae +.byte 0x2c +.byte 0xea +.byte 0x2a +.byte 0xbd +.byte 0xa6 +.byte 0xf3 +.byte 0xec +.byte 0x97 +.byte 0x7f +.byte 0x9b +.byte 0xf6 +.byte 0x94 +.byte 0x9a +.byte 0xfc +.byte 0x83 +.byte 0x68 +.byte 0x27 +.byte 0xcb +.byte 0xa0 +.byte 0xa0 +.byte 0x9f +.byte 0x6b +.byte 0x6f +.byte 0xde +.byte 0x52 +.byte 0xcd +.byte 0xe2 +.byte 0xcd +.byte 0xff +.byte 0x31 +.byte 0x80 Index: chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c +++ chromium-128.0.6613.113/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c @@ -416,6 +416,10 @@ static void sha1_block_data_order(uint32 return; } #endif +#if defined(SHA1_ASM_PPC64) + sha1_block_data_order_ppc64(state, data, num); + return; +#endif sha1_block_data_order_nohw(state, data, num); } Index: chromium-128.0.6613.113/third_party/boringssl/src/build.json =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/build.json +++ chromium-128.0.6613.113/third_party/boringssl/src/build.json @@ -122,6 +122,10 @@ {"src": "crypto/fipsmodule/sha/asm/sha512-armv4.pl"}, {"src": "crypto/fipsmodule/aes/asm/vpaes-armv7.pl"} ], + "perlasm_ppc64le": [ + {"src": "crypto/fipsmodule/aes/asm/aesp8-ppc.pl"}, + {"src": "crypto/fipsmodule/modes/asm/ghashp8-ppc.pl"} + ], "perlasm_x86": [ {"src": "crypto/fipsmodule/aes/asm/aesni-x86.pl"}, {"src": "crypto/fipsmodule/bn/asm/bn-586.pl"}, @@ -225,6 +229,7 @@ "crypto/cpu_arm_freebsd.c", "crypto/cpu_arm_linux.c", "crypto/cpu_intel.c", + "crypto/cpu_ppc64le.c", "crypto/crypto.c", "crypto/curve25519/curve25519.c", "crypto/curve25519/curve25519_64_adx.c", @@ -775,6 +780,9 @@ "perlasm_arm": [ {"src": "crypto/test/asm/trampoline-armv4.pl"} ], + "perlasm_ppc64le": [ + {"src": "crypto/test/asm/trampoline-ppc.pl"} + ], "perlasm_x86": [ {"src": "crypto/test/asm/trampoline-x86.pl"} ], Index: chromium-128.0.6613.113/third_party/boringssl/src/util/pregenerate/build.go =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/src/util/pregenerate/build.go +++ chromium-128.0.6613.113/third_party/boringssl/src/util/pregenerate/build.go @@ -38,6 +38,7 @@ type InputTarget struct { // architecture. PerlasmAarch64 []PerlasmSource `json:"perlasm_aarch64,omitempty"` PerlasmArm []PerlasmSource `json:"perlasm_arm,omitempty"` + PerlasmPPC64LE []PerlasmSource `json:"perlasm_ppc64le,omitempty"` PerlasmX86 []PerlasmSource `json:"perlasm_x86,omitempty"` PerlasmX86_64 []PerlasmSource `json:"perlasm_x86_64,omitempty"` } @@ -116,6 +117,9 @@ func (in *InputTarget) Pregenerate(name for _, p := range in.PerlasmArm { addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux32"}) } + for _, p := range in.PerlasmPPC64LE { + addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"linux64le"}) + } for _, p := range in.PerlasmX86 { addPerlasmTask(&out.Asm, &p, "-apple.S", []string{"macosx", "-fPIC"}) addPerlasmTask(&out.Asm, &p, "-linux.S", []string{"elf", "-fPIC"}) Index: chromium-128.0.6613.113/third_party/boringssl/BUILD.generated.gni =================================================================== --- chromium-128.0.6613.113.orig/third_party/boringssl/BUILD.generated.gni +++ chromium-128.0.6613.113/third_party/boringssl/BUILD.generated.gni @@ -92,6 +92,7 @@ crypto_sources = [ "src/crypto/cpu_arm_linux.c", "src/crypto/cpu_arm_linux.h", "src/crypto/cpu_intel.c", + "src/crypto/cpu_ppc64le.c", "src/crypto/crypto.c", "src/crypto/curve25519/curve25519.c", "src/crypto/curve25519/curve25519_64_adx.c", @@ -335,6 +336,7 @@ crypto_sources_asm = [ "src/gen/bcm/aesv8-gcm-armv8-apple.S", "src/gen/bcm/aesv8-gcm-armv8-linux.S", "src/gen/bcm/aesv8-gcm-armv8-win.S", + "src/gen/bcm/aesp8-ppc-linux.S", "src/gen/bcm/armv4-mont-linux.S", "src/gen/bcm/armv8-mont-apple.S", "src/gen/bcm/armv8-mont-linux.S", @@ -351,6 +353,7 @@ crypto_sources_asm = [ "src/gen/bcm/ghash-neon-armv8-apple.S", "src/gen/bcm/ghash-neon-armv8-linux.S", "src/gen/bcm/ghash-neon-armv8-win.S", + "src/gen/bcm/ghashp8-ppc-linux.S", "src/gen/bcm/ghash-ssse3-x86-apple.S", "src/gen/bcm/ghash-ssse3-x86-linux.S", "src/gen/bcm/ghash-ssse3-x86_64-apple.S", Index: chromium-128.0.6613.113/third_party/boringssl/README.ppc64le =================================================================== --- /dev/null +++ chromium-128.0.6613.113/third_party/boringssl/README.ppc64le @@ -0,0 +1,8 @@ +============================================================== +To recreate boringssl pregenerated files patch for ppc64le: + +cd third_party/boringssl/src +cp -Rp gen gen.orig +go run ./util/pregenerate +cd ../../../../ +diff -urN chromium-*/third_party/boringssl/src/gen.orig chromium-*/third_party/boringssl/src/gen