You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
309 lines
9.3 KiB
309 lines
9.3 KiB
9 months ago
|
commit e941e0ae80626b7661c1db8953a673cafd3b8b19
|
||
|
Author: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||
|
Date: Fri Apr 30 18:12:08 2021 -0300
|
||
|
|
||
|
powerpc64le: Optimize memcpy for POWER10
|
||
|
|
||
|
This implementation is based on __memcpy_power8_cached and integrates
|
||
|
suggestions from Anton Blanchard.
|
||
|
It benefits from loads and stores with length for short lengths and for
|
||
|
tail code, simplifying the code.
|
||
|
|
||
|
All unaligned memory accesses use instructions that do not generate
|
||
|
alignment interrupts on POWER10, making it safe to use on
|
||
|
caching-inhibited memory.
|
||
|
|
||
|
The main loop has also been modified in order to increase instruction
|
||
|
throughput by reducing the dependency on updates from previous iterations.
|
||
|
|
||
|
On average, this implementation provides around 30% improvement when
|
||
|
compared to __memcpy_power7 and 10% improvement in comparison to
|
||
|
__memcpy_power8_cached.
|
||
|
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
|
||
|
new file mode 100644
|
||
|
index 0000000000000000..ad1414db4a3a8b9f
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
|
||
|
@@ -0,0 +1,198 @@
|
||
|
+/* Optimized memcpy implementation for POWER10.
|
||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <http://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <sysdep.h>
|
||
|
+
|
||
|
+
|
||
|
+#ifndef MEMCPY
|
||
|
+# define MEMCPY memcpy
|
||
|
+#endif
|
||
|
+
|
||
|
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
|
||
|
+ Returns 'dst'. */
|
||
|
+
|
||
|
+ .machine power9
|
||
|
+ENTRY_TOCLESS (MEMCPY, 5)
|
||
|
+ CALL_MCOUNT 3
|
||
|
+
|
||
|
+ /* Copy up to 16 bytes. */
|
||
|
+ sldi r6,r5,56 /* Prepare [l|st]xvl counter. */
|
||
|
+ lxvl v10,r4,r6
|
||
|
+ stxvl v10,r3,r6
|
||
|
+ subic. r6,r5,16 /* Return if len <= 16. */
|
||
|
+ blelr
|
||
|
+
|
||
|
+ /* If len >= 256, assume nothing got copied before and copy
|
||
|
+ again. This might cause issues with overlapped memory, but memcpy
|
||
|
+ is not expected to treat overlapped memory. */
|
||
|
+ cmpdi r5,256
|
||
|
+ bge L(copy_ge_256)
|
||
|
+ /* 16 < len < 256 and the first 16 bytes have already been copied. */
|
||
|
+ addi r10,r3,16 /* Keep r3 intact as return value. */
|
||
|
+ addi r4,r4,16
|
||
|
+ subi r5,r5,16
|
||
|
+ b L(copy_lt_256) /* Avoid the main loop if len < 256. */
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(copy_ge_256):
|
||
|
+ mr r10,r3 /* Keep r3 intact as return value. */
|
||
|
+ /* Align dst to 16 bytes. */
|
||
|
+ andi. r9,r10,0xf
|
||
|
+ beq L(dst_is_align_16)
|
||
|
+ lxv v10,0(r4)
|
||
|
+ subfic r12,r9,16
|
||
|
+ subf r5,r12,r5
|
||
|
+ add r4,r4,r12
|
||
|
+ stxv v10,0(r3)
|
||
|
+ add r10,r3,r12
|
||
|
+
|
||
|
+L(dst_is_align_16):
|
||
|
+ srdi r9,r5,7 /* Divide by 128. */
|
||
|
+ mtctr r9
|
||
|
+ addi r6,r4,64
|
||
|
+ addi r7,r10,64
|
||
|
+
|
||
|
+
|
||
|
+ /* Main loop, copy 128 bytes per iteration.
|
||
|
+ Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
|
||
|
+ r4 and r10. */
|
||
|
+ .p2align 5
|
||
|
+L(copy_128):
|
||
|
+
|
||
|
+ lxv v10, 0(r4)
|
||
|
+ lxv v11, 16(r4)
|
||
|
+ lxv v12, 32(r4)
|
||
|
+ lxv v13, 48(r4)
|
||
|
+
|
||
|
+ addi r4,r4,128
|
||
|
+
|
||
|
+ stxv v10, 0(r10)
|
||
|
+ stxv v11, 16(r10)
|
||
|
+ stxv v12, 32(r10)
|
||
|
+ stxv v13, 48(r10)
|
||
|
+
|
||
|
+ addi r10,r10,128
|
||
|
+
|
||
|
+ lxv v10, 0(r6)
|
||
|
+ lxv v11, 16(r6)
|
||
|
+ lxv v12, 32(r6)
|
||
|
+ lxv v13, 48(r6)
|
||
|
+
|
||
|
+ addi r6,r6,128
|
||
|
+
|
||
|
+ stxv v10, 0(r7)
|
||
|
+ stxv v11, 16(r7)
|
||
|
+ stxv v12, 32(r7)
|
||
|
+ stxv v13, 48(r7)
|
||
|
+
|
||
|
+ addi r7,r7,128
|
||
|
+
|
||
|
+ bdnz L(copy_128)
|
||
|
+
|
||
|
+ clrldi. r5,r5,64-7 /* Have we copied everything? */
|
||
|
+ beqlr
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(copy_lt_256):
|
||
|
+ cmpdi r5,16
|
||
|
+ ble L(copy_le_16)
|
||
|
+ srdi. r9,r5,5 /* Divide by 32. */
|
||
|
+ beq L(copy_lt_32)
|
||
|
+ mtctr r9
|
||
|
+ /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
|
||
|
+ the dependency on r4 and r10. */
|
||
|
+ addi r6,r4,32
|
||
|
+ addi r7,r10,32
|
||
|
+ addi r8,r4,64
|
||
|
+ addi r9,r10,64
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+ /* Copy 32 bytes at a time, unaligned.
|
||
|
+ The loop is unrolled 3 times in order to reduce the dependency on
|
||
|
+ r4 and r10, copying up-to 96 bytes per iteration. */
|
||
|
+L(copy_32):
|
||
|
+ lxv v10, 0(r4)
|
||
|
+ lxv v11, 16(r4)
|
||
|
+ stxv v10, 0(r10)
|
||
|
+ stxv v11, 16(r10)
|
||
|
+ bdz L(end_copy_32a)
|
||
|
+ addi r4,r4,96
|
||
|
+ addi r10,r10,96
|
||
|
+
|
||
|
+ lxv v10, 0(r6)
|
||
|
+ lxv v11, 16(r6)
|
||
|
+ addi r6,r6,96
|
||
|
+ stxv v10, 0(r7)
|
||
|
+ stxv v11, 16(r7)
|
||
|
+ bdz L(end_copy_32b)
|
||
|
+ addi r7,r7,96
|
||
|
+
|
||
|
+ lxv v12, 0(r8)
|
||
|
+ lxv v13, 16(r8)
|
||
|
+ addi r8,r8,96
|
||
|
+ stxv v12, 0(r9)
|
||
|
+ stxv v13, 16(r9)
|
||
|
+ addi r9,r9,96
|
||
|
+ bdnz L(copy_32)
|
||
|
+
|
||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||
|
+ beqlr
|
||
|
+ cmpdi r5,16
|
||
|
+ ble L(copy_le_16)
|
||
|
+ b L(copy_lt_32)
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(end_copy_32a):
|
||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||
|
+ beqlr
|
||
|
+ /* 32 bytes have been copied since the last update of r4 and r10. */
|
||
|
+ addi r4,r4,32
|
||
|
+ addi r10,r10,32
|
||
|
+ cmpdi r5,16
|
||
|
+ ble L(copy_le_16)
|
||
|
+ b L(copy_lt_32)
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(end_copy_32b):
|
||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||
|
+ beqlr
|
||
|
+ /* The last iteration of the loop copied 64 bytes. Update r4 and r10
|
||
|
+ accordingly. */
|
||
|
+ addi r4,r4,-32
|
||
|
+ addi r10,r10,-32
|
||
|
+ cmpdi r5,16
|
||
|
+ ble L(copy_le_16)
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(copy_lt_32):
|
||
|
+ lxv v10, 0(r4)
|
||
|
+ stxv v10, 0(r10)
|
||
|
+ addi r4,r4,16
|
||
|
+ addi r10,r10,16
|
||
|
+ subi r5,r5,16
|
||
|
+
|
||
|
+ .p2align 5
|
||
|
+L(copy_le_16):
|
||
|
+ sldi r6,r5,56
|
||
|
+ lxvl v10,r4,r6
|
||
|
+ stxvl v10,r10,r6
|
||
|
+ blr
|
||
|
+
|
||
|
+
|
||
|
+END_GEN_TB (MEMCPY,TB_TOCLESS)
|
||
|
+libc_hidden_builtin_def (memcpy)
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644
|
||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||
|
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||
|
strncase-power8
|
||
|
|
||
|
ifneq (,$(filter %le,$(config-machine)))
|
||
|
-sysdep_routines += memmove-power10 \
|
||
|
+sysdep_routines += memcpy-power10 memmove-power10 \
|
||
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||
|
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||
|
strlen-power10
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
index 4ce04bc51574cca1..9d5a14e480c02171 100644
|
||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||
|
@@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||
|
#ifdef SHARED
|
||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */
|
||
|
IFUNC_IMPL (i, name, memcpy,
|
||
|
+#ifdef __LITTLE_ENDIAN__
|
||
|
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||
|
+ __memcpy_power10)
|
||
|
+#endif
|
||
|
IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||
|
__memcpy_power8_cached)
|
||
|
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
|
||
|
new file mode 100644
|
||
|
index 0000000000000000..70e0fc3ed610cdc3
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
|
||
|
@@ -0,0 +1,26 @@
|
||
|
+/* Optimized memcpy implementation for POWER10.
|
||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <https://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
|
||
|
+#define MEMCPY __memcpy_power10
|
||
|
+
|
||
|
+#undef libc_hidden_builtin_def
|
||
|
+#define libc_hidden_builtin_def(name)
|
||
|
+
|
||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S>
|
||
|
+#endif
|
||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||
|
index 44dea594f3770673..be0e47f32dde2ccf 100644
|
||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||
|
@@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
|
||
|
extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
|
||
|
extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
|
||
|
extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden;
|
||
|
+# if defined __LITTLE_ENDIAN__
|
||
|
+extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden;
|
||
|
+# endif
|
||
|
|
||
|
libc_ifunc (__libc_memcpy,
|
||
|
+# if defined __LITTLE_ENDIAN__
|
||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
|
||
|
+ ? __memcpy_power10 :
|
||
|
+# endif
|
||
|
((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
|
||
|
? __memcpy_power8_cached :
|
||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|