You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
537 lines
18 KiB
537 lines
18 KiB
6 months ago
|
From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
|
||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
Date: Tue, 4 May 2021 19:02:40 -0400
|
||
|
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
|
||
|
Content-type: text/plain; charset=UTF-8
|
||
|
|
||
|
No bug.
|
||
|
|
||
|
This commit adds a new implementation for EVEX memchr that is not safe
|
||
|
for RTM because it uses vzeroupper. The benefit is that by using
|
||
|
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
|
||
|
faster than the RTM safe version which cannot use vpcmpeq because
|
||
|
there is no EVEX encoding for the instruction. All parts of the
|
||
|
implementation aside from the 4x loop are the same for the two
|
||
|
versions and the optimization is only relevant for large sizes.
|
||
|
|
||
|
Tigerlake:
|
||
|
size , algn , Pos , Cur T , New T , Win , Dif
|
||
|
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
|
||
|
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
|
||
|
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
|
||
|
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
|
||
|
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
|
||
|
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
|
||
|
|
||
|
Icelake:
|
||
|
size , algn , Pos , Cur T , New T , Win , Dif
|
||
|
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
|
||
|
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
|
||
|
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
|
||
|
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
|
||
|
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
|
||
|
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
|
||
|
|
||
|
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
|
||
|
|
||
|
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||
|
---
|
||
|
sysdeps/x86_64/multiarch/Makefile | 7 +-
|
||
|
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
|
||
|
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
|
||
|
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
|
||
|
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
|
||
|
sysdeps/x86_64/multiarch/memchr.c | 2 +-
|
||
|
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
|
||
|
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
|
||
|
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
|
||
|
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
|
||
|
10 files changed, 217 insertions(+), 41 deletions(-)
|
||
|
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
|
||
|
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||
|
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||
|
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||
|
|
||
|
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||
|
index 65fde4eb..26be4095 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/Makefile
|
||
|
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||
|
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||
|
strncmp-evex \
|
||
|
strncpy-evex \
|
||
|
strnlen-evex \
|
||
|
- strrchr-evex
|
||
|
+ strrchr-evex \
|
||
|
+ memchr-evex-rtm \
|
||
|
+ rawmemchr-evex-rtm
|
||
|
CFLAGS-varshift.c += -msse4
|
||
|
CFLAGS-strcspn-c.c += -msse4
|
||
|
CFLAGS-strpbrk-c.c += -msse4
|
||
|
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||
|
wcsnlen-evex \
|
||
|
wcsrchr-evex \
|
||
|
wmemchr-evex \
|
||
|
- wmemcmp-evex-movbe
|
||
|
+ wmemcmp-evex-movbe \
|
||
|
+ wmemchr-evex-rtm
|
||
|
endif
|
||
|
|
||
|
ifeq ($(subdir),debug)
|
||
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
||
|
new file mode 100644
|
||
|
index 00000000..fc391edb
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
||
|
@@ -0,0 +1,55 @@
|
||
|
+/* Common definition for ifunc selection optimized with EVEX.
|
||
|
+ All versions must be listed in ifunc-impl-list.c.
|
||
|
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||
|
+ This file is part of the GNU C Library.
|
||
|
+
|
||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||
|
+ modify it under the terms of the GNU Lesser General Public
|
||
|
+ License as published by the Free Software Foundation; either
|
||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+ Lesser General Public License for more details.
|
||
|
+
|
||
|
+ You should have received a copy of the GNU Lesser General Public
|
||
|
+ License along with the GNU C Library; if not, see
|
||
|
+ <https://www.gnu.org/licenses/>. */
|
||
|
+
|
||
|
+#include <init-arch.h>
|
||
|
+
|
||
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
|
||
|
+
|
||
|
+
|
||
|
+static inline void *
|
||
|
+IFUNC_SELECTOR (void)
|
||
|
+{
|
||
|
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
||
|
+
|
||
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||
|
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||
|
+ {
|
||
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||
|
+ {
|
||
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||
|
+ return OPTIMIZE (evex_rtm);
|
||
|
+
|
||
|
+ return OPTIMIZE (evex);
|
||
|
+ }
|
||
|
+
|
||
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||
|
+ return OPTIMIZE (avx2_rtm);
|
||
|
+
|
||
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||
|
+ return OPTIMIZE (avx2);
|
||
|
+ }
|
||
|
+
|
||
|
+ return OPTIMIZE (sse2);
|
||
|
+}
|
||
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||
|
index d59d65f8..ac097e8d 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||
|
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||
|
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||
|
&& CPU_FEATURE_USABLE (AVX512BW)
|
||
|
&& CPU_FEATURE_USABLE (BMI2)),
|
||
|
__memchr_evex)
|
||
|
+ IFUNC_IMPL_ADD (array, i, memchr,
|
||
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
||
|
+ __memchr_evex_rtm)
|
||
|
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
||
|
|
||
|
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
||
|
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||
|
&& CPU_FEATURE_USABLE (AVX512BW)
|
||
|
&& CPU_FEATURE_USABLE (BMI2)),
|
||
|
__rawmemchr_evex)
|
||
|
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
||
|
+ __rawmemchr_evex_rtm)
|
||
|
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
||
|
|
||
|
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
||
|
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||
|
&& CPU_FEATURE_USABLE (AVX512BW)
|
||
|
&& CPU_FEATURE_USABLE (BMI2)),
|
||
|
__wmemchr_evex)
|
||
|
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
||
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
||
|
+ __wmemchr_evex_rtm)
|
||
|
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
||
|
|
||
|
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||
|
new file mode 100644
|
||
|
index 00000000..19871882
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||
|
@@ -0,0 +1,8 @@
|
||
|
+#ifndef MEMCHR
|
||
|
+# define MEMCHR __memchr_evex_rtm
|
||
|
+#endif
|
||
|
+
|
||
|
+#define USE_IN_RTM 1
|
||
|
+#define SECTION(p) p##.evex.rtm
|
||
|
+
|
||
|
+#include "memchr-evex.S"
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||
|
index f3fdad4f..4d0ed6d1 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||
|
@@ -38,10 +38,32 @@
|
||
|
# define CHAR_SIZE 1
|
||
|
# endif
|
||
|
|
||
|
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
|
||
|
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
|
||
|
+ This is represented by BASE_OFFSET. As well because the RTM
|
||
|
+ version uses vpcmp which stores a bit per element compared where
|
||
|
+ the non-RTM version uses vpcmpeq which stores a bit per byte
|
||
|
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
|
||
|
+ version. */
|
||
|
+# ifdef USE_IN_RTM
|
||
|
+# define VZEROUPPER
|
||
|
+# define BASE_OFFSET (VEC_SIZE * 4)
|
||
|
+# define RET_SCALE CHAR_SIZE
|
||
|
+# else
|
||
|
+# define VZEROUPPER vzeroupper
|
||
|
+# define BASE_OFFSET 0
|
||
|
+# define RET_SCALE 1
|
||
|
+# endif
|
||
|
+
|
||
|
+ /* In the return from 4x loop memchr and rawmemchr versions have
|
||
|
+ data pointers off by VEC_SIZE * 4 with memchr version being
|
||
|
+ VEC_SIZE * 4 greater. */
|
||
|
# ifdef USE_AS_RAWMEMCHR
|
||
|
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
|
||
|
# define RAW_PTR_REG rcx
|
||
|
# define ALGN_PTR_REG rdi
|
||
|
# else
|
||
|
+# define RET_OFFSET BASE_OFFSET
|
||
|
# define RAW_PTR_REG rdi
|
||
|
# define ALGN_PTR_REG rcx
|
||
|
# endif
|
||
|
@@ -57,11 +79,15 @@
|
||
|
# define YMM5 ymm21
|
||
|
# define YMM6 ymm22
|
||
|
|
||
|
+# ifndef SECTION
|
||
|
+# define SECTION(p) p##.evex
|
||
|
+# endif
|
||
|
+
|
||
|
# define VEC_SIZE 32
|
||
|
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||
|
# define PAGE_SIZE 4096
|
||
|
|
||
|
- .section .text.evex,"ax",@progbits
|
||
|
+ .section SECTION(.text),"ax",@progbits
|
||
|
ENTRY (MEMCHR)
|
||
|
# ifndef USE_AS_RAWMEMCHR
|
||
|
/* Check for zero length. */
|
||
|
@@ -237,14 +263,15 @@ L(cross_page_continue):
|
||
|
/* Check if at last CHAR_PER_VEC * 4 length. */
|
||
|
subq $(CHAR_PER_VEC * 4), %rdx
|
||
|
jbe L(last_4x_vec_or_less_cmpeq)
|
||
|
- addq $VEC_SIZE, %rdi
|
||
|
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
|
||
|
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
||
|
|
||
|
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
||
|
*/
|
||
|
# ifdef USE_AS_WMEMCHR
|
||
|
movl %edi, %ecx
|
||
|
andq $-(4 * VEC_SIZE), %rdi
|
||
|
- andl $(VEC_SIZE * 4 - 1), %ecx
|
||
|
+ subl %edi, %ecx
|
||
|
/* NB: Divide bytes by 4 to get the wchar_t count. */
|
||
|
sarl $2, %ecx
|
||
|
addq %rcx, %rdx
|
||
|
@@ -254,15 +281,28 @@ L(cross_page_continue):
|
||
|
subq %rdi, %rdx
|
||
|
# endif
|
||
|
# else
|
||
|
- addq $VEC_SIZE, %rdi
|
||
|
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
||
|
andq $-(4 * VEC_SIZE), %rdi
|
||
|
# endif
|
||
|
-
|
||
|
+# ifdef USE_IN_RTM
|
||
|
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||
|
+# else
|
||
|
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
|
||
|
+ encodable with EVEX registers (ymm16-ymm31). */
|
||
|
+ vmovdqa64 %YMMMATCH, %ymm0
|
||
|
+# endif
|
||
|
|
||
|
/* Compare 4 * VEC at a time forward. */
|
||
|
.p2align 4
|
||
|
L(loop_4x_vec):
|
||
|
+ /* Two versions of the loop. One that does not require
|
||
|
+ vzeroupper by not using ymm0-ymm15 and another does that require
|
||
|
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
|
||
|
+ is used at all is because there is no EVEX encoding vpcmpeq and
|
||
|
+ with vpcmpeq this loop can be performed more efficiently. The
|
||
|
+ non-vzeroupper version is safe for RTM while the vzeroupper
|
||
|
+ version should be prefered if RTM are not supported. */
|
||
|
+# ifdef USE_IN_RTM
|
||
|
/* It would be possible to save some instructions using 4x VPCMP
|
||
|
but bottleneck on port 5 makes it not woth it. */
|
||
|
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
||
|
@@ -273,12 +313,55 @@ L(loop_4x_vec):
|
||
|
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||
|
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
||
|
VPCMP $0, %YMM3, %YMMZERO, %k2
|
||
|
+# else
|
||
|
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
|
||
|
+ seperately with EVEX vpcmp. */
|
||
|
+# ifdef USE_AS_WMEMCHR
|
||
|
+ /* vptern can only accept masks for epi32/epi64 so can only save
|
||
|
+ instruction using not equals mask on vptern with wmemchr. */
|
||
|
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
|
||
|
+# else
|
||
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||
|
+# endif
|
||
|
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
|
||
|
+ */
|
||
|
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
||
|
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
||
|
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
||
|
+# ifdef USE_AS_WMEMCHR
|
||
|
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
|
||
|
+ combines result from VEC0 with zero mask. */
|
||
|
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
|
||
|
+ vpmovmskb %ymm4, %ecx
|
||
|
+# else
|
||
|
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
|
||
|
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
|
||
|
+ vpmovmskb %ymm4, %ecx
|
||
|
+ kmovd %k1, %eax
|
||
|
+# endif
|
||
|
+# endif
|
||
|
+
|
||
|
# ifdef USE_AS_RAWMEMCHR
|
||
|
subq $-(VEC_SIZE * 4), %rdi
|
||
|
+# endif
|
||
|
+# ifdef USE_IN_RTM
|
||
|
kortestd %k2, %k3
|
||
|
+# else
|
||
|
+# ifdef USE_AS_WMEMCHR
|
||
|
+ /* ecx contains not of matches. All 1s means no matches. incl will
|
||
|
+ overflow and set zeroflag if that is the case. */
|
||
|
+ incl %ecx
|
||
|
+# else
|
||
|
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
|
||
|
+ to ecx is not an issue because if eax is non-zero it will be
|
||
|
+ used for returning the match. If it is zero the add does
|
||
|
+ nothing. */
|
||
|
+ addq %rax, %rcx
|
||
|
+# endif
|
||
|
+# endif
|
||
|
+# ifdef USE_AS_RAWMEMCHR
|
||
|
jz L(loop_4x_vec)
|
||
|
# else
|
||
|
- kortestd %k2, %k3
|
||
|
jnz L(loop_4x_vec_end)
|
||
|
|
||
|
subq $-(VEC_SIZE * 4), %rdi
|
||
|
@@ -288,10 +371,11 @@ L(loop_4x_vec):
|
||
|
|
||
|
/* Fall through into less than 4 remaining vectors of length case.
|
||
|
*/
|
||
|
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||
|
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
|
||
|
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
|
||
|
kmovd %k0, %eax
|
||
|
- addq $(VEC_SIZE * 3), %rdi
|
||
|
- .p2align 4
|
||
|
+ VZEROUPPER
|
||
|
+
|
||
|
L(last_4x_vec_or_less):
|
||
|
/* Check if first VEC contained match. */
|
||
|
testl %eax, %eax
|
||
|
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
|
||
|
/* rawmemchr will fall through into this if match was found in
|
||
|
loop. */
|
||
|
|
||
|
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
|
||
|
/* k1 has not of matches with VEC1. */
|
||
|
kmovd %k1, %eax
|
||
|
-# ifdef USE_AS_WMEMCHR
|
||
|
+# ifdef USE_AS_WMEMCHR
|
||
|
subl $((1 << CHAR_PER_VEC) - 1), %eax
|
||
|
-# else
|
||
|
+# else
|
||
|
incl %eax
|
||
|
+# endif
|
||
|
+# else
|
||
|
+ /* eax already has matches for VEC1. */
|
||
|
+ testl %eax, %eax
|
||
|
# endif
|
||
|
jnz L(last_vec_x1_return)
|
||
|
|
||
|
+# ifdef USE_IN_RTM
|
||
|
VPCMP $0, %YMM2, %YMMZERO, %k0
|
||
|
kmovd %k0, %eax
|
||
|
+# else
|
||
|
+ vpmovmskb %ymm2, %eax
|
||
|
+# endif
|
||
|
testl %eax, %eax
|
||
|
jnz L(last_vec_x2_return)
|
||
|
|
||
|
+# ifdef USE_IN_RTM
|
||
|
kmovd %k2, %eax
|
||
|
testl %eax, %eax
|
||
|
jnz L(last_vec_x3_return)
|
||
|
|
||
|
kmovd %k3, %eax
|
||
|
tzcntl %eax, %eax
|
||
|
-# ifdef USE_AS_RAWMEMCHR
|
||
|
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
# else
|
||
|
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
+ vpmovmskb %ymm3, %eax
|
||
|
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
|
||
|
+ salq $VEC_SIZE, %rcx
|
||
|
+ orq %rcx, %rax
|
||
|
+ tzcntq %rax, %rax
|
||
|
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
|
||
|
+ VZEROUPPER
|
||
|
# endif
|
||
|
ret
|
||
|
|
||
|
.p2align 4
|
||
|
L(last_vec_x1_return):
|
||
|
tzcntl %eax, %eax
|
||
|
-# ifdef USE_AS_RAWMEMCHR
|
||
|
-# ifdef USE_AS_WMEMCHR
|
||
|
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
||
|
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||
|
-# else
|
||
|
- addq %rdi, %rax
|
||
|
-# endif
|
||
|
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
# else
|
||
|
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
+ addq %rdi, %rax
|
||
|
# endif
|
||
|
+ VZEROUPPER
|
||
|
ret
|
||
|
|
||
|
.p2align 4
|
||
|
L(last_vec_x2_return):
|
||
|
tzcntl %eax, %eax
|
||
|
-# ifdef USE_AS_RAWMEMCHR
|
||
|
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
-# else
|
||
|
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
-# endif
|
||
|
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
|
||
|
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
|
||
|
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
|
||
|
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
|
||
|
+ VZEROUPPER
|
||
|
ret
|
||
|
|
||
|
+# ifdef USE_IN_RTM
|
||
|
.p2align 4
|
||
|
L(last_vec_x3_return):
|
||
|
tzcntl %eax, %eax
|
||
|
-# ifdef USE_AS_RAWMEMCHR
|
||
|
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
-# else
|
||
|
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||
|
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
-# endif
|
||
|
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
||
|
ret
|
||
|
-
|
||
|
+# endif
|
||
|
|
||
|
# ifndef USE_AS_RAWMEMCHR
|
||
|
L(last_4x_vec_or_less_cmpeq):
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
|
||
|
index 016f5784..f28aea77 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memchr.c
|
||
|
+++ b/sysdeps/x86_64/multiarch/memchr.c
|
||
|
@@ -24,7 +24,7 @@
|
||
|
# undef memchr
|
||
|
|
||
|
# define SYMBOL_NAME memchr
|
||
|
-# include "ifunc-avx2.h"
|
||
|
+# include "ifunc-evex.h"
|
||
|
|
||
|
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
|
||
|
strong_alias (memchr, __memchr)
|
||
|
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||
|
new file mode 100644
|
||
|
index 00000000..deda1ca3
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||
|
@@ -0,0 +1,3 @@
|
||
|
+#define MEMCHR __rawmemchr_evex_rtm
|
||
|
+#define USE_AS_RAWMEMCHR 1
|
||
|
+#include "memchr-evex-rtm.S"
|
||
|
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
|
||
|
index 8a0bc313..1f764f35 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
|
||
|
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
|
||
|
@@ -26,7 +26,7 @@
|
||
|
# undef __rawmemchr
|
||
|
|
||
|
# define SYMBOL_NAME rawmemchr
|
||
|
-# include "ifunc-avx2.h"
|
||
|
+# include "ifunc-evex.h"
|
||
|
|
||
|
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
||
|
IFUNC_SELECTOR ());
|
||
|
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||
|
new file mode 100644
|
||
|
index 00000000..a346cd35
|
||
|
--- /dev/null
|
||
|
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||
|
@@ -0,0 +1,3 @@
|
||
|
+#define MEMCHR __wmemchr_evex_rtm
|
||
|
+#define USE_AS_WMEMCHR 1
|
||
|
+#include "memchr-evex-rtm.S"
|
||
|
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
|
||
|
index 6d833702..f9c91915 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/wmemchr.c
|
||
|
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
|
||
|
@@ -26,7 +26,7 @@
|
||
|
# undef __wmemchr
|
||
|
|
||
|
# define SYMBOL_NAME wmemchr
|
||
|
-# include "ifunc-avx2.h"
|
||
|
+# include "ifunc-evex.h"
|
||
|
|
||
|
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
|
||
|
weak_alias (__wmemchr, wmemchr)
|
||
|
--
|
||
|
GitLab
|
||
|
|