You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
957 lines
20 KiB
957 lines
20 KiB
commit ffe75982cc0bb2d25d55ed566a3731b9c3017e6f
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Fri Apr 15 12:28:00 2022 -0500
|
|
|
|
x86: Remove memcmp-sse4.S
|
|
|
|
Code didn't actually use any sse4 instructions since `ptest` was
|
|
removed in:
|
|
|
|
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Wed Nov 10 16:18:56 2021 -0600
|
|
|
|
x86: Shrink memcmp-sse4.S code size
|
|
|
|
The new memcmp-sse2 implementation is also faster.
|
|
|
|
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
|
|
|
|
Note there are two regressions preferring SSE2 for Size = 1 and Size =
|
|
65.
|
|
|
|
Size = 1:
|
|
size, align0, align1, ret, New Time/Old Time
|
|
1, 1, 1, 0, 1.2
|
|
1, 1, 1, 1, 1.197
|
|
1, 1, 1, -1, 1.2
|
|
|
|
This is intentional. Size == 1 is significantly less hot based on
|
|
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
|
|
hotter).
|
|
|
|
Python3 Size = 1 -> 13.64%
|
|
Python3 Size = [4, 8] -> 60.92%
|
|
|
|
GCC11 Size = 1 -> 1.29%
|
|
GCC11 Size = [4, 8] -> 33.86%
|
|
|
|
size, align0, align1, ret, New Time/Old Time
|
|
4, 4, 4, 0, 0.622
|
|
4, 4, 4, 1, 0.797
|
|
4, 4, 4, -1, 0.805
|
|
5, 5, 5, 0, 0.623
|
|
5, 5, 5, 1, 0.777
|
|
5, 5, 5, -1, 0.802
|
|
6, 6, 6, 0, 0.625
|
|
6, 6, 6, 1, 0.813
|
|
6, 6, 6, -1, 0.788
|
|
7, 7, 7, 0, 0.625
|
|
7, 7, 7, 1, 0.799
|
|
7, 7, 7, -1, 0.795
|
|
8, 8, 8, 0, 0.625
|
|
8, 8, 8, 1, 0.848
|
|
8, 8, 8, -1, 0.914
|
|
9, 9, 9, 0, 0.625
|
|
|
|
Size = 65:
|
|
size, align0, align1, ret, New Time/Old Time
|
|
65, 0, 0, 0, 1.103
|
|
65, 0, 0, 1, 1.216
|
|
65, 0, 0, -1, 1.227
|
|
65, 65, 0, 0, 1.091
|
|
65, 0, 65, 1, 1.19
|
|
65, 65, 65, -1, 1.215
|
|
|
|
This is because A) the checks in range [65, 96] are now unrolled 2x
|
|
and B) because smaller values <= 16 are now given a hotter path. By
|
|
contrast the SSE4 version has a branch for Size = 80. The unrolled
|
|
version has get better performance for returns which need both
|
|
comparisons.
|
|
|
|
size, align0, align1, ret, New Time/Old Time
|
|
128, 4, 8, 0, 0.858
|
|
128, 4, 8, 1, 0.879
|
|
128, 4, 8, -1, 0.888
|
|
|
|
As well, out of microbenchmark environments that are not full
|
|
predictable the branch will have a real-cost.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
index bca82e38d86cc440..b503e4b81e92a11c 100644
|
|
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
@@ -11,7 +11,6 @@ sysdep_routines += \
|
|
memcmp-avx2-movbe-rtm \
|
|
memcmp-evex-movbe \
|
|
memcmp-sse2 \
|
|
- memcmp-sse4 \
|
|
memcmp-ssse3 \
|
|
memcpy-ssse3 \
|
|
memcpy-ssse3-back \
|
|
@@ -174,7 +173,6 @@ sysdep_routines += \
|
|
wmemcmp-avx2-movbe-rtm \
|
|
wmemcmp-c \
|
|
wmemcmp-evex-movbe \
|
|
- wmemcmp-sse4 \
|
|
wmemcmp-ssse3 \
|
|
# sysdep_routines
|
|
endif
|
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
index 4c7834dd0b951fa4..e5e48b36c3175e68 100644
|
|
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
&& CPU_FEATURE_USABLE (BMI2)
|
|
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
__memcmp_evex_movbe)
|
|
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
- __memcmp_sse4_1)
|
|
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
__memcmp_ssse3)
|
|
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
|
|
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
&& CPU_FEATURE_USABLE (BMI2)
|
|
&& CPU_FEATURE_USABLE (MOVBE)),
|
|
__wmemcmp_evex_movbe)
|
|
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
|
- __wmemcmp_sse4_1)
|
|
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
__wmemcmp_ssse3)
|
|
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
|
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
index 89e2129968e1e49c..5b92594093c1e0bb 100644
|
|
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
|
@@ -21,7 +21,6 @@
|
|
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
|
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
|
|
return OPTIMIZE (avx2_movbe);
|
|
}
|
|
|
|
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
- return OPTIMIZE (sse4_1);
|
|
-
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
|
|
return OPTIMIZE (ssse3);
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
deleted file mode 100644
|
|
index 97c102a9c5ab2b91..0000000000000000
|
|
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
|
+++ /dev/null
|
|
@@ -1,804 +0,0 @@
|
|
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
|
|
- Copyright (C) 2010-2021 Free Software Foundation, Inc.
|
|
- Contributed by Intel Corporation.
|
|
- This file is part of the GNU C Library.
|
|
-
|
|
- The GNU C Library is free software; you can redistribute it and/or
|
|
- modify it under the terms of the GNU Lesser General Public
|
|
- License as published by the Free Software Foundation; either
|
|
- version 2.1 of the License, or (at your option) any later version.
|
|
-
|
|
- The GNU C Library is distributed in the hope that it will be useful,
|
|
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
- Lesser General Public License for more details.
|
|
-
|
|
- You should have received a copy of the GNU Lesser General Public
|
|
- License along with the GNU C Library; if not, see
|
|
- <https://www.gnu.org/licenses/>. */
|
|
-
|
|
-#if IS_IN (libc)
|
|
-
|
|
-# include <sysdep.h>
|
|
-
|
|
-# ifndef MEMCMP
|
|
-# define MEMCMP __memcmp_sse4_1
|
|
-# endif
|
|
-
|
|
-#ifdef USE_AS_WMEMCMP
|
|
-# define CMPEQ pcmpeqd
|
|
-# define CHAR_SIZE 4
|
|
-#else
|
|
-# define CMPEQ pcmpeqb
|
|
-# define CHAR_SIZE 1
|
|
-#endif
|
|
-
|
|
-
|
|
-/* Warning!
|
|
- wmemcmp has to use SIGNED comparison for elements.
|
|
- memcmp has to use UNSIGNED comparison for elemnts.
|
|
-*/
|
|
-
|
|
- .section .text.sse4.1,"ax",@progbits
|
|
-ENTRY (MEMCMP)
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- shl $2, %RDX_LP
|
|
-# elif defined __ILP32__
|
|
- /* Clear the upper 32 bits. */
|
|
- mov %edx, %edx
|
|
-# endif
|
|
- cmp $79, %RDX_LP
|
|
- ja L(79bytesormore)
|
|
-
|
|
- cmp $CHAR_SIZE, %RDX_LP
|
|
- jbe L(firstbyte)
|
|
-
|
|
- /* N in (CHAR_SIZE, 79) bytes. */
|
|
- cmpl $32, %edx
|
|
- ja L(more_32_bytes)
|
|
-
|
|
- cmpl $16, %edx
|
|
- jae L(16_to_32_bytes)
|
|
-
|
|
-# ifndef USE_AS_WMEMCMP
|
|
- cmpl $8, %edx
|
|
- jae L(8_to_16_bytes)
|
|
-
|
|
- cmpl $4, %edx
|
|
- jb L(2_to_3_bytes)
|
|
-
|
|
- movl (%rdi), %eax
|
|
- movl (%rsi), %ecx
|
|
-
|
|
- bswap %eax
|
|
- bswap %ecx
|
|
-
|
|
- shlq $32, %rax
|
|
- shlq $32, %rcx
|
|
-
|
|
- movl -4(%rdi, %rdx), %edi
|
|
- movl -4(%rsi, %rdx), %esi
|
|
-
|
|
- bswap %edi
|
|
- bswap %esi
|
|
-
|
|
- orq %rdi, %rax
|
|
- orq %rsi, %rcx
|
|
- subq %rcx, %rax
|
|
- cmovne %edx, %eax
|
|
- sbbl %ecx, %ecx
|
|
- orl %ecx, %eax
|
|
- ret
|
|
-
|
|
- .p2align 4,, 8
|
|
-L(2_to_3_bytes):
|
|
- movzwl (%rdi), %eax
|
|
- movzwl (%rsi), %ecx
|
|
- shll $8, %eax
|
|
- shll $8, %ecx
|
|
- bswap %eax
|
|
- bswap %ecx
|
|
- movzbl -1(%rdi, %rdx), %edi
|
|
- movzbl -1(%rsi, %rdx), %esi
|
|
- orl %edi, %eax
|
|
- orl %esi, %ecx
|
|
- subl %ecx, %eax
|
|
- ret
|
|
-
|
|
- .p2align 4,, 8
|
|
-L(8_to_16_bytes):
|
|
- movq (%rdi), %rax
|
|
- movq (%rsi), %rcx
|
|
-
|
|
- bswap %rax
|
|
- bswap %rcx
|
|
-
|
|
- subq %rcx, %rax
|
|
- jne L(8_to_16_bytes_done)
|
|
-
|
|
- movq -8(%rdi, %rdx), %rax
|
|
- movq -8(%rsi, %rdx), %rcx
|
|
-
|
|
- bswap %rax
|
|
- bswap %rcx
|
|
-
|
|
- subq %rcx, %rax
|
|
-
|
|
-L(8_to_16_bytes_done):
|
|
- cmovne %edx, %eax
|
|
- sbbl %ecx, %ecx
|
|
- orl %ecx, %eax
|
|
- ret
|
|
-# else
|
|
- xorl %eax, %eax
|
|
- movl (%rdi), %ecx
|
|
- cmpl (%rsi), %ecx
|
|
- jne L(8_to_16_bytes_done)
|
|
- movl 4(%rdi), %ecx
|
|
- cmpl 4(%rsi), %ecx
|
|
- jne L(8_to_16_bytes_done)
|
|
- movl -4(%rdi, %rdx), %ecx
|
|
- cmpl -4(%rsi, %rdx), %ecx
|
|
- jne L(8_to_16_bytes_done)
|
|
- ret
|
|
-# endif
|
|
-
|
|
- .p2align 4,, 3
|
|
-L(ret_zero):
|
|
- xorl %eax, %eax
|
|
-L(zero):
|
|
- ret
|
|
-
|
|
- .p2align 4,, 8
|
|
-L(firstbyte):
|
|
- jb L(ret_zero)
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- xorl %eax, %eax
|
|
- movl (%rdi), %ecx
|
|
- cmpl (%rsi), %ecx
|
|
- je L(zero)
|
|
-L(8_to_16_bytes_done):
|
|
- setg %al
|
|
- leal -1(%rax, %rax), %eax
|
|
-# else
|
|
- movzbl (%rdi), %eax
|
|
- movzbl (%rsi), %ecx
|
|
- sub %ecx, %eax
|
|
-# endif
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(vec_return_begin_48):
|
|
- addq $16, %rdi
|
|
- addq $16, %rsi
|
|
-L(vec_return_begin_32):
|
|
- bsfl %eax, %eax
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- movl 32(%rdi, %rax), %ecx
|
|
- xorl %edx, %edx
|
|
- cmpl 32(%rsi, %rax), %ecx
|
|
- setg %dl
|
|
- leal -1(%rdx, %rdx), %eax
|
|
-# else
|
|
- movzbl 32(%rsi, %rax), %ecx
|
|
- movzbl 32(%rdi, %rax), %eax
|
|
- subl %ecx, %eax
|
|
-# endif
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(vec_return_begin_16):
|
|
- addq $16, %rdi
|
|
- addq $16, %rsi
|
|
-L(vec_return_begin):
|
|
- bsfl %eax, %eax
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- movl (%rdi, %rax), %ecx
|
|
- xorl %edx, %edx
|
|
- cmpl (%rsi, %rax), %ecx
|
|
- setg %dl
|
|
- leal -1(%rdx, %rdx), %eax
|
|
-# else
|
|
- movzbl (%rsi, %rax), %ecx
|
|
- movzbl (%rdi, %rax), %eax
|
|
- subl %ecx, %eax
|
|
-# endif
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(vec_return_end_16):
|
|
- subl $16, %edx
|
|
-L(vec_return_end):
|
|
- bsfl %eax, %eax
|
|
- addl %edx, %eax
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- movl -16(%rdi, %rax), %ecx
|
|
- xorl %edx, %edx
|
|
- cmpl -16(%rsi, %rax), %ecx
|
|
- setg %dl
|
|
- leal -1(%rdx, %rdx), %eax
|
|
-# else
|
|
- movzbl -16(%rsi, %rax), %ecx
|
|
- movzbl -16(%rdi, %rax), %eax
|
|
- subl %ecx, %eax
|
|
-# endif
|
|
- ret
|
|
-
|
|
- .p2align 4,, 8
|
|
-L(more_32_bytes):
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu (%rsi), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu 16(%rdi), %xmm0
|
|
- movdqu 16(%rsi), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- cmpl $64, %edx
|
|
- jbe L(32_to_64_bytes)
|
|
- movdqu 32(%rdi), %xmm0
|
|
- movdqu 32(%rsi), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(32_to_64_bytes):
|
|
- movdqu -32(%rdi, %rdx), %xmm0
|
|
- movdqu -32(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end_16)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(16_to_32_bytes):
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu (%rsi), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
-
|
|
- .p2align 4
|
|
-L(79bytesormore):
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu (%rsi), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
-
|
|
- mov %rsi, %rcx
|
|
- and $-16, %rsi
|
|
- add $16, %rsi
|
|
- sub %rsi, %rcx
|
|
-
|
|
- sub %rcx, %rdi
|
|
- add %rcx, %rdx
|
|
- test $0xf, %rdi
|
|
- jz L(2aligned)
|
|
-
|
|
- cmp $128, %rdx
|
|
- ja L(128bytesormore)
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(less128bytes):
|
|
- movdqu (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqu 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqu 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- cmp $96, %rdx
|
|
- jb L(32_to_64_bytes)
|
|
-
|
|
- addq $64, %rdi
|
|
- addq $64, %rsi
|
|
- subq $64, %rdx
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(last_64_bytes):
|
|
- movdqu (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqu -32(%rdi, %rdx), %xmm0
|
|
- movdqu -32(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end_16)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(128bytesormore):
|
|
- cmp $256, %rdx
|
|
- ja L(unaligned_loop)
|
|
-L(less256bytes):
|
|
- movdqu (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqu 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqu 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- addq $64, %rdi
|
|
- addq $64, %rsi
|
|
-
|
|
- movdqu (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqu 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqu 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqu 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- addq $-128, %rdx
|
|
- subq $-64, %rsi
|
|
- subq $-64, %rdi
|
|
-
|
|
- cmp $64, %rdx
|
|
- ja L(less128bytes)
|
|
-
|
|
- cmp $32, %rdx
|
|
- ja L(last_64_bytes)
|
|
-
|
|
- movdqu -32(%rdi, %rdx), %xmm0
|
|
- movdqu -32(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end_16)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(unaligned_loop):
|
|
-# ifdef DATA_CACHE_SIZE_HALF
|
|
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
|
-# else
|
|
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
|
-# endif
|
|
- movq %r8, %r9
|
|
- addq %r8, %r8
|
|
- addq %r9, %r8
|
|
- cmpq %r8, %rdx
|
|
- ja L(L2_L3_cache_unaligned)
|
|
- sub $64, %rdx
|
|
- .p2align 4
|
|
-L(64bytesormore_loop):
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu 16(%rdi), %xmm1
|
|
- movdqu 32(%rdi), %xmm2
|
|
- movdqu 48(%rdi), %xmm3
|
|
-
|
|
- CMPEQ (%rsi), %xmm0
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm2
|
|
- CMPEQ 48(%rsi), %xmm3
|
|
-
|
|
- pand %xmm0, %xmm1
|
|
- pand %xmm2, %xmm3
|
|
- pand %xmm1, %xmm3
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- incw %ax
|
|
- jnz L(64bytesormore_loop_end)
|
|
-
|
|
- add $64, %rsi
|
|
- add $64, %rdi
|
|
- sub $64, %rdx
|
|
- ja L(64bytesormore_loop)
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(loop_tail):
|
|
- addq %rdx, %rdi
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu 16(%rdi), %xmm1
|
|
- movdqu 32(%rdi), %xmm2
|
|
- movdqu 48(%rdi), %xmm3
|
|
-
|
|
- addq %rdx, %rsi
|
|
- movdqu (%rsi), %xmm4
|
|
- movdqu 16(%rsi), %xmm5
|
|
- movdqu 32(%rsi), %xmm6
|
|
- movdqu 48(%rsi), %xmm7
|
|
-
|
|
- CMPEQ %xmm4, %xmm0
|
|
- CMPEQ %xmm5, %xmm1
|
|
- CMPEQ %xmm6, %xmm2
|
|
- CMPEQ %xmm7, %xmm3
|
|
-
|
|
- pand %xmm0, %xmm1
|
|
- pand %xmm2, %xmm3
|
|
- pand %xmm1, %xmm3
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- incw %ax
|
|
- jnz L(64bytesormore_loop_end)
|
|
- ret
|
|
-
|
|
-L(L2_L3_cache_unaligned):
|
|
- subq $64, %rdx
|
|
- .p2align 4
|
|
-L(L2_L3_unaligned_128bytes_loop):
|
|
- prefetchnta 0x1c0(%rdi)
|
|
- prefetchnta 0x1c0(%rsi)
|
|
-
|
|
- movdqu (%rdi), %xmm0
|
|
- movdqu 16(%rdi), %xmm1
|
|
- movdqu 32(%rdi), %xmm2
|
|
- movdqu 48(%rdi), %xmm3
|
|
-
|
|
- CMPEQ (%rsi), %xmm0
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm2
|
|
- CMPEQ 48(%rsi), %xmm3
|
|
-
|
|
- pand %xmm0, %xmm1
|
|
- pand %xmm2, %xmm3
|
|
- pand %xmm1, %xmm3
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- incw %ax
|
|
- jnz L(64bytesormore_loop_end)
|
|
-
|
|
- add $64, %rsi
|
|
- add $64, %rdi
|
|
- sub $64, %rdx
|
|
- ja L(L2_L3_unaligned_128bytes_loop)
|
|
- jmp L(loop_tail)
|
|
-
|
|
-
|
|
- /* This case is for machines which are sensitive for unaligned
|
|
- * instructions. */
|
|
- .p2align 4
|
|
-L(2aligned):
|
|
- cmp $128, %rdx
|
|
- ja L(128bytesormorein2aligned)
|
|
-L(less128bytesin2aligned):
|
|
- movdqa (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqa 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqa 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqa 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- cmp $96, %rdx
|
|
- jb L(32_to_64_bytes)
|
|
-
|
|
- addq $64, %rdi
|
|
- addq $64, %rsi
|
|
- subq $64, %rdx
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(aligned_last_64_bytes):
|
|
- movdqa (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqa 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqu -32(%rdi, %rdx), %xmm0
|
|
- movdqu -32(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end_16)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(128bytesormorein2aligned):
|
|
- cmp $256, %rdx
|
|
- ja L(aligned_loop)
|
|
-L(less256bytesin2alinged):
|
|
- movdqa (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqa 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqa 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqa 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- addq $64, %rdi
|
|
- addq $64, %rsi
|
|
-
|
|
- movdqa (%rdi), %xmm1
|
|
- CMPEQ (%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin)
|
|
-
|
|
- movdqa 16(%rdi), %xmm1
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_16)
|
|
-
|
|
- movdqa 32(%rdi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_32)
|
|
-
|
|
- movdqa 48(%rdi), %xmm1
|
|
- CMPEQ 48(%rsi), %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_begin_48)
|
|
-
|
|
- addq $-128, %rdx
|
|
- subq $-64, %rsi
|
|
- subq $-64, %rdi
|
|
-
|
|
- cmp $64, %rdx
|
|
- ja L(less128bytesin2aligned)
|
|
-
|
|
- cmp $32, %rdx
|
|
- ja L(aligned_last_64_bytes)
|
|
-
|
|
- movdqu -32(%rdi, %rdx), %xmm0
|
|
- movdqu -32(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end_16)
|
|
-
|
|
- movdqu -16(%rdi, %rdx), %xmm0
|
|
- movdqu -16(%rsi, %rdx), %xmm1
|
|
- CMPEQ %xmm0, %xmm1
|
|
- pmovmskb %xmm1, %eax
|
|
- incw %ax
|
|
- jnz L(vec_return_end)
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(aligned_loop):
|
|
-# ifdef DATA_CACHE_SIZE_HALF
|
|
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
|
-# else
|
|
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
|
-# endif
|
|
- movq %r8, %r9
|
|
- addq %r8, %r8
|
|
- addq %r9, %r8
|
|
- cmpq %r8, %rdx
|
|
- ja L(L2_L3_cache_aligned)
|
|
-
|
|
- sub $64, %rdx
|
|
- .p2align 4
|
|
-L(64bytesormore_loopin2aligned):
|
|
- movdqa (%rdi), %xmm0
|
|
- movdqa 16(%rdi), %xmm1
|
|
- movdqa 32(%rdi), %xmm2
|
|
- movdqa 48(%rdi), %xmm3
|
|
-
|
|
- CMPEQ (%rsi), %xmm0
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm2
|
|
- CMPEQ 48(%rsi), %xmm3
|
|
-
|
|
- pand %xmm0, %xmm1
|
|
- pand %xmm2, %xmm3
|
|
- pand %xmm1, %xmm3
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- incw %ax
|
|
- jnz L(64bytesormore_loop_end)
|
|
- add $64, %rsi
|
|
- add $64, %rdi
|
|
- sub $64, %rdx
|
|
- ja L(64bytesormore_loopin2aligned)
|
|
- jmp L(loop_tail)
|
|
-
|
|
-L(L2_L3_cache_aligned):
|
|
- subq $64, %rdx
|
|
- .p2align 4
|
|
-L(L2_L3_aligned_128bytes_loop):
|
|
- prefetchnta 0x1c0(%rdi)
|
|
- prefetchnta 0x1c0(%rsi)
|
|
- movdqa (%rdi), %xmm0
|
|
- movdqa 16(%rdi), %xmm1
|
|
- movdqa 32(%rdi), %xmm2
|
|
- movdqa 48(%rdi), %xmm3
|
|
-
|
|
- CMPEQ (%rsi), %xmm0
|
|
- CMPEQ 16(%rsi), %xmm1
|
|
- CMPEQ 32(%rsi), %xmm2
|
|
- CMPEQ 48(%rsi), %xmm3
|
|
-
|
|
- pand %xmm0, %xmm1
|
|
- pand %xmm2, %xmm3
|
|
- pand %xmm1, %xmm3
|
|
-
|
|
- pmovmskb %xmm3, %eax
|
|
- incw %ax
|
|
- jnz L(64bytesormore_loop_end)
|
|
-
|
|
- addq $64, %rsi
|
|
- addq $64, %rdi
|
|
- subq $64, %rdx
|
|
- ja L(L2_L3_aligned_128bytes_loop)
|
|
- jmp L(loop_tail)
|
|
-
|
|
- .p2align 4
|
|
-L(64bytesormore_loop_end):
|
|
- pmovmskb %xmm0, %ecx
|
|
- incw %cx
|
|
- jnz L(loop_end_ret)
|
|
-
|
|
- pmovmskb %xmm1, %ecx
|
|
- notw %cx
|
|
- sall $16, %ecx
|
|
- jnz L(loop_end_ret)
|
|
-
|
|
- pmovmskb %xmm2, %ecx
|
|
- notw %cx
|
|
- shlq $32, %rcx
|
|
- jnz L(loop_end_ret)
|
|
-
|
|
- addq $48, %rdi
|
|
- addq $48, %rsi
|
|
- movq %rax, %rcx
|
|
-
|
|
- .p2align 4,, 6
|
|
-L(loop_end_ret):
|
|
- bsfq %rcx, %rcx
|
|
-# ifdef USE_AS_WMEMCMP
|
|
- movl (%rdi, %rcx), %eax
|
|
- xorl %edx, %edx
|
|
- cmpl (%rsi, %rcx), %eax
|
|
- setg %dl
|
|
- leal -1(%rdx, %rdx), %eax
|
|
-# else
|
|
- movzbl (%rdi, %rcx), %eax
|
|
- movzbl (%rsi, %rcx), %ecx
|
|
- subl %ecx, %eax
|
|
-# endif
|
|
- ret
|
|
-END (MEMCMP)
|
|
-#endif
|