You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
14 KiB
460 lines
14 KiB
9 months ago
|
From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
|
||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
Date: Sun, 6 Feb 2022 00:54:18 -0600
|
||
|
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
|
||
|
Content-type: text/plain; charset=UTF-8
|
||
|
|
||
|
No bug.
|
||
|
|
||
|
Split vec generation into multiple steps. This allows the
|
||
|
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
|
||
|
case. This saves an expensive lane-cross instruction and removes
|
||
|
the need for 'vzeroupper'.
|
||
|
|
||
|
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
|
||
|
byte broadcast.
|
||
|
|
||
|
Results for memset-avx2 small (geomean of N = 20 benchset runs).
|
||
|
|
||
|
size, New Time, Old Time, New / Old
|
||
|
0, 4.100, 3.831, 0.934
|
||
|
1, 5.074, 4.399, 0.867
|
||
|
2, 4.433, 4.411, 0.995
|
||
|
4, 4.487, 4.415, 0.984
|
||
|
8, 4.454, 4.396, 0.987
|
||
|
16, 4.502, 4.443, 0.987
|
||
|
|
||
|
All relevant string/wcsmbs tests are passing.
|
||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||
|
---
|
||
|
sysdeps/x86_64/memset.S | 21 ++-
|
||
|
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
|
||
|
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
|
||
|
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
|
||
|
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
|
||
|
5 files changed, 152 insertions(+), 87 deletions(-)
|
||
|
|
||
|
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||
|
index 8672b030..27debd2b 100644
|
||
|
--- a/sysdeps/x86_64/memset.S
|
||
|
+++ b/sysdeps/x86_64/memset.S
|
||
|
@@ -28,17 +28,22 @@
|
||
|
#define VMOVU movups
|
||
|
#define VMOVA movaps
|
||
|
|
||
|
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
movd d, %xmm0; \
|
||
|
- movq r, %rax; \
|
||
|
- punpcklbw %xmm0, %xmm0; \
|
||
|
- punpcklwd %xmm0, %xmm0; \
|
||
|
- pshufd $0, %xmm0, %xmm0
|
||
|
+ pxor %xmm1, %xmm1; \
|
||
|
+ pshufb %xmm1, %xmm0; \
|
||
|
+ movq r, %rax
|
||
|
|
||
|
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
movd d, %xmm0; \
|
||
|
- movq r, %rax; \
|
||
|
- pshufd $0, %xmm0, %xmm0
|
||
|
+ pshufd $0, %xmm0, %xmm0; \
|
||
|
+ movq r, %rax
|
||
|
+
|
||
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||
|
+
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||
|
|
||
|
#define SECTION(p) p
|
||
|
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||
|
index 1af668af..c0bf2875 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||
|
@@ -10,15 +10,18 @@
|
||
|
# define VMOVU vmovdqu
|
||
|
# define VMOVA vmovdqa
|
||
|
|
||
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
vmovd d, %xmm0; \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastb %xmm0, %ymm0
|
||
|
+ movq r, %rax;
|
||
|
|
||
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
- vmovd d, %xmm0; \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastd %xmm0, %ymm0
|
||
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||
|
+
|
||
|
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
||
|
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
||
|
+
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
||
|
|
||
|
# ifndef SECTION
|
||
|
# define SECTION(p) p##.avx
|
||
|
@@ -30,5 +33,6 @@
|
||
|
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||
|
# endif
|
||
|
|
||
|
+# define USE_XMM_LESS_VEC
|
||
|
# include "memset-vec-unaligned-erms.S"
|
||
|
#endif
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||
|
index f14d6f84..5241216a 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||
|
@@ -15,13 +15,19 @@
|
||
|
|
||
|
# define VZEROUPPER
|
||
|
|
||
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastb d, %VEC0
|
||
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+ vpbroadcastb d, %VEC0; \
|
||
|
+ movq r, %rax
|
||
|
|
||
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastd d, %VEC0
|
||
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+ vpbroadcastd d, %VEC0; \
|
||
|
+ movq r, %rax
|
||
|
+
|
||
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||
|
+
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||
|
|
||
|
# define SECTION(p) p##.evex512
|
||
|
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||
|
index 64b09e77..63700215 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||
|
@@ -15,13 +15,19 @@
|
||
|
|
||
|
# define VZEROUPPER
|
||
|
|
||
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastb d, %VEC0
|
||
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+ vpbroadcastb d, %VEC0; \
|
||
|
+ movq r, %rax
|
||
|
|
||
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||
|
- movq r, %rax; \
|
||
|
- vpbroadcastd d, %VEC0
|
||
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||
|
+ vpbroadcastd d, %VEC0; \
|
||
|
+ movq r, %rax
|
||
|
+
|
||
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||
|
+
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||
|
|
||
|
# define SECTION(p) p##.evex
|
||
|
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||
|
index f08b7323..a67f9833 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||
|
@@ -58,8 +58,10 @@
|
||
|
#ifndef MOVQ
|
||
|
# if VEC_SIZE > 16
|
||
|
# define MOVQ vmovq
|
||
|
+# define MOVD vmovd
|
||
|
# else
|
||
|
# define MOVQ movq
|
||
|
+# define MOVD movd
|
||
|
# endif
|
||
|
#endif
|
||
|
|
||
|
@@ -72,9 +74,17 @@
|
||
|
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||
|
# define END_REG rcx
|
||
|
# define LOOP_REG rdi
|
||
|
+# define LESS_VEC_REG rax
|
||
|
#else
|
||
|
# define END_REG rdi
|
||
|
# define LOOP_REG rdx
|
||
|
+# define LESS_VEC_REG rdi
|
||
|
+#endif
|
||
|
+
|
||
|
+#ifdef USE_XMM_LESS_VEC
|
||
|
+# define XMM_SMALL 1
|
||
|
+#else
|
||
|
+# define XMM_SMALL 0
|
||
|
#endif
|
||
|
|
||
|
#define PAGE_SIZE 4096
|
||
|
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||
|
|
||
|
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||
|
shl $2, %RDX_LP
|
||
|
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
- jmp L(entry_from_bzero)
|
||
|
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
+ WMEMSET_VDUP_TO_VEC0_LOW()
|
||
|
+ cmpq $VEC_SIZE, %rdx
|
||
|
+ jb L(less_vec_no_vdup)
|
||
|
+ WMEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+ jmp L(entry_from_wmemset)
|
||
|
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||
|
#endif
|
||
|
|
||
|
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||
|
#endif
|
||
|
|
||
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
# ifdef __ILP32__
|
||
|
/* Clear the upper 32 bits. */
|
||
|
mov %edx, %edx
|
||
|
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||
|
L(entry_from_bzero):
|
||
|
cmpq $VEC_SIZE, %rdx
|
||
|
jb L(less_vec)
|
||
|
+ MEMSET_VDUP_TO_VEC0_HIGH()
|
||
|
+L(entry_from_wmemset):
|
||
|
cmpq $(VEC_SIZE * 2), %rdx
|
||
|
ja L(more_2x_vec)
|
||
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||
|
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||
|
# endif
|
||
|
|
||
|
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||
|
# ifdef __ILP32__
|
||
|
/* Clear the upper 32 bits. */
|
||
|
mov %edx, %edx
|
||
|
# endif
|
||
|
cmp $VEC_SIZE, %RDX_LP
|
||
|
jb L(less_vec)
|
||
|
+ MEMSET_VDUP_TO_VEC0_HIGH ()
|
||
|
cmp $(VEC_SIZE * 2), %RDX_LP
|
||
|
ja L(stosb_more_2x_vec)
|
||
|
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||
|
- */
|
||
|
- VMOVU %VEC(0), (%rax)
|
||
|
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||
|
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||
|
+ VMOVU %VEC(0), (%rdi)
|
||
|
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||
|
VZEROUPPER_RETURN
|
||
|
#endif
|
||
|
|
||
|
- .p2align 4,, 10
|
||
|
+ .p2align 4,, 4
|
||
|
L(last_2x_vec):
|
||
|
#ifdef USE_LESS_VEC_MASK_STORE
|
||
|
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||
|
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||
|
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||
|
#else
|
||
|
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||
|
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||
|
@@ -212,6 +228,7 @@ L(last_2x_vec):
|
||
|
#ifdef USE_LESS_VEC_MASK_STORE
|
||
|
.p2align 4,, 10
|
||
|
L(less_vec):
|
||
|
+L(less_vec_no_vdup):
|
||
|
/* Less than 1 VEC. */
|
||
|
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||
|
# error Unsupported VEC_SIZE!
|
||
|
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
||
|
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||
|
and (4x, 8x] jump to target. */
|
||
|
L(more_2x_vec):
|
||
|
-
|
||
|
- /* Two different methods of setting up pointers / compare. The
|
||
|
- two methods are based on the fact that EVEX/AVX512 mov
|
||
|
- instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||
|
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||
|
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||
|
- this saves code size and keeps a few targets in one fetch block.
|
||
|
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||
|
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||
|
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||
|
- LOOP_4X_OFFSET) with LEA_BID. */
|
||
|
-
|
||
|
- /* END_REG is rcx for EVEX/AVX512. */
|
||
|
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||
|
-#endif
|
||
|
-
|
||
|
- /* Stores to first 2x VEC before cmp as any path forward will
|
||
|
- require it. */
|
||
|
- VMOVU %VEC(0), (%rax)
|
||
|
- VMOVU %VEC(0), VEC_SIZE(%rax)
|
||
|
+ /* Store next 2x vec regardless. */
|
||
|
+ VMOVU %VEC(0), (%rdi)
|
||
|
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
||
|
|
||
|
|
||
|
+ /* Two different methods of setting up pointers / compare. The two
|
||
|
+ methods are based on the fact that EVEX/AVX512 mov instructions take
|
||
|
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
||
|
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
||
|
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
|
||
|
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
||
|
+ bottlenecks. */
|
||
|
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||
|
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||
|
addq %rdx, %END_REG
|
||
|
@@ -292,6 +299,15 @@ L(more_2x_vec):
|
||
|
cmpq $(VEC_SIZE * 4), %rdx
|
||
|
jbe L(last_2x_vec)
|
||
|
|
||
|
+
|
||
|
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||
|
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
||
|
+ LEA_BID. */
|
||
|
+
|
||
|
+ /* END_REG is rcx for EVEX/AVX512. */
|
||
|
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||
|
+#endif
|
||
|
+
|
||
|
/* Store next 2x vec regardless. */
|
||
|
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||
|
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||
|
@@ -355,65 +371,93 @@ L(stosb_local):
|
||
|
/* Define L(less_vec) only if not otherwise defined. */
|
||
|
.p2align 4
|
||
|
L(less_vec):
|
||
|
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||
|
+ xmm). This is only does anything for AVX2. */
|
||
|
+ MEMSET_VDUP_TO_VEC0_LOW ()
|
||
|
+L(less_vec_no_vdup):
|
||
|
#endif
|
||
|
L(cross_page):
|
||
|
#if VEC_SIZE > 32
|
||
|
cmpl $32, %edx
|
||
|
- jae L(between_32_63)
|
||
|
+ jge L(between_32_63)
|
||
|
#endif
|
||
|
#if VEC_SIZE > 16
|
||
|
cmpl $16, %edx
|
||
|
- jae L(between_16_31)
|
||
|
+ jge L(between_16_31)
|
||
|
+#endif
|
||
|
+#ifndef USE_XMM_LESS_VEC
|
||
|
+ MOVQ %XMM0, %rcx
|
||
|
#endif
|
||
|
- MOVQ %XMM0, %rdi
|
||
|
cmpl $8, %edx
|
||
|
- jae L(between_8_15)
|
||
|
+ jge L(between_8_15)
|
||
|
cmpl $4, %edx
|
||
|
- jae L(between_4_7)
|
||
|
+ jge L(between_4_7)
|
||
|
cmpl $1, %edx
|
||
|
- ja L(between_2_3)
|
||
|
- jb L(return)
|
||
|
- movb %sil, (%rax)
|
||
|
- VZEROUPPER_RETURN
|
||
|
+ jg L(between_2_3)
|
||
|
+ jl L(between_0_0)
|
||
|
+ movb %sil, (%LESS_VEC_REG)
|
||
|
+L(between_0_0):
|
||
|
+ ret
|
||
|
|
||
|
- /* Align small targets only if not doing so would cross a fetch
|
||
|
- line. */
|
||
|
+ /* Align small targets only if not doing so would cross a fetch line.
|
||
|
+ */
|
||
|
#if VEC_SIZE > 32
|
||
|
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||
|
/* From 32 to 63. No branch when size == 32. */
|
||
|
L(between_32_63):
|
||
|
- VMOVU %YMM0, (%rax)
|
||
|
- VMOVU %YMM0, -32(%rax, %rdx)
|
||
|
+ VMOVU %YMM0, (%LESS_VEC_REG)
|
||
|
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
||
|
VZEROUPPER_RETURN
|
||
|
#endif
|
||
|
|
||
|
#if VEC_SIZE >= 32
|
||
|
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
||
|
L(between_16_31):
|
||
|
/* From 16 to 31. No branch when size == 16. */
|
||
|
- VMOVU %XMM0, (%rax)
|
||
|
- VMOVU %XMM0, -16(%rax, %rdx)
|
||
|
- VZEROUPPER_RETURN
|
||
|
+ VMOVU %XMM0, (%LESS_VEC_REG)
|
||
|
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
||
|
+ ret
|
||
|
#endif
|
||
|
|
||
|
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||
|
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||
|
+ */
|
||
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
||
|
L(between_8_15):
|
||
|
/* From 8 to 15. No branch when size == 8. */
|
||
|
- movq %rdi, (%rax)
|
||
|
- movq %rdi, -8(%rax, %rdx)
|
||
|
- VZEROUPPER_RETURN
|
||
|
+#ifdef USE_XMM_LESS_VEC
|
||
|
+ MOVQ %XMM0, (%rdi)
|
||
|
+ MOVQ %XMM0, -8(%rdi, %rdx)
|
||
|
+#else
|
||
|
+ movq %rcx, (%LESS_VEC_REG)
|
||
|
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
||
|
+#endif
|
||
|
+ ret
|
||
|
|
||
|
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||
|
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||
|
+ */
|
||
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
||
|
L(between_4_7):
|
||
|
/* From 4 to 7. No branch when size == 4. */
|
||
|
- movl %edi, (%rax)
|
||
|
- movl %edi, -4(%rax, %rdx)
|
||
|
- VZEROUPPER_RETURN
|
||
|
+#ifdef USE_XMM_LESS_VEC
|
||
|
+ MOVD %XMM0, (%rdi)
|
||
|
+ MOVD %XMM0, -4(%rdi, %rdx)
|
||
|
+#else
|
||
|
+ movl %ecx, (%LESS_VEC_REG)
|
||
|
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
||
|
+#endif
|
||
|
+ ret
|
||
|
|
||
|
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||
|
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
|
||
|
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
||
|
L(between_2_3):
|
||
|
/* From 2 to 3. No branch when size == 2. */
|
||
|
- movw %di, (%rax)
|
||
|
- movb %dil, -1(%rax, %rdx)
|
||
|
- VZEROUPPER_RETURN
|
||
|
+#ifdef USE_XMM_LESS_VEC
|
||
|
+ movb %sil, (%rdi)
|
||
|
+ movb %sil, 1(%rdi)
|
||
|
+ movb %sil, -1(%rdi, %rdx)
|
||
|
+#else
|
||
|
+ movw %cx, (%LESS_VEC_REG)
|
||
|
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
|
||
|
+#endif
|
||
|
+ ret
|
||
|
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||
|
--
|
||
|
GitLab
|
||
|
|