glibc/SOURCES/glibc-RHEL-15696-78.patch

From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 6 Feb 2022 00:54:18 -0600
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8

No bug.

Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.

For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.

Results for memset-avx2 small (geomean of N = 20 benchset runs).

size, New Time, Old Time, New / Old
   0,    4.100,    3.831,     0.934
   1,    5.074,    4.399,     0.867
   2,    4.433,    4.411,     0.995
   4,    4.487,    4.415,     0.984
   8,    4.454,    4.396,     0.987
  16,    4.502,    4.443,     0.987

All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
 sysdeps/x86_64/memset.S                       |  21 ++-
 .../multiarch/memset-avx2-unaligned-erms.S    |  18 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |  18 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  18 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 164 +++++++++++-------
 5 files changed, 152 insertions(+), 87 deletions(-)

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 8672b030..27debd2b 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
 #define VMOVU     movups
 #define VMOVA     movaps
 
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
+  pxor %xmm1, %xmm1; \
+  pshufb %xmm1, %xmm0; \
+  movq r, %rax
 
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
-  movq r, %rax; \
-  pshufd $0, %xmm0, %xmm0
+  pshufd $0, %xmm0, %xmm0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 #define SECTION(p)		p
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af..c0bf2875 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
 # define VMOVU     vmovdqu
 # define VMOVA     vmovdqa
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastb %xmm0, %ymm0
+  movq r, %rax;
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
-  movq r, %rax; \
-  vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
@@ -30,5 +33,6 @@
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
 
+# define USE_XMM_LESS_VEC
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f84..5241216a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77..63700215 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
 
 # define VZEROUPPER
 
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastb d, %VEC0; \
+  movq r, %rax
 
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movq r, %rax; \
-  vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+  vpbroadcastd d, %VEC0; \
+  movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
 
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index f08b7323..a67f9833 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
+#  define MOVD				vmovd
 # else
 #  define MOVQ				movq
+#  define MOVD				movd
 # endif
 #endif
 
@@ -72,9 +74,17 @@
 #if defined USE_WITH_EVEX || defined USE_WITH_AVX512
 # define END_REG	rcx
 # define LOOP_REG	rdi
+# define LESS_VEC_REG	rax
 #else
 # define END_REG	rdi
 # define LOOP_REG	rdx
+# define LESS_VEC_REG	rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL	1
+#else
+# define XMM_SMALL	0
 #endif
 
 #define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
-	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
-	jmp	L(entry_from_bzero)
+	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+	WMEMSET_VDUP_TO_VEC0_LOW()
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec_no_vdup)
+	WMEMSET_VDUP_TO_VEC0_HIGH()
+	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
-	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
 	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
+	MEMSET_VDUP_TO_VEC0_HIGH ()
 	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
-	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
-	 */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
-	.p2align 4,, 10
+	.p2align 4,, 4
 L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
-	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
-	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
 #else
 	VMOVU	%VEC(0), (VEC_SIZE * -2)(%rdi)
 	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
 #ifdef USE_LESS_VEC_MASK_STORE
 	.p2align 4,, 10
 L(less_vec):
+L(less_vec_no_vdup):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
 	/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
 	   and (4x, 8x] jump to target.  */
 L(more_2x_vec):
-
-	/* Two different methods of setting up pointers / compare. The
-	   two methods are based on the fact that EVEX/AVX512 mov
-	   instructions take more bytes then AVX2/SSE2 mov instructions. As
-	   well that EVEX/AVX512 machines also have fast LEA_BID. Both
-	   setup and END_REG to avoid complex address mode. For EVEX/AVX512
-	   this saves code size and keeps a few targets in one fetch block.
-	   For AVX2/SSE2 this helps prevent AGU bottlenecks.  */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
-	   LOOP_4X_OFFSET) with LEA_BID.  */
-
-	/* END_REG is rcx for EVEX/AVX512.  */
-	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
-	/* Stores to first 2x VEC before cmp as any path forward will
-	   require it.  */
-	VMOVU	%VEC(0), (%rax)
-	VMOVU	%VEC(0), VEC_SIZE(%rax)
+	/* Store next 2x vec regardless.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * 1)(%rdi)
 
 
+	/* Two different methods of setting up pointers / compare. The two
+	   methods are based on the fact that EVEX/AVX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+	   bottlenecks.  */
 #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 	addq	%rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jbe	L(last_2x_vec)
 
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+	   LEA_BID.  */
+
+	/* END_REG is rcx for EVEX/AVX512.  */
+	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
 	/* Store next 2x vec regardless.  */
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rax)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,93 @@ L(stosb_local):
 	/* Define L(less_vec) only if not otherwise defined.  */
 	.p2align 4
 L(less_vec):
+	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+	   xmm). This is only does anything for AVX2.  */
+	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
 #endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
-	jae	L(between_32_63)
+	jge	L(between_32_63)
 #endif
 #if VEC_SIZE > 16
 	cmpl	$16, %edx
-	jae	L(between_16_31)
+	jge	L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, %rcx
 #endif
-	MOVQ	%XMM0, %rdi
 	cmpl	$8, %edx
-	jae	L(between_8_15)
+	jge	L(between_8_15)
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jge	L(between_4_7)
 	cmpl	$1, %edx
-	ja	L(between_2_3)
-	jb	L(return)
-	movb	%sil, (%rax)
-	VZEROUPPER_RETURN
+	jg	L(between_2_3)
+	jl	L(between_0_0)
+	movb	%sil, (%LESS_VEC_REG)
+L(between_0_0):
+	ret
 
-	/* Align small targets only if not doing so would cross a fetch
-	   line.  */
+	/* Align small targets only if not doing so would cross a fetch line.
+	 */
 #if VEC_SIZE > 32
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%rax)
-	VMOVU	%YMM0, -32(%rax, %rdx)
+	VMOVU	%YMM0, (%LESS_VEC_REG)
+	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
 #if VEC_SIZE >= 32
-	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%rax)
-	VMOVU	%XMM0, -16(%rax, %rdx)
-	VZEROUPPER_RETURN
+	VMOVU	%XMM0, (%LESS_VEC_REG)
+	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	ret
 #endif
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
-	movq	%rdi, (%rax)
-	movq	%rdi, -8(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVQ	%XMM0, (%rdi)
+	MOVQ	%XMM0, -8(%rdi, %rdx)
+#else
+	movq	%rcx, (%LESS_VEC_REG)
+	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	 */
+	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%edi, (%rax)
-	movl	%edi, -4(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	MOVD	%XMM0, (%rdi)
+	MOVD	%XMM0, -4(%rdi, %rdx)
+#else
+	movl	%ecx, (%LESS_VEC_REG)
+	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 
-	.p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+	/* 4 * XMM_SMALL for the third mov for AVX2.  */
+	.p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%di, (%rax)
-	movb	%dil, -1(%rax, %rdx)
-	VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+	movb	%sil, (%rdi)
+	movb	%sil, 1(%rdi)
+	movb	%sil, -1(%rdi, %rdx)
+#else
+	movw	%cx, (%LESS_VEC_REG)
+	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
-- 
GitLab
import glibc-2.28-251.el8 9 months ago			`From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001`
			`From: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Date: Sun, 6 Feb 2022 00:54:18 -0600`
			`Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S`
			`Content-type: text/plain; charset=UTF-8`

			`No bug.`

			`Split vec generation into multiple steps. This allows the`
			`broadcast in AVX2 to use 'xmm' registers for the L(less_vec)`
			`case. This saves an expensive lane-cross instruction and removes`
			`the need for 'vzeroupper'.`

			`For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for`
			`byte broadcast.`

			`Results for memset-avx2 small (geomean of N = 20 benchset runs).`

			`size, New Time, Old Time, New / Old`
			`0, 4.100, 3.831, 0.934`
			`1, 5.074, 4.399, 0.867`
			`2, 4.433, 4.411, 0.995`
			`4, 4.487, 4.415, 0.984`
			`8, 4.454, 4.396, 0.987`
			`16, 4.502, 4.443, 0.987`

			`All relevant string/wcsmbs tests are passing.`
			`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
			`---`
			`sysdeps/x86_64/memset.S \| 21 ++-`
			`.../multiarch/memset-avx2-unaligned-erms.S \| 18 +-`
			`.../multiarch/memset-avx512-unaligned-erms.S \| 18 +-`
			`.../multiarch/memset-evex-unaligned-erms.S \| 18 +-`
			`.../multiarch/memset-vec-unaligned-erms.S \| 164 +++++++++++-------`
			`5 files changed, 152 insertions(+), 87 deletions(-)`

			`diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S`
			`index 8672b030..27debd2b 100644`
			`--- a/sysdeps/x86_64/memset.S`
			`+++ b/sysdeps/x86_64/memset.S`
			`@@ -28,17 +28,22 @@`
			`#define VMOVU movups`
			`#define VMOVA movaps`

			`-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`movd d, %xmm0; \`
			`- movq r, %rax; \`
			`- punpcklbw %xmm0, %xmm0; \`
			`- punpcklwd %xmm0, %xmm0; \`
			`- pshufd $0, %xmm0, %xmm0`
			`+ pxor %xmm1, %xmm1; \`
			`+ pshufb %xmm1, %xmm0; \`
			`+ movq r, %rax`

			`-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`movd d, %xmm0; \`
			`- movq r, %rax; \`
			`- pshufd $0, %xmm0, %xmm0`
			`+ pshufd $0, %xmm0, %xmm0; \`
			`+ movq r, %rax`
			`+`
			`+# define MEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define MEMSET_VDUP_TO_VEC0_LOW()`
			`+`
			`+# define WMEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define WMEMSET_VDUP_TO_VEC0_LOW()`

			`#define SECTION(p) p`

			`diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S`
			`index 1af668af..c0bf2875 100644`
			`--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S`
			`+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S`
			`@@ -10,15 +10,18 @@`
			`# define VMOVU vmovdqu`
			`# define VMOVA vmovdqa`

			`-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`vmovd d, %xmm0; \`
			`- movq r, %rax; \`
			`- vpbroadcastb %xmm0, %ymm0`
			`+ movq r, %rax;`

			`-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`- vmovd d, %xmm0; \`
			`- movq r, %rax; \`
			`- vpbroadcastd %xmm0, %ymm0`
			`+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)`
			`+`
			`+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0`
			`+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0`
			`+`
			`+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0`
			`+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0`

			`# ifndef SECTION`
			`# define SECTION(p) p##.avx`
			`@@ -30,5 +33,6 @@`
			`# define WMEMSET_SYMBOL(p,s) p##_avx2_##s`
			`# endif`

			`+# define USE_XMM_LESS_VEC`
			`# include "memset-vec-unaligned-erms.S"`
			`#endif`
			`diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S`
			`index f14d6f84..5241216a 100644`
			`--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S`
			`+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S`
			`@@ -15,13 +15,19 @@`

			`# define VZEROUPPER`

			`-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`- movq r, %rax; \`
			`- vpbroadcastb d, %VEC0`
			`+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`+ vpbroadcastb d, %VEC0; \`
			`+ movq r, %rax`

			`-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`- movq r, %rax; \`
			`- vpbroadcastd d, %VEC0`
			`+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`+ vpbroadcastd d, %VEC0; \`
			`+ movq r, %rax`
			`+`
			`+# define MEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define MEMSET_VDUP_TO_VEC0_LOW()`
			`+`
			`+# define WMEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define WMEMSET_VDUP_TO_VEC0_LOW()`

			`# define SECTION(p) p##.evex512`
			`# define MEMSET_SYMBOL(p,s) p##_avx512_##s`
			`diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S`
			`index 64b09e77..63700215 100644`
			`--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S`
			`+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S`
			`@@ -15,13 +15,19 @@`

			`# define VZEROUPPER`

			`-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`- movq r, %rax; \`
			`- vpbroadcastb d, %VEC0`
			`+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`+ vpbroadcastb d, %VEC0; \`
			`+ movq r, %rax`

			`-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \`
			`- movq r, %rax; \`
			`- vpbroadcastd d, %VEC0`
			`+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \`
			`+ vpbroadcastd d, %VEC0; \`
			`+ movq r, %rax`
			`+`
			`+# define MEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define MEMSET_VDUP_TO_VEC0_LOW()`
			`+`
			`+# define WMEMSET_VDUP_TO_VEC0_HIGH()`
			`+# define WMEMSET_VDUP_TO_VEC0_LOW()`

			`# define SECTION(p) p##.evex`
			`# define MEMSET_SYMBOL(p,s) p##_evex_##s`
			`diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S`
			`index f08b7323..a67f9833 100644`
			`--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S`
			`+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S`
			`@@ -58,8 +58,10 @@`
			`#ifndef MOVQ`
			`# if VEC_SIZE > 16`
			`# define MOVQ vmovq`
			`+# define MOVD vmovd`
			`# else`
			`# define MOVQ movq`
			`+# define MOVD movd`
			`# endif`
			`#endif`

			`@@ -72,9 +74,17 @@`
			`#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512`
			`# define END_REG rcx`
			`# define LOOP_REG rdi`
			`+# define LESS_VEC_REG rax`
			`#else`
			`# define END_REG rdi`
			`# define LOOP_REG rdx`
			`+# define LESS_VEC_REG rdi`
			`+#endif`
			`+`
			`+#ifdef USE_XMM_LESS_VEC`
			`+# define XMM_SMALL 1`
			`+#else`
			`+# define XMM_SMALL 0`
			`#endif`

			`#define PAGE_SIZE 4096`
			`@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))`

			`ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))`
			`shl $2, %RDX_LP`
			`- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`- jmp L(entry_from_bzero)`
			`+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`+ WMEMSET_VDUP_TO_VEC0_LOW()`
			`+ cmpq $VEC_SIZE, %rdx`
			`+ jb L(less_vec_no_vdup)`
			`+ WMEMSET_VDUP_TO_VEC0_HIGH()`
			`+ jmp L(entry_from_wmemset)`
			`END (WMEMSET_SYMBOL (__wmemset, unaligned))`
			`#endif`

			`@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))`
			`#endif`

			`ENTRY (MEMSET_SYMBOL (__memset, unaligned))`
			`- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`# ifdef __ILP32__`
			`/* Clear the upper 32 bits. */`
			`mov %edx, %edx`
			`@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))`
			`L(entry_from_bzero):`
			`cmpq $VEC_SIZE, %rdx`
			`jb L(less_vec)`
			`+ MEMSET_VDUP_TO_VEC0_HIGH()`
			`+L(entry_from_wmemset):`
			`cmpq $(VEC_SIZE * 2), %rdx`
			`ja L(more_2x_vec)`
			`/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */`
			`@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))`
			`# endif`

			`ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)`
			`- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)`
			`# ifdef __ILP32__`
			`/* Clear the upper 32 bits. */`
			`mov %edx, %edx`
			`# endif`
			`cmp $VEC_SIZE, %RDX_LP`
			`jb L(less_vec)`
			`+ MEMSET_VDUP_TO_VEC0_HIGH ()`
			`cmp $(VEC_SIZE * 2), %RDX_LP`
			`ja L(stosb_more_2x_vec)`
			`- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.`
			`- */`
			`- VMOVU %VEC(0), (%rax)`
			`- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)`
			`+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */`
			`+ VMOVU %VEC(0), (%rdi)`
			`+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)`
			`VZEROUPPER_RETURN`
			`#endif`

			`- .p2align 4,, 10`
			`+ .p2align 4,, 4`
			`L(last_2x_vec):`
			`#ifdef USE_LESS_VEC_MASK_STORE`
			`- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)`
			`- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)`
			`+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)`
			`+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)`
			`#else`
			`VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)`
			`VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)`
			`@@ -212,6 +228,7 @@ L(last_2x_vec):`
			`#ifdef USE_LESS_VEC_MASK_STORE`
			`.p2align 4,, 10`
			`L(less_vec):`
			`+L(less_vec_no_vdup):`
			`/* Less than 1 VEC. */`
			`# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64`
			`# error Unsupported VEC_SIZE!`
			`@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):`
			`/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]`
			`and (4x, 8x] jump to target. */`
			`L(more_2x_vec):`
			`-`
			`- /* Two different methods of setting up pointers / compare. The`
			`- two methods are based on the fact that EVEX/AVX512 mov`
			`- instructions take more bytes then AVX2/SSE2 mov instructions. As`
			`- well that EVEX/AVX512 machines also have fast LEA_BID. Both`
			`- setup and END_REG to avoid complex address mode. For EVEX/AVX512`
			`- this saves code size and keeps a few targets in one fetch block.`
			`- For AVX2/SSE2 this helps prevent AGU bottlenecks. */`
			`-#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512`
			`- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +`
			`- LOOP_4X_OFFSET) with LEA_BID. */`
			`-`
			`- /* END_REG is rcx for EVEX/AVX512. */`
			`- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG`
			`-#endif`
			`-`
			`- /* Stores to first 2x VEC before cmp as any path forward will`
			`- require it. */`
			`- VMOVU %VEC(0), (%rax)`
			`- VMOVU %VEC(0), VEC_SIZE(%rax)`
			`+ /* Store next 2x vec regardless. */`
			`+ VMOVU %VEC(0), (%rdi)`
			`+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)`


			`+ /* Two different methods of setting up pointers / compare. The two`
			`+ methods are based on the fact that EVEX/AVX512 mov instructions take`
			`+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512`
			`+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex`
			`+ address mode. For EVEX/AVX512 this saves code size and keeps a few`
			`+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU`
			`+ bottlenecks. */`
			`#if !(defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512)`
			`/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */`
			`addq %rdx, %END_REG`
			`@@ -292,6 +299,15 @@ L(more_2x_vec):`
			`cmpq $(VEC_SIZE * 4), %rdx`
			`jbe L(last_2x_vec)`

			`+`
			`+#if defined USE_WITH_EVEX \|\| defined USE_WITH_AVX512`
			`+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with`
			`+ LEA_BID. */`
			`+`
			`+ /* END_REG is rcx for EVEX/AVX512. */`
			`+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG`
			`+#endif`
			`+`
			`/* Store next 2x vec regardless. */`
			`VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)`
			`VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)`
			`@@ -355,65 +371,93 @@ L(stosb_local):`
			`/* Define L(less_vec) only if not otherwise defined. */`
			`.p2align 4`
			`L(less_vec):`
			`+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to`
			`+ xmm). This is only does anything for AVX2. */`
			`+ MEMSET_VDUP_TO_VEC0_LOW ()`
			`+L(less_vec_no_vdup):`
			`#endif`
			`L(cross_page):`
			`#if VEC_SIZE > 32`
			`cmpl $32, %edx`
			`- jae L(between_32_63)`
			`+ jge L(between_32_63)`
			`#endif`
			`#if VEC_SIZE > 16`
			`cmpl $16, %edx`
			`- jae L(between_16_31)`
			`+ jge L(between_16_31)`
			`+#endif`
			`+#ifndef USE_XMM_LESS_VEC`
			`+ MOVQ %XMM0, %rcx`
			`#endif`
			`- MOVQ %XMM0, %rdi`
			`cmpl $8, %edx`
			`- jae L(between_8_15)`
			`+ jge L(between_8_15)`
			`cmpl $4, %edx`
			`- jae L(between_4_7)`
			`+ jge L(between_4_7)`
			`cmpl $1, %edx`
			`- ja L(between_2_3)`
			`- jb L(return)`
			`- movb %sil, (%rax)`
			`- VZEROUPPER_RETURN`
			`+ jg L(between_2_3)`
			`+ jl L(between_0_0)`
			`+ movb %sil, (%LESS_VEC_REG)`
			`+L(between_0_0):`
			`+ ret`

			`- /* Align small targets only if not doing so would cross a fetch`
			`- line. */`
			`+ /* Align small targets only if not doing so would cross a fetch line.`
			`+ */`
			`#if VEC_SIZE > 32`
			`.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)`
			`/* From 32 to 63. No branch when size == 32. */`
			`L(between_32_63):`
			`- VMOVU %YMM0, (%rax)`
			`- VMOVU %YMM0, -32(%rax, %rdx)`
			`+ VMOVU %YMM0, (%LESS_VEC_REG)`
			`+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)`
			`VZEROUPPER_RETURN`
			`#endif`

			`#if VEC_SIZE >= 32`
			`- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)`
			`+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)`
			`L(between_16_31):`
			`/* From 16 to 31. No branch when size == 16. */`
			`- VMOVU %XMM0, (%rax)`
			`- VMOVU %XMM0, -16(%rax, %rdx)`
			`- VZEROUPPER_RETURN`
			`+ VMOVU %XMM0, (%LESS_VEC_REG)`
			`+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)`
			`+ ret`
			`#endif`

			`- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)`
			`+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.`
			`+ */`
			`+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)`
			`L(between_8_15):`
			`/* From 8 to 15. No branch when size == 8. */`
			`- movq %rdi, (%rax)`
			`- movq %rdi, -8(%rax, %rdx)`
			`- VZEROUPPER_RETURN`
			`+#ifdef USE_XMM_LESS_VEC`
			`+ MOVQ %XMM0, (%rdi)`
			`+ MOVQ %XMM0, -8(%rdi, %rdx)`
			`+#else`
			`+ movq %rcx, (%LESS_VEC_REG)`
			`+ movq %rcx, -8(%LESS_VEC_REG, %rdx)`
			`+#endif`
			`+ ret`

			`- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)`
			`+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.`
			`+ */`
			`+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)`
			`L(between_4_7):`
			`/* From 4 to 7. No branch when size == 4. */`
			`- movl %edi, (%rax)`
			`- movl %edi, -4(%rax, %rdx)`
			`- VZEROUPPER_RETURN`
			`+#ifdef USE_XMM_LESS_VEC`
			`+ MOVD %XMM0, (%rdi)`
			`+ MOVD %XMM0, -4(%rdi, %rdx)`
			`+#else`
			`+ movl %ecx, (%LESS_VEC_REG)`
			`+ movl %ecx, -4(%LESS_VEC_REG, %rdx)`
			`+#endif`
			`+ ret`

			`- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)`
			`+ /* 4 * XMM_SMALL for the third mov for AVX2. */`
			`+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)`
			`L(between_2_3):`
			`/* From 2 to 3. No branch when size == 2. */`
			`- movw %di, (%rax)`
			`- movb %dil, -1(%rax, %rdx)`
			`- VZEROUPPER_RETURN`
			`+#ifdef USE_XMM_LESS_VEC`
			`+ movb %sil, (%rdi)`
			`+ movb %sil, 1(%rdi)`
			`+ movb %sil, -1(%rdi, %rdx)`
			`+#else`
			`+ movw %cx, (%LESS_VEC_REG)`
			`+ movb %sil, -1(%LESS_VEC_REG, %rdx)`
			`+#endif`
			`+ ret`
			`END (MEMSET_SYMBOL (__memset, unaligned_erms))`
			`--`
			`GitLab`