You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
450 lines
15 KiB
450 lines
15 KiB
10 months ago
|
From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
|
||
|
From: noah <goldstein.w.n@gmail.com>
|
||
|
Date: Sat, 3 Apr 2021 04:12:15 -0400
|
||
|
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
|
||
|
Content-type: text/plain; charset=UTF-8
|
||
|
|
||
|
No Bug. This commit updates the large memcpy case (no overlap). The
|
||
|
update is to perform memcpy on either 2 or 4 contiguous pages at
|
||
|
once. This 1) helps to alleviate the affects of false memory aliasing
|
||
|
when destination and source have a close 4k alignment and 2) In most
|
||
|
cases and for most DRAM units is a modestly more efficient access
|
||
|
pattern. These changes are a clear performance improvement for
|
||
|
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
|
||
|
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
|
||
|
pass.
|
||
|
|
||
|
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
---
|
||
|
.../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++----
|
||
|
1 file changed, 265 insertions(+), 73 deletions(-)
|
||
|
|
||
|
Conflicts:
|
||
|
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||
|
(different number of sections)
|
||
|
|
||
|
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||
|
index c475fed4..3e2dd6bc 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||
|
@@ -32,7 +32,16 @@
|
||
|
overlapping addresses.
|
||
|
6. If size >= __x86_shared_non_temporal_threshold and there is no
|
||
|
overlap between destination and source, use non-temporal store
|
||
|
- instead of aligned store. */
|
||
|
+ instead of aligned store copying from either 2 or 4 pages at
|
||
|
+ once.
|
||
|
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
|
||
|
+ and source and destination do not page alias, copy from 2 pages
|
||
|
+ at once using non-temporal stores. Page aliasing in this case is
|
||
|
+ considered true if destination's page alignment - sources' page
|
||
|
+ alignment is less than 8 * VEC_SIZE.
|
||
|
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
|
||
|
+ and destination do page alias copy from 4 pages at once using
|
||
|
+ non-temporal stores. */
|
||
|
|
||
|
#include <sysdep.h>
|
||
|
|
||
|
@@ -64,6 +73,34 @@
|
||
|
# endif
|
||
|
#endif
|
||
|
|
||
|
+#ifndef PAGE_SIZE
|
||
|
+# define PAGE_SIZE 4096
|
||
|
+#endif
|
||
|
+
|
||
|
+#if PAGE_SIZE != 4096
|
||
|
+# error Unsupported PAGE_SIZE
|
||
|
+#endif
|
||
|
+
|
||
|
+#ifndef LOG_PAGE_SIZE
|
||
|
+# define LOG_PAGE_SIZE 12
|
||
|
+#endif
|
||
|
+
|
||
|
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
|
||
|
+# error Invalid LOG_PAGE_SIZE
|
||
|
+#endif
|
||
|
+
|
||
|
+/* Byte per page for large_memcpy inner loop. */
|
||
|
+#if VEC_SIZE == 64
|
||
|
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
|
||
|
+#else
|
||
|
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
|
||
|
+#endif
|
||
|
+
|
||
|
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
|
||
|
+#ifndef LOG_4X_MEMCPY_THRESH
|
||
|
+# define LOG_4X_MEMCPY_THRESH 4
|
||
|
+#endif
|
||
|
+
|
||
|
/* Avoid short distance rep movsb only with non-SSE vector. */
|
||
|
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
||
|
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
||
|
@@ -103,6 +140,28 @@
|
||
|
# error Unsupported PREFETCH_SIZE!
|
||
|
#endif
|
||
|
|
||
|
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
|
||
|
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
|
||
|
+ VMOVU (offset)base, vec0; \
|
||
|
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
|
||
|
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
|
||
|
+ VMOVNT vec0, (offset)base; \
|
||
|
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
|
||
|
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
|
||
|
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
||
|
+ VMOVU (offset)base, vec0; \
|
||
|
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
|
||
|
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
|
||
|
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
|
||
|
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
||
|
+ VMOVNT vec0, (offset)base; \
|
||
|
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
|
||
|
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
|
||
|
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
|
||
|
+#else
|
||
|
+# error Invalid LARGE_LOAD_SIZE
|
||
|
+#endif
|
||
|
+
|
||
|
#ifndef SECTION
|
||
|
# error SECTION is not defined!
|
||
|
#endif
|
||
|
@@ -390,6 +449,15 @@ L(last_4x_vec):
|
||
|
VZEROUPPER_RETURN
|
||
|
|
||
|
L(more_8x_vec):
|
||
|
+ /* Check if non-temporal move candidate. */
|
||
|
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||
|
+ /* Check non-temporal store threshold. */
|
||
|
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||
|
+ ja L(large_memcpy_2x)
|
||
|
+#endif
|
||
|
+ /* Entry if rdx is greater than non-temporal threshold but there
|
||
|
+ is overlap. */
|
||
|
+L(more_8x_vec_check):
|
||
|
cmpq %rsi, %rdi
|
||
|
ja L(more_8x_vec_backward)
|
||
|
/* Source == destination is less common. */
|
||
|
@@ -416,24 +484,21 @@ L(more_8x_vec):
|
||
|
subq %r8, %rdi
|
||
|
/* Adjust length. */
|
||
|
addq %r8, %rdx
|
||
|
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||
|
- /* Check non-temporal store threshold. */
|
||
|
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||
|
- ja L(large_forward)
|
||
|
-#endif
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
L(loop_4x_vec_forward):
|
||
|
/* Copy 4 * VEC a time forward. */
|
||
|
VMOVU (%rsi), %VEC(0)
|
||
|
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||
|
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||
|
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||
|
- addq $(VEC_SIZE * 4), %rsi
|
||
|
- subq $(VEC_SIZE * 4), %rdx
|
||
|
+ subq $-(VEC_SIZE * 4), %rsi
|
||
|
+ addq $-(VEC_SIZE * 4), %rdx
|
||
|
VMOVA %VEC(0), (%rdi)
|
||
|
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||
|
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||
|
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||
|
- addq $(VEC_SIZE * 4), %rdi
|
||
|
+ subq $-(VEC_SIZE * 4), %rdi
|
||
|
cmpq $(VEC_SIZE * 4), %rdx
|
||
|
ja L(loop_4x_vec_forward)
|
||
|
/* Store the last 4 * VEC. */
|
||
|
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
|
||
|
subq %r8, %r9
|
||
|
/* Adjust length. */
|
||
|
subq %r8, %rdx
|
||
|
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||
|
- /* Check non-temporal store threshold. */
|
||
|
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||
|
- ja L(large_backward)
|
||
|
-#endif
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
L(loop_4x_vec_backward):
|
||
|
/* Copy 4 * VEC a time backward. */
|
||
|
VMOVU (%rcx), %VEC(0)
|
||
|
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||
|
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||
|
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||
|
- subq $(VEC_SIZE * 4), %rcx
|
||
|
- subq $(VEC_SIZE * 4), %rdx
|
||
|
+ addq $-(VEC_SIZE * 4), %rcx
|
||
|
+ addq $-(VEC_SIZE * 4), %rdx
|
||
|
VMOVA %VEC(0), (%r9)
|
||
|
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
||
|
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||
|
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||
|
- subq $(VEC_SIZE * 4), %r9
|
||
|
+ addq $-(VEC_SIZE * 4), %r9
|
||
|
cmpq $(VEC_SIZE * 4), %rdx
|
||
|
ja L(loop_4x_vec_backward)
|
||
|
/* Store the first 4 * VEC. */
|
||
|
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
|
||
|
VZEROUPPER_RETURN
|
||
|
|
||
|
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||
|
-L(large_forward):
|
||
|
+ .p2align 4
|
||
|
+L(large_memcpy_2x):
|
||
|
+ /* Compute absolute value of difference between source and
|
||
|
+ destination. */
|
||
|
+ movq %rdi, %r9
|
||
|
+ subq %rsi, %r9
|
||
|
+ movq %r9, %r8
|
||
|
+ leaq -1(%r9), %rcx
|
||
|
+ sarq $63, %r8
|
||
|
+ xorq %r8, %r9
|
||
|
+ subq %r8, %r9
|
||
|
/* Don't use non-temporal store if there is overlap between
|
||
|
- destination and source since destination may be in cache
|
||
|
- when source is loaded. */
|
||
|
- leaq (%rdi, %rdx), %r10
|
||
|
- cmpq %r10, %rsi
|
||
|
- jb L(loop_4x_vec_forward)
|
||
|
-L(loop_large_forward):
|
||
|
+ destination and source since destination may be in cache when
|
||
|
+ source is loaded. */
|
||
|
+ cmpq %r9, %rdx
|
||
|
+ ja L(more_8x_vec_check)
|
||
|
+
|
||
|
+ /* Cache align destination. First store the first 64 bytes then
|
||
|
+ adjust alignments. */
|
||
|
+ VMOVU (%rsi), %VEC(8)
|
||
|
+#if VEC_SIZE < 64
|
||
|
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
|
||
|
+#if VEC_SIZE < 32
|
||
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
||
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
||
|
+#endif
|
||
|
+#endif
|
||
|
+ VMOVU %VEC(8), (%rdi)
|
||
|
+#if VEC_SIZE < 64
|
||
|
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
|
||
|
+#if VEC_SIZE < 32
|
||
|
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
||
|
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
||
|
+#endif
|
||
|
+#endif
|
||
|
+ /* Adjust source, destination, and size. */
|
||
|
+ movq %rdi, %r8
|
||
|
+ andq $63, %r8
|
||
|
+ /* Get the negative of offset for alignment. */
|
||
|
+ subq $64, %r8
|
||
|
+ /* Adjust source. */
|
||
|
+ subq %r8, %rsi
|
||
|
+ /* Adjust destination which should be aligned now. */
|
||
|
+ subq %r8, %rdi
|
||
|
+ /* Adjust length. */
|
||
|
+ addq %r8, %rdx
|
||
|
+
|
||
|
+ /* Test if source and destination addresses will alias. If they do
|
||
|
+ the larger pipeline in large_memcpy_4x alleviated the
|
||
|
+ performance drop. */
|
||
|
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
||
|
+ jz L(large_memcpy_4x)
|
||
|
+
|
||
|
+ movq %rdx, %r10
|
||
|
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
|
||
|
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
|
||
|
+ jae L(large_memcpy_4x)
|
||
|
+
|
||
|
+ /* edx will store remainder size for copying tail. */
|
||
|
+ andl $(PAGE_SIZE * 2 - 1), %edx
|
||
|
+ /* r10 stores outer loop counter. */
|
||
|
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
|
||
|
+ /* Copy 4x VEC at a time from 2 pages. */
|
||
|
+ .p2align 4
|
||
|
+L(loop_large_memcpy_2x_outer):
|
||
|
+ /* ecx stores inner loop counter. */
|
||
|
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||
|
+L(loop_large_memcpy_2x_inner):
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
|
||
|
+ /* Load vectors from rsi. */
|
||
|
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||
|
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||
|
+ subq $-LARGE_LOAD_SIZE, %rsi
|
||
|
+ /* Non-temporal store vectors to rdi. */
|
||
|
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||
|
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||
|
+ subq $-LARGE_LOAD_SIZE, %rdi
|
||
|
+ decl %ecx
|
||
|
+ jnz L(loop_large_memcpy_2x_inner)
|
||
|
+ addq $PAGE_SIZE, %rdi
|
||
|
+ addq $PAGE_SIZE, %rsi
|
||
|
+ decq %r10
|
||
|
+ jne L(loop_large_memcpy_2x_outer)
|
||
|
+ sfence
|
||
|
+
|
||
|
+ /* Check if only last 4 loads are needed. */
|
||
|
+ cmpl $(VEC_SIZE * 4), %edx
|
||
|
+ jbe L(large_memcpy_2x_end)
|
||
|
+
|
||
|
+ /* Handle the last 2 * PAGE_SIZE bytes. */
|
||
|
+L(loop_large_memcpy_2x_tail):
|
||
|
/* Copy 4 * VEC a time forward with non-temporal stores. */
|
||
|
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
||
|
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
|
||
|
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
||
|
VMOVU (%rsi), %VEC(0)
|
||
|
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||
|
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||
|
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||
|
- addq $PREFETCHED_LOAD_SIZE, %rsi
|
||
|
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
||
|
- VMOVNT %VEC(0), (%rdi)
|
||
|
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
|
||
|
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||
|
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||
|
- addq $PREFETCHED_LOAD_SIZE, %rdi
|
||
|
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||
|
- ja L(loop_large_forward)
|
||
|
- sfence
|
||
|
+ subq $-(VEC_SIZE * 4), %rsi
|
||
|
+ addl $-(VEC_SIZE * 4), %edx
|
||
|
+ VMOVA %VEC(0), (%rdi)
|
||
|
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||
|
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||
|
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||
|
+ subq $-(VEC_SIZE * 4), %rdi
|
||
|
+ cmpl $(VEC_SIZE * 4), %edx
|
||
|
+ ja L(loop_large_memcpy_2x_tail)
|
||
|
+
|
||
|
+L(large_memcpy_2x_end):
|
||
|
/* Store the last 4 * VEC. */
|
||
|
- VMOVU %VEC(5), (%rcx)
|
||
|
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||
|
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||
|
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||
|
- /* Store the first VEC. */
|
||
|
- VMOVU %VEC(4), (%r11)
|
||
|
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
||
|
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
||
|
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
||
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
||
|
+
|
||
|
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
||
|
VZEROUPPER_RETURN
|
||
|
|
||
|
-L(large_backward):
|
||
|
- /* Don't use non-temporal store if there is overlap between
|
||
|
- destination and source since destination may be in cache
|
||
|
- when source is loaded. */
|
||
|
- leaq (%rcx, %rdx), %r10
|
||
|
- cmpq %r10, %r9
|
||
|
- jb L(loop_4x_vec_backward)
|
||
|
-L(loop_large_backward):
|
||
|
- /* Copy 4 * VEC a time backward with non-temporal stores. */
|
||
|
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
|
||
|
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
|
||
|
- VMOVU (%rcx), %VEC(0)
|
||
|
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||
|
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||
|
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||
|
- subq $PREFETCHED_LOAD_SIZE, %rcx
|
||
|
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
||
|
- VMOVNT %VEC(0), (%r9)
|
||
|
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
|
||
|
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||
|
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||
|
- subq $PREFETCHED_LOAD_SIZE, %r9
|
||
|
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||
|
- ja L(loop_large_backward)
|
||
|
+ .p2align 4
|
||
|
+L(large_memcpy_4x):
|
||
|
+ movq %rdx, %r10
|
||
|
+ /* edx will store remainder size for copying tail. */
|
||
|
+ andl $(PAGE_SIZE * 4 - 1), %edx
|
||
|
+ /* r10 stores outer loop counter. */
|
||
|
+ shrq $(LOG_PAGE_SIZE + 2), %r10
|
||
|
+ /* Copy 4x VEC at a time from 4 pages. */
|
||
|
+ .p2align 4
|
||
|
+L(loop_large_memcpy_4x_outer):
|
||
|
+ /* ecx stores inner loop counter. */
|
||
|
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||
|
+L(loop_large_memcpy_4x_inner):
|
||
|
+ /* Only one prefetch set per page as doing 4 pages give more time
|
||
|
+ for prefetcher to keep up. */
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
|
||
|
+ /* Load vectors from rsi. */
|
||
|
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||
|
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||
|
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
||
|
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
||
|
+ subq $-LARGE_LOAD_SIZE, %rsi
|
||
|
+ /* Non-temporal store vectors to rdi. */
|
||
|
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||
|
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||
|
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
||
|
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
||
|
+ subq $-LARGE_LOAD_SIZE, %rdi
|
||
|
+ decl %ecx
|
||
|
+ jnz L(loop_large_memcpy_4x_inner)
|
||
|
+ addq $(PAGE_SIZE * 3), %rdi
|
||
|
+ addq $(PAGE_SIZE * 3), %rsi
|
||
|
+ decq %r10
|
||
|
+ jne L(loop_large_memcpy_4x_outer)
|
||
|
sfence
|
||
|
- /* Store the first 4 * VEC. */
|
||
|
- VMOVU %VEC(4), (%rdi)
|
||
|
- VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||
|
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||
|
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||
|
- /* Store the last VEC. */
|
||
|
- VMOVU %VEC(8), (%r11)
|
||
|
+ /* Check if only last 4 loads are needed. */
|
||
|
+ cmpl $(VEC_SIZE * 4), %edx
|
||
|
+ jbe L(large_memcpy_4x_end)
|
||
|
+
|
||
|
+ /* Handle the last 4 * PAGE_SIZE bytes. */
|
||
|
+L(loop_large_memcpy_4x_tail):
|
||
|
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
|
||
|
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||
|
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
||
|
+ VMOVU (%rsi), %VEC(0)
|
||
|
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||
|
+ subq $-(VEC_SIZE * 4), %rsi
|
||
|
+ addl $-(VEC_SIZE * 4), %edx
|
||
|
+ VMOVA %VEC(0), (%rdi)
|
||
|
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||
|
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||
|
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||
|
+ subq $-(VEC_SIZE * 4), %rdi
|
||
|
+ cmpl $(VEC_SIZE * 4), %edx
|
||
|
+ ja L(loop_large_memcpy_4x_tail)
|
||
|
+
|
||
|
+L(large_memcpy_4x_end):
|
||
|
+ /* Store the last 4 * VEC. */
|
||
|
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
||
|
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
||
|
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
||
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
||
|
+
|
||
|
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||
|
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
||
|
VZEROUPPER_RETURN
|
||
|
#endif
|
||
|
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||
|
--
|
||
|
GitLab
|
||
|
|